You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/09/19 10:48:39 UTC
svn commit: r1387488 [5/5] - in /incubator/stanbol/branches/stanbol-nlp-processing: ./ data/ data/bundlelists/sentiment/ data/bundlelists/sentiment/src/ data/bundlelists/sentiment/src/main/ data/bundlelists/sentiment/src/main/bundles/ data/opennlp/lang...

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/utils/NlpEngineHelper.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/utils/NlpEngineHelper.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/utils/NlpEngineHelper.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/utils/NlpEngineHelper.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,179 @@
+package org.apache.stanbol.enhancer.nlp.utils;
+
+import static java.util.Collections.singleton;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Utility class for {@link EnhancementEngine} implementations that
+ * do use the {@link AnalysedText} content part
+ * @author Rupert Westenthaler
+ *
+ */
+public final class NlpEngineHelper {
+    
+    private static final Logger log = LoggerFactory.getLogger(NlpEngineHelper.class);
+
+    private NlpEngineHelper(){}
+    
+    
+    /**
+     * Getter for the AnalysedText for a ContentItem
+     * @param engine the EnhancementEngine calling this method (used for logging)
+     * @param ci the ContentItem
+     * @param exception <code>false</code> id used in {@link #canEnhance(ContentItem)}
+     * and <code>true</code> when called from {@link #computeEnhancements(ContentItem)}
+     * @return the AnalysedText or <code>null</code> if not found.
+     * @throws IllegalStateException if exception is <code>true</code> and the
+     * {@link AnalysedText} could not be retrieved from the parsed {@link ContentItem}.
+     */
+    public static AnalysedText getAnalysedText(EnhancementEngine engine, ContentItem ci, boolean exception) {
+        AnalysedText at;
+        try {
+            at = AnalysedTextUtils.getAnalysedText(ci);
+        }catch (RuntimeException e) {
+            log.warn("Unable to retrieve AnalysedText for ContentItem "
+                + ci + "because of an "+e.getClass().getSimpleName()+" with message "
+                + e.getMessage(),e);
+            at = null;
+        }
+        if(at != null){
+            return at;
+        }
+        if(exception){
+            throw new IllegalStateException("Unable to retrieve AnalysedText from ContentItem "
+                    + ci+". As this is also checked in canEnhancer this may indicate an Bug in the "
+                    + "used EnhancementJobManager!");
+        } else {
+            log.warn("The Enhancement Engine '{} (impl: {})' CAN NOT enhance "
+                    + "ContentItem {} because the AnalysedText ContentPart is "
+                    + "missing. Users might want to add an EnhancementEngine that "
+                    + "creates the AnalysedText ContentPart such as the "
+                    + "POSTaggingEngine (o.a.stanbol.enhancer.engines.opennlp.pos)!",
+                    new Object[]{engine.getName(), engine.getClass().getSimpleName(),ci});
+            return null;
+        }
+    }
+    
+    /**
+     * Getter for the language of the content
+     * @param ci the ContentItem
+     * @param exception <code>false</code> id used in {@link #canEnhance(ContentItem)}
+     * and <code>true</code> when called from {@link #computeEnhancements(ContentItem)}
+     * @return the AnalysedText or <code>null</code> if not found.
+     * @throws IllegalStateException if exception is <code>true</code> and the
+     * language could not be retrieved from the parsed {@link ContentItem}.
+     */
+    public static String getLanguage(EnhancementEngine engine, ContentItem ci, boolean exception) {
+        String language = EnhancementEngineHelper.getLanguage(ci);
+        if(language != null) {
+            return language;
+        }
+        if(exception){
+            throw new IllegalStateException("Unable to retrieve the detected language for ContentItem "
+                    + ci+". As this is also checked in canEnhancer this may indicate an Bug in the "
+                    + "used EnhancementJobManager!");
+        } else {
+            log.warn("The Enhancement Engine '{} (impl: {})' CAN NOT enhance "
+                    + "ContentItem {} because the langauge of "
+                    + "this ContentItem is unknown. Users might want to add a "
+                    + "Language Identification EnhancementEngine to the current "
+                    + "EnhancementChain!",
+                    new Object[]{engine.getName(), engine.getClass().getSimpleName(),ci});
+            return null;
+        }
+    }
+    /**
+     * Retrieves - or if not present - creates the {@link AnalysedText} content
+     * part for the parsed {@link ContentItem}. If the {@link Blob} with the
+     * mime type '<code>text/plain</code>' is present this method
+     * throws an {@link IllegalStateException} (this method internally uses
+     * {@link #getPlainText(EnhancementEngine, ContentItem, boolean)} with
+     * <code>true</code> as third parameters. Users of this method should call
+     * this method with <code>false</code> as third parameter in their 
+     * {@link EnhancementEngine#canEnhance(ContentItem)} implementation.<p>
+     * <i>NOTE:</i> This method is intended for Engines that want to create an
+     * empty {@link AnalysedText} content part. Engines that assume that this
+     * content part is already present (e.g. if the consume already existing
+     * annotations) should use the 
+     * {@link #getAnalysedText(EnhancementEngine, ContentItem, boolean)}
+     * method instead.
+     * @param engine the EnhancementEngine calling this method (used for logging)
+     * @param analysedTextFactory the {@link AnalysedTextFactory} used to create
+     * the {@link AnalysedText} instance (if not present).
+     * @param ci the {@link ContentItem}
+     * @return the AnalysedText
+     * @throws EngineException on any exception while accessing the 
+     * '<code>text/plain</code>' Blob
+     * @throws IllegalStateException if no '<code>text/plain</code>' Blob is
+     * present as content part of the parsed {@link ContentItem}. NOTE that if
+     * the {@link AnalysedText} content part is already present no Exception will
+     * be thrown even if no plain text {@link Blob} is present in the parsed
+     * {@link ContentItem}
+     */
+    public static AnalysedText initAnalysedText(EnhancementEngine engine, 
+                                                AnalysedTextFactory analysedTextFactory,
+                                                ContentItem ci) throws EngineException {
+        AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
+        if(at == null){
+            Entry<UriRef,Blob> textBlob = getPlainText(engine, ci, true);
+            log.debug(" ... create new AnalysedText instance for Engine {}", engine.getName());
+            try {
+                at = analysedTextFactory.createAnalysedText(ci, textBlob.getValue());
+            } catch (IOException e) {
+                throw new EngineException("Unable to create AnalysetText instance for Blob "
+                    + textBlob.getKey()+ " of ContentItem "+ci.getUri()+"!",e);
+            }
+        } else {
+            log.debug(" ... use existing AnalysedText instance for Engine {}", engine.getName());
+        }
+        return at;
+    }
+    
+    /**
+     * Getter for the language of the content
+     * @param ci the ContentItem
+     * @param exception <code>false</code> id used in {@link #canEnhance(ContentItem)}
+     * and <code>true</code> when called from {@link #computeEnhancements(ContentItem)}
+     * @return the AnalysedText or <code>null</code> if not found.
+     * @throws IllegalStateException if exception is <code>true</code> and the
+     * language could not be retrieved from the parsed {@link ContentItem}.
+     */
+    public static Entry<UriRef,Blob> getPlainText(EnhancementEngine engine, ContentItem ci, boolean exception) {
+        Entry<UriRef,Blob> textBlob = ContentItemHelper.getBlob(
+            ci, singleton("text/plain"));
+        if(textBlob != null) {
+            return textBlob;
+        }
+        if(exception){
+            throw new IllegalStateException("Unable to retrieve 'text/plain' ContentPart for ContentItem "
+                    + ci+". As this is also checked in canEnhancer this may indicate an Bug in the "
+                    + "used EnhancementJobManager!");
+        } else {
+            log.warn("The Enhancement Engine '{} (impl: {})' CAN NOT enhance "
+                    + "ContentItem {} because no 'text/plain' ContentPart is "
+                    + "present in this ContentItem. Users that need to enhance "
+                    + "non-plain-text Content need to add an EnhancementEngine "
+                    + "that supports the conversion of '{}' files to plain text "
+                    + "to the current EnhancementChain!",
+                    new Object[]{engine.getName(), engine.getClass().getSimpleName(),ci,ci.getMimeType()});
+            return null;
+        }
+    }
+    
+}

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextTest.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextTest.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextTest.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,403 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertNotNull;
+import static junit.framework.Assert.assertTrue;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import junit.framework.Assert;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
+import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * The Class added as ContentPart to the contentItem
+ * @author westei
+ *
+ */
+public class AnalysedTextTest {
+
+    private static Logger log = LoggerFactory.getLogger(AnalysedTextTest.class);
+
+    public static final String text = "The Stanbol enhancer can detect famous " +
+            "cities such as Paris and people such as Bob Marley. With " +
+            "disambiguation it would even be able to detect the Comedian " +
+            "Bob Marley trafeling to Paris in Texas.";
+    
+    public static final Annotation<String,Number> testAnnotation = 
+            new Annotation<String,Number>("test", Number.class);
+    
+    /* -----
+     * Test data creates within the BeforeClass
+     * -----
+     */
+    /**
+     * AnalysedText instance filled in {@link #setup()} with test dats
+     */
+    private static AnalysedText analysedTextWithData;
+    private static LinkedHashMap<Sentence,String> expectedSentences = new LinkedHashMap<Sentence,String>();
+    private static LinkedHashMap<Chunk,String> expectedChunks = new LinkedHashMap<Chunk,String>();
+    private static LinkedHashMap<Token,String> expectedTokens = new LinkedHashMap<Token,String>();
+    
+    /* -----
+     * Test data creates before every single test
+     * -----
+     */
+    /**
+     * Empty AnalysedText instance created before each test
+     */
+    private static AnalysedText at;
+
+    private static final ContentItemFactory ciFactory = InMemoryContentItemFactory.getInstance();
+    private static final AnalysedTextFactory atFactory = AnalysedTextFactory.getDefaultInstance();
+    
+    private static ContentItem ci;
+    
+    @BeforeClass
+    public static final void setup() throws IOException {
+        analysedTextWithData = createAnalysedText();
+        int sentence = text.indexOf('.')+1;
+        Sentence sent1 = analysedTextWithData.addSentence(0, sentence);
+        expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " +
+            "cities such as Paris and people such as Bob Marley.");
+        
+        Sentence sent2 = analysedTextWithData.addSentence(sentence+1, text.length());
+        expectedSentences.put(sent2, "With disambiguation it would even be able " +
+        		"to detect the Comedian Bob Marley trafeling to Paris in Texas.");
+        
+        Token the = sent1.addToken(0, 3);
+        expectedTokens.put(the, "The");
+        Token stanbol = sent1.addToken(4,11);
+        expectedTokens.put(stanbol, "Stanbol");
+        //use index to create Tokens
+        int enhancerStart = sent1.getSpan().toString().indexOf("enhancer");
+        Token enhancer = sent1.addToken(enhancerStart,enhancerStart+"enhancer".length());
+        expectedTokens.put(enhancer, "enhancer");
+
+        //create a chunk
+        Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd());
+        expectedChunks.put(stanbolEnhancer, "Stanbol enhancer");
+        
+        int parisStart = sent1.getSpan().toString().indexOf("Paris");
+        Token paris = sent1.addToken(parisStart, parisStart+5);
+        expectedTokens.put(paris, "Paris");
+
+        int bobMarleyStart = sent1.getSpan().toString().indexOf("Bob Marley");
+        Chunk bobMarley = sent1.addChunk(bobMarleyStart, bobMarleyStart+10);
+        expectedChunks.put(bobMarley, "Bob Marley");
+        Token bob = bobMarley.addToken(0, 3);
+        expectedTokens.put(bob, "Bob");
+        Token marley = bobMarley.addToken(4, 10);
+        expectedTokens.put(marley, "Marley");
+
+        Token with = sent2.addToken(0, 4);
+        expectedTokens.put(with, "With");
+        Token disambiguation = sent2.addToken(5, 5+"disambiguation".length());
+        expectedTokens.put(disambiguation, "disambiguation");
+        
+        int comedianBobMarleyIndex = sent2.getSpan().toString().indexOf("Comedian");
+        Chunk comedianBobMarley = sent2.addChunk(comedianBobMarleyIndex, 
+            comedianBobMarleyIndex+"Comedian Bob Marley".length());
+        expectedChunks.put(comedianBobMarley, "Comedian Bob Marley");
+        Token comedian = comedianBobMarley.addToken(0, "Comedian".length());
+        expectedTokens.put(comedian, "Comedian");
+        Token bobSent2 = comedianBobMarley.addToken(9,9+"Bob".length());
+        expectedTokens.put(bobSent2, "Bob");
+        Token marleySent2 = comedianBobMarley.addToken(13, 13+"Marley".length());
+        expectedTokens.put(marleySent2, "Marley");
+
+        int parisIndex = sent2.getSpan().toString().indexOf("Paris");
+        Chunk parisInTexas = sent2.addChunk(parisIndex, parisIndex+"Paris in Texas".length());
+        expectedChunks.put(parisInTexas, "Paris in Texas");
+        Token parisSent2 = parisInTexas.addToken(0, "Paris".length());
+        expectedTokens.put(parisSent2, "Paris");
+        int inIndex = parisInTexas.getSpan().indexOf("in");
+        Token in = parisInTexas.addToken(inIndex,
+            inIndex+2);
+        expectedTokens.put(in, "in");
+        Token texasSent2 = parisInTexas.addToken(parisInTexas.getSpan().toString().indexOf("Texas"),
+            parisInTexas.getSpan().toString().indexOf("Texas")+"Texas".length());
+        expectedTokens.put(texasSent2, "Texas");
+        
+    }
+
+
+    @Before
+    public void initAnalysedText() throws Exception {
+        at = createAnalysedText();
+    }
+    /**
+     * @throws IOException
+     */
+    private static AnalysedText createAnalysedText() throws IOException {
+        ci = ciFactory.createContentItem(new StringSource(text));
+        Entry<UriRef,Blob> textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
+        return  atFactory.createAnalysedText(ci, textBlob.getValue());
+    }
+    
+    
+    @Test
+    public void testSpanFilter(){
+        Iterator<Sentence> sentences = analysedTextWithData.getSentences();
+        Iterator<Chunk> chunks = analysedTextWithData.getChunks();
+        Iterator<Token> tokens = analysedTextWithData.getTokens();
+        for(Entry<Sentence,String> sentEntry : expectedSentences.entrySet()){
+            Sentence sent = sentences.next();
+            Assert.assertEquals(sentEntry.getKey(), sent);
+            Assert.assertEquals(sentEntry.getValue(), sent.getSpan().toString());
+        }
+        for(Entry<Chunk,String> chunkEntry : expectedChunks.entrySet()){
+            Chunk chunk = chunks.next();
+            Assert.assertEquals(chunkEntry.getKey(), chunk);
+            Assert.assertEquals(chunkEntry.getValue(), chunk.getSpan().toString());
+        }
+        for(Entry<Token,String> tokenEntry : expectedTokens.entrySet()){
+            Token token = tokens.next();
+            Assert.assertEquals(tokenEntry.getKey(), token);
+            Assert.assertEquals(tokenEntry.getValue(), token.getSpan().toString());
+        }
+    }
+    
+    @Test
+    public void testAnalysedText(){
+        Assert.assertEquals(text, at.getText());
+        Assert.assertEquals(text, at.getSpan());
+        Assert.assertEquals(0, at.getStart());
+        Assert.assertEquals(text.length(), at.getEnd());
+    }
+    /**
+     * Spans created relative to an other MUST NOT exceed the span of the 
+     * other one
+     */
+    @Test(expected=IllegalArgumentException.class)
+    public void testExceedsRelativeSpan(){
+        Sentence sent = at.addSentence(0, 10);
+        sent.addChunk(5, 15); //Invalid
+    }
+
+    @Test(expected=IllegalArgumentException.class)
+    public void testNegativeStart(){
+        at.addSentence(-1, 10);
+    }
+    
+    @Test(expected=IllegalArgumentException.class)
+    public void testRelativeNegativeStart(){
+        Sentence sent = at.addSentence(0, 10);
+        sent.addToken(-1, 5);
+    }
+    @Test
+    public void testAnalysedTextaddSpanMethods(){
+        Collection<Span> spans = new HashSet<Span>();
+        //add some span of different types 
+        spans.add(at.addToken(4, 11));
+        spans.add(at.addChunk(4,19));
+        spans.add(at.addSentence(0, 91));
+        Set<Span> atSpans = AnalysedTextUtils.asSet(at.getEnclosed(EnumSet.allOf(SpanTypeEnum.class)));
+        Assert.assertTrue(spans.containsAll(atSpans));
+        Assert.assertTrue(atSpans.containsAll(spans));
+    }
+    /**
+     * Test relative additions (with relative indexes) as well as iterators
+     * over this hierarchy
+     */
+    @Test
+    public void testSpanHierarchy(){
+        int[] startPos = new int[]{0,1,2};
+        int[] endPos = new int[]{1,2,3};
+        int maxVal = endPos[endPos.length-1];
+        int tokenLength = 5;
+        int chunkLength = tokenLength*maxVal;
+        int sentenceLength = tokenLength*maxVal*maxVal;
+        List<Sentence> sentences = new ArrayList<Sentence>(startPos.length);
+        List<Chunk> chunks = new ArrayList<Chunk>(startPos.length*2);
+        List<Token> tokens = new ArrayList<Token>(startPos.length*3);
+        int start;
+        int end;
+        //1. test relative add and absolute start/end
+        log.info("--- adding Spans ---");
+        for(int s=0;s<startPos.length;s++){
+            start = startPos[s]*sentenceLength;
+            end = endPos[s]*sentenceLength;
+            Sentence sent = at.addSentence(start, end);
+            log.info("add {}",sent);
+            assertEquals(start, sent.getStart());
+            assertEquals(end, sent.getEnd());
+            sentences.add(sent);
+        }
+        //1.b iterate over the sentences while adding Chunks and Tokens to
+        //    test that returned Iterators MUST NOT throw 
+        //    ConcurrentModificationExceptions when adding Spans to the AnalysedText
+        Iterator<Sentence> sentenceIt = at.getSentences();
+        while(sentenceIt.hasNext()){
+            Sentence sent = sentenceIt.next();
+            for(int c=0;c<startPos.length;c++){
+                start = startPos[c]*chunkLength;
+                end = endPos[c]*chunkLength;
+                Chunk chunk = sent.addChunk(start, end);
+                log.info("  add {}",chunk);
+                start = sent.getStart() + start;
+                end = sent.getStart() + end;
+                assertEquals(start, chunk.getStart());
+                assertEquals(end, chunk.getEnd());
+                chunks.add(chunk);
+                for(int t=0;t<startPos.length;t++){
+                    start = startPos[t]*tokenLength;
+                    end = endPos[t]*tokenLength;
+                    Token token = chunk.addToken(start, end);
+                    log.info("    add {}",token);
+                    start = chunk.getStart() + start;
+                    end = chunk.getStart() + end;
+                    assertEquals(start, token.getStart());
+                    assertEquals(end, token.getEnd());
+                    tokens.add(token);
+                }
+            }
+        }
+        //2. test iterations of enclosed
+        int chunksInSentence = startPos.length;
+        int tokensInChunk = chunksInSentence;
+        int tokensInSentence = chunksInSentence*tokensInChunk;
+        Iterator<Sentence> sentIt = at.getSentences();
+        int s = 0;
+        int c = 0;
+        int t = 0;
+        log.info("--- iterating over Spans ---");
+        log.info("{}",at);
+        for(;sentIt.hasNext();s++){
+            assertTrue(sentences.size()+" Sentences Expected (found: "+(s+1)+")",s < sentences.size());
+            Sentence sent = sentIt.next();
+            log.info("  {}",sent);
+            assertEquals(sentences.get(s), sent);
+            Iterator<Chunk> chunkIt = sent.getChunks();
+            int foundChunks = 0;
+            for(;chunkIt.hasNext();c++){
+                assertTrue(chunks.size()+" Chunks Expected (found: "+(c+1)+")",c < chunks.size());
+                Chunk chunk = chunkIt.next();
+                log.info("    {}",chunk);
+                assertEquals(chunks.get(c), chunk);
+                Iterator<Token> tokenIt = chunk.getTokens();
+                int foundTokens = 0;
+                for(;tokenIt.hasNext();t++){
+                    assertTrue(tokens.size()+" Tokens Expected (found: "+(t+1)+")",t < tokens.size());
+                    Token token = tokenIt.next();
+                    log.info("      {}",token);
+                    assertEquals(tokens.get(t), token);
+                    foundTokens++;
+                }
+                assertEquals(tokensInChunk+" Tokens expected in Chunk", tokensInChunk,foundTokens);
+                foundChunks++;
+            }
+            assertEquals(chunksInSentence+" Chunks expected in Sentence", chunksInSentence,foundChunks);
+            //also iterate over tokens within a sentence
+            log.info("  {}",sent);
+            Iterator<Token> tokenIt = sent.getTokens();
+            int foundTokens = 0;
+            for(;tokenIt.hasNext();foundTokens++){
+                Token token = tokenIt.next();
+                log.info("    {}",token);
+                assertEquals(tokens.get(s*tokensInSentence+foundTokens), token);
+            }
+            assertEquals(tokensInSentence+" Tokens expected in Sentence", tokensInSentence,foundTokens);
+        }
+        assertEquals(sentences.size()+" Sentences Expected (found: "+s+")", sentences.size(),s);
+        assertEquals(chunks.size()+" Chunks Expected (found: "+c+")", chunks.size(),c);
+        assertEquals(tokens.size()+" Sentences Expected (found: "+t+")", tokens.size(),t);
+        //also iterate over Chunks in AnalysedText
+        Iterator<Chunk> chunkIt = at.getChunks();
+        int foundChunks = 0;
+        log.info("{}",at);
+        for(;chunkIt.hasNext();foundChunks++){
+            Chunk chunk = chunkIt.next();
+            log.info("  {}",chunk);
+            assertEquals(chunks.get(foundChunks), chunk);
+        }
+        assertEquals(chunks.size()+" Chunks expected in AnalysedText", chunks.size(),foundChunks);
+        //also iterate over Tokens in AnalysedText
+        Iterator<Token> tokenIt = at.getTokens();
+        int foundTokens = 0;
+        log.info("{}",at);
+        for(;tokenIt.hasNext();foundTokens++){
+            Token token = tokenIt.next();
+            log.info("  {}",token);
+            assertEquals(tokens.get(foundTokens), token);
+        }
+        assertEquals(tokens.size()+" Tokens expected in AnalysedText", tokens.size(),foundTokens);
+
+      //Finally iterate over multiple token types
+      Iterator<Span> sentencesAndChunks = at.getEnclosed(
+          EnumSet.of(SpanTypeEnum.Sentence,SpanTypeEnum.Chunk));
+      s=0;
+      c=0;
+      log.info("{} >> Iterate over Sentences and Chunks",at);
+      while(sentencesAndChunks.hasNext()){
+          Span span = sentencesAndChunks.next();
+          log.info("  {}",span);
+          if(span.getType() == SpanTypeEnum.Chunk){
+              assertEquals(chunks.get(c), span);
+              c++;
+          } else if(span.getType() == SpanTypeEnum.Sentence){
+              assertEquals(sentences.get(s), span);
+              s++;
+          } else {
+              Assert.fail("Unexpected SpanType '"+span.getType()+" (Span: "+span.getClass()+")");
+          }
+      }
+      assertEquals(sentences.size()+" Sentences expected in AnalysedText", sentences.size(),s);
+      assertEquals((sentences.size()*chunksInSentence)+" Chunks expected in AnalysedText", 
+          (sentences.size()*chunksInSentence),c);
+    }
+    
+    @Test
+    public void testAnnotation(){
+        List<Value<Number>> values = new ArrayList<Value<Number>>();
+        values.add(new Value<Number>(26,0.6));
+        values.add(new Value<Number>(27l));
+        values.add(new Value<Number>(28.0f));
+        values.add(new Value<Number>(25.0,0.8));
+        at.addAnnotations(testAnnotation, values);
+        Value<Number> value = at.getAnnotation(testAnnotation);
+        assertNotNull(value);
+        assertEquals(Double.valueOf(25.0), value.value());
+        assertEquals(0.8d, value.probability());
+        Number prev = Float.valueOf(24f);
+        for(Value<Number> v : at.getAnnotations(testAnnotation)){
+            assertNotNull(v);
+            assertTrue(v.value().doubleValue() > prev.doubleValue());
+            prev = v.value();
+        }
+        //check that the order of Annotations without probability is kept
+        at.addAnnotation(testAnnotation, new Value<Number>(29));
+        prev = Integer.valueOf(24);
+        for(Value<Number> v : at.getAnnotations(testAnnotation)){
+            assertNotNull(v);
+            assertTrue(v.value().intValue() > prev.intValue());
+            prev = v.value();
+        }
+        
+    }
+    
+}

Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/utils/NIFHelperTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/utils/NIFHelperTest.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/utils/NIFHelperTest.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/utils/NIFHelperTest.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,40 @@
+package org.apache.stanbol.enhancer.nlp.utils;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.nio.charset.Charset;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.junit.Test;
+
+import junit.framework.Assert;
+
+public class NIFHelperTest {
+
+    static UriRef base = new UriRef("http://stanbol.apache.org/test/nif/nif-helper");
+    static String text = "This is a test for the NLP Interchange format!";
+    
+    
+    @Test
+    public void testFragmentURI(){
+        Assert.assertEquals(
+            new UriRef(base.getUnicodeString()+"#char=23,26"), 
+            NIFHelper.getNifFragmentURI(base, 23, 26));
+    }
+    @Test
+    public void testOffsetURI(){
+        Assert.assertEquals(
+            base.getUnicodeString()+"#offset_23_26", 
+            NIFHelper.getNifOffsetURI(base, 23, 26).getUnicodeString());
+    }
+    @Test
+    public void testHashURI() throws IOException {
+        String selected = text.substring(23,26);
+        String context = text.substring(13,23)+'('+selected+')'+text.substring(26,36);
+        byte[] contextData = context.getBytes(Charset.forName("UTF8"));
+        String md5 = ContentItemHelper.streamDigest(new ByteArrayInputStream(contextData), null, "MD5");
+        UriRef expected = new UriRef(base.getUnicodeString()+"#hash_10_3_"+md5+"_NLP");
+        Assert.assertEquals(expected, NIFHelper.getNifHashURI(base, 23, 26, text));
+    }    
+}

Modified: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/pom.xml?rev=1387488&r1=1387487&r2=1387488&view=diff
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/pom.xml (original)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/pom.xml Wed Sep 19 08:48:32 2012
@@ -54,6 +54,7 @@
     <module>generic/core</module>
     <module>generic/test</module>
     <module>generic/rdfentities</module>
+    <module>generic/nlp</module>
     <module>jobmanager</module>
 
     <module>chain/allactive</module>
@@ -61,6 +62,8 @@
     <module>chain/weighted</module>
     <module>chain/list</module>
     
+    <module>engines</module>
+      
     <module>jersey</module>
     <module>ldpath</module>
     <module>benchmark</module>

Propchange: incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Wed Sep 19 08:48:32 2012
@@ -0,0 +1,7 @@
+.project
+
+.settings
+
+target
+
+.classpath

Added: incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/pom.xml?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/pom.xml (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/pom.xml Wed Sep 19 08:48:32 2012
@@ -0,0 +1,212 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+  <modelVersion>4.0.0</modelVersion>
+  <parent>
+    <groupId>org.apache.stanbol</groupId>
+    <artifactId>stanbol-parent</artifactId>
+    <version>2-incubating-SNAPSHOT</version>
+    <relativePath>../../parent</relativePath>
+  </parent>
+
+  <groupId>org.apache.stanbol</groupId>
+  <artifactId>org.apache.stanbol.launchers.enhancer-nlp</artifactId>
+  <version>0.10.0-incubating-SNAPSHOT</version>
+  <packaging>jar</packaging>
+
+  <name>Apache Stanbol Launchers for the NLP processing branch</name>
+  <description>
+        Runnable jar configured to test engines included in the
+        NLP processing branch (STANBOL-733)
+  </description>
+
+  <scm>
+    <url>http://incubator.apache.org/stanbol/</url>
+  </scm>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-clean-plugin</artifactId>
+        <configuration>
+          <filesets>
+            <fileset>
+              <directory>.</directory>
+              <includes>
+                <include>stanbol/**</include>
+              </includes>
+            </fileset>
+          </filesets>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.sling</groupId>
+        <artifactId>maven-launchpad-plugin</artifactId>
+        <!--
+            TODO the maven-launchpad-plugin can also generate a war file and
+            Karaf description, we could add this. See
+            http://sling.apache.org/site/maven-launchpad-plugin.html
+          -->
+        <executions>
+          <execution>
+            <id>prepare-package</id>
+            <goals>
+              <goal>prepare-package</goal>
+              <goal>attach-bundle-list</goal>
+            </goals>
+            <configuration>
+              <includeDefaultBundles>false</includeDefaultBundles>
+              <!-- Standalone jar requires an OSGi http service implementation -->
+              <jarWebSupport>
+                <groupId>org.apache.felix</groupId>
+                <artifactId>org.apache.felix.http.jetty</artifactId>
+                <version>2.2.0</version>
+              </jarWebSupport>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-jar-plugin</artifactId>
+        <configuration>
+          <archive>
+            <manifest>
+              <!-- make the generated jar runnable -->
+              <addClasspath>true</addClasspath>
+              <mainClass>org.apache.stanbol.launchpad.Main</mainClass>
+              <addDefaultImplementationEntries>true</addDefaultImplementationEntries>
+            </manifest>
+          </archive>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.maven.plugins</groupId>
+        <artifactId>maven-shade-plugin</artifactId>
+        <version>1.2</version>
+        <executions>
+          <execution>
+            <phase>package</phase>
+            <goals>
+              <goal>shade</goal>
+            </goals>
+            <configuration>
+              <artifactSet>
+              <!-- Use this to in/exclude only specific dependencies -->
+                <includes>
+                  <include>org.apache.stanbol:org.apache.stanbol.launchpad</include>
+                </includes>
+              </artifactSet>
+              <transformers>
+                <transformer implementation="org.apache.maven.plugins.shade.resource.ComponentsXmlResourceTransformer" />
+              </transformers>
+            </configuration>
+          </execution>
+        </executions>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.rat</groupId>
+        <artifactId>apache-rat-plugin</artifactId>
+        <configuration>
+          <excludes>
+          </excludes>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+
+  <dependencies>
+    <dependency>
+      <!-- The Apache Stanbol lauchpad -->
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.launchpad</artifactId>
+      <version>0.10.0-incubating-SNAPSHOT</version>
+    </dependency>
+    <dependency>
+      <!-- maven-launchpad-plugin builds on the launchpad.base app -->
+      <groupId>org.apache.sling</groupId>
+      <artifactId>org.apache.sling.launchpad.base</artifactId>
+      <classifier>app</classifier>
+      <scope>provided</scope>
+    </dependency>
+
+    <!-- OSGi Framemework Bundle List -->
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.launchers.bundlelists.osgiframework</artifactId>
+      <version>0.10.0-incubating-SNAPSHOT</version>
+      <type>partialbundlelist</type>
+      <scope>provided</scope>
+    </dependency>
+
+    <!-- Stanbol Commons Bundle List -->
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.launchers.bundlelists.stanbolcommons</artifactId>
+      <version>0.10.0-incubating-SNAPSHOT</version>
+      <type>partialbundlelist</type>
+      <scope>provided</scope>
+    </dependency>
+
+    <!-- Stanbol Enhancer Bundle List -->
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.enhancer.bundlelist</artifactId>
+      <version>0.10.0-incubating-SNAPSHOT</version>
+      <type>partialbundlelist</type>
+      <scope>provided</scope>
+    </dependency>
+
+    <!-- Stanbol Data Bundle List -->
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.data.bundlelists.defaultdata</artifactId>
+      <version>0.10.0-incubating-SNAPSHOT</version>
+      <type>partialbundlelist</type>
+      <scope>provided</scope>
+    </dependency>
+    <!-- OpenNLP Data Bundle List -->
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.data.bundlelists.opennlp</artifactId>
+      <version>0.10.0-incubating-SNAPSHOT</version>
+      <type>partialbundlelist</type>
+      <scope>provided</scope>
+    </dependency>
+    <!-- Sentiment Data Bundle List -->
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.data.bundlelists.sentiment</artifactId>
+      <version>0.10.0-incubating-SNAPSHOT</version>
+      <type>partialbundlelist</type>
+      <scope>provided</scope>
+    </dependency>
+
+    <!-- Stanbol Entityhub Bundle List -->
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.entityhub.bundlelist</artifactId>
+      <version>0.11.0-incubating-SNAPSHOT</version>
+      <type>partialbundlelist</type>
+      <scope>provided</scope>
+    </dependency>
+  </dependencies>
+  
+</project>

Added: incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/bundles/list.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/bundles/list.xml?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/bundles/list.xml (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/bundles/list.xml Wed Sep 19 08:48:32 2012
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one or more
+  contributor license agreements.  See the NOTICE file distributed with
+  this work for additional information regarding copyright ownership.
+  The ASF licenses this file to You under the Apache License, Version 2.0
+  (the "License"); you may not use this file except in compliance with
+  the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+  <!--
+		List of initial bundles for the Stanbol Sling-based standalone launcher.
+	-->
+<bundles>
+  <!-- General-purpose libraries -->
+
+  <!-- *********************************************************************
+          start level 20 TO 24 reserved for Stanbol Framework
+          (Enhancer, Entityhub, Contenthub, Factstore ... incl. Web Fragments)
+       ********************************************************************* -->
+
+    
+
+  <!-- *********************************************************************
+          start level >= 30 are unused
+       ********************************************************************* -->
+
+</bundles>

Added: incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/sling/common.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/sling/common.properties?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/sling/common.properties (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/sling/common.properties Wed Sep 19 08:48:32 2012
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This file is loaded by Apache Sling during startup. Properties defined
+# in this file are copied over to the sling.properties file in the {sling.home}
+# directory.
+
+# The stanbol home directory
+# by default this is set to the same value as sling.home
+stanbol.home=${sling.home}
+org.osgi.framework.startlevel.beginning=40

Added: incubator/stanbol/branches/stanbol-nlp-processing/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/pom.xml?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/pom.xml (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/pom.xml Wed Sep 19 08:48:32 2012
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor 
+  license agreements. See the NOTICE file distributed with this work for additional 
+  information regarding copyright ownership. The ASF licenses this file to 
+  You under the Apache License, Version 2.0 (the "License"); you may not use 
+  this file except in compliance with the License. You may obtain a copy of 
+  the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required 
+  by applicable law or agreed to in writing, software distributed under the 
+  License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS 
+  OF ANY KIND, either express or implied. See the License for the specific 
+  language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+  xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+  <modelVersion>4.0.0</modelVersion>
+
+  <groupId>org.apache.stanbol</groupId>
+  <artifactId>org.apache.stanbol.stanbolnlpprocessing.reactor</artifactId>
+  <version>0.10.0-incubating-SNAPSHOT</version>
+  <packaging>pom</packaging>
+
+  <name>Apache Stanbol NLP Processing Branch Reactor</name>
+  <description>
+    Dummy reactor to compile all modules in the Stanbol NLP processing branch (STANBOL-733)
+  </description>
+
+  <modules>
+    <module>data</module>
+    <module>enhancer</module>
+    <module>nlp-launcher</module>
+  </modules>
+
+</project>