You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/09/19 10:48:39 UTC
svn commit: r1387488 [5/5] - in
/incubator/stanbol/branches/stanbol-nlp-processing: ./ data/
data/bundlelists/sentiment/ data/bundlelists/sentiment/src/
data/bundlelists/sentiment/src/main/
data/bundlelists/sentiment/src/main/bundles/ data/opennlp/lang...
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/utils/NlpEngineHelper.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/utils/NlpEngineHelper.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/utils/NlpEngineHelper.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/utils/NlpEngineHelper.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,179 @@
+package org.apache.stanbol.enhancer.nlp.utils;
+
+import static java.util.Collections.singleton;
+
+import java.io.IOException;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedTextFactory;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Utility class for {@link EnhancementEngine} implementations that
+ * do use the {@link AnalysedText} content part
+ * @author Rupert Westenthaler
+ *
+ */
+public final class NlpEngineHelper {
+
+ private static final Logger log = LoggerFactory.getLogger(NlpEngineHelper.class);
+
+ private NlpEngineHelper(){}
+
+
+ /**
+ * Getter for the AnalysedText for a ContentItem
+ * @param engine the EnhancementEngine calling this method (used for logging)
+ * @param ci the ContentItem
+ * @param exception <code>false</code> id used in {@link #canEnhance(ContentItem)}
+ * and <code>true</code> when called from {@link #computeEnhancements(ContentItem)}
+ * @return the AnalysedText or <code>null</code> if not found.
+ * @throws IllegalStateException if exception is <code>true</code> and the
+ * {@link AnalysedText} could not be retrieved from the parsed {@link ContentItem}.
+ */
+ public static AnalysedText getAnalysedText(EnhancementEngine engine, ContentItem ci, boolean exception) {
+ AnalysedText at;
+ try {
+ at = AnalysedTextUtils.getAnalysedText(ci);
+ }catch (RuntimeException e) {
+ log.warn("Unable to retrieve AnalysedText for ContentItem "
+ + ci + "because of an "+e.getClass().getSimpleName()+" with message "
+ + e.getMessage(),e);
+ at = null;
+ }
+ if(at != null){
+ return at;
+ }
+ if(exception){
+ throw new IllegalStateException("Unable to retrieve AnalysedText from ContentItem "
+ + ci+". As this is also checked in canEnhancer this may indicate an Bug in the "
+ + "used EnhancementJobManager!");
+ } else {
+ log.warn("The Enhancement Engine '{} (impl: {})' CAN NOT enhance "
+ + "ContentItem {} because the AnalysedText ContentPart is "
+ + "missing. Users might want to add an EnhancementEngine that "
+ + "creates the AnalysedText ContentPart such as the "
+ + "POSTaggingEngine (o.a.stanbol.enhancer.engines.opennlp.pos)!",
+ new Object[]{engine.getName(), engine.getClass().getSimpleName(),ci});
+ return null;
+ }
+ }
+
+ /**
+ * Getter for the language of the content
+ * @param ci the ContentItem
+ * @param exception <code>false</code> id used in {@link #canEnhance(ContentItem)}
+ * and <code>true</code> when called from {@link #computeEnhancements(ContentItem)}
+ * @return the AnalysedText or <code>null</code> if not found.
+ * @throws IllegalStateException if exception is <code>true</code> and the
+ * language could not be retrieved from the parsed {@link ContentItem}.
+ */
+ public static String getLanguage(EnhancementEngine engine, ContentItem ci, boolean exception) {
+ String language = EnhancementEngineHelper.getLanguage(ci);
+ if(language != null) {
+ return language;
+ }
+ if(exception){
+ throw new IllegalStateException("Unable to retrieve the detected language for ContentItem "
+ + ci+". As this is also checked in canEnhancer this may indicate an Bug in the "
+ + "used EnhancementJobManager!");
+ } else {
+ log.warn("The Enhancement Engine '{} (impl: {})' CAN NOT enhance "
+ + "ContentItem {} because the langauge of "
+ + "this ContentItem is unknown. Users might want to add a "
+ + "Language Identification EnhancementEngine to the current "
+ + "EnhancementChain!",
+ new Object[]{engine.getName(), engine.getClass().getSimpleName(),ci});
+ return null;
+ }
+ }
+ /**
+ * Retrieves - or if not present - creates the {@link AnalysedText} content
+ * part for the parsed {@link ContentItem}. If the {@link Blob} with the
+ * mime type '<code>text/plain</code>' is present this method
+ * throws an {@link IllegalStateException} (this method internally uses
+ * {@link #getPlainText(EnhancementEngine, ContentItem, boolean)} with
+ * <code>true</code> as third parameters. Users of this method should call
+ * this method with <code>false</code> as third parameter in their
+ * {@link EnhancementEngine#canEnhance(ContentItem)} implementation.<p>
+ * <i>NOTE:</i> This method is intended for Engines that want to create an
+ * empty {@link AnalysedText} content part. Engines that assume that this
+ * content part is already present (e.g. if the consume already existing
+ * annotations) should use the
+ * {@link #getAnalysedText(EnhancementEngine, ContentItem, boolean)}
+ * method instead.
+ * @param engine the EnhancementEngine calling this method (used for logging)
+ * @param analysedTextFactory the {@link AnalysedTextFactory} used to create
+ * the {@link AnalysedText} instance (if not present).
+ * @param ci the {@link ContentItem}
+ * @return the AnalysedText
+ * @throws EngineException on any exception while accessing the
+ * '<code>text/plain</code>' Blob
+ * @throws IllegalStateException if no '<code>text/plain</code>' Blob is
+ * present as content part of the parsed {@link ContentItem}. NOTE that if
+ * the {@link AnalysedText} content part is already present no Exception will
+ * be thrown even if no plain text {@link Blob} is present in the parsed
+ * {@link ContentItem}
+ */
+ public static AnalysedText initAnalysedText(EnhancementEngine engine,
+ AnalysedTextFactory analysedTextFactory,
+ ContentItem ci) throws EngineException {
+ AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
+ if(at == null){
+ Entry<UriRef,Blob> textBlob = getPlainText(engine, ci, true);
+ log.debug(" ... create new AnalysedText instance for Engine {}", engine.getName());
+ try {
+ at = analysedTextFactory.createAnalysedText(ci, textBlob.getValue());
+ } catch (IOException e) {
+ throw new EngineException("Unable to create AnalysetText instance for Blob "
+ + textBlob.getKey()+ " of ContentItem "+ci.getUri()+"!",e);
+ }
+ } else {
+ log.debug(" ... use existing AnalysedText instance for Engine {}", engine.getName());
+ }
+ return at;
+ }
+
+ /**
+ * Getter for the language of the content
+ * @param ci the ContentItem
+ * @param exception <code>false</code> id used in {@link #canEnhance(ContentItem)}
+ * and <code>true</code> when called from {@link #computeEnhancements(ContentItem)}
+ * @return the AnalysedText or <code>null</code> if not found.
+ * @throws IllegalStateException if exception is <code>true</code> and the
+ * language could not be retrieved from the parsed {@link ContentItem}.
+ */
+ public static Entry<UriRef,Blob> getPlainText(EnhancementEngine engine, ContentItem ci, boolean exception) {
+ Entry<UriRef,Blob> textBlob = ContentItemHelper.getBlob(
+ ci, singleton("text/plain"));
+ if(textBlob != null) {
+ return textBlob;
+ }
+ if(exception){
+ throw new IllegalStateException("Unable to retrieve 'text/plain' ContentPart for ContentItem "
+ + ci+". As this is also checked in canEnhancer this may indicate an Bug in the "
+ + "used EnhancementJobManager!");
+ } else {
+ log.warn("The Enhancement Engine '{} (impl: {})' CAN NOT enhance "
+ + "ContentItem {} because no 'text/plain' ContentPart is "
+ + "present in this ContentItem. Users that need to enhance "
+ + "non-plain-text Content need to add an EnhancementEngine "
+ + "that supports the conversion of '{}' files to plain text "
+ + "to the current EnhancementChain!",
+ new Object[]{engine.getName(), engine.getClass().getSimpleName(),ci,ci.getMimeType()});
+ return null;
+ }
+ }
+
+}
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextTest.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextTest.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/model/AnalysedTextTest.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,403 @@
+package org.apache.stanbol.enhancer.nlp.model;
+
+import static junit.framework.Assert.assertEquals;
+import static junit.framework.Assert.assertNotNull;
+import static junit.framework.Assert.assertTrue;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.EnumSet;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map.Entry;
+import java.util.Set;
+
+import junit.framework.Assert;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
+import org.apache.stanbol.enhancer.nlp.model.Span.SpanTypeEnum;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.apache.stanbol.enhancer.servicesapi.impl.StringSource;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * The Class added as ContentPart to the contentItem
+ * @author westei
+ *
+ */
+public class AnalysedTextTest {
+
+ private static Logger log = LoggerFactory.getLogger(AnalysedTextTest.class);
+
+ public static final String text = "The Stanbol enhancer can detect famous " +
+ "cities such as Paris and people such as Bob Marley. With " +
+ "disambiguation it would even be able to detect the Comedian " +
+ "Bob Marley trafeling to Paris in Texas.";
+
+ public static final Annotation<String,Number> testAnnotation =
+ new Annotation<String,Number>("test", Number.class);
+
+ /* -----
+ * Test data creates within the BeforeClass
+ * -----
+ */
+ /**
+ * AnalysedText instance filled in {@link #setup()} with test dats
+ */
+ private static AnalysedText analysedTextWithData;
+ private static LinkedHashMap<Sentence,String> expectedSentences = new LinkedHashMap<Sentence,String>();
+ private static LinkedHashMap<Chunk,String> expectedChunks = new LinkedHashMap<Chunk,String>();
+ private static LinkedHashMap<Token,String> expectedTokens = new LinkedHashMap<Token,String>();
+
+ /* -----
+ * Test data creates before every single test
+ * -----
+ */
+ /**
+ * Empty AnalysedText instance created before each test
+ */
+ private static AnalysedText at;
+
+ private static final ContentItemFactory ciFactory = InMemoryContentItemFactory.getInstance();
+ private static final AnalysedTextFactory atFactory = AnalysedTextFactory.getDefaultInstance();
+
+ private static ContentItem ci;
+
+ @BeforeClass
+ public static final void setup() throws IOException {
+ analysedTextWithData = createAnalysedText();
+ int sentence = text.indexOf('.')+1;
+ Sentence sent1 = analysedTextWithData.addSentence(0, sentence);
+ expectedSentences.put(sent1, "The Stanbol enhancer can detect famous " +
+ "cities such as Paris and people such as Bob Marley.");
+
+ Sentence sent2 = analysedTextWithData.addSentence(sentence+1, text.length());
+ expectedSentences.put(sent2, "With disambiguation it would even be able " +
+ "to detect the Comedian Bob Marley trafeling to Paris in Texas.");
+
+ Token the = sent1.addToken(0, 3);
+ expectedTokens.put(the, "The");
+ Token stanbol = sent1.addToken(4,11);
+ expectedTokens.put(stanbol, "Stanbol");
+ //use index to create Tokens
+ int enhancerStart = sent1.getSpan().toString().indexOf("enhancer");
+ Token enhancer = sent1.addToken(enhancerStart,enhancerStart+"enhancer".length());
+ expectedTokens.put(enhancer, "enhancer");
+
+ //create a chunk
+ Chunk stanbolEnhancer = analysedTextWithData.addChunk(stanbol.getStart(), enhancer.getEnd());
+ expectedChunks.put(stanbolEnhancer, "Stanbol enhancer");
+
+ int parisStart = sent1.getSpan().toString().indexOf("Paris");
+ Token paris = sent1.addToken(parisStart, parisStart+5);
+ expectedTokens.put(paris, "Paris");
+
+ int bobMarleyStart = sent1.getSpan().toString().indexOf("Bob Marley");
+ Chunk bobMarley = sent1.addChunk(bobMarleyStart, bobMarleyStart+10);
+ expectedChunks.put(bobMarley, "Bob Marley");
+ Token bob = bobMarley.addToken(0, 3);
+ expectedTokens.put(bob, "Bob");
+ Token marley = bobMarley.addToken(4, 10);
+ expectedTokens.put(marley, "Marley");
+
+ Token with = sent2.addToken(0, 4);
+ expectedTokens.put(with, "With");
+ Token disambiguation = sent2.addToken(5, 5+"disambiguation".length());
+ expectedTokens.put(disambiguation, "disambiguation");
+
+ int comedianBobMarleyIndex = sent2.getSpan().toString().indexOf("Comedian");
+ Chunk comedianBobMarley = sent2.addChunk(comedianBobMarleyIndex,
+ comedianBobMarleyIndex+"Comedian Bob Marley".length());
+ expectedChunks.put(comedianBobMarley, "Comedian Bob Marley");
+ Token comedian = comedianBobMarley.addToken(0, "Comedian".length());
+ expectedTokens.put(comedian, "Comedian");
+ Token bobSent2 = comedianBobMarley.addToken(9,9+"Bob".length());
+ expectedTokens.put(bobSent2, "Bob");
+ Token marleySent2 = comedianBobMarley.addToken(13, 13+"Marley".length());
+ expectedTokens.put(marleySent2, "Marley");
+
+ int parisIndex = sent2.getSpan().toString().indexOf("Paris");
+ Chunk parisInTexas = sent2.addChunk(parisIndex, parisIndex+"Paris in Texas".length());
+ expectedChunks.put(parisInTexas, "Paris in Texas");
+ Token parisSent2 = parisInTexas.addToken(0, "Paris".length());
+ expectedTokens.put(parisSent2, "Paris");
+ int inIndex = parisInTexas.getSpan().indexOf("in");
+ Token in = parisInTexas.addToken(inIndex,
+ inIndex+2);
+ expectedTokens.put(in, "in");
+ Token texasSent2 = parisInTexas.addToken(parisInTexas.getSpan().toString().indexOf("Texas"),
+ parisInTexas.getSpan().toString().indexOf("Texas")+"Texas".length());
+ expectedTokens.put(texasSent2, "Texas");
+
+ }
+
+
+ @Before
+ public void initAnalysedText() throws Exception {
+ at = createAnalysedText();
+ }
+ /**
+ * @throws IOException
+ */
+ private static AnalysedText createAnalysedText() throws IOException {
+ ci = ciFactory.createContentItem(new StringSource(text));
+ Entry<UriRef,Blob> textBlob = ContentItemHelper.getBlob(ci, Collections.singleton("text/plain"));
+ return atFactory.createAnalysedText(ci, textBlob.getValue());
+ }
+
+
+ @Test
+ public void testSpanFilter(){
+ Iterator<Sentence> sentences = analysedTextWithData.getSentences();
+ Iterator<Chunk> chunks = analysedTextWithData.getChunks();
+ Iterator<Token> tokens = analysedTextWithData.getTokens();
+ for(Entry<Sentence,String> sentEntry : expectedSentences.entrySet()){
+ Sentence sent = sentences.next();
+ Assert.assertEquals(sentEntry.getKey(), sent);
+ Assert.assertEquals(sentEntry.getValue(), sent.getSpan().toString());
+ }
+ for(Entry<Chunk,String> chunkEntry : expectedChunks.entrySet()){
+ Chunk chunk = chunks.next();
+ Assert.assertEquals(chunkEntry.getKey(), chunk);
+ Assert.assertEquals(chunkEntry.getValue(), chunk.getSpan().toString());
+ }
+ for(Entry<Token,String> tokenEntry : expectedTokens.entrySet()){
+ Token token = tokens.next();
+ Assert.assertEquals(tokenEntry.getKey(), token);
+ Assert.assertEquals(tokenEntry.getValue(), token.getSpan().toString());
+ }
+ }
+
+ @Test
+ public void testAnalysedText(){
+ Assert.assertEquals(text, at.getText());
+ Assert.assertEquals(text, at.getSpan());
+ Assert.assertEquals(0, at.getStart());
+ Assert.assertEquals(text.length(), at.getEnd());
+ }
+ /**
+ * Spans created relative to an other MUST NOT exceed the span of the
+ * other one
+ */
+ @Test(expected=IllegalArgumentException.class)
+ public void testExceedsRelativeSpan(){
+ Sentence sent = at.addSentence(0, 10);
+ sent.addChunk(5, 15); //Invalid
+ }
+
+ @Test(expected=IllegalArgumentException.class)
+ public void testNegativeStart(){
+ at.addSentence(-1, 10);
+ }
+
+ @Test(expected=IllegalArgumentException.class)
+ public void testRelativeNegativeStart(){
+ Sentence sent = at.addSentence(0, 10);
+ sent.addToken(-1, 5);
+ }
+ @Test
+ public void testAnalysedTextaddSpanMethods(){
+ Collection<Span> spans = new HashSet<Span>();
+ //add some span of different types
+ spans.add(at.addToken(4, 11));
+ spans.add(at.addChunk(4,19));
+ spans.add(at.addSentence(0, 91));
+ Set<Span> atSpans = AnalysedTextUtils.asSet(at.getEnclosed(EnumSet.allOf(SpanTypeEnum.class)));
+ Assert.assertTrue(spans.containsAll(atSpans));
+ Assert.assertTrue(atSpans.containsAll(spans));
+ }
+ /**
+ * Test relative additions (with relative indexes) as well as iterators
+ * over this hierarchy
+ */
+ @Test
+ public void testSpanHierarchy(){
+ int[] startPos = new int[]{0,1,2};
+ int[] endPos = new int[]{1,2,3};
+ int maxVal = endPos[endPos.length-1];
+ int tokenLength = 5;
+ int chunkLength = tokenLength*maxVal;
+ int sentenceLength = tokenLength*maxVal*maxVal;
+ List<Sentence> sentences = new ArrayList<Sentence>(startPos.length);
+ List<Chunk> chunks = new ArrayList<Chunk>(startPos.length*2);
+ List<Token> tokens = new ArrayList<Token>(startPos.length*3);
+ int start;
+ int end;
+ //1. test relative add and absolute start/end
+ log.info("--- adding Spans ---");
+ for(int s=0;s<startPos.length;s++){
+ start = startPos[s]*sentenceLength;
+ end = endPos[s]*sentenceLength;
+ Sentence sent = at.addSentence(start, end);
+ log.info("add {}",sent);
+ assertEquals(start, sent.getStart());
+ assertEquals(end, sent.getEnd());
+ sentences.add(sent);
+ }
+ //1.b iterate over the sentences while adding Chunks and Tokens to
+ // test that returned Iterators MUST NOT throw
+ // ConcurrentModificationExceptions when adding Spans to the AnalysedText
+ Iterator<Sentence> sentenceIt = at.getSentences();
+ while(sentenceIt.hasNext()){
+ Sentence sent = sentenceIt.next();
+ for(int c=0;c<startPos.length;c++){
+ start = startPos[c]*chunkLength;
+ end = endPos[c]*chunkLength;
+ Chunk chunk = sent.addChunk(start, end);
+ log.info(" add {}",chunk);
+ start = sent.getStart() + start;
+ end = sent.getStart() + end;
+ assertEquals(start, chunk.getStart());
+ assertEquals(end, chunk.getEnd());
+ chunks.add(chunk);
+ for(int t=0;t<startPos.length;t++){
+ start = startPos[t]*tokenLength;
+ end = endPos[t]*tokenLength;
+ Token token = chunk.addToken(start, end);
+ log.info(" add {}",token);
+ start = chunk.getStart() + start;
+ end = chunk.getStart() + end;
+ assertEquals(start, token.getStart());
+ assertEquals(end, token.getEnd());
+ tokens.add(token);
+ }
+ }
+ }
+ //2. test iterations of enclosed
+ int chunksInSentence = startPos.length;
+ int tokensInChunk = chunksInSentence;
+ int tokensInSentence = chunksInSentence*tokensInChunk;
+ Iterator<Sentence> sentIt = at.getSentences();
+ int s = 0;
+ int c = 0;
+ int t = 0;
+ log.info("--- iterating over Spans ---");
+ log.info("{}",at);
+ for(;sentIt.hasNext();s++){
+ assertTrue(sentences.size()+" Sentences Expected (found: "+(s+1)+")",s < sentences.size());
+ Sentence sent = sentIt.next();
+ log.info(" {}",sent);
+ assertEquals(sentences.get(s), sent);
+ Iterator<Chunk> chunkIt = sent.getChunks();
+ int foundChunks = 0;
+ for(;chunkIt.hasNext();c++){
+ assertTrue(chunks.size()+" Chunks Expected (found: "+(c+1)+")",c < chunks.size());
+ Chunk chunk = chunkIt.next();
+ log.info(" {}",chunk);
+ assertEquals(chunks.get(c), chunk);
+ Iterator<Token> tokenIt = chunk.getTokens();
+ int foundTokens = 0;
+ for(;tokenIt.hasNext();t++){
+ assertTrue(tokens.size()+" Tokens Expected (found: "+(t+1)+")",t < tokens.size());
+ Token token = tokenIt.next();
+ log.info(" {}",token);
+ assertEquals(tokens.get(t), token);
+ foundTokens++;
+ }
+ assertEquals(tokensInChunk+" Tokens expected in Chunk", tokensInChunk,foundTokens);
+ foundChunks++;
+ }
+ assertEquals(chunksInSentence+" Chunks expected in Sentence", chunksInSentence,foundChunks);
+ //also iterate over tokens within a sentence
+ log.info(" {}",sent);
+ Iterator<Token> tokenIt = sent.getTokens();
+ int foundTokens = 0;
+ for(;tokenIt.hasNext();foundTokens++){
+ Token token = tokenIt.next();
+ log.info(" {}",token);
+ assertEquals(tokens.get(s*tokensInSentence+foundTokens), token);
+ }
+ assertEquals(tokensInSentence+" Tokens expected in Sentence", tokensInSentence,foundTokens);
+ }
+ assertEquals(sentences.size()+" Sentences Expected (found: "+s+")", sentences.size(),s);
+ assertEquals(chunks.size()+" Chunks Expected (found: "+c+")", chunks.size(),c);
+ assertEquals(tokens.size()+" Sentences Expected (found: "+t+")", tokens.size(),t);
+ //also iterate over Chunks in AnalysedText
+ Iterator<Chunk> chunkIt = at.getChunks();
+ int foundChunks = 0;
+ log.info("{}",at);
+ for(;chunkIt.hasNext();foundChunks++){
+ Chunk chunk = chunkIt.next();
+ log.info(" {}",chunk);
+ assertEquals(chunks.get(foundChunks), chunk);
+ }
+ assertEquals(chunks.size()+" Chunks expected in AnalysedText", chunks.size(),foundChunks);
+ //also iterate over Tokens in AnalysedText
+ Iterator<Token> tokenIt = at.getTokens();
+ int foundTokens = 0;
+ log.info("{}",at);
+ for(;tokenIt.hasNext();foundTokens++){
+ Token token = tokenIt.next();
+ log.info(" {}",token);
+ assertEquals(tokens.get(foundTokens), token);
+ }
+ assertEquals(tokens.size()+" Tokens expected in AnalysedText", tokens.size(),foundTokens);
+
+ //Finally iterate over multiple token types
+ Iterator<Span> sentencesAndChunks = at.getEnclosed(
+ EnumSet.of(SpanTypeEnum.Sentence,SpanTypeEnum.Chunk));
+ s=0;
+ c=0;
+ log.info("{} >> Iterate over Sentences and Chunks",at);
+ while(sentencesAndChunks.hasNext()){
+ Span span = sentencesAndChunks.next();
+ log.info(" {}",span);
+ if(span.getType() == SpanTypeEnum.Chunk){
+ assertEquals(chunks.get(c), span);
+ c++;
+ } else if(span.getType() == SpanTypeEnum.Sentence){
+ assertEquals(sentences.get(s), span);
+ s++;
+ } else {
+ Assert.fail("Unexpected SpanType '"+span.getType()+" (Span: "+span.getClass()+")");
+ }
+ }
+ assertEquals(sentences.size()+" Sentences expected in AnalysedText", sentences.size(),s);
+ assertEquals((sentences.size()*chunksInSentence)+" Chunks expected in AnalysedText",
+ (sentences.size()*chunksInSentence),c);
+ }
+
+ @Test
+ public void testAnnotation(){
+ List<Value<Number>> values = new ArrayList<Value<Number>>();
+ values.add(new Value<Number>(26,0.6));
+ values.add(new Value<Number>(27l));
+ values.add(new Value<Number>(28.0f));
+ values.add(new Value<Number>(25.0,0.8));
+ at.addAnnotations(testAnnotation, values);
+ Value<Number> value = at.getAnnotation(testAnnotation);
+ assertNotNull(value);
+ assertEquals(Double.valueOf(25.0), value.value());
+ assertEquals(0.8d, value.probability());
+ Number prev = Float.valueOf(24f);
+ for(Value<Number> v : at.getAnnotations(testAnnotation)){
+ assertNotNull(v);
+ assertTrue(v.value().doubleValue() > prev.doubleValue());
+ prev = v.value();
+ }
+ //check that the order of Annotations without probability is kept
+ at.addAnnotation(testAnnotation, new Value<Number>(29));
+ prev = Integer.valueOf(24);
+ for(Value<Number> v : at.getAnnotations(testAnnotation)){
+ assertNotNull(v);
+ assertTrue(v.value().intValue() > prev.intValue());
+ prev = v.value();
+ }
+
+ }
+
+}
Added: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/utils/NIFHelperTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/utils/NIFHelperTest.java?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/utils/NIFHelperTest.java (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/generic/nlp/src/test/java/org/apache/stanbol/enhancer/nlp/utils/NIFHelperTest.java Wed Sep 19 08:48:32 2012
@@ -0,0 +1,40 @@
+package org.apache.stanbol.enhancer.nlp.utils;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.nio.charset.Charset;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.junit.Test;
+
+import junit.framework.Assert;
+
+public class NIFHelperTest {
+
+ static UriRef base = new UriRef("http://stanbol.apache.org/test/nif/nif-helper");
+ static String text = "This is a test for the NLP Interchange format!";
+
+
+ @Test
+ public void testFragmentURI(){
+ Assert.assertEquals(
+ new UriRef(base.getUnicodeString()+"#char=23,26"),
+ NIFHelper.getNifFragmentURI(base, 23, 26));
+ }
+ @Test
+ public void testOffsetURI(){
+ Assert.assertEquals(
+ base.getUnicodeString()+"#offset_23_26",
+ NIFHelper.getNifOffsetURI(base, 23, 26).getUnicodeString());
+ }
+ @Test
+ public void testHashURI() throws IOException {
+ String selected = text.substring(23,26);
+ String context = text.substring(13,23)+'('+selected+')'+text.substring(26,36);
+ byte[] contextData = context.getBytes(Charset.forName("UTF8"));
+ String md5 = ContentItemHelper.streamDigest(new ByteArrayInputStream(contextData), null, "MD5");
+ UriRef expected = new UriRef(base.getUnicodeString()+"#hash_10_3_"+md5+"_NLP");
+ Assert.assertEquals(expected, NIFHelper.getNifHashURI(base, 23, 26, text));
+ }
+}
Modified: incubator/stanbol/branches/stanbol-nlp-processing/enhancer/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/pom.xml?rev=1387488&r1=1387487&r2=1387488&view=diff
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/pom.xml (original)
+++ incubator/stanbol/branches/stanbol-nlp-processing/enhancer/pom.xml Wed Sep 19 08:48:32 2012
@@ -54,6 +54,7 @@
<module>generic/core</module>
<module>generic/test</module>
<module>generic/rdfentities</module>
+ <module>generic/nlp</module>
<module>jobmanager</module>
<module>chain/allactive</module>
@@ -61,6 +62,8 @@
<module>chain/weighted</module>
<module>chain/list</module>
+ <module>engines</module>
+
<module>jersey</module>
<module>ldpath</module>
<module>benchmark</module>
Propchange: incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Wed Sep 19 08:48:32 2012
@@ -0,0 +1,7 @@
+.project
+
+.settings
+
+target
+
+.classpath
Added: incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/pom.xml?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/pom.xml (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/pom.xml Wed Sep 19 08:48:32 2012
@@ -0,0 +1,212 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+ <parent>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>stanbol-parent</artifactId>
+ <version>2-incubating-SNAPSHOT</version>
+ <relativePath>../../parent</relativePath>
+ </parent>
+
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.launchers.enhancer-nlp</artifactId>
+ <version>0.10.0-incubating-SNAPSHOT</version>
+ <packaging>jar</packaging>
+
+ <name>Apache Stanbol Launchers for the NLP processing branch</name>
+ <description>
+ Runnable jar configured to test engines included in the
+ NLP processing branch (STANBOL-733)
+ </description>
+
+ <scm>
+ <url>http://incubator.apache.org/stanbol/</url>
+ </scm>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-clean-plugin</artifactId>
+ <configuration>
+ <filesets>
+ <fileset>
+ <directory>.</directory>
+ <includes>
+ <include>stanbol/**</include>
+ </includes>
+ </fileset>
+ </filesets>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.sling</groupId>
+ <artifactId>maven-launchpad-plugin</artifactId>
+ <!--
+ TODO the maven-launchpad-plugin can also generate a war file and
+ Karaf description, we could add this. See
+ http://sling.apache.org/site/maven-launchpad-plugin.html
+ -->
+ <executions>
+ <execution>
+ <id>prepare-package</id>
+ <goals>
+ <goal>prepare-package</goal>
+ <goal>attach-bundle-list</goal>
+ </goals>
+ <configuration>
+ <includeDefaultBundles>false</includeDefaultBundles>
+ <!-- Standalone jar requires an OSGi http service implementation -->
+ <jarWebSupport>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.http.jetty</artifactId>
+ <version>2.2.0</version>
+ </jarWebSupport>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-jar-plugin</artifactId>
+ <configuration>
+ <archive>
+ <manifest>
+ <!-- make the generated jar runnable -->
+ <addClasspath>true</addClasspath>
+ <mainClass>org.apache.stanbol.launchpad.Main</mainClass>
+ <addDefaultImplementationEntries>true</addDefaultImplementationEntries>
+ </manifest>
+ </archive>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-shade-plugin</artifactId>
+ <version>1.2</version>
+ <executions>
+ <execution>
+ <phase>package</phase>
+ <goals>
+ <goal>shade</goal>
+ </goals>
+ <configuration>
+ <artifactSet>
+ <!-- Use this to in/exclude only specific dependencies -->
+ <includes>
+ <include>org.apache.stanbol:org.apache.stanbol.launchpad</include>
+ </includes>
+ </artifactSet>
+ <transformers>
+ <transformer implementation="org.apache.maven.plugins.shade.resource.ComponentsXmlResourceTransformer" />
+ </transformers>
+ </configuration>
+ </execution>
+ </executions>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.rat</groupId>
+ <artifactId>apache-rat-plugin</artifactId>
+ <configuration>
+ <excludes>
+ </excludes>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+ <dependencies>
+ <dependency>
+ <!-- The Apache Stanbol lauchpad -->
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.launchpad</artifactId>
+ <version>0.10.0-incubating-SNAPSHOT</version>
+ </dependency>
+ <dependency>
+ <!-- maven-launchpad-plugin builds on the launchpad.base app -->
+ <groupId>org.apache.sling</groupId>
+ <artifactId>org.apache.sling.launchpad.base</artifactId>
+ <classifier>app</classifier>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- OSGi Framemework Bundle List -->
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.launchers.bundlelists.osgiframework</artifactId>
+ <version>0.10.0-incubating-SNAPSHOT</version>
+ <type>partialbundlelist</type>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- Stanbol Commons Bundle List -->
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.launchers.bundlelists.stanbolcommons</artifactId>
+ <version>0.10.0-incubating-SNAPSHOT</version>
+ <type>partialbundlelist</type>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- Stanbol Enhancer Bundle List -->
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.bundlelist</artifactId>
+ <version>0.10.0-incubating-SNAPSHOT</version>
+ <type>partialbundlelist</type>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- Stanbol Data Bundle List -->
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.data.bundlelists.defaultdata</artifactId>
+ <version>0.10.0-incubating-SNAPSHOT</version>
+ <type>partialbundlelist</type>
+ <scope>provided</scope>
+ </dependency>
+ <!-- OpenNLP Data Bundle List -->
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.data.bundlelists.opennlp</artifactId>
+ <version>0.10.0-incubating-SNAPSHOT</version>
+ <type>partialbundlelist</type>
+ <scope>provided</scope>
+ </dependency>
+ <!-- Sentiment Data Bundle List -->
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.data.bundlelists.sentiment</artifactId>
+ <version>0.10.0-incubating-SNAPSHOT</version>
+ <type>partialbundlelist</type>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- Stanbol Entityhub Bundle List -->
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.entityhub.bundlelist</artifactId>
+ <version>0.11.0-incubating-SNAPSHOT</version>
+ <type>partialbundlelist</type>
+ <scope>provided</scope>
+ </dependency>
+ </dependencies>
+
+</project>
Added: incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/bundles/list.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/bundles/list.xml?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/bundles/list.xml (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/bundles/list.xml Wed Sep 19 08:48:32 2012
@@ -0,0 +1,35 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+ <!--
+ List of initial bundles for the Stanbol Sling-based standalone launcher.
+ -->
+<bundles>
+ <!-- General-purpose libraries -->
+
+ <!-- *********************************************************************
+ start level 20 TO 24 reserved for Stanbol Framework
+ (Enhancer, Entityhub, Contenthub, Factstore ... incl. Web Fragments)
+ ********************************************************************* -->
+
+
+
+ <!-- *********************************************************************
+ start level >= 30 are unused
+ ********************************************************************* -->
+
+</bundles>
Added: incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/sling/common.properties
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/sling/common.properties?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/sling/common.properties (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/nlp-launcher/src/main/sling/common.properties Wed Sep 19 08:48:32 2012
@@ -0,0 +1,24 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This file is loaded by Apache Sling during startup. Properties defined
+# in this file are copied over to the sling.properties file in the {sling.home}
+# directory.
+
+# The stanbol home directory
+# by default this is set to the same value as sling.home
+stanbol.home=${sling.home}
+org.osgi.framework.startlevel.beginning=40
Added: incubator/stanbol/branches/stanbol-nlp-processing/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/stanbol-nlp-processing/pom.xml?rev=1387488&view=auto
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/pom.xml (added)
+++ incubator/stanbol/branches/stanbol-nlp-processing/pom.xml Wed Sep 19 08:48:32 2012
@@ -0,0 +1,33 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ You under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.stanbolnlpprocessing.reactor</artifactId>
+ <version>0.10.0-incubating-SNAPSHOT</version>
+ <packaging>pom</packaging>
+
+ <name>Apache Stanbol NLP Processing Branch Reactor</name>
+ <description>
+ Dummy reactor to compile all modules in the Stanbol NLP processing branch (STANBOL-733)
+ </description>
+
+ <modules>
+ <module>data</module>
+ <module>enhancer</module>
+ <module>nlp-launcher</module>
+ </modules>
+
+</project>