You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2014/10/22 12:09:42 UTC
svn commit: r1633587 - in /stanbol/trunk:
enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/
enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/
Author: rwesten
Date: Wed Oct 22 10:09:41 2014
New Revision: 1633587
URL: http://svn.apache.org/r1633587
Log:
first implementation of an NLP 2 NIF 2.0 engine (STANBOL-1397)
Added:
stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20Helper.java
stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20MetadataEngine.java
stanbol/trunk/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/Nif20.java
Added: stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20Helper.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20Helper.java?rev=1633587&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20Helper.java (added)
+++ stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20Helper.java Wed Oct 22 10:09:41 2014
@@ -0,0 +1,302 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.nlp2rdf.engine;
+
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.nio.charset.Charset;
+import java.util.Collections;
+import java.util.EnumMap;
+import java.util.Map;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.commons.io.IOUtils;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+import org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Annotated;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Annotation;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
+import org.apache.stanbol.enhancer.nlp.nif.Nif20;
+import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+import org.apache.stanbol.enhancer.nlp.pos.PosTag;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+
+public final class Nif20Helper {
+
+ private static final LiteralFactory lf = LiteralFactory.getInstance();
+
+ private Nif20Helper(){}
+
+ public static final Map<SpanTypeEnum,UriRef> SPAN_TYPE_TO_SSO_TYPE;
+ static {
+ Map<SpanTypeEnum,UriRef> mapping = new EnumMap<SpanTypeEnum,UriRef>(SpanTypeEnum.class);
+ //mapping.put(SpanTypeEnum.Text, null);
+ //mapping.put(SpanTypeEnum.TextSection, null);
+ mapping.put(SpanTypeEnum.Sentence, Nif20.Sentence.getUri());
+ mapping.put(SpanTypeEnum.Chunk, Nif20.Phrase.getUri());
+ mapping.put(SpanTypeEnum.Token, Nif20.Word.getUri());
+ SPAN_TYPE_TO_SSO_TYPE = Collections.unmodifiableMap(mapping);
+ }
+
+ /**
+ * Read-only map that maps from the {@link LexicalCategory} to the OLIA
+ * Concept representing the Phrase (e.g. {@link LexicalCategory#Noun} maps
+ * to "<code>http://purl.org/olia/olia.owl#NounPhrase</code>").
+ */
+ public static final Map<LexicalCategory,UriRef> LEXICAL_TYPE_TO_PHRASE_TYPE;
+ static {
+ String olia = "http://purl.org/olia/olia.owl#";
+ Map<LexicalCategory,UriRef> mapping = new EnumMap<LexicalCategory,UriRef>(LexicalCategory.class);
+ mapping.put(LexicalCategory.Noun, new UriRef(olia+"NounPhrase"));
+ mapping.put(LexicalCategory.Verb, new UriRef(olia+"VerbPhrase"));
+ mapping.put(LexicalCategory.Adjective, new UriRef(olia+"AdjectivePhrase"));
+ mapping.put(LexicalCategory.Adverb, new UriRef(olia+"AdverbPhrase"));
+ mapping.put(LexicalCategory.Conjuction, new UriRef(olia+"ConjuctionPhrase"));
+ LEXICAL_TYPE_TO_PHRASE_TYPE = Collections.unmodifiableMap(mapping);
+ }
+ /**
+ * Creates a NIF2.0 Fragment URI using the parsed base URI and the start/end
+ * indexes.
+ * @param base the base URI
+ * @param start the start position. If <code>< 0</code> than zero is added.
+ * @param end the end position or values < 1 when open ended.
+ * @return the NIF 2.0 Fragment URI
+ * @throws IllegalArgumentException if <code>null</code> is parsed as base
+ * {@link UriRef} or the end position is >=0 but <= the parsed start
+ * position.
+ */
+ public static final UriRef getNifFragmentURI(UriRef base, int start,int end){
+ if(base == null){
+ throw new IllegalArgumentException("Base URI MUST NOT be NULL!");
+ }
+ StringBuilder sb = new StringBuilder(base.getUnicodeString());
+ sb.append("#char=");
+ sb.append(start >= 0 ? start : 0).append(',');
+ if(end >= 0){
+ if(end < start){
+ throw new IllegalArgumentException("End index '"+end+"' < start '"+start+"'!");
+ }
+ sb.append(end);
+ } //else open ended ...
+ return new UriRef(sb.toString());
+ }
+
+ public static final UriRef getNifRFC5147URI(UriRef base, int start, int end){
+ if(base == null){
+ throw new IllegalArgumentException("Base URI MUST NOT be NULL!");
+ }
+ assert start >= 0;
+ assert end < 0 || end >= start;
+ StringBuilder sb = new StringBuilder(base.getUnicodeString());
+ sb.append("#char=");
+ sb.append(start >= 0 ? start : 0);
+ if(end >= 0){
+ sb.append(',').append(end);
+ } //else select the whole string ...
+ return new UriRef(sb.toString());
+ }
+
+ public static final int NIF_HASH_CONTEXT_LENGTH = 10;
+ public static final int NIF_HASH_MAX_STRING_LENGTH = 20;
+
+ public static final Charset UTF8 = Charset.forName("UTF8");
+
+ public static final UriRef getNifHashURI(UriRef base, int start, int end, String text){
+ if(base == null){
+ throw new IllegalArgumentException("Base URI MUST NOT be NULL!");
+ }
+ start = start < 0 ? 0 : start;
+ end = end < 0 ? start : end;
+ if(end < start){
+ throw new IllegalArgumentException("End index '"+end+"' < start '"+start+"'!");
+ }
+ if(end >= text.length()){
+ throw new IllegalArgumentException("The End index '"+end+"' exeeds the "
+ + "length of the text '"+text.length()+"'!");
+ }
+ int contextStart = Math.max(0, start-NIF_HASH_CONTEXT_LENGTH);
+ int contextEnd = Math.min(text.length(), end+NIF_HASH_CONTEXT_LENGTH);
+ StringBuilder sb = new StringBuilder(base.getUnicodeString());
+ sb.append("#hash_");
+ sb.append(NIF_HASH_CONTEXT_LENGTH);
+ sb.append('_');
+ sb.append(end-start);
+ sb.append('_');
+ sb.append(getContextDigest(text, contextStart, start, end, contextEnd));
+ sb.append('_');
+ sb.append(text.substring(start,
+ Math.min(end,start+NIF_HASH_MAX_STRING_LENGTH)));
+ return new UriRef(sb.toString());
+ }
+
+ /**
+ * Creates the UTF8 byte representation for the '{prefix}({selected}){suffix}'
+ * calculated based on the parsed parameters
+ * @param text the text
+ * @param contextStart the start index of the prefix
+ * @param start the start index of the selected text part
+ * @param end the end index of the selecte text part
+ * @param contextEnd the end index of the suffix
+ * @return the HASH string representation of the MD5 over
+ * <code>'{prefix}({selected}){suffix}'</code> (NOTE the brackets that are
+ * added at the start/end of the selected text)
+ */
+ private static String getContextDigest(String text, int contextStart, int start, int end, int contextEnd) {
+ ByteArrayOutputStream contextOs = new ByteArrayOutputStream();
+ Writer contextWriter = new OutputStreamWriter(contextOs, UTF8);
+ try {
+ if(contextStart<start){
+ contextWriter.append(text, contextStart, start);
+ }
+ contextWriter.append('(');
+ if(start < end){
+ contextWriter.append(text, start, end);
+ }
+ contextWriter.append(')');
+ if(end < contextEnd){
+ contextWriter.append(text,end,contextEnd);
+ }
+ contextWriter.flush();
+ return ContentItemHelper.streamDigest(
+ new ByteArrayInputStream(contextOs.toByteArray()),
+ null, "MD5");
+ } catch (IOException e) {
+ //NO IOExceptions in in-memory stream implementations
+ throw new IllegalStateException(e);
+ } finally {
+ IOUtils.closeQuietly(contextOs);
+ }
+ }
+
+ /**
+ * Writes basic information of the parsed span by using NIF 1.0 including the
+ * {@link SsoOntology} Sentence/Phrase/Word type based on
+ * the {@link Span#getType()}<p>
+ * As {@link AnalysedText} is based on the plain text version of the ContentItem
+ * this uses the {@link StringOntology#OffsetBasedString} notation.<p>
+ * <i>NOTE:</i> This DOES NOT write string relations, lemma, pos ... information
+ * that might be stored as {@link Annotation} with the parsed {@link Span}.
+ * @param graph the graph to add the triples
+ * @param base the base URI
+ * @param text the {@link AnalysedText}
+ * @param language the {@link Language} or <code>null</code> if not known
+ * @param span the {@link Span} to write.
+ * @return the {@link UriRef} representing the parsed {@link Span} in the
+ * graph
+ */
+ public static UriRef writeSpan(MGraph graph, UriRef base, AnalysedText text, Language language, Span span){
+ UriRef segment = getNifRFC5147URI(base, span.getStart(),
+ span.getType() == SpanTypeEnum.Text ? -1 : span.getEnd());
+ graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.RFC5147String.getUri()));
+ if(span.getEnd() - span.getStart() < 100){
+ graph.add(new TripleImpl(segment, Nif20.anchorOf.getUri(),
+ new PlainLiteralImpl(span.getSpan(),language)));
+ } else {
+ graph.add(new TripleImpl(segment, Nif20.head.getUri(),
+ new PlainLiteralImpl(span.getSpan().substring(0,10),language)));
+ }
+ graph.add(new TripleImpl(segment, Nif20.beginIndex.getUri(),
+ lf.createTypedLiteral(span.getStart())));
+ graph.add(new TripleImpl(segment, Nif20.endIndex.getUri(),
+ lf.createTypedLiteral(span.getEnd())));
+ switch (span.getType()) {
+ case Token:
+ graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.Word.getUri()));
+ break;
+ case Chunk:
+ graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.Phrase.getUri()));
+ break;
+ case Sentence:
+ graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.Sentence.getUri()));
+ break;
+ case Text:
+ graph.add(new TripleImpl(segment, RDF_TYPE, Nif20.Context.getUri()));
+ break;
+ default:
+ // no default:
+ }
+ return segment;
+ }
+
+ /**
+ * Writes the {@link NlpAnnotations#POS_ANNOTATION} as NIF 1.0 to the parsed
+ * RDF graph by using the parsed segmentUri as subject
+ * @param graph the graph
+ * @param annotated the annotated element (e.g. a {@link Token})
+ * @param segmentUri the URI of the resource representing the parsed
+ * annotated element in the graph
+ */
+ public static void writePos(MGraph graph, Annotated annotated, UriRef segmentUri) {
+ Value<PosTag> posTag = annotated.getAnnotation(NlpAnnotations.POS_ANNOTATION);
+ if(posTag != null){
+ if(posTag.value().isMapped()){
+ for(Pos pos : posTag.value().getPos()){
+ graph.add(new TripleImpl(segmentUri, Nif20.oliaCategory.getUri(),
+ pos.getUri()));
+ }
+ for(LexicalCategory cat : posTag.value().getCategories()){
+ graph.add(new TripleImpl(segmentUri, Nif20.oliaCategory.getUri(),
+ cat.getUri()));
+ }
+ }
+ graph.add(new TripleImpl(segmentUri, Nif20.posTag.getUri(),
+ lf.createTypedLiteral(posTag.value().getTag())));
+ graph.add(new TripleImpl(segmentUri, ENHANCER_CONFIDENCE,
+ lf.createTypedLiteral(posTag.probability())));
+ }
+ }
+
+ /**
+ * Writes a {@link NlpAnnotations#PHRASE_ANNOTATION} as NIF 1.0 to the
+ * parsed RDF graph by using the segmentUri as subject
+ * @param graph the graph
+ * @param annotated the annotated element (e.g. a {@link Chunk})
+ * @param segmentUri the URI of the resource representing the parsed
+ * annotated element in the graph
+ */
+ public static void writePhrase(MGraph graph, Annotated annotated, UriRef segmentUri) {
+ Value<PhraseTag> phraseTag = annotated.getAnnotation(NlpAnnotations.PHRASE_ANNOTATION);
+ if(phraseTag != null){
+ UriRef phraseTypeUri = LEXICAL_TYPE_TO_PHRASE_TYPE.get(phraseTag.value().getCategory());
+ if(phraseTypeUri != null){ //add the oliaLink for the Phrase
+ graph.add(new TripleImpl(segmentUri, Nif20.oliaCategory.getUri(), phraseTypeUri));
+ graph.add(new TripleImpl(segmentUri, ENHANCER_CONFIDENCE,
+ lf.createTypedLiteral(phraseTag.probability())));
+ }
+ }
+ }
+
+}
Added: stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20MetadataEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20MetadataEngine.java?rev=1633587&view=auto
==============================================================================
--- stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20MetadataEngine.java (added)
+++ stanbol/trunk/enhancement-engines/nlp2rdf/src/main/java/org/apache/stanbol/enhancer/engines/nlp2rdf/engine/Nif20MetadataEngine.java Wed Oct 22 10:09:41 2014
@@ -0,0 +1,197 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.nlp2rdf.engine;
+
+import static org.apache.stanbol.enhancer.nlp.utils.NlpEngineHelper.getAnalysedText;
+
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.EnumSet;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.ConfigurationPolicy;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.Span;
+import org.apache.stanbol.enhancer.nlp.model.SpanTypeEnum;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.nif.Nif20;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Component(immediate = true, metatype = true,
+ configurationFactory = true, //allow multiple configuration
+ policy = ConfigurationPolicy.OPTIONAL) //create a default instance
+@Service
+@Properties(value={
+ @Property(name= EnhancementEngine.PROPERTY_NAME,value="nif20")
+})
+public class Nif20MetadataEngine extends AbstractEnhancementEngine<RuntimeException,RuntimeException> implements ServiceProperties{
+
+ private final Logger log = LoggerFactory.getLogger(Nif20MetadataEngine.class);
+ //TODO: replace this with a reald ontology
+ private final static UriRef SENTIMENT_PROPERTY = new UriRef(NamespaceEnum.fise+"sentiment-value");
+ private final LiteralFactory lf = LiteralFactory.getInstance();
+
+ /**
+ * Activate and read the properties. Configures and initialises a ChunkerHelper for each language configured in
+ * CONFIG_LANGUAGES.
+ *
+ * @param ce the {@link org.osgi.service.component.ComponentContext}
+ */
+ @Activate
+ protected void activate(ComponentContext ce) throws ConfigurationException {
+ log.info("activating POS tagging engine");
+ super.activate(ce);
+ @SuppressWarnings("unchecked")
+ Dictionary<String, Object> properties = ce.getProperties();
+ //TODO: read configuration
+ }
+
+ @Override
+ public int canEnhance(ContentItem ci) throws EngineException {
+ return getAnalysedText(this, ci, false) != null ?
+ ENHANCE_ASYNC : CANNOT_ENHANCE;
+ }
+
+ @Override
+ public void computeEnhancements(ContentItem ci) throws EngineException {
+ AnalysedText at = getAnalysedText(this, ci, true);
+ String lang = EnhancementEngineHelper.getLanguage(ci);
+ Language language = lang == null ? null : new Language(lang);
+ //now iterate over the AnalysedText data and create the RDF representation
+ //TODO: make configureable
+ boolean sentences = true;
+ boolean phrases = true;
+ boolean words = true;
+
+ EnumSet<SpanTypeEnum> activeTypes = EnumSet.noneOf(SpanTypeEnum.class);
+ if(sentences){
+ activeTypes.add(SpanTypeEnum.Sentence);
+ }
+ if(phrases){
+ activeTypes.add(SpanTypeEnum.Chunk);
+ }
+ if(words){
+ activeTypes.add(SpanTypeEnum.Token);
+ }
+ MGraph metadata = ci.getMetadata();
+ UriRef base = ci.getUri();
+ ci.getLock().writeLock().lock();
+ try {
+ //write the context
+ UriRef text = Nif20Helper.writeSpan(metadata, base, at, language, at);
+ metadata.add(new TripleImpl(text, Nif20.sourceUrl.getUri(), ci.getUri()));
+
+ Iterator<Span> spans = at.getEnclosed(activeTypes);
+ UriRef sentence = null;
+ UriRef phrase = null;
+ UriRef word = null;
+ boolean firstWordInSentence = true;
+ while(spans.hasNext()){
+ Span span = spans.next();
+ //TODO: filter Spans based on additional requirements
+ //(1) write generic information about the span
+ UriRef current = Nif20Helper.writeSpan(metadata, base, at, language, span);
+ //write the context
+ metadata.add(new TripleImpl(current, Nif20.referenceContext.getUri(), text));
+ //(2) add the relations between the different spans
+ switch (span.getType()) {
+ case Sentence:
+ if(sentence != null){
+ metadata.add(new TripleImpl(sentence, Nif20.nextSentence.getUri(), current));
+ }
+ sentence = current;
+ firstWordInSentence = true;
+ break;
+ case Chunk:
+ if(sentence != null){
+ metadata.add(new TripleImpl(current, Nif20.superString.getUri(), sentence));
+ if(word != null){
+ metadata.add(new TripleImpl(word, Nif20.lastWord.getUri(), sentence));
+ }
+ }
+ phrase = current;
+ break;
+ case Token:
+ if(sentence != null){
+ metadata.add(new TripleImpl(current, Nif20.sentence.getUri(), sentence));
+ if(firstWordInSentence){
+ metadata.add(new TripleImpl(current, Nif20.firstWord.getUri(), sentence));
+ firstWordInSentence = false;
+ }
+ }
+ if(phrase != null){
+ metadata.add(new TripleImpl(current, Nif20.subString.getUri(), phrase));
+ }
+ if(word != null){
+ metadata.add(new TripleImpl(word, Nif20.nextWord.getUri(), current));
+ metadata.add(new TripleImpl(current, Nif20.previousWord.getUri(), word));
+ }
+ word = current;
+ break;
+ default:
+ break;
+ }
+ //(3) add specific information such as POS, chunk type ...
+ Nif20Helper.writePos(metadata, span, current);
+ Nif20Helper.writePhrase(metadata, span, current);
+
+ //OlIA does not include Sentiments
+
+ Value<Double> sentiment = span.getAnnotation(NlpAnnotations.SENTIMENT_ANNOTATION);
+ if(sentiment != null && sentiment.value() != null){
+ metadata.add(new TripleImpl(current, SENTIMENT_PROPERTY,
+ lf.createTypedLiteral(sentiment.value())));
+ }
+ }
+ } finally {
+ ci.getLock().writeLock().unlock();
+ }
+ }
+
+ @Override
+ public Map<String,Object> getServiceProperties() {
+ return Collections.singletonMap(ServiceProperties.ENHANCEMENT_ENGINE_ORDERING,
+ (Object)ServiceProperties.ORDERING_POST_PROCESSING);
+ }
+
+
+
+
+
+}
Added: stanbol/trunk/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/Nif20.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/Nif20.java?rev=1633587&view=auto
==============================================================================
--- stanbol/trunk/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/Nif20.java (added)
+++ stanbol/trunk/enhancer/generic/nlp/src/main/java/org/apache/stanbol/enhancer/nlp/nif/Nif20.java Wed Oct 22 10:09:41 2014
@@ -0,0 +1,545 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.nlp.nif;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
+import org.apache.stanbol.enhancer.nlp.pos.Pos;
+
+public enum Nif20 {
+ /**
+ * A URI Scheme for NIF which is able to refer to a single, consecutive
+ * string in a context. Note that any scheme subclassing this class,
+ * requires the existence of beginIndex, endIndex and referenceContext .
+ * <p>
+ * This is an abstract class and should not be serialized.
+ */
+ CString,
+ /**
+ * An abitrary URI (e.g. a URN) for an arbitrary string of the context.
+ * This is roughly the same as TextAnnotations are currently implemented in Stanbol.
+ */
+ CStringInst,
+ /**
+ * The string that serves as a context for its substrings. The Unicode String
+ * given in the nif:isString property must be used to calculate the begin and
+ * endIndex for all nif:Strings that have a nif:referenceContext property to
+ * this URI. For further information, see
+ * http://svn.aksw.org/papers/2013/ISWC_NIF/public.pdf
+ */
+ Context,
+ /**
+ * A collection of contexts used to create an unordered set of context via
+ * the nif:hasContext property. This can be compared to a document collection,
+ * but here it is a collection of nif:Context and therefore a collection of
+ * annotated strings, not documents.
+ */
+ ContextCollection,
+ /**
+ * see <a href="http://jens-lehmann.org/files/2012/ekaw_nif.pdf">
+ * Linked-Data Aware URI Schemes for Referencing Text Fragments</a>
+ * by Sebastian Hellmann, Jens Lehmann und Sören Auer in EKAW 2012 for more
+ * information.
+ */
+ ContextHashBasedString,
+ /**
+ * see <a href="http://jens-lehmann.org/files/2012/ekaw_nif.pdf">
+ * Linked-Data Aware URI Schemes for Referencing Text Fragments</a>
+ * by Sebastian Hellmann, Jens Lehmann und Sören Auer in EKAW 2012 for more
+ * information.
+ */
+ OffsetBasedString,
+ /**
+ * A Paragraph
+ */
+ Paragraph,
+ /**
+ * A nif:Phrase can be a nif:String, that is a chunk of several words or a
+ * word itself (e.g. a NounPhrase as a Named Entity). The term is underspecified
+ * and can be compatible with many defintitions of phrase. Please subClass
+ * it to specify the meaning (e.g. for Chunking or Phrase Structure Grammar).
+ * Example: ((My dog)(also)(likes)(eating (sausage)))
+ */
+ Phrase,
+ /**
+ * URIs of this class have to conform with the syntax of <a
+ * href="http://tools.ietf.org/html/rfc5147">RFC 5147</a> in a way that the
+ * end on a valid identifier, if you remove the prefix. Note that unlike
+ * RFC 5147 NIF does not requrire '#' URIs. So valid URIs are
+ * http://example.org#char=0,28 , http://example.org/whatever/char=0,28 ,
+ * http://example.org/nif?char=0,28
+ */
+ RFC5147String,
+ /**
+ * A Sentence
+ */
+ Sentence,
+ /**
+ * Individuals of this class are a string, i.e. Unicode characters, who
+ * have been given a URI and are used in the subject of an RDF statement.
+ * <p>
+ * This class is abstract and should not be serialized.
+ * <p>
+ * NIF-Stanbol (nif-stanbol.ttl): subclassOf nifs:Annotation because it
+ * "annotates" strings for example with begin and end index. The class is
+ * similar to fise:TextAnnotation
+ */
+ String,
+ /**
+ * A structure is a more or less arbitrary label for a partitioning of a
+ * string. We do not follow a strict approach for what a word, phrase,
+ * sentence, title, paragraph is. These labels enable the definition
+ * processes for tool chains, e.g. tool analyses nif:Paragraph and
+ * calculates term frequency.
+ * <p>
+ * This is an abstract class and should not be serialized.
+ */
+ Structure,
+ /**
+ * A title within a text.
+ */
+ Title,
+ /**
+ * A URI Scheme for NIF, subclasses need to define guidelines on the URI
+ * Scheme as well as the text it refers to. This class is just to keep some
+ * order, and should not be serialized.
+ * <p>
+ * This is an abstract class and should not be serialized.
+ */
+ URIScheme,
+ /**
+ * The Word class represents strings that are tokens or words. A string is
+ * a Word, if it is a word. We don't nitpic about whether it is a a pronoun,
+ * a name, a punctuation mark or an apostrophe or whether it is separated
+ * by white space from another Word or something else. The string
+ * 'He enters the room.' for example has 5 words. Words are assigned by a
+ * tokenizer NIF Implementation. Single word phrases might be tagged as
+ * nif:Word and nif:Phrase.
+ *
+ * Example 1: "The White House" are three Words separated by whitespace
+ *
+ * Comment 1: We adopted the definition style from foaf:Person, see
+ * here: http://xmlns.com/foaf/spec/#term_Person We are well aware that
+ * the world out there is much more complicated, but we are ignorant about
+ * it, for the following reasons:
+ *
+ * Comment 2: <ol>
+ * <li> NIF has a client-server and the client has the ability to
+ * dictate the tokenization to the server (i.e. the NIF Implementation) by
+ * sending properly tokenized NIF annotated with nif:Word. All NIF
+ * Implementations are supposed to honor and respect the current assignment
+ * of the Word class. Thus the client should decide which NIF Implementation
+ * should create the tokenization. Therefore this class is not descriptive,
+ * but prescriptive.
+ * <li>The client may choose to send an existing tokenization to a NIF
+ * Implementation, with the capability to change (for better or for worse)
+ * the tokenization.
+ * </ol>
+ *
+ * The class has not been named 'Token' as the NLP definition of 'token'
+ * is descriptive (and not well-defined), while the assignment of what is
+ * a Word and what not is prescriptive, e.g. "can't" could be described as
+ * one, two or three tokens or defined as being one, two or three words.
+ * For further reading, we refer the reader to: By all these lovely tokens...
+ * Merging conflicting tokenizations by Christian Chiarcos, Julia Ritz, and
+ * Manfred Stede. Language Resources and Evaluation 46(1):53-74 (2012) or
+ * the short form: http://www.aclweb.org/anthology/W09-3005
+ *
+ * There the task at hand is to merge two tokenization T_1 and T_2 which
+ * is normally not the case in the NIF world as tokenization is prescribed,
+ * i.e. given as a baseline (Note that this ideal state might not be
+ * achieved by all implementations.)
+ */
+ Word,
+ //Object Properties
+ /**
+ * see <a href="http://svn.aksw.org/papers/2012/PeoplesWeb/public_preprint.pdf>
+ * Towards Web-Scale Collaborative Knowledge Extraction</a> â page 21
+ */
+ annotation,
+ /**
+ * This property should be used to express that one Context is contained in
+ * another Context, e.g. several sentences of a document are modelled
+ * indivudally and refer to the broader context of the whole document.
+ */
+ broaderContext,
+ /**
+ * A dependency relation pointing from gov to dep.
+ */
+ dependency,
+ /**
+ * Links a nif:ContextCollection to its contexts.
+ */
+ hasContext,
+ /**
+ * This property links sentences to their first word.
+ */
+ firstWord,
+ /**
+ * This property links sentences to their last word.
+ */
+ lastWord,
+ /**
+ * This property links sentences to their words.
+ */
+ word,
+ /**
+ * This object property models a relation between two nif:Strings.
+ * The name "inter" is kept generic and can be used to express any kind of
+ * relation in between (inter) two nif:Strings. Extensions can create
+ * rdfs:subPropertyOf for "head", "dependent", nif:substring and
+ * nif:nextWord.
+ */
+ inter,
+ /**
+ * Defines the language of a substring of the context.
+ * If the language for the nif:Context should be specified,
+ * nif:predominantLanguage must be used.
+ */
+ lang,
+ /**
+ * The inverse of nif:broaderContex
+ */
+ narrowerContext,
+ /**
+ * This property can be used to make resources of
+ * nif:Sentence traversable, it can not be assumed that no gaps
+ * or whitespaces between sentences or words exist, i.e. string adjacency
+ * is not mandatory. The transitivity axioms are included in nif-core-inf.ttl
+ * and need to be included separately to keep a low reasoning profile.
+ * They are modeled after skos:broader and skos:broaderTransitive
+ */
+ nextSentence,
+ /**
+ * transitive version of {@link #nextSentence}
+ */
+ nextSentenceTrans,
+ /**
+ * This property can be used to make resources of
+ * nif:Word traversable, it can not be assumed that no gaps
+ * or whitespaces between sentences or words exist, i.e. string adjacency
+ * is not mandatory. The transitivity axioms are included in nif-core-inf.ttl
+ * and need to be included separately to keep a low reasoning profile.
+ * They are modeled after skos:broader and skos:broaderTransitive
+ */
+ nextWord,
+ /**
+ * transitive version of {@link #nextWord}
+ */
+ nextWordTrans,
+ /**
+ * This property links a string to a URI from one of the OLiA Annotation model,
+ * - members of the {@link Pos} enumeration
+ */
+ oliaLink,
+ /**
+ * This property is used to link to a <a href="http://marl.gi2mo.org/?page_id=1#overview">marl:Opinion</a>.
+ * We have not investigated marl, so it might be replaced.
+ * <p>
+ * InverseOf marl:extractedFrom
+ */
+ opinion,
+ /**
+ * Defines the predominant language of the text. If this annotation is given
+ * on a nif:Context, all NIF tools have to treat the text to be in this
+ * language unless specified differently for a subpart. To change the
+ * language for a smaller substring nif:lang must be used.
+ * <p>
+ * This property requires a uri as an argument. We expect this to be a URI
+ * from the lexvo.org namespace, e.g. http://lexvo.org/id/iso639-3/eng using
+ * ISO639-3
+ * <p>
+ * Examples:
+ * <p>
+ * "The dealer says: "Rien ne va plus!" "
+ * <p>
+ * has nif:predomintLanguage http://lexvo.org/id/iso639-3/eng and
+ * nif:lang http://www.lexvo.org/page/iso639-3/fra
+ * <p>
+ * see also: http://www.w3.org/TR/its20/#selection-local
+ * <p>
+ * Tests for RDFUnit (not written yet):
+ * <p>
+ * - write a test for RDFUnit, so people do not use
+ * http://www.lexvo.org/page/iso639-3/eng
+ */
+ predLang,
+ /**
+ * This property can be used to make resources of
+ * nif:Sentence, it can not be assumed that no gaps
+ * or whitespaces between sentences or words exist, i.e. string adjacency
+ * is not mandatory. The transitivity axioms are included in nif-core-inf.ttl
+ * and need to be included separately to keep a low reasoning profile.
+ * They are modeled after skos:broader and skos:broaderTransitive
+ */
+ previousSentence,
+ /**
+ * Transitive version of {@link #previousSentence}
+ */
+ previousSentenceTrans,
+ /**
+ * This property can be used to make resources of
+ * nif:Word, it can not be assumed that no gaps
+ * or whitespaces between sentences or words exist, i.e. string adjacency
+ * is not mandatory. The transitivity axioms are included in nif-core-inf.ttl
+ * and need to be included separately to keep a low reasoning profile.
+ * They are modeled after skos:broader and skos:broaderTransitive
+ */
+ previousWord,
+ /**
+ * Transitive version of {@link #previousWord}
+ */
+ previousWordTrans,
+ /**
+ * Links to the URI describing the provenance
+ */
+ oliaProv,
+ /**
+ * Links a URI of a string to its reference context of type nif:Context.
+ * The reference context determines the calculation of begin and end index
+ * <p>
+ * Each String that is not an instance of nif:Context MUST have exactly one
+ * reference context.
+ * <p>
+ * Inferences (nif-core-inf.ttl):
+ * <p>
+ * Instances of nif:Context do have itself as reference context, this is
+ * inferred automatically, MAY be materialized, as well.
+ * <p>
+ * OWL validation (nif-core-val.ttl):
+ * <p>
+ * This property is functional.
+ */
+ referenceContext,
+ /**
+ * This property links words to their sentence.
+ */
+ sentence,
+ /**
+ * The URL the context was extracted from, e.g. the blog or news article url.
+ * Doesn't matter whether it is HTML or XML or plain text. rdfs:range is
+ * foaf:Document. Subproperty of prov:hadPrimarySource. In case the string
+ * comes from another NIF String and gives the exact provenance, please use
+ * nif:wasConvertedFrom or a subProperty thereof.
+ */
+ sourceUrl,
+ /**
+ * This property together with nif:subString, nif:superString, and their
+ * transitive extension can be used to express that one string is contained
+ * in another one. Examples: "a" nif:subString "apple" , "apple"
+ * nif:subString "apple". The transitivity axioms are included in
+ * nif-core-inf.ttl and need to be included separately to keep a low
+ * reasoning profile. They are modeled after skos:broader and
+ * skos:broaderTransitive
+ */
+ subString,
+ /**
+ * Inverse of {@link #subString}
+ */
+ superString,
+ /**
+ * Transitive version of {@link #dependency}
+ */
+ dependencyTrans,
+ /**
+ * Transitive version of {@link #subString}
+ */
+ subStringTrans,
+ /**
+ * Transitive version of {@link #superString}
+ */
+ superStringTrans,
+ /**
+ * This property should be used, when mapping one nif:String or nif:Context
+ * to another and is often confused with nif:sourceUrl.
+ * <p>
+ * While nif:sourceUrl is built on PROV-O and is used to link the nif:Context
+ * to the document URL for provenance information, nif:convertedFrom is more
+ * precise and pinpoints exact locations where a certain NIF String
+ * "wasConvertedFrom".
+ * <p>
+ * nif:wasConvertedFrom is therefore used to provide *exact* provenance
+ * during a conversion process, e.g. when removing tags from XHTML and then
+ * linking XPath URIs to NIF index based URIs (e.g. RFC 5147 with char=x,y).
+ * An example of the usage of this property can be found here:
+ * http://www.w3.org/TR/its20/#conversion-to-nif
+ * <p>
+ * Example
+ * <p>
+ * # "Dublin"
+ * <p>
+ * <http://example.com/myitsservice?informat=html&intype=url&input=http://example.com/doc.html&char=11,17>
+ * <p>
+ * nif:wasConvertedFrom
+ * <p>
+ * <http://example.com/myitsservice?informat=html&intype=url&input=http://example.com/doc.html&xpath=/html/body[1]/h2[1]/span[1]/text()[1]>.
+ */
+ wasConvertedFrom,
+ //Datatype properties
+ /**
+ * For each string you can include a snippet (e.g. 10-40 characters of text),
+ * that occurs immediately after the subject string.
+ */
+ after,
+ /**
+ * The string, which the URI is representing as an RDF Literal. Some use
+ * cases require this property, as it is necessary for certain sparql queries.
+ */
+ anchorOf,
+ /**
+ * For each string you can include a snippet (e.g. 10-40 characters of text),
+ * that occurs immediately before the subject string.
+ */
+ before,
+ /**
+ * The begin index of a character range as defined in
+ * http://tools.ietf.org/html/rfc5147#section-2.2.1 and
+ * http://tools.ietf.org/html/rfc5147#section-2.2.2, measured as the gap
+ * between two characters, starting to count from 0 (the position before
+ * the first character of a text).
+ * <p>
+ * Example: Index "2" is the postion between "Mr" and "." in "Mr. Sandman".
+ * <p>
+ * Note: RFC 5147 is re-used for the definition of character ranges. RFC 5147
+ * is assuming a text/plain MIME type. NIF builds upon Unicode and is content
+ * agnostic.
+ * <p>
+ * Requirement (1): This property has the same value the "Character position"
+ * of RFC 5147 and it MUST therefore be castable to xsd:nonNegativeInteger,
+ * i.e. it MUST not have negative values.
+ * <p>
+ * Requirement (2): The index of the subject string MUST be calculated
+ * relative to the nif:referenceContext of the subject. If available, this
+ * is the rdf:Literal of the nif:isString property.
+ */
+ beginIndex,
+ /**
+ * The confidence is relative to the tool and can be between 0.0 and 1.0,
+ * it is for nif:oliaLink and therefore also for nif:oliaCategory.
+ */
+ oliaConf,
+ /**
+ * The end index of a character range as defined in
+ * http://tools.ietf.org/html/rfc5147#section-2.2.1 and
+ * http://tools.ietf.org/html/rfc5147#section-2.2.2, measured as the gap
+ * between two characters, starting to count from 0 (the position before
+ * the first character of a text).
+ * <p>
+ * Example: Index "2" is the postion between "Mr" and "." in "Mr. Sandman".
+ * <p>
+ * Note: RFC 5147 is re-used for the definition of character ranges. RFC 5147
+ * is assuming a text/plain MIME type. NIF builds upon Unicode and is content
+ * agnostic.
+ * <p>
+ * Requirement (1): This property has the same value the "Character position"
+ * of RFC 5147 and it must therefore be an xsd:nonNegativeInteger .
+ * <p>
+ * Requirement (2): The index of the subject string MUST be calculated
+ * relative to the nif:referenceContext of the subject. If available, this
+ * is the rdf:Literal of the nif:isString property.
+ */
+ endIndex,
+ /**
+ * The first few chars of the nif:anchorOf. Typically used if the nif:anchorOf
+ * is to long for inclusion as RDF literal.
+ */
+ head,
+ /**
+ * The reference text as rdf:Literal for this nif:Context resource.
+ * NIF requires that the reference text (i.e. the context) is always
+ * included in the RDF as an rdf:Literal.
+ * <p>
+ * Note, that the isString property is *the* place to keep the string itself
+ * in RDF.
+ * <p>
+ * All other nif:Strings and nif:URISchemes relate to the text of this
+ * property to calculate character position and indices.
+ */
+ isString,
+ /**
+ * The lemma(s) of the nif:String.
+ */
+ lemma,
+ /**
+ * see <a href=http://svn.aksw.org/papers/2012/PeoplesWeb/public_preprint.pdf">
+ * Towards Web-Scale Collaborative Knowledge Extraction</a>â page 21 .
+ */
+ literalAnnotation,
+ /**
+ * To include the pos tag as it comes out of the NLP tool as RDF Literal.
+ * This property is discouraged to use alone, please use oliaLink and
+ * oliaCategory. We included it, because some people might still want it
+ * and will even create their own property, if the string variant is missing
+ */
+ posTag,
+ /**
+ * Between -1 negative and 1 positive
+ */
+ sentimentValue,
+ /**
+ * The stem(s) of the nif:String.
+ */
+ stem,
+ //Annotation properties
+ /**
+ * A simple annotation for machine learning purposes. The object can be
+ * anything, e.g. the literal "A. PRESS: Reportage" from Brown or any URI.
+ */
+ category,
+ /**
+ * see <a href=http://svn.aksw.org/papers/2012/PeoplesWeb/public_preprint.pdf">
+ * Towards Web-Scale Collaborative Knowledge Extraction</a>â page 12 .
+ */
+ classAnnotation,
+ /**
+ * This property marks the most specific class from itsrdf:taClassRef.
+ * The rule is: from the set S of itsrdf:taClassRef attached to this resource
+ * taMscRef points to the one that does not have any subclasses in the set
+ * S except itself. So if taClassRef is owl:Thing, dbo:Agent, dbo:Person,
+ * dbp:Actor taMsClassRef is dbo:Actor
+ */
+ taMsClassRef,
+ /**
+ * This property links a string URI to classes of the OLiA Reference model.
+ * It provides a direct link for querying, thus it is a redundant optimization.
+ * <p>
+ * Values are expected to be member of {@link Pos}
+ */
+ oliaCategory,
+ ;
+ public final static String NAMESPACE = "http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#";
+
+ UriRef uri;
+
+ private Nif20() {
+ uri = new UriRef(NAMESPACE+name());
+ }
+
+ public String getLocalName(){
+ return name();
+ }
+
+ public UriRef getUri(){
+ return uri;
+ }
+
+ @Override
+ public String toString() {
+ return uri.getUnicodeString();
+ }
+
+}