You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/11/26 12:39:38 UTC
svn commit: r1413560 [2/3] - in /stanbol/trunk: data/ data/defaultconfig/
data/defaultconfig/src/main/resources/
data/defaultconfig/src/main/resources/config/ data/opennlp/lang/de/
data/sentiment/ data/sentiment/sentiwordnet/ data/sentiment/sentiwordne...
Modified: stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml?rev=1413560&r1=1413559&r2=1413560&view=diff
==============================================================================
--- stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml (original)
+++ stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml Mon Nov 26 11:39:25 2012
@@ -51,6 +51,11 @@
<artifactId>org.apache.stanbol.enhancer.core</artifactId>
<version>0.10.0-SNAPSHOT</version>
</bundle>
+ <bundle> <!-- NLP processing (STANBOL-733) -->
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.nlp</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ </bundle>
</startLevel>
<!-- LDPath -->
<startLevel level="30">
@@ -146,25 +151,74 @@
<artifactId>org.apache.stanbol.enhancer.engines.tika</artifactId>
<version>0.10.0-SNAPSHOT</version>
</bundle>
-
- <!-- Named Entity Recoqunition (NER)-->
-
+
+ <!-- NLP processing engines (all STANBOL-733 and sub-tasks) -->
+
+ <bundle><!-- sentence detection with OpenNLP -->
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.opennlp.sentence</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ </bundle>
+ <bundle><!-- OpenNLP based tokenizing of Texts -->
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.opennlp.token</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ </bundle>
+ <bundle><!-- POS tagging with OpenNLP -->
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.opennlp.pos</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ </bundle>
+ <bundle><!-- Chunking tagging with OpenNLP -->
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.opennlp.chunker</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ </bundle>
<bundle> <!-- Open NLP based NER -->
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.enhancer.engines.opennlp.ner</artifactId>
<version>0.10.0-SNAPSHOT</version>
</bundle>
+ <!-- NLP metadata to RDF (using NIF 1.0) - NOT YET READY FOR DEFAULT CONFIG
+ <bundle>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.nlp2rdf</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ </bundle> -->
+ <!-- Sentiment Enhancement Engines -->
+ <bundle><!-- Sentiment Word Classifiers -->
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.sentiment.wordclassifier</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ </bundle>
+ <!-- NOT YET READY FOR DEFAULT CONFIG
+ <bundle>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.sentiment.summarization</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ </bundle> -->
+
<!-- Entity Extraction/Linking -->
<bundle><!-- NER linking (depends on the Entityhub) -->
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.enhancer.engine.entitytagging</artifactId>
<version>0.10.0-SNAPSHOT</version>
</bundle>
- <bundle><!-- Keyword Extraction from Text (depends on the Entityhub) -->
+ <bundle><!-- Keyword Extraction from Text DEPRECATED! (depends on the Entityhub) -->
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.enhancer.engine.keywordextraction</artifactId>
<version>0.10.0-SNAPSHOT</version>
</bundle>
+ <bundle><!-- EntityLinking based on the Entityhub -->
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.entitylinking</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ </bundle>
+ <bundle><!-- EntityLinking for the Stanbol Entityhub -->
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.entityhublinking</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ </bundle>
<!-- Refactor Enhancement Engine -->
Propchange: stanbol/trunk/enhancer/engines/celi/
------------------------------------------------------------------------------
--- svn:mergeinfo (added)
+++ svn:mergeinfo Mon Nov 26 11:39:25 2012
@@ -0,0 +1,4 @@
+/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/celi:1374978-1386535
+/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/celi:1386989-1388016
+/incubator/stanbol/trunk/enhancer/engines/celi:1339554,1339557-1339558
+/stanbol/branches/stanbol-nlp-processing/enhancer/engines/celi:1388017-1413353
Modified: stanbol/trunk/enhancer/engines/celi/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/celi/pom.xml?rev=1413560&r1=1413559&r2=1413560&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/celi/pom.xml (original)
+++ stanbol/trunk/enhancer/engines/celi/pom.xml Mon Nov 26 11:39:25 2012
@@ -1,153 +1,154 @@
<?xml version="1.0" encoding="UTF-8"?>
<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
- license agreements. See the NOTICE file distributed with this work for additional
- information regarding copyright ownership. The ASF licenses this file to
- You under the Apache License, Version 2.0 (the "License"); you may not use
- this file except in compliance with the License. You may obtain a copy of
- the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
- by applicable law or agreed to in writing, software distributed under the
- License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
- OF ANY KIND, either express or implied. See the License for the specific
- language governing permissions and limitations under the License. -->
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ You under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
- xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
- <modelVersion>4.0.0</modelVersion>
+ <modelVersion>4.0.0</modelVersion>
- <parent>
- <groupId>org.apache.stanbol</groupId>
- <artifactId>org.apache.stanbol.enhancer.parent</artifactId>
- <version>0.10.0-SNAPSHOT</version>
- <relativePath>../../parent</relativePath>
- </parent>
-
- <groupId>org.apache.stanbol</groupId>
- <artifactId>org.apache.stanbol.enhancer.engines.celi</artifactId>
- <packaging>bundle</packaging>
+ <parent>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.parent</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ <relativePath>../../parent</relativePath>
+ </parent>
+
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.celi</artifactId>
+ <packaging>bundle</packaging>
<name>Apache Stanbol Enhancer Enhancement Engine : CELI Engine</name>
<description>Enhancement Engine using the CELI service.</description>
<inceptionYear>2012</inceptionYear>
- <dependencies>
- <dependency>
- <groupId>org.apache.stanbol</groupId>
- <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
- <version>0.10.0-SNAPSHOT</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.stanbol</groupId>
- <artifactId>org.apache.stanbol.commons.stanboltools.datafileprovider</artifactId>
- <version>0.9.0-incubating</version>
- </dependency>
-
- <dependency>
- <groupId>org.apache.clerezza</groupId>
- <artifactId>rdf.core</artifactId>
- </dependency>
-
- <dependency>
- <groupId>org.apache.felix</groupId>
- <artifactId>org.apache.felix.scr.annotations</artifactId>
- <scope>provided</scope>
- </dependency>
-
- <!-- generic tax -->
- <dependency>
- <groupId>commons-lang</groupId>
- <artifactId>commons-lang</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.httpcomponents</groupId>
- <artifactId>httpclient-osgi</artifactId>
- </dependency>
- <dependency>
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-api</artifactId>
- </dependency>
- <dependency>
- <groupId>org.apache.stanbol</groupId>
- <artifactId>org.apache.stanbol.commons.stanboltools.offline</artifactId>
- <version>0.9.0-incubating</version>
- <scope>provided</scope>
- </dependency>
-
- <!-- test -->
- <dependency>
- <groupId>org.apache.stanbol</groupId>
- <artifactId>org.apache.stanbol.enhancer.test</artifactId>
- <version>0.10.0-SNAPSHOT</version>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>org.apache.stanbol</groupId>
- <artifactId>org.apache.stanbol.enhancer.core</artifactId>
- <version>0.10.0-SNAPSHOT</version>
- <scope>test</scope>
- </dependency>
- <dependency><!-- for debugging enhancements -->
- <groupId>org.apache.clerezza</groupId>
- <artifactId>rdf.jena.serializer</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency> <!-- we use log4j 1.2 -->
- <groupId>org.slf4j</groupId>
- <artifactId>slf4j-log4j12</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>log4j</groupId>
- <artifactId>log4j</artifactId>
- <scope>test</scope>
- </dependency>
- <dependency>
- <groupId>junit</groupId>
- <artifactId>junit</artifactId>
- <scope>test</scope>
- </dependency>
-
-
- </dependencies>
-
- <build>
- <plugins>
- <plugin>
- <groupId>org.apache.maven.plugins</groupId>
- <artifactId>maven-surefire-plugin</artifactId>
- <configuration>
- <skipTests>false</skipTests>
- </configuration>
- </plugin>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-bundle-plugin</artifactId>
- <extensions>true</extensions>
- <configuration>
- <instructions>
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ </dependency>
+ <dependency> <!-- STANBOL-739: adapt Lemmatizer Engine to use AnalyzedText -->
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.nlp</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.commons.stanboltools.datafileprovider</artifactId>
+ <version>0.9.0-incubating</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.clerezza</groupId>
+ <artifactId>rdf.core</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.scr.annotations</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- generic tax -->
+ <dependency>
+ <groupId>commons-lang</groupId>
+ <artifactId>commons-lang</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpclient-osgi</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.commons.stanboltools.offline</artifactId>
+ <version>0.9.0-incubating</version>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- test -->
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.test</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.core</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency><!-- for debugging enhancements -->
+ <groupId>org.apache.clerezza</groupId>
+ <artifactId>rdf.jena.serializer</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency> <!-- we use log4j 1.2 -->
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>log4j</groupId>
+ <artifactId>log4j</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+ <skipTests>false</skipTests>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
<Import-Package>
org.apache.stanbol.enhancer.servicesapi; provide:=true,
org.apache.stanbol.enhancer.servicesapi.impl; provide:=true,
*
</Import-Package>
- <Export-Package>
- org.apache.stanbol.enhancer.engines.celi
- </Export-Package>
- <Private-Package>
- org.apache.stanbol.enhancer.engines.celi.ner.impl,
- org.apache.stanbol.enhancer.engines.celi.langid.impl,
- org.apache.stanbol.enhancer.engines.celi.classification.impl,
- org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl,
+ <Export-Package>
+ org.apache.stanbol.enhancer.engines.celi
+ </Export-Package>
+ <Private-Package>
+ org.apache.stanbol.enhancer.engines.celi.ner.impl,
+ org.apache.stanbol.enhancer.engines.celi.langid.impl,
+ org.apache.stanbol.enhancer.engines.celi.classification.impl,
+ org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl,
org.apache.stanbol.enhancer.engines.celi.utils
- </Private-Package>
- </instructions>
- </configuration>
- </plugin>
- <plugin>
- <groupId>org.apache.felix</groupId>
- <artifactId>maven-scr-plugin</artifactId>
- </plugin>
- </plugins>
- </build>
-
+ </Private-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-scr-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
</project>
\ No newline at end of file
Modified: stanbol/trunk/enhancer/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/CeliLemmatizerEnhancementEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/CeliLemmatizerEnhancementEngine.java?rev=1413560&r1=1413559&r2=1413560&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/CeliLemmatizerEnhancementEngine.java (original)
+++ stanbol/trunk/enhancer/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/CeliLemmatizerEnhancementEngine.java Mon Nov 26 11:39:25 2012
@@ -50,7 +50,18 @@ import org.apache.felix.scr.annotations.
import org.apache.felix.scr.annotations.Service;
import org.apache.stanbol.commons.stanboltools.offline.OnlineMode;
import org.apache.stanbol.enhancer.engines.celi.CeliConstants;
+import org.apache.stanbol.enhancer.engines.celi.CeliMorphoFeatures;
+import org.apache.stanbol.enhancer.engines.celi.CeliTagSetRegistry;
import org.apache.stanbol.enhancer.engines.celi.utils.Utils;
+import org.apache.stanbol.enhancer.nlp.model.tag.TagSet;
+import org.apache.stanbol.enhancer.nlp.morpho.Case;
+import org.apache.stanbol.enhancer.nlp.morpho.Gender;
+import org.apache.stanbol.enhancer.nlp.morpho.NumberFeature;
+import org.apache.stanbol.enhancer.nlp.morpho.Person;
+import org.apache.stanbol.enhancer.nlp.morpho.Tense;
+import org.apache.stanbol.enhancer.nlp.morpho.TenseTag;
+import org.apache.stanbol.enhancer.nlp.morpho.VerbMood;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
@@ -67,22 +78,16 @@ import org.slf4j.LoggerFactory;
@Component(immediate = true, metatype = true)
@Service
-@Properties(value = {
- @Property(name = EnhancementEngine.PROPERTY_NAME, value = "celiLemmatizer"),
- @Property(name = CeliConstants.CELI_LICENSE),
- @Property(name = CeliConstants.CELI_TEST_ACCOUNT,boolValue=false)
-})
+@Properties(value = { @Property(name = EnhancementEngine.PROPERTY_NAME, value = "celiLemmatizer"), @Property(name = CeliConstants.CELI_LICENSE), @Property(name = CeliConstants.CELI_TEST_ACCOUNT, boolValue = false) })
public class CeliLemmatizerEnhancementEngine extends AbstractEnhancementEngine<IOException, RuntimeException> implements EnhancementEngine, ServiceProperties {
-
+ // TODO: check if it is OK to define new properties in the FISE namespace
+ public static final UriRef hasLemmaForm = new UriRef("http://fise.iks-project.eu/ontology/hasLemmaForm");
+
/**
- * This ensures that no connections to external services are made if Stanbol is started in offline mode
- * as the OnlineMode service will only be available if OfflineMode is deactivated.
+ * This ensures that no connections to external services are made if Stanbol is started in offline mode as the OnlineMode service will only be available if OfflineMode is deactivated.
*/
@Reference
- private OnlineMode onlineMode;
- //TODO: check if it is OK to define new properties in the FISE namespace
- public static final UriRef hasLemmaForm = new UriRef("http://fise.iks-project.eu/ontology/hasLemmaForm");
- public static final UriRef hasMorphoFeature = new UriRef("http://fise.iks-project.eu/ontology/hasMorphologicalFeature");
+ private OnlineMode onlineMode;
private static List<String> supportedLangs = new Vector<String>();
static {
@@ -99,17 +104,14 @@ public class CeliLemmatizerEnhancementEn
public static final Literal LANG_ID_ENGINE_NAME = LiteralFactory.getInstance().createTypedLiteral("org.apache.stanbol.enhancer.engines.celi.langid.impl.CeliLanguageIdentifierEnhancementEngine");
/**
- * The default value for the Execution of this Engine. Currently set to
- * {@link ServiceProperties#ORDERING_CONTENT_EXTRACTION}
+ * The default value for the Execution of this Engine. Currently set to {@link ServiceProperties#ORDERING_CONTENT_EXTRACTION}
*/
public static final Integer defaultOrder = ServiceProperties.ORDERING_CONTENT_EXTRACTION;
private Logger log = LoggerFactory.getLogger(getClass());
-
/**
- * This contains the only MIME type directly supported by this enhancement
- * engine.
+ * This contains the only MIME type directly supported by this enhancement engine.
*/
private static final String TEXT_PLAIN_MIMETYPE = "text/plain";
@@ -135,7 +137,7 @@ public class CeliLemmatizerEnhancementEn
protected void activate(ComponentContext ctx) throws IOException, ConfigurationException {
super.activate(ctx);
Dictionary<String, Object> properties = ctx.getProperties();
- this.licenseKey = Utils.getLicenseKey(properties,ctx.getBundleContext());
+ this.licenseKey = Utils.getLicenseKey(properties, ctx.getBundleContext());
String url = (String) properties.get(SERVICE_URL);
if (url == null || url.isEmpty()) {
throw new ConfigurationException(SERVICE_URL, String.format("%s : please configure the URL of the CELI Web Service (e.g. by" + "using the 'Configuration' tab of the Apache Felix Web Console).", getClass().getSimpleName()));
@@ -159,10 +161,8 @@ public class CeliLemmatizerEnhancementEn
@Override
public int canEnhance(ContentItem ci) throws EngineException {
String language = EnhancementEngineHelper.getLanguage(ci);
- if(language==null) {
- log.warn("Unable to enhance ContentItem {} because language of the Content is unknown." +
- "Please check that a language identification engine is active in this EnhancementChain).",
- ci.getUri());
+ if (language == null) {
+ log.warn("Unable to enhance ContentItem {} because language of the Content is unknown." + "Please check that a language identification engine is active in this EnhancementChain).", ci.getUri());
}
if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null && this.isLangSupported(language))
return ENHANCE_ASYNC;
@@ -172,19 +172,15 @@ public class CeliLemmatizerEnhancementEn
@Override
public void computeEnhancements(ContentItem ci) throws EngineException {
- String language = EnhancementEngineHelper.getLanguage(ci);
- if (!isLangSupported(language)){
- throw new IllegalStateException("Call to computeEnhancement with unsupported language '"
- +language+" for ContentItem "+ ci.getUri() +": This is also checked "
- + "in the canEnhance method! -> This indicated an Bug in the "
- + "implementation of the " + "EnhancementJobManager!");
+ String language = EnhancementEngineHelper.getLanguage(ci);
+ if (!isLangSupported(language)) {
+ throw new IllegalStateException("Call to computeEnhancement with unsupported language '" + language + " for ContentItem " + ci.getUri() + ": This is also checked " + "in the canEnhance method! -> This indicated an Bug in the "
+ + "implementation of the " + "EnhancementJobManager!");
}
- Language lang = new Language(language); //clerezza language for PlainLiterals
+
Entry<UriRef, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
if (contentPart == null) {
- throw new IllegalStateException("No ContentPart with Mimetype '"
- + TEXT_PLAIN_MIMETYPE + "' found for ContentItem "
- + ci.getUri() + ": This is also checked in the canEnhance method! -> This "
+ throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This "
+ "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
}
String text;
@@ -198,70 +194,80 @@ public class CeliLemmatizerEnhancementEn
return;
}
- MGraph g = ci.getMetadata();
- LiteralFactory literalFactory = LiteralFactory.getInstance();
+ MGraph graph = ci.getMetadata();
if (this.completeMorphoAnalysis) {
- List<LexicalEntry> terms;
- try {
- terms = this.client.performMorfologicalAnalysis(text, language);
- } catch (IOException e) {
- throw new EngineException("Error while calling the CELI Lemmatizer"
- +" service (configured URL: "
- +serviceURL+")!",e);
- } catch (SOAPException e) {
- throw new EngineException("Error wile encoding/decoding the request/"
- +"response to the CELI lemmatizer service!",e);
- }
- //get a write lock before writing the enhancements
- ci.getLock().writeLock().lock();
- try {
- for (LexicalEntry le : terms) {
- if(!le.termReadings.isEmpty()){
- UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
- g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT,
- new PlainLiteralImpl(le.getWordForm(),lang)));
- if (le.from >= 0 && le.to > 0) {
- g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(le.from)));
- g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(le.to)));
- g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT,
- new PlainLiteralImpl(getSelectionContext(text, le.getWordForm(), le.from), lang)));
- }
- for (Reading r : le.termReadings) {
- g.add(new TripleImpl(textAnnotation, hasLemmaForm,
- new PlainLiteralImpl(r.getLemma(),lang)));
- for (Entry<String,String> entry : r.lexicalFeatures.entrySet()) {
- g.add(new TripleImpl(textAnnotation, hasMorphoFeature,
- literalFactory.createTypedLiteral(entry.getKey() + "=" + entry.getValue())));
- }
- }
- } //TODO: check if it is OK to ignore lexical entries with no readings
- }
- } finally {
- ci.getLock().writeLock().unlock();
- }
+ this.addMorphoAnalysisEnhancement(ci, text, language, graph);
} else {
- String lemmatizedContents;
- try {
- lemmatizedContents = this.client.lemmatizeContents(text, language);
- } catch (IOException e) {
- throw new EngineException("Error while calling the CELI Lemmatizer"
- +" service (configured URL: "
- +serviceURL+")!",e);
- } catch (SOAPException e) {
- throw new EngineException("Error wile encoding/decoding the request/"
- +"response to the CELI lemmatizer service!",e);
- }
- //get a write lock before writing the enhancements
- ci.getLock().writeLock().lock();
- try {
- UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
- g.add(new TripleImpl(textEnhancement, hasLemmaForm,
- new PlainLiteralImpl(lemmatizedContents,lang)));
- } finally {
- ci.getLock().writeLock().unlock();
- }
+ this.addLemmatizationEnhancement(ci, text, language, graph);
+ }
+ }
+
+ private void addMorphoAnalysisEnhancement(ContentItem ci, String text, String language, MGraph g) throws EngineException {
+ Language lang = new Language(language); // clerezza language for PlainLiterals
+ List<LexicalEntry> terms;
+ try {
+ terms = this.client.performMorfologicalAnalysis(text, language);
+ } catch (IOException e) {
+ throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
+ } catch (SOAPException e) {
+ throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
+ }
+ // get a write lock before writing the enhancements
+ ci.getLock().writeLock().lock();
+ try {
+ LiteralFactory literalFactory = LiteralFactory.getInstance();
+ for (LexicalEntry le : terms) {
+
+ List<CeliMorphoFeatures> mFeatures = this.convertLexicalEntryToMorphFeatures(le, language);
+ for (CeliMorphoFeatures feat : mFeatures) {
+ // Create a text annotation for each interpretation produced by the morphological analyzer
+ UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
+ g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, new PlainLiteralImpl(le.getWordForm(), lang)));
+ if (le.from >= 0 && le.to > 0) {
+ g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(le.from)));
+ g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(le.to)));
+ g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT, new PlainLiteralImpl(getSelectionContext(text, le.getWordForm(), le.from), lang)));
+ }
+ g.addAll(feat.featuresAsTriples(textAnnotation, lang));
+ }
+ }
+ } finally {
+ ci.getLock().writeLock().unlock();
+ }
+ }
+
+ private void addLemmatizationEnhancement(ContentItem ci, String text, String language, MGraph g) throws EngineException {
+ Language lang = new Language(language); // clerezza language for PlainLiterals
+ String lemmatizedContents;
+ try {
+ lemmatizedContents = this.client.lemmatizeContents(text, language);
+ } catch (IOException e) {
+ throw new EngineException("Error while calling the CELI Lemmatizer" + " service (configured URL: " + serviceURL + ")!", e);
+ } catch (SOAPException e) {
+ throw new EngineException("Error wile encoding/decoding the request/" + "response to the CELI lemmatizer service!", e);
+ }
+ // get a write lock before writing the enhancements
+ ci.getLock().writeLock().lock();
+ try {
+ UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
+ g.add(new TripleImpl(textEnhancement, CeliLemmatizerEnhancementEngine.hasLemmaForm, new PlainLiteralImpl(lemmatizedContents, lang)));
+ } finally {
+ ci.getLock().writeLock().unlock();
+ }
+ }
+
+ private List<CeliMorphoFeatures> convertLexicalEntryToMorphFeatures(LexicalEntry le, String lang) {
+ List<CeliMorphoFeatures> result = new Vector<CeliMorphoFeatures>();
+ if (!le.termReadings.isEmpty()) {
+ for (Reading r : le.termReadings) {
+ CeliMorphoFeatures morphoFeature = CeliMorphoFeatures.parseFrom(r, lang);
+ if(morphoFeature != null){
+ result.add(morphoFeature);
+ }
+ }
}
+ return result;
}
private boolean isLangSupported(String language) {
Modified: stanbol/trunk/enhancer/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LemmatizerClientHTTP.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LemmatizerClientHTTP.java?rev=1413560&r1=1413559&r2=1413560&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LemmatizerClientHTTP.java (original)
+++ stanbol/trunk/enhancer/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LemmatizerClientHTTP.java Mon Nov 26 11:39:25 2012
@@ -17,6 +17,7 @@
package org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl;
import java.io.BufferedWriter;
+import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
@@ -39,6 +40,7 @@ import javax.xml.soap.SOAPPart;
import javax.xml.transform.stream.StreamSource;
import org.apache.clerezza.rdf.core.impl.util.Base64;
+import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringEscapeUtils;
import org.apache.stanbol.enhancer.engines.celi.utils.Utils;
import org.slf4j.Logger;
@@ -101,7 +103,12 @@ public class LemmatizerClientHTTP {
long start = System.currentTimeMillis();
InputStream stream = con.getInputStream();
log.debug("Request to {} took {}ms",serviceEP,System.currentTimeMillis()-start);
-
+ if(log.isTraceEnabled()){
+ //log the response if trace is enabled
+ String soapResponse = IOUtils.toString(stream,"UTF-8");
+ log.trace("SoapResponse: \n{}\n",soapResponse);
+ stream = new ByteArrayInputStream(soapResponse.getBytes(Charset.forName("UTF-8")));
+ }
// Create SoapMessage
MessageFactory msgFactory = MessageFactory.newInstance();
SOAPMessage message = msgFactory.createMessage();
@@ -132,12 +139,18 @@ public class LemmatizerClientHTTP {
Element lemmaElm = (Element) lemmasList.item(j);
String lemma = lemmaElm.getTextContent();
NodeList features = ((Element)lemmaElm.getParentNode()).getElementsByTagNameNS("*","LexicalFeature");
- Hashtable<String,String> featuresMap=new Hashtable<String,String>();
+ Hashtable<String,List<String>> featuresMap=new Hashtable<String,List<String>>();
for(int k=0;features!=null && k<features.getLength();k++){
Element feat = (Element) features.item(k);
String name = feat.getAttribute("name");
String value = feat.getTextContent();
- featuresMap.put(name, value);
+ List<String> values=null;
+ if(featuresMap.containsKey(name))
+ values=featuresMap.get(name);
+ else
+ values=new Vector<String>();
+ values.add(value);
+ featuresMap.put(name, values);
}
Reading r=new Reading(lemma, featuresMap);
readings.add(r);
Modified: stanbol/trunk/enhancer/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/Reading.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/Reading.java?rev=1413560&r1=1413559&r2=1413560&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/Reading.java (original)
+++ stanbol/trunk/enhancer/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/Reading.java Mon Nov 26 11:39:25 2012
@@ -17,13 +17,14 @@
package org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl;
import java.util.Hashtable;
+import java.util.List;
public class Reading {
String lemma;
- Hashtable<String,String> lexicalFeatures;
+ Hashtable<String,List<String>> lexicalFeatures;
- public Reading(String lemma, Hashtable<String, String> lexicalFeatures) {
+ public Reading(String lemma, Hashtable<String, List<String>> lexicalFeatures) {
super();
this.lemma = lemma;
this.lexicalFeatures = lexicalFeatures;
@@ -37,11 +38,11 @@ public class Reading {
this.lemma = lemma;
}
- public Hashtable<String, String> getLexicalFeatures() {
+ public Hashtable<String, List<String>> getLexicalFeatures() {
return lexicalFeatures;
}
- public void setLexicalFeatures(Hashtable<String, String> lexicalFeatures) {
+ public void setLexicalFeatures(Hashtable<String, List<String>> lexicalFeatures) {
this.lexicalFeatures = lexicalFeatures;
}
Modified: stanbol/trunk/enhancer/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/CeliLemmatizerEnhancementEngineTest.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/CeliLemmatizerEnhancementEngineTest.java?rev=1413560&r1=1413559&r2=1413560&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/CeliLemmatizerEnhancementEngineTest.java (original)
+++ stanbol/trunk/enhancer/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/CeliLemmatizerEnhancementEngineTest.java Mon Nov 26 11:39:25 2012
@@ -19,7 +19,6 @@ package org.apache.stanbol.enhancer.engi
import static org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.CeliLemmatizerEnhancementEngine.MORPHOLOGICAL_ANALYSIS;
import static org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.CeliLemmatizerEnhancementEngine.SERVICE_URL;
import static org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.CeliLemmatizerEnhancementEngine.hasLemmaForm;
-import static org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.CeliLemmatizerEnhancementEngine.hasMorphoFeature;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.RDF_TYPE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.TechnicalClasses.ENHANCER_TEXTANNOTATION;
@@ -52,8 +51,12 @@ import org.apache.clerezza.rdf.core.impl
import org.apache.clerezza.rdf.ontologies.XSD;
import org.apache.stanbol.enhancer.contentitem.inmemory.InMemoryContentItemFactory;
import org.apache.stanbol.enhancer.engines.celi.CeliConstants;
+import org.apache.stanbol.enhancer.engines.celi.CeliMorphoFeatures;
import org.apache.stanbol.enhancer.engines.celi.testutils.MockComponentContext;
import org.apache.stanbol.enhancer.engines.celi.testutils.TestUtils;
+import org.apache.stanbol.enhancer.nlp.morpho.Gender;
+import org.apache.stanbol.enhancer.nlp.morpho.NumberFeature;
+import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.ContentItemFactory;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
@@ -69,14 +72,15 @@ import org.slf4j.LoggerFactory;
public class CeliLemmatizerEnhancementEngineTest {
- //static CeliLemmatizerEnhancementEngine morphoAnalysisEngine = new CeliLemmatizerEnhancementEngine();
+ static final String OLIA_NAMESPACE = "http://purl.org/olia/olia.owl#";
private static final ContentItemFactory ciFactory = InMemoryContentItemFactory.getInstance();
private static final Logger log = LoggerFactory.getLogger(CeliLemmatizerEnhancementEngine.class);
private static final String TEXT = "Torino è la principale città del Piemonte.";
+ private static final String TERM = "casa";
- public CeliLemmatizerEnhancementEngine initEngine(boolean completeMorphoAnalysis) throws IOException, ConfigurationException {
+ private CeliLemmatizerEnhancementEngine initEngine(boolean completeMorphoAnalysis) throws IOException, ConfigurationException {
Dictionary<String, Object> properties = new Hashtable<String, Object>();
properties.put(EnhancementEngine.PROPERTY_NAME, "celiLemmatizer");
properties.put(CeliConstants.CELI_TEST_ACCOUNT, "true");
@@ -88,11 +92,11 @@ public class CeliLemmatizerEnhancementEn
return morphoAnalysisEngine;
}
- public static void shutdownEngine(CeliLemmatizerEnhancementEngine morphoAnalysisEngine) {
+ private static void shutdownEngine(CeliLemmatizerEnhancementEngine morphoAnalysisEngine) {
morphoAnalysisEngine.deactivate(null);
}
- public static ContentItem wrapAsContentItem(final String text) throws IOException {
+ private static ContentItem wrapAsContentItem(final String text) throws IOException {
return ciFactory.createContentItem(new StringSource(text));
}
@@ -129,8 +133,7 @@ public class CeliLemmatizerEnhancementEn
validateEnhancement(ci.getMetadata(), (UriRef)lemmaTextAnnotation, expectedValues);
//validate the lemma form TextAnnotation
int lemmaForms = validateLemmaFormProperty(ci.getMetadata(), lemmaTextAnnotation,"it");
- assertTrue("Only a single LemmaForm property is expected if '"+
- MORPHOLOGICAL_ANALYSIS+"=false'",lemmaForms == 1);
+ assertTrue("Only a single LemmaForm property is expected if '"+ MORPHOLOGICAL_ANALYSIS+"=false'",lemmaForms == 1);
shutdownEngine(morphoAnalysisEngine);
}
@@ -138,7 +141,7 @@ public class CeliLemmatizerEnhancementEn
@Test
public void testCompleteMorphoAnalysis() throws Exception {
- ContentItem ci = wrapAsContentItem(TEXT);
+ ContentItem ci = wrapAsContentItem(TERM);
//add a simple triple to statically define the language of the test
//content
ci.getMetadata().add(new TripleImpl(ci.getUri(), DC_LANGUAGE, new PlainLiteralImpl("it")));
@@ -167,10 +170,9 @@ public class CeliLemmatizerEnhancementEn
while (textAnnotationIterator.hasNext()) {
UriRef textAnnotation = (UriRef) textAnnotationIterator.next().getSubject();
// test if selected Text is added
- validateTextAnnotation(ci.getMetadata(), textAnnotation,TEXT,expectedValues);
+ validateTextAnnotation(ci.getMetadata(), textAnnotation,TERM,expectedValues);
textAnnotationCount++;
//perform additional tests for "hasMorphologicalFeature" and "hasLemmaForm"
- validateLemmaFormProperty(ci.getMetadata(), textAnnotation,"it");
validateMorphoFeatureProperty(ci.getMetadata(),textAnnotation);
}
log.info("{} TextAnnotations found and validated ...",textAnnotationCount);
@@ -196,8 +198,7 @@ public class CeliLemmatizerEnhancementEn
Resource lemmaForms = lemmaFormsIterator.next().getObject();
assertTrue("Lemma Forms value are expected of type PlainLiteral", lemmaForms instanceof PlainLiteral);
assertFalse("Lemma forms MUST NOT be empty",((PlainLiteral)lemmaForms).getLexicalForm().isEmpty());
- assertNotNull("Language of the Lemma Form literal MUST BE the same as for the parsed text",
- ((PlainLiteral)lemmaForms).getLanguage());
+ assertNotNull("Language of the Lemma Form literal MUST BE not null",((PlainLiteral)lemmaForms).getLanguage());
assertEquals("Language of the Lemma Form literal MUST BE the same as for the parsed text",
lang, ((PlainLiteral)lemmaForms).getLanguage().toString());
}
@@ -209,19 +210,55 @@ public class CeliLemmatizerEnhancementEn
* @param textAnnotation the TextAnnotation to check
*/
private void validateMorphoFeatureProperty(TripleCollection enhancements, NonLiteral textAnnotation) {
- Iterator<Triple> morphoFeatureIterator = enhancements.filter(textAnnotation, hasMorphoFeature, null);
- assertTrue("No Morpho Feature value found for TextAnnotation "+textAnnotation+"!", morphoFeatureIterator.hasNext());
+ //This taste checks for known morpho features of a given input (constant TERM)
+ Iterator<Triple> morphoFeatureIterator = enhancements.filter(textAnnotation, RDF_TYPE, null);
+ assertTrue("No POS Morpho Feature value found for TextAnnotation "+textAnnotation+"!", morphoFeatureIterator.hasNext());
while(morphoFeatureIterator.hasNext()){
Resource morphoFeature = morphoFeatureIterator.next().getObject();
- assertTrue("Morpho Feature value are expected of typed literal", morphoFeature instanceof TypedLiteral);
- String feature = ((Literal)morphoFeature).getLexicalForm();
+ assertTrue("Morpho Feature value are expected of typed literal", morphoFeature instanceof UriRef);
+ String feature=((UriRef)morphoFeature).getUnicodeString();
assertFalse("Morpho Feature MUST NOT be empty",feature.isEmpty());
- assertTrue("{key}={value} encoding expected (value:"+feature+")",feature.indexOf('=')>0);
- String[] keyValue = feature.split("=");
- assertTrue("{key}={value} encoding expected(value:"+feature+")",
- keyValue.length == 2 && (!keyValue[0].isEmpty()) && (!keyValue[1].isEmpty()));
- assertEquals("DataType of the Morpho Feature MUST BE xsd:string (for now)",XSD.string,
- ((TypedLiteral)morphoFeature).getDataType());
+ if(feature.startsWith(OLIA_NAMESPACE)){
+ String key=feature.substring(OLIA_NAMESPACE.length());
+ LexicalCategory cat=LexicalCategory.valueOf(key);
+ assertTrue("Part of Speech of "+TERM+" should be "+LexicalCategory.Noun , (cat==LexicalCategory.Noun));
+ }
}
+ morphoFeatureIterator = enhancements.filter(textAnnotation, CeliMorphoFeatures.HAS_GENDER, null);
+ assertTrue("No Gender Morpho Feature value found for TextAnnotation "+textAnnotation+"!", morphoFeatureIterator.hasNext());
+ if(morphoFeatureIterator.hasNext()){
+ Resource morphoFeature = morphoFeatureIterator.next().getObject();
+ assertTrue("Morpho Feature value are expected of typed literal", morphoFeature instanceof UriRef);
+ String feature=((UriRef)morphoFeature).getUnicodeString();
+ assertFalse("Morpho Feature MUST NOT be empty",feature.isEmpty());
+ if(feature.startsWith(OLIA_NAMESPACE)){
+ String key=feature.substring(OLIA_NAMESPACE.length());
+ Gender cat=Gender.valueOf(key);
+ assertTrue("Gender of "+TERM+" should be "+Gender.Feminine , (cat==Gender.Feminine));
+ }
+ }
+ morphoFeatureIterator = enhancements.filter(textAnnotation, CeliMorphoFeatures.HAS_NUMBER, null);
+ assertTrue("No Number Morpho Feature value found for TextAnnotation "+textAnnotation+"!", morphoFeatureIterator.hasNext());
+ if(morphoFeatureIterator.hasNext()){
+ Resource morphoFeature = morphoFeatureIterator.next().getObject();
+ assertTrue("Morpho Feature value are expected of typed literal", morphoFeature instanceof UriRef);
+ String feature=((UriRef)morphoFeature).getUnicodeString();
+ assertFalse("Morpho Feature MUST NOT be empty",feature.isEmpty());
+ if(feature.startsWith(OLIA_NAMESPACE)){
+ String key=feature.substring(OLIA_NAMESPACE.length());
+ NumberFeature cat=NumberFeature.valueOf(key);
+ assertTrue("Number of "+TERM+" should be "+Gender.Feminine , (cat==NumberFeature.Singular));
+ }
+ }
+ morphoFeatureIterator = enhancements.filter(textAnnotation, CeliLemmatizerEnhancementEngine.hasLemmaForm, null);
+ assertTrue("No Number Morpho Feature value found for TextAnnotation "+textAnnotation+"!", morphoFeatureIterator.hasNext());
+ if(morphoFeatureIterator.hasNext()){
+ Resource morphoFeature = morphoFeatureIterator.next().getObject();
+ assertTrue("Lemma Forms value are expected of type PlainLiteral", morphoFeature instanceof PlainLiteral);
+ assertFalse("Lemma forms MUST NOT be empty",((PlainLiteral)morphoFeature).getLexicalForm().isEmpty());
+ String feature=((PlainLiteral)morphoFeature).getLexicalForm();
+ assertTrue("Lemma of "+TERM+" should be "+TERM , (feature.equals(TERM)));
+ }
+
}
}
Propchange: stanbol/trunk/enhancer/engines/dbpedia-spotlight/
------------------------------------------------------------------------------
--- svn:mergeinfo (added)
+++ svn:mergeinfo Mon Nov 26 11:39:25 2012
@@ -0,0 +1,4 @@
+/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/dbpedia-spotlight:1374978-1386535
+/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/dbpedia-spotlight:1386989-1388016
+/incubator/stanbol/trunk/enhancer/engines/dbpedia-spotlight:1339554,1339557-1339558
+/stanbol/branches/stanbol-nlp-processing/enhancer/engines/dbpedia-spotlight:1388017-1413353
Propchange: stanbol/trunk/enhancer/engines/entityhublinking/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Nov 26 11:39:25 2012
@@ -0,0 +1,7 @@
+target
+
+.settings
+
+.project
+
+.classpath
Propchange: stanbol/trunk/enhancer/engines/entitylinking/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Nov 26 11:39:25 2012
@@ -0,0 +1,7 @@
+.settings
+
+.classpath
+
+.project
+
+target
Copied: stanbol/trunk/enhancer/engines/entitylinking/pom.xml (from r1413353, stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/pom.xml)
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/entitylinking/pom.xml?p2=stanbol/trunk/enhancer/engines/entitylinking/pom.xml&p1=stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/pom.xml&r1=1413353&r2=1413560&rev=1413560&view=diff
==============================================================================
--- stanbol/branches/stanbol-nlp-processing/enhancer/engines/entitylinking/pom.xml (original)
+++ stanbol/trunk/enhancer/engines/entitylinking/pom.xml Mon Nov 26 11:39:25 2012
@@ -165,7 +165,7 @@
<dependency>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.data.opennlp.lang.en</artifactId>
- <version>1.0.2-SNAPSHOT</version>
+ <version>1.1.0-SNAPSHOT</version>
<scope>test</scope>
</dependency>
Propchange: stanbol/trunk/enhancer/engines/langdetect/
------------------------------------------------------------------------------
--- svn:mergeinfo (added)
+++ svn:mergeinfo Mon Nov 26 11:39:25 2012
@@ -0,0 +1,4 @@
+/incubator/stanbol/branches/dbpedia-spotlight-engines/engines/langdetect:1374978-1386535
+/incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/langdetect:1386989-1388016
+/incubator/stanbol/trunk/enhancer/engines/langdetect:1339554,1339557-1339558
+/stanbol/branches/stanbol-nlp-processing/enhancer/engines/langdetect:1388017-1413353
Modified: stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java?rev=1413560&r1=1413559&r2=1413560&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java (original)
+++ stanbol/trunk/enhancer/engines/langdetect/src/main/java/org/apache/stanbol/enhancer/engines/langdetect/LanguageDetectionEnhancementEngine.java Mon Nov 26 11:39:25 2012
@@ -87,14 +87,14 @@ public class LanguageDetectionEnhancemen
public static final String MAX_SUGGESTED_PROP = "org.apache.stanbol.enhancer.engines.langdetect.max-suggested";
/**
- * The default value for the Execution of this Engine. Currently set to
- * {@link ServiceProperties#ORDERING_PRE_PROCESSING} - 2<p>
+ * The default value for the Execution of this Engine (
+ * {@link ServiceProperties#ORDERING_NLP_LANGAUGE_DETECTION})<p>
* NOTE: this information is used by the default and weighed {@link Chain}
* implementation to determine the processing order of
* {@link EnhancementEngine}s. Other {@link Chain} implementation do not
* use this information.
*/
- public static final Integer defaultOrder = ORDERING_PRE_PROCESSING - 2;
+ public static final Integer defaultOrder = ServiceProperties.ORDERING_NLP_LANGAUGE_DETECTION;
/**
* This contains the only MIME type directly supported by this enhancement engine.
@@ -266,7 +266,7 @@ public class LanguageDetectionEnhancemen
}
public Map<String, Object> getServiceProperties() {
- return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
+ return Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder);
}
}
Modified: stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java?rev=1413560&r1=1413559&r2=1413560&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java (original)
+++ stanbol/trunk/enhancer/engines/langid/src/main/java/org/apache/stanbol/enhancer/engines/langid/LangIdEnhancementEngine.java Mon Nov 26 11:39:25 2012
@@ -75,14 +75,14 @@ public class LangIdEnhancementEngine
/**
- * The default value for the Execution of this Engine. Currently set to
- * {@link ServiceProperties#ORDERING_PRE_PROCESSING} - 2<p>
+ * The default value for the Execution of this Engine (
+ * {@link ServiceProperties#ORDERING_NLP_LANGAUGE_DETECTION})<p>
* NOTE: this information is used by the default and weighed {@link Chain}
* implementation to determine the processing order of
* {@link EnhancementEngine}s. Other {@link Chain} implementation do not
* use this information.
*/
- public static final Integer defaultOrder = ORDERING_PRE_PROCESSING - 2;
+ public static final Integer defaultOrder = ServiceProperties.ORDERING_NLP_LANGAUGE_DETECTION;
/**
* This contains the only MIME type directly supported by this enhancement engine.
@@ -184,7 +184,7 @@ public class LangIdEnhancementEngine
}
public Map<String, Object> getServiceProperties() {
- return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
+ return Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder);
}
}
Propchange: stanbol/trunk/enhancer/engines/nlp2rdf/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Nov 26 11:39:25 2012
@@ -0,0 +1,7 @@
+.settings
+
+.project
+
+.classpath
+
+target
Copied: stanbol/trunk/enhancer/engines/nlp2rdf/pom.xml (from r1388016, incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/nlp2rdf/pom.xml)
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/nlp2rdf/pom.xml?p2=stanbol/trunk/enhancer/engines/nlp2rdf/pom.xml&p1=incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/nlp2rdf/pom.xml&r1=1388016&r2=1413560&rev=1413560&view=diff
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/nlp2rdf/pom.xml (original)
+++ stanbol/trunk/enhancer/engines/nlp2rdf/pom.xml Mon Nov 26 11:39:25 2012
@@ -17,13 +17,13 @@
<parent>
<artifactId>org.apache.stanbol.enhancer.parent</artifactId>
<groupId>org.apache.stanbol</groupId>
- <version>0.10.0-incubating-SNAPSHOT</version>
+ <version>0.10.0-SNAPSHOT</version>
<relativePath>../../parent</relativePath>
</parent>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.enhancer.engines.nlp2rdf</artifactId>
- <version>0.10.0-incubating-SNAPSHOT</version>
+ <version>0.10.0-SNAPSHOT</version>
<packaging>bundle</packaging>
<name>Apache Stanbol Enhancer Enhancement Engine: NLP data to RDF converter</name>
@@ -38,12 +38,12 @@
<scm>
<connection>
- scm:svn:http://svn.apache.org/repos/asf/incubator/stanbol/trunk/enhancer/engines/nlp2rdf/
+ scm:svn:http://svn.apache.org/repos/asf/stanbol/trunk/enhancer/engines/nlp2rdf/
</connection>
<developerConnection>
- scm:svn:https://svn.apache.org/repos/asf/incubator/stanbol/trunk/enhancer/engines/nlp2rdf/
+ scm:svn:https://svn.apache.org/repos/asf/stanbol/trunk/enhancer/engines/nlp2rdf/
</developerConnection>
- <url>http://incubator.apache.org/stanbol/</url>
+ <url>http://stanbol.apache.org/</url>
</scm>
<build>
@@ -81,17 +81,17 @@
<dependency>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
- <version>0.10.0-incubating-SNAPSHOT</version>
+ <version>0.10.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.commons.opennlp</artifactId>
- <version>0.10.0-incubating-SNAPSHOT</version>
+ <version>0.10.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.enhancer.nlp</artifactId>
- <version>0.10.0-incubating-SNAPSHOT</version>
+ <version>0.10.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.felix</groupId>
Propchange: stanbol/trunk/enhancer/engines/opennlp-chunker/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon Nov 26 11:39:25 2012
@@ -0,0 +1,7 @@
+target
+
+.settings
+
+.classpath
+
+.project
Copied: stanbol/trunk/enhancer/engines/opennlp-chunker/pom.xml (from r1388016, incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-chunker/pom.xml)
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/opennlp-chunker/pom.xml?p2=stanbol/trunk/enhancer/engines/opennlp-chunker/pom.xml&p1=incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-chunker/pom.xml&r1=1388016&r2=1413560&rev=1413560&view=diff
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-chunker/pom.xml (original)
+++ stanbol/trunk/enhancer/engines/opennlp-chunker/pom.xml Mon Nov 26 11:39:25 2012
@@ -16,13 +16,13 @@
<parent>
<artifactId>org.apache.stanbol.enhancer.parent</artifactId>
<groupId>org.apache.stanbol</groupId>
- <version>0.10.0-incubating-SNAPSHOT</version>
+ <version>0.10.0-SNAPSHOT</version>
<relativePath>../../parent</relativePath>
</parent>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.enhancer.engines.opennlp.chunker</artifactId>
- <version>0.10.0-incubating-SNAPSHOT</version>
+ <version>0.10.0-SNAPSHOT</version>
<packaging>bundle</packaging>
<name>Apache Stanbol Enhancer Enhancement Engine: Chunking / Noun Phrase Detection</name>
@@ -36,12 +36,12 @@
<scm>
<connection>
- scm:svn:http://svn.apache.org/repos/asf/incubator/stanbol/trunk/enhancer/engines/tika/
+ scm:svn:http://svn.apache.org/repos/asf/stanbol/trunk/enhancer/engines/tika/
</connection>
<developerConnection>
- scm:svn:https://svn.apache.org/repos/asf/incubator/stanbol/trunk/enhancer/engines/tika/
+ scm:svn:https://svn.apache.org/repos/asf/stanbol/trunk/enhancer/engines/tika/
</developerConnection>
- <url>http://incubator.apache.org/stanbol/</url>
+ <url>http://stanbol.apache.org/</url>
</scm>
<properties>
@@ -91,17 +91,17 @@
<dependency>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
- <version>0.10.0-incubating-SNAPSHOT</version>
+ <version>0.10.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.commons.opennlp</artifactId>
- <version>0.10.0-incubating-SNAPSHOT</version>
+ <version>0.10.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.enhancer.nlp</artifactId>
- <version>0.10.0-incubating-SNAPSHOT</version>
+ <version>0.10.0-SNAPSHOT</version>
</dependency>
<dependency>
<groupId>org.apache.felix</groupId>
Copied: stanbol/trunk/enhancer/engines/opennlp-chunker/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/chunker/model/PhraseTagSetRegistry.java (from r1388016, incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-chunker/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/chunker/model/PhraseTagSetRegistry.java)
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/opennlp-chunker/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/chunker/model/PhraseTagSetRegistry.java?p2=stanbol/trunk/enhancer/engines/opennlp-chunker/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/chunker/model/PhraseTagSetRegistry.java&p1=incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-chunker/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/chunker/model/PhraseTagSetRegistry.java&r1=1388016&r2=1413560&rev=1413560&view=diff
==============================================================================
--- incubator/stanbol/branches/stanbol-nlp-processing/enhancer/engines/opennlp-chunker/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/chunker/model/PhraseTagSetRegistry.java (original)
+++ stanbol/trunk/enhancer/engines/opennlp-chunker/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/chunker/model/PhraseTagSetRegistry.java Mon Nov 26 11:39:25 2012
@@ -5,7 +5,7 @@ import java.util.Map;
import opennlp.tools.chunker.Chunker;
-import org.apache.stanbol.enhancer.nlp.TagSet;
+import org.apache.stanbol.enhancer.nlp.model.tag.TagSet;
import org.apache.stanbol.enhancer.nlp.phrase.PhraseTag;
import org.apache.stanbol.enhancer.nlp.pos.LexicalCategory;
Modified: stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml?rev=1413560&r1=1413559&r2=1413560&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml (original)
+++ stanbol/trunk/enhancer/engines/opennlp-ner/pom.xml Mon Nov 26 11:39:25 2012
@@ -87,6 +87,11 @@
<version>0.10.0-SNAPSHOT</version>
</dependency>
<dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.nlp</artifactId>
+ <version>0.10.0-SNAPSHOT</version>
+ </dependency>
+ <dependency>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.commons.stanboltools.datafileprovider</artifactId>
<version>0.9.0-incubating</version>
Modified: stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java?rev=1413560&r1=1413559&r2=1413560&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java (original)
+++ stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineConfig.java Mon Nov 26 11:39:25 2012
@@ -6,12 +6,15 @@ import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
+import java.util.Map.Entry;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.CopyOnWriteArrayList;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.stanbol.commons.opennlp.OpenNLP;
+import org.apache.stanbol.enhancer.nlp.model.tag.TagSet;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
@@ -32,10 +35,11 @@ public class NEREngineConfig {
}
/**
- * Holds the mappings of rdf:type used by concepts to dc:type values used
- * by TextAnnotations.
+ * Holds the configured {@link NerTag}s - the mappings from the
+ * named entity name to the {@link UriRef} type used for the
+ * <code>dc:type</code> value for <code>fise:TextAnnotation</code>s
*/
- private Map<String,UriRef> typeMappings = new HashMap<String,UriRef>(DEFAULT_ENTITY_TYPE_MAPPINGS);
+ private TagSet<NerTag> nerTagSet = new TagSet<NerTag>("NER TagSet");
private Map<String,Collection<String>> additionalNerModels = new HashMap<String,Collection<String>>();
/**
@@ -50,6 +54,12 @@ public class NEREngineConfig {
private String defaultLanguage;
+ public NEREngineConfig(){
+ for(Entry<String,UriRef> mapping : DEFAULT_ENTITY_TYPE_MAPPINGS.entrySet()){
+ nerTagSet.addTag(new NerTag(mapping.getKey(), mapping.getValue()));
+ }
+ }
+
public synchronized void addCustomNameFinderModel(String lang, String modelFileName){
if(lang == null || lang.isEmpty()){
throw new IllegalArgumentException("The parsed lanaguage MUST NOT be NULL or empty!");
@@ -115,17 +125,40 @@ public class NEREngineConfig {
Collection<String> modelNames = additionalNerModels.get(lang);
return modelNames == null ? Collections.EMPTY_LIST : modelNames;
}
-
- public UriRef getMappedType(String namedEntityType){
- return typeMappings.get(namedEntityType);
+ /**
+ * Getter for the {@link NerTag} of the parsed Named Entity
+ * name. If not yet present a new {@link NerTag} (with no
+ * <code>dc:type</code> mapping) is created and added to the
+ * configuration.
+ * @param namedEntityType the NamedEntity name.
+ * @return the NerTag. Guaranteed to be not <code>null</code>
+ * @throws IllegalArgumentException if the parsed NamedEntity
+ * type is <code>null</code> or an empty String.
+ */
+ public NerTag getNerTag(String namedEntityType){
+ if(namedEntityType == null || namedEntityType.isEmpty()){
+ throw new IllegalArgumentException("The parsed NamedEntity string MUST NOT be NULL nor empty!");
+ }
+ NerTag tag = nerTagSet.getTag(namedEntityType);
+ if(tag == null){
+ tag = new NerTag(namedEntityType);
+ nerTagSet.addTag(tag);
+ }
+ return tag;
}
+ /**
+ * Setter for a NamedEntity name > <code>dc:tyoe</code>
+ * mapping.
+ * @param namedEntityType the Named Entity type (as
+ * used by the OpenNLP NameFinder model)
+ * @param dcType the <code>dc:Type</code> used for the
+ * NamedEntity or <code>nulll</code> if non
+ * @throws IllegalArgumentException if the parsed NamedEntity
+ * type is <code>null</code> or an empty String.
+ */
public void setMappedType(String namedEntityType,UriRef dcType){
if(namedEntityType != null && !namedEntityType.isEmpty()){
- if(dcType == null){
- typeMappings.remove(namedEntityType);
- } else {
- typeMappings.put(namedEntityType, dcType);
- }
+ nerTagSet.addTag(new NerTag(namedEntityType, dcType));
} else {
throw new IllegalArgumentException("The parsed NamedEntity type MUST NOT be NULL nor empty!");
}
Modified: stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java?rev=1413560&r1=1413559&r2=1413560&view=diff
==============================================================================
--- stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java (original)
+++ stanbol/trunk/enhancer/engines/opennlp-ner/src/main/java/org/apache/stanbol/enhancer/engines/opennlp/impl/NEREngineCore.java Mon Nov 26 11:39:25 2012
@@ -16,6 +16,7 @@
*/
package org.apache.stanbol.enhancer.engines.opennlp.impl;
+import static org.apache.stanbol.enhancer.nlp.NlpAnnotations.NER_ANNOTATION;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_TYPE;
import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
@@ -41,23 +42,28 @@ import opennlp.tools.namefind.NameFinder
import opennlp.tools.namefind.TokenNameFinderModel;
import opennlp.tools.sentdetect.SentenceDetectorME;
import opennlp.tools.sentdetect.SentenceModel;
-import opennlp.tools.tokenize.SimpleTokenizer;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.Span;
import org.apache.clerezza.rdf.core.Language;
-import org.apache.clerezza.rdf.core.Literal;
import org.apache.clerezza.rdf.core.LiteralFactory;
import org.apache.clerezza.rdf.core.MGraph;
-import org.apache.clerezza.rdf.core.Triple;
import org.apache.clerezza.rdf.core.UriRef;
import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
import org.apache.clerezza.rdf.core.impl.TripleImpl;
-import org.apache.commons.io.IOUtils;
import org.apache.commons.lang.StringUtils;
import org.apache.stanbol.commons.opennlp.OpenNLP;
import org.apache.stanbol.commons.stanboltools.datafileprovider.DataFileProvider;
+import org.apache.stanbol.enhancer.nlp.NlpAnnotations;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedText;
+import org.apache.stanbol.enhancer.nlp.model.AnalysedTextUtils;
+import org.apache.stanbol.enhancer.nlp.model.Chunk;
+import org.apache.stanbol.enhancer.nlp.model.Section;
+import org.apache.stanbol.enhancer.nlp.model.Sentence;
+import org.apache.stanbol.enhancer.nlp.model.Token;
+import org.apache.stanbol.enhancer.nlp.model.annotation.Value;
+import org.apache.stanbol.enhancer.nlp.ner.NerTag;
import org.apache.stanbol.enhancer.servicesapi.Blob;
import org.apache.stanbol.enhancer.servicesapi.ContentItem;
import org.apache.stanbol.enhancer.servicesapi.EngineException;
@@ -66,8 +72,6 @@ import org.apache.stanbol.enhancer.servi
import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
-import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
-import org.apache.stanbol.enhancer.servicesapi.rdf.Properties;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -91,6 +95,7 @@ public abstract class NEREngineCore
protected NEREngineConfig config;
+
/** Comments about our models */
public static final Map<String, String> DATA_FILE_COMMENTS;
static {
@@ -135,32 +140,45 @@ public abstract class NEREngineCore
+ "method! -> This indicated an Bug in the implementation of the "
+ "EnhancementJobManager!");
}
- Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
- if(contentPart == null){
- throw new IllegalStateException("No ContentPart with Mimetype '"
- + TEXT_PLAIN_MIMETYPE+"' found for ContentItem "+ci.getUri()
- + ": This is also checked in the canEnhance method! -> This "
- + "indicated an Bug in the implementation of the "
- + "EnhancementJobManager!");
- }
- String text;
- try {
- text = ContentItemHelper.getText(contentPart.getValue());
- } catch (IOException e) {
- throw new InvalidContentException(this, ci, e);
- }
- if (text.trim().length() == 0) {
- // TODO: make the length of the data a field of the ContentItem
- // interface to be able to filter out empty items in the canEnhance
- // method
- log.warn("ContentPart {} of ContentItem {} does not contain any text" +
- "to extract knowledge from in ContentItem {}",
- contentPart.getKey(),ci);
- return;
+ final AnalysedText at = AnalysedTextUtils.getAnalysedText(ci);
+ //validate data in the AnalysedText
+ final String text;
+ if(at != null && at.getTokens().hasNext()){ //if the AnalysedText is present and tokens are present
+ if(log.isDebugEnabled()){
+ log.debug("computeEnhancements from AnalysedText ContentPart of ContentItem {}: text={}",
+ ci.getUri().getUnicodeString(), StringUtils.abbreviate(at.getSpan(), 100));
+ }
+ text = null;
+ } else { //no AnalysedText with tokens ...
+ //fallback to processing the plain text is still supported
+ Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMETYPES);
+ if(contentPart == null){
+ throw new IllegalStateException("No ContentPart with Mimetype '"
+ + TEXT_PLAIN_MIMETYPE+"' found for ContentItem "+ci.getUri()
+ + ": This is also checked in the canEnhance method! -> This "
+ + "indicated an Bug in the implementation of the "
+ + "EnhancementJobManager!");
+ }
+ try {
+ text = ContentItemHelper.getText(contentPart.getValue());
+ } catch (IOException e) {
+ throw new InvalidContentException(this, ci, e);
+ }
+ if (text.trim().length() == 0) {
+ // TODO: make the length of the data a field of the ContentItem
+ // interface to be able to filter out empty items in the canEnhance
+ // method
+ log.warn("ContentPart {} of ContentItem {} does not contain any text" +
+ "to extract knowledge from in ContentItem {}",
+ contentPart.getKey(),ci);
+ return;
+ }
+ if(log.isDebugEnabled()){
+ log.debug("computeEnhancements from ContentPart {} of ContentItem {}: text={}",
+ new Object[]{contentPart.getKey(),ci.getUri().getUnicodeString(),
+ StringUtils.abbreviate(text, 100)});
+ }
}
- log.debug("computeEnhancements from ContentPart {} of ContentItem {}: text={}",
- new Object[]{contentPart.getKey(),ci.getUri().getUnicodeString(),
- StringUtils.abbreviate(text, 100)});
try {
if(config.isProcessedLangage(language)){
for (String defaultModelType : config.getDefaultModelTypes()) {
@@ -168,7 +186,7 @@ public abstract class NEREngineCore
if(nameFinderModel == null){
log.info("No NER Model for {} and language {} available!",defaultModelType,language);
} else {
- findNamedEntities(ci, text, language, nameFinderModel);
+ findNamedEntities(ci, at, text, language, nameFinderModel);
}
}
} //else do not use default models for languages other than the processed one
@@ -178,7 +196,7 @@ public abstract class NEREngineCore
try {
nameFinderModel = openNLP.getModel(TokenNameFinderModel.class,
additionalModel, null);
- findNamedEntities(ci, text, language, nameFinderModel);
+ findNamedEntities(ci, at, text, language, nameFinderModel);
} catch (IOException e) {
log.warn("Unable to load TokenNameFinderModel model for language '"+language
+ "' (model: "+additionalModel+")",e);
@@ -197,6 +215,7 @@ public abstract class NEREngineCore
}
protected void findNamedEntities(final ContentItem ci,
+ final AnalysedText at,
final String text,
final String lang,
final TokenNameFinderModel nameFinderModel) {
@@ -204,8 +223,9 @@ public abstract class NEREngineCore
if (ci == null) {
throw new IllegalArgumentException("Parsed ContentItem MUST NOT be NULL");
}
- if (text == null) {
- log.warn("NULL was parsed as text for content item " + ci.getUri().getUnicodeString() + "! -> call ignored");
+ if (at == null && text == null) {
+ log.warn("NULL was parsed as AnalysedText AND Text for content item "
+ + ci.getUri() + ". One of the two MUST BE present! -> call ignored");
return;
}
final Language language;
@@ -216,11 +236,17 @@ public abstract class NEREngineCore
}
if(log.isDebugEnabled()){
log.debug("findNamedEntities model={}, language={}, text=",
- new Object[]{ nameFinderModel, language, StringUtils.abbreviate(text, 100) });
+ new Object[]{ nameFinderModel, language,
+ StringUtils.abbreviate(at != null ? at.getSpan() : text, 100) });
}
LiteralFactory literalFactory = LiteralFactory.getInstance();
MGraph g = ci.getMetadata();
- Map<String,List<NameOccurrence>> entityNames = extractNameOccurrences(nameFinderModel, text);
+ Map<String,List<NameOccurrence>> entityNames;
+ if(at != null){
+ entityNames = extractNameOccurrences(nameFinderModel, at, lang);
+ } else {
+ entityNames = extractNameOccurrences(nameFinderModel, text,lang);
+ }
//lock the ContentItem while writing the RDF data for found Named Entities
ci.getLock().writeLock().lock();
try {
@@ -282,32 +308,74 @@ public abstract class NEREngineCore
}
}
+ @Deprecated
public Collection<String> extractPersonNames(String text) {
- return extractNames(getNameModel("person","en"),text);
+ return extractPersonNames(text, "en");
+ }
+ public Collection<String> extractPersonNames(String text,String lang) {
+ return extractNames(getNameModel("person",lang),text);
}
+ @Deprecated
public Collection<String> extractLocationNames(String text) {
- return extractNames(getNameModel("location","en"), text);
+ return extractLocationNames(text,"en");
}
-
+
+ public Collection<String> extractLocationNames(String text,String lang) {
+ return extractNames(getNameModel("location",lang), text);
+ }
+
+ @Deprecated
public Collection<String> extractOrganizationNames(String text) {
- return extractNames(getNameModel("organization","en"), text);
+ return extractOrganizationNames(text,"en");
}
-
+ public Collection<String> extractOrganizationNames(String text,String lang) {
+ return extractNames(getNameModel("organization",lang), text);
+ }
+ /**
+ * extracts the PersonName occurrences for English language texts
+ * @param text
+ * @return
+ * @deprecated use {@link #extractLocationNameOccurrences(String,String)} instead
+ */
+ @Deprecated
public Map<String,List<NameOccurrence>> extractPersonNameOccurrences(String text) {
- return extractNameOccurrences(getNameModel("person","en"), text);
+ return this.extractPersonNameOccurrences(text, "en");
}
-
+ public Map<String,List<NameOccurrence>> extractPersonNameOccurrences(String text, String lang) {
+ return extractNameOccurrences(getNameModel("person",lang), text, lang);
+ }
+ /**
+ * extracts the LocationName occurrences for English language texts
+ * @param text
+ * @return
+ * @deprecated use {@link #extractLocationNameOccurrences(String,String)} instead
+ */
+ @Deprecated
public Map<String,List<NameOccurrence>> extractLocationNameOccurrences(String text) {
- return extractNameOccurrences(getNameModel("location","en"), text);
+ return extractLocationNameOccurrences(text, "en");
+ }
+
+ public Map<String,List<NameOccurrence>> extractLocationNameOccurrences(String text,String lang) {
+ return extractNameOccurrences(getNameModel("location",lang), text,lang);
}
+ /**
+ * extracts the OrganizationName occurrences for English language texts
+ * @param text
+ * @return
+ * @deprecated use {@link #extractOrganizationNamesOccurrences(String,String)} instead
+ */
+ @Deprecated
public Map<String,List<NameOccurrence>> extractOrganizationNameOccurrences(String text) {
- return extractNameOccurrences(getNameModel("organization","en"), text);
+ return extractOrganizationNameOccurrences(text,"en");
+ }
+ public Map<String,List<NameOccurrence>> extractOrganizationNameOccurrences(String text,String lang) {
+ return extractNameOccurrences(getNameModel("organization",lang), text,lang);
}
protected Collection<String> extractNames(TokenNameFinderModel nameFinderModel, String text) {
- return extractNameOccurrences(nameFinderModel, text).keySet();
+ return extractNameOccurrences(nameFinderModel, text, nameFinderModel.getLanguage()).keySet();
}
/**
@@ -339,16 +407,28 @@ public abstract class NEREngineCore
type,language),e);
}
}
+ /**
+ * Loads the {@link SentenceModel} for the parsed language or
+ * English as fallback if one for the language is not available
+ * @param language
+ * @return
+ */
private SentenceModel getSentenceModel(String language) {
try {
SentenceModel model = openNLP.getSentenceModel(language);
if(model != null){
return model;
- } else {
- throw new IllegalStateException(String.format(
- "Unable to built Model for extracting sentences from '%s' " +
- "language texts because the model data could not be loaded.",
- language));
+ } else { //fallback to english
+ log.info("No sentence detection modle for {}. fallback to English");
+ model = openNLP.getSentenceModel("en");
+ if(model == null){
+ throw new IllegalStateException(String.format(
+ "Unable to built Model for extracting sentences neither for '%s' " +
+ "nor the fallback language 'en'.",
+ language));
+ } else {
+ return model;
+ }
}
} catch (InvalidFormatException e) {
throw new IllegalStateException(String.format(
@@ -360,10 +440,82 @@ public abstract class NEREngineCore
language),e);
}
}
-
- protected Map<String,List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel,
- String text) {
+ /**
+ * THis method extracts NamedEntity occurrences by using existing {@link Token}s and
+ * {@link Sentence}s in the parsed {@link AnalysedText}.
+ * @param nameFinderModel the model used to find NamedEntities
+ * @param at the Analysed Text
+ * @param language the language of the text
+ * @return the found named Entity Occurrences
+ */
+ protected Map<String,List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel,
+ AnalysedText at, String language) {
+ // version with explicit sentence endings to reflect heading / paragraph
+ // structure of an HTML or PDF document converted to text
+ NameFinderME finder = new NameFinderME(nameFinderModel);
+ Map<String,List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String,List<NameOccurrence>>();
+ List<Section> sentences = new ArrayList<Section>();
+ //Holds the tokens of the previouse (pos 0) current (pos 1) and next (pos 2) sentence
+ AnalysedTextUtils.appandToList(at.getSentences(), sentences);
+ if(!sentences.isEmpty()){ //no sentence annotations
+ sentences.add(at); //process as a single section
+ }
+ for (int i=0;i<sentences.size();i++) {
+ String sentence = sentences.get(i).getSpan();
+
+ // build a context by concatenating three sentences to be used for
+ // similarity ranking / disambiguation + contextual snippet in the
+ // extraction structure
+ List<String> contextElements = new ArrayList<String>();
+ contextElements.add(sentence);
+ //three sentences as context
+ String context = at.getSpan().substring(
+ sentences.get(Math.max(0, i-1)).getStart(),
+ sentences.get(Math.min(sentences.size()-1, i+1)).getEnd());
+
+ // get the tokens, words of the current sentence
+ List<Token> tokens = new ArrayList<Token>(32);
+ List<String> words = new ArrayList<String>(32);
+ for(Iterator<Token> it =sentences.get(i).getTokens();it.hasNext();){
+ Token t = it.next();
+ tokens.add(t);
+ words.add(t.getSpan());
+ }
+ Span[] nameSpans = finder.find(words.toArray(new String[words.size()]));
+ double[] probs = finder.probs();
+ //int lastStartPosition = 0;
+ for (int j = 0; j < nameSpans.length; j++) {
+ String name = at.getSpan().substring(tokens.get(nameSpans[j].getStart()).getStart(),
+ tokens.get(nameSpans[j].getEnd()-1).getEnd());
+ Double confidence = 1.0;
+ for (int k = nameSpans[j].getStart(); k < nameSpans[j].getEnd(); k++) {
+ confidence *= probs[k];
+ }
+ int start = tokens.get(nameSpans[j].getStart()).getStart();
+ int end = start + name.length();
+ NerTag nerTag = config.getNerTag(nameSpans[j].getType());
+ //create the occurrence for writing fise:TextAnnotations
+ NameOccurrence occurrence = new NameOccurrence(name, start, end, nerTag.getType(),
+ context, confidence);
+ List<NameOccurrence> occurrences = nameOccurrences.get(name);
+ if (occurrences == null) {
+ occurrences = new ArrayList<NameOccurrence>();
+ }
+ occurrences.add(occurrence);
+ nameOccurrences.put(name, occurrences);
+ //add also the NerAnnotation to the AnalysedText
+ Chunk chunk = at.addChunk(start, end);
+ //TODO: build AnnotationModel based on the configured Mappings
+ chunk.addAnnotation(NER_ANNOTATION, Value.value(nerTag, confidence));
+ }
+ }
+ finder.clearAdaptiveData();
+ log.debug("{} name occurrences found: {}", nameOccurrences.size(), nameOccurrences);
+ return nameOccurrences;
+ }
+
+ protected Map<String,List<NameOccurrence>> extractNameOccurrences(TokenNameFinderModel nameFinderModel, String text, String language) {
// version with explicit sentence endings to reflect heading / paragraph
// structure of an HTML or PDF document converted to text
String textWithDots = text.replaceAll("\\n\\n", ".\n");
@@ -374,7 +526,7 @@ public abstract class NEREngineCore
Span[] sentenceSpans = sentenceDetector.sentPosDetect(textWithDots);
NameFinderME finder = new NameFinderME(nameFinderModel);
- Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
+ Tokenizer tokenizer = openNLP.getTokenizer(language);
Map<String,List<NameOccurrence>> nameOccurrences = new LinkedHashMap<String,List<NameOccurrence>>();
for (int i = 0; i < sentenceSpans.length; i++) {
String sentence = sentenceSpans[i].getCoveredText(text).toString().trim();
@@ -411,9 +563,9 @@ public abstract class NEREngineCore
int start = tokenSpans[nameSpans[j].getStart()].getStart();
int absoluteStart = sentenceSpans[i].getStart() + start;
int absoluteEnd = absoluteStart + name.length();
- UriRef mappedType = config.getMappedType(nameSpans[j].getType());
+ NerTag nerTag = config.getNerTag(nameSpans[j].getType());
NameOccurrence occurrence = new NameOccurrence(name, absoluteStart, absoluteEnd,
- mappedType, context, confidence);
+ nerTag.getType(),context, confidence);
List<NameOccurrence> occurrences = nameOccurrences.get(name);
if (occurrences == null) {