You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/05/14 23:07:21 UTC
svn commit: r1338425 [1/2] - in
/incubator/stanbol/branches/celi-enhancement-engines:
bundlelist/src/main/bundles/ engines/ engines/celi/ engines/celi/src/
engines/celi/src/main/ engines/celi/src/main/java/
engines/celi/src/main/java/org/ engines/celi/...
Author: rwesten
Date: Mon May 14 21:07:19 2012
New Revision: 1338425
URL: http://svn.apache.org/viewvc?rev=1338425&view=rev
Log:
Initial commit of the latest patch for STANBOL-583 including applying the Stanbol Enhancement Structure validation introduced by STANBOL-612; making supported languages configureable; and some other minors. NOTE: that unit tests for the CELI NER engine fails as there seam to be some bugs related XML entity encoding and/or char encoding resulting in wrong selected text and wrong start/end positions.
Added:
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/ (with props)
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/pom.xml
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/langid/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/langid/impl/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/langid/impl/CeliLanguageIdentifierEnhancementEngine.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/langid/impl/GuessedLanguage.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/langid/impl/LanguageIdentifierClientHTTP.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/CeliLemmatizerEnhancementEngine.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LemmatizerClientHTTP.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LexicalEntry.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/Reading.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/CeliNamedEntityExtractionEnhancementEngine.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NERserviceClientHTTP.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/NamedEntity.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/resources/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/resources/OSGI-INF/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/resources/OSGI-INF/metatype/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/resources/OSGI-INF/metatype/metatype.properties
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/resources/log4j.properties
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/classification/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngineTest.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/langid/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/langid/impl/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/langid/impl/CeliLanguageIdentifierEnhancementEngineTest.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/CeliLemmatizerEnhancementEngineTest.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/ner/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/CeliNamedEntityExtractionEnhancementEngineTest.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/test_utils/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/java/org/apache/stanbol/enhancer/engines/celi/test_utils/MockComponentContext.java
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/resources/
incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/test/resources/log4j.properties
Modified:
incubator/stanbol/branches/celi-enhancement-engines/bundlelist/src/main/bundles/list.xml
incubator/stanbol/branches/celi-enhancement-engines/engines/pom.xml
Modified: incubator/stanbol/branches/celi-enhancement-engines/bundlelist/src/main/bundles/list.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/bundlelist/src/main/bundles/list.xml?rev=1338425&r1=1338424&r2=1338425&view=diff
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/bundlelist/src/main/bundles/list.xml (original)
+++ incubator/stanbol/branches/celi-enhancement-engines/bundlelist/src/main/bundles/list.xml Mon May 14 21:07:19 2012
@@ -177,6 +177,11 @@
<artifactId>org.apache.stanbol.enhancer.engines.geonames</artifactId>
<version>0.10.0-incubating-SNAPSHOT</version>
</bundle>
+ <bundle> <!-- http://linguagrid.org/ -->
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.celi</artifactId>
+ <version>0.10.0-incubating-SNAPSHOT</version>
+ </bundle>
</startLevel>
<!-- Default Configuration for the Stanbol Enhancer -->
Propchange: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Mon May 14 21:07:19 2012
@@ -0,0 +1,7 @@
+.classpath
+
+.project
+
+target
+
+.settings
Added: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/pom.xml?rev=1338425&view=auto
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/pom.xml (added)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/pom.xml Mon May 14 21:07:19 2012
@@ -0,0 +1,146 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor
+ license agreements. See the NOTICE file distributed with this work for additional
+ information regarding copyright ownership. The ASF licenses this file to
+ You under the Apache License, Version 2.0 (the "License"); you may not use
+ this file except in compliance with the License. You may obtain a copy of
+ the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required
+ by applicable law or agreed to in writing, software distributed under the
+ License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS
+ OF ANY KIND, either express or implied. See the License for the specific
+ language governing permissions and limitations under the License. -->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.parent</artifactId>
+ <version>0.10.0-incubating-SNAPSHOT</version>
+ <relativePath>../../parent</relativePath>
+ </parent>
+
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.celi</artifactId>
+ <packaging>bundle</packaging>
+
+ <name>Apache Stanbol Enhancer Enhancement Engine: CELI </name>
+ <description></description>
+ <inceptionYear>2012</inceptionYear>
+
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
+ <version>0.10.0-incubating-SNAPSHOT</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.commons.stanboltools.datafileprovider</artifactId>
+ <version>0.9.0-incubating</version>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.clerezza</groupId>
+ <artifactId>rdf.core</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.scr.annotations</artifactId>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- generic tax -->
+ <dependency>
+ <groupId>commons-lang</groupId>
+ <artifactId>commons-lang</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpclient-osgi</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.commons.stanboltools.offline</artifactId>
+ <version>0.9.0-incubating</version>
+ <scope>provided</scope>
+ </dependency>
+
+ <!-- test -->
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.test</artifactId>
+ <version>0.10.0-incubating-SNAPSHOT</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.core</artifactId>
+ <version>0.10.0-incubating-SNAPSHOT</version>
+ <scope>test</scope>
+ </dependency>
+ <dependency> <!-- we use log4j 1.2 -->
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-log4j12</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>log4j</groupId>
+ <artifactId>log4j</artifactId>
+ <scope>test</scope>
+ </dependency>
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+
+
+ </dependencies>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.maven.plugins</groupId>
+ <artifactId>maven-surefire-plugin</artifactId>
+ <configuration>
+ <skipTests>false</skipTests>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Private-Package>
+ org.apache.stanbol.enhancer.engines.celi.ner.impl.*,
+ org.apache.stanbol.enhancer.engines.celi.langid.impl.*,
+ org.apache.stanbol.enhancer.engines.celi.classification.impl.*,
+ org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.*
+ </Private-Package>
+ <!-- <Embed-Dependency>true</Embed-Dependency>
+ <Embed-Transitive>true</Embed-Transitive> -->
+ <Import-Package>
+ org.apache.http,
+ *;resolution:=optional
+ </Import-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-scr-plugin</artifactId>
+ </plugin>
+ </plugins>
+ </build>
+
+</project>
\ No newline at end of file
Added: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java?rev=1338425&view=auto
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java (added)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/CeliClassificationEnhancementEngine.java Mon May 14 21:07:19 2012
@@ -0,0 +1,252 @@
+package org.apache.stanbol.enhancer.engines.celi.classification.impl;
+
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_CREATOR;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_RELATION;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
+import java.io.IOException;
+import java.net.URL;
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Vector;
+import java.util.Map.Entry;
+
+import org.apache.clerezza.rdf.core.Literal;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.NoConvertorException;
+import org.apache.clerezza.rdf.core.Resource;
+import org.apache.clerezza.rdf.core.Triple;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.commons.stanboltools.offline.OnlineMode;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.rdf.NamespaceEnum;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+@Component(immediate = true, metatype = true)
+@Service
+@Properties(value = { @Property(name = EnhancementEngine.PROPERTY_NAME, value = "celiClassification") })
+public class CeliClassificationEnhancementEngine extends AbstractEnhancementEngine<IOException, RuntimeException> implements EnhancementEngine, ServiceProperties {
+
+ /**
+ * This ensures that no connections to external services are made if Stanbol is started in offline mode
+ * as the OnlineMode service will only be available if OfflineMode is deactivated.
+ */
+ @Reference
+ private OnlineMode onlineMode;
+
+ private static List<String> supportedLangs = new Vector<String>();
+ static {
+ supportedLangs.add("en");
+ supportedLangs.add("fr");
+ supportedLangs.add("de");
+ supportedLangs.add("it");
+ supportedLangs.add("es");
+ supportedLangs.add("pt");
+ supportedLangs.add("pl");
+ supportedLangs.add("nl");
+ }
+
+ /**
+ * The literal representing the LangIDEngine as creator.
+ */
+ public static final Literal LANG_ID_ENGINE_NAME = LiteralFactory.getInstance().createTypedLiteral("org.apache.stanbol.enhancer.engines.celi.langid.impl.CeliLanguageIdentifierEnhancementEngine");
+
+ /**
+ * The default value for the Execution of this Engine. Currently set to
+ * {@link ServiceProperties#ORDERING_CONTENT_EXTRACTION}
+ */
+ public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION;
+
+ private Logger log = LoggerFactory.getLogger(getClass());
+
+ private String language = null;
+
+ /**
+ * This contains the only MIME type directly supported by this enhancement
+ * engine.
+ */
+ private static final String TEXT_PLAIN_MIMETYPE = "text/plain";
+
+ /**
+ * Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
+ */
+ private static final Set<String> SUPPORTED_MIMTYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);
+
+ @Property
+ public static final String LICENSE_KEY = "org.apache.stanbol.enhancer.engines.celi.classification.impl.CeliClassificationEnhancementEngine.license";
+
+ @Property(value = "http://linguagrid.org/LSGrid/ws/dbpedia-classification")
+ public static final String SERVICE_URL = "org.apache.stanbol.enhancer.engines.celi.classification.impl.CeliClassificationEnhancementEngine.url";
+
+ private String licenseKey;
+ private URL serviceURL;
+
+ private ClassificationClientHTTP client;
+
+ @Override
+ @Activate
+ protected void activate(ComponentContext ctx) throws IOException, ConfigurationException {
+ super.activate(ctx);
+ Dictionary<String, Object> properties = ctx.getProperties();
+ this.licenseKey = (String) properties.get(LICENSE_KEY);
+ if (licenseKey == null || licenseKey.isEmpty()) {
+ log.warn("no CELI license key configured for this Engine, a guest account will be used (max 100 requests per day). Go on http://linguagrid.org for getting a proper license key.");
+ }
+ String url = (String) properties.get(SERVICE_URL);
+ if (url == null || url.isEmpty()) {
+ throw new ConfigurationException(SERVICE_URL, String.format("%s : please configure the URL of the CELI Web Service (e.g. by" + "using the 'Configuration' tab of the Apache Felix Web Console).", getClass().getSimpleName()));
+ }
+ this.serviceURL = new URL(url);
+ this.client = new ClassificationClientHTTP(this.serviceURL, this.licenseKey);
+ }
+
+ @Override
+ @Deactivate
+ protected void deactivate(ComponentContext ce) {
+ super.deactivate(ce);
+ }
+
+ @Override
+ public int canEnhance(ContentItem ci) throws EngineException {
+ this.language = extractLanguage(ci);
+ if (language == null) {
+ throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
+ }
+
+ if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null && this.isLangSupported(language))
+ return ENHANCE_ASYNC;
+ else
+ return CANNOT_ENHANCE;
+ }
+
+
+ /**
+ * Extracts the language of the parsed ContentItem from the metadata
+ * @param ci the content item
+ * @return the language
+ */
+ private String extractLanguage(ContentItem ci) {
+ MGraph metadata = ci.getMetadata();
+ Iterator<Triple> langaugeEnhancementCreatorTriples =
+ metadata.filter(null, DC_CREATOR, LANG_ID_ENGINE_NAME);
+ if(langaugeEnhancementCreatorTriples.hasNext()){
+ String lang = EnhancementEngineHelper.getString(metadata,
+ langaugeEnhancementCreatorTriples.next().getSubject(), DC_LANGUAGE);
+ if(lang != null){
+ return lang;
+ } else {
+ log.info("Unable to extract language for ContentItem "+ci.getUri().getUnicodeString()+"! The Enhancement of the "+LANG_ID_ENGINE_NAME.getLexicalForm()+
+ " is missing the "+DC_LANGUAGE+" property ... return '{}' as default");
+ return null;
+ }
+ } else {
+
+ Iterator<Triple> it = metadata.filter(null, DC_LANGUAGE, null);
+ if (it.hasNext()) {
+ Resource res = it.next().getObject();
+ if (res instanceof Literal) {
+ return ((Literal) res).getLexicalForm();
+ } else {
+ return res.toString();
+ }
+ }
+
+ log.warn("Unable to extract language for ContentItem "+ci.getUri().getUnicodeString()+"! Is the "+LANG_ID_ENGINE_NAME.getLexicalForm()+" active? ... return '{}' as default");
+ return null;
+ }
+ }
+
+ @Override
+ public void computeEnhancements(ContentItem ci) throws EngineException {
+ if (this.language == null)
+ this.language = extractLanguage(ci);
+
+ Entry<UriRef, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
+ if (contentPart == null) {
+ throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This "
+ + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
+ }
+ String text = "";
+ try {
+ text = ContentItemHelper.getText(contentPart.getValue());
+ } catch (IOException e) {
+ throw new InvalidContentException(this, ci, e);
+ }
+ if (text.trim().length() == 0) {
+ log.info("No text contained in ContentPart {"+contentPart.getKey()+"} of ContentItem {"+ci.getUri()+"}");
+ return;
+ }
+
+ try {
+
+ List<Concept> lista = this.client.extractConcepts(text, language);
+ LiteralFactory literalFactory = LiteralFactory.getInstance();
+
+ MGraph g = ci.getMetadata();
+
+ UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
+
+ for (Concept ne : lista) {
+ List<UriRef> uris = this.getEntityRefForType(ne.getClassLabel());
+
+ try {
+ for (UriRef uri : uris)
+ g.add(new TripleImpl(textAnnotation, DC_RELATION, uri));
+ g.add(new TripleImpl(textAnnotation, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(ne.getConfidence())));
+ } catch (NoConvertorException e) {
+ log.error(e.getMessage(),e);
+ }
+ }
+ } catch (Exception e) {
+ log.error(e.getMessage(),e);
+ }
+
+ }
+
+ private boolean isLangSupported(String language) {
+ if (supportedLangs.contains(language))
+ return true;
+ else
+ return false;
+ }
+
+ private List<UriRef> getEntityRefForType(String classificationLabels) {
+ List<UriRef> refs = new Vector<UriRef>();
+ String[] tmps = classificationLabels.split(" ");
+ for (String dbPediaLabel : tmps) {
+ refs.add(new UriRef(NamespaceEnum.dbpedia_ont + dbPediaLabel));
+ }
+ return refs;
+ }
+
+ @Override
+ public Map<String, Object> getServiceProperties() {
+ return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
+ }
+
+}
Added: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java?rev=1338425&view=auto
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java (added)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/ClassificationClientHTTP.java Mon May 14 21:07:19 2012
@@ -0,0 +1,133 @@
+package org.apache.stanbol.enhancer.engines.celi.classification.impl;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.nio.charset.Charset;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Vector;
+
+import javax.xml.soap.MessageFactory;
+import javax.xml.soap.SOAPBody;
+import javax.xml.soap.SOAPMessage;
+import javax.xml.soap.SOAPPart;
+import javax.xml.transform.stream.StreamSource;
+
+import org.apache.clerezza.rdf.core.impl.util.Base64;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringEscapeUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+
+public class ClassificationClientHTTP {
+
+ private final Logger log = LoggerFactory.getLogger(getClass());
+
+ private static final int maxResultToReturn = 3;
+
+ private URL serviceEP;
+ private String licenseKey;
+
+
+ public ClassificationClientHTTP(URL serviceUrl, String licenseKey){
+ this.serviceEP=serviceUrl;
+ this.licenseKey=licenseKey;
+ }
+
+
+ public String doPostRequest(URL url, String body) throws IOException {
+
+ HttpURLConnection urlConn = (HttpURLConnection) url.openConnection();
+ urlConn.setRequestMethod("POST");
+ urlConn.setDoInput(true);
+ if (null != body) {
+ urlConn.setDoOutput(true);
+ } else {
+ urlConn.setDoOutput(false);
+ }
+ urlConn.setUseCaches(false);
+ String contentType = "text/xml; charset=utf-8";
+ urlConn.setRequestProperty("Content-Type", contentType);
+ if(this.licenseKey!=null){
+ String encoded = Base64.encode(this.licenseKey.getBytes("UTF-8"));
+ urlConn.setRequestProperty("Authorization", "Basic "+encoded);
+ }
+
+ // send POST output
+ if (null != body) {
+ OutputStreamWriter printout = new OutputStreamWriter(urlConn.getOutputStream(), "UTF-8");
+ printout.write(body);
+ printout.flush();
+ printout.close();
+ }
+
+ //close connection
+ urlConn.disconnect();
+
+ // get response data
+ return IOUtils.toString(urlConn.getInputStream(), "UTF-8");
+ }
+
+
+ public List<Concept> extractConcepts(String text,String lang) {
+ List<Concept> extractedConcepts = new Vector<Concept>();
+
+ try {
+ String txt = StringEscapeUtils.escapeXml(text);
+ String xmldata = "<soapenv:Envelope xmlns:soapenv=\"http://schemas.xmlsoap.org/soap/envelope/\" xmlns:clas=\"http://linguagrid.org/v20110204/classification\"><soapenv:Header/><soapenv:Body> <clas:classify>"
+ +"<clas:user>wiki</clas:user><clas:model>"+lang+"</clas:model><clas:text>"+txt+"</clas:text></clas:classify></soapenv:Body></soapenv:Envelope>";
+
+
+ String responseXml = doPostRequest(this.serviceEP, xmldata);
+ log.debug(responseXml);
+
+ // Create SoapMessage
+ MessageFactory msgFactory = MessageFactory.newInstance();
+ SOAPMessage message = msgFactory.createMessage();
+ SOAPPart soapPart = message.getSOAPPart();
+
+ // Load the SOAP text into a stream source
+ ByteArrayInputStream stream = new ByteArrayInputStream(responseXml.getBytes("UTF-8"));
+ StreamSource source = new StreamSource(stream);
+
+ // Set contents of message
+ soapPart.setContent(source);
+
+ SOAPBody soapBody = message.getSOAPBody();
+ NodeList nlist = soapBody.getElementsByTagNameNS("*","return");
+ HashSet<String> inserted=new HashSet<String>();
+ for (int i = 0; i < nlist.getLength() && i<maxResultToReturn; i++) {
+ try {
+ Element result = (Element) nlist.item(i);
+
+ String model = result.getElementsByTagNameNS("*","label").item(0).getTextContent();
+ model=model.substring(1, model.length()-1);
+ String conf=result.getElementsByTagNameNS("*","score").item(0).getTextContent();
+ float confidence=Float.parseFloat(conf);
+
+ String[] tmps=model.split(" ");
+
+ for(String t: tmps){
+ if(!inserted.contains(t)){
+ extractedConcepts.add(new Concept(t, confidence));
+ inserted.add(t);
+ }
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ return extractedConcepts;
+ }
+
+}
Added: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java?rev=1338425&view=auto
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java (added)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/classification/impl/Concept.java Mon May 14 21:07:19 2012
@@ -0,0 +1,28 @@
+package org.apache.stanbol.enhancer.engines.celi.classification.impl;
+
+public class Concept {
+
+ private String classLabel;
+ private float confidence;
+
+ public Concept(String classLabel, float confidence) {
+ super();
+ this.classLabel = classLabel;
+ this.confidence = confidence;
+ }
+
+ public String getClassLabel() {
+ return classLabel;
+ }
+ public void setClassLabel(String classLabel) {
+ this.classLabel = classLabel;
+ }
+ public float getConfidence() {
+ return confidence;
+ }
+ public void setConfidence(float confidence) {
+ this.confidence = confidence;
+ }
+
+
+}
Added: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/langid/impl/CeliLanguageIdentifierEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/langid/impl/CeliLanguageIdentifierEnhancementEngine.java?rev=1338425&view=auto
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/langid/impl/CeliLanguageIdentifierEnhancementEngine.java (added)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/langid/impl/CeliLanguageIdentifierEnhancementEngine.java Mon May 14 21:07:19 2012
@@ -0,0 +1,160 @@
+package org.apache.stanbol.enhancer.engines.celi.langid.impl;
+
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_CONFIDENCE;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Map.Entry;
+
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.commons.stanboltools.offline.OnlineMode;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+
+@Component(immediate = true, metatype = true)
+@Service
+@Properties(value = { @Property(name = EnhancementEngine.PROPERTY_NAME, value = "celiLangid") })
+public class CeliLanguageIdentifierEnhancementEngine extends AbstractEnhancementEngine<IOException, RuntimeException> implements EnhancementEngine, ServiceProperties {
+ /**
+ * This ensures that no connections to external services are made if Stanbol is started in offline mode
+ * as the OnlineMode service will only be available if OfflineMode is deactivated.
+ */
+ @Reference
+ private OnlineMode onlineMode;
+
+ public static final Integer defaultOrder = ServiceProperties.ORDERING_PRE_PROCESSING -2;
+
+ private Logger log = LoggerFactory.getLogger(getClass());
+ /**
+ * This contains the only MIME type directly supported by this enhancement
+ * engine.
+ */
+ private static final String TEXT_PLAIN_MIMETYPE = "text/plain";
+
+ /**
+ * Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
+ */
+ private static final Set<String> SUPPORTED_MIMTYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);
+
+ @Property
+ public static final String LICENSE_KEY = "org.apache.stanbol.enhancer.engines.celi.langid.impl.CeliLanguageIdentifierEnhancementEngine.license";
+
+ @Property(value = "http://linguagrid.org/LSGrid/ws/language-identifier")
+ public static final String SERVICE_URL = "org.apache.stanbol.enhancer.engines.celi.langid.impl.CeliLanguageIdentifierEnhancementEngine.url";
+
+ private String licenseKey;
+ private URL serviceURL;
+
+ private LanguageIdentifierClientHTTP client;
+
+ @Override
+ @Activate
+ public void activate(ComponentContext ctx) throws IOException, ConfigurationException {
+ super.activate(ctx);
+ Dictionary<String, Object> properties = ctx.getProperties();
+ this.licenseKey = (String) properties.get(LICENSE_KEY);
+ if (licenseKey == null || licenseKey.isEmpty()) {
+ log.warn("no CELI license key configured for this Engine, a guest account will be used (max 100 requests per day). Go on http://linguagrid.org for getting a proper license key.");
+ }
+ String url = (String) properties.get(SERVICE_URL);
+ if (url == null || url.isEmpty()) {
+ throw new ConfigurationException(SERVICE_URL, String.format("%s : please configure the URL of the CELI Web Service (e.g. by" + "using the 'Configuration' tab of the Apache Felix Web Console).", getClass().getSimpleName()));
+ }
+ this.serviceURL = new URL(url);
+ this.client = new LanguageIdentifierClientHTTP(this.serviceURL, this.licenseKey);
+ }
+
+ @Override
+ @Deactivate
+ protected void deactivate(ComponentContext ce) {
+ super.deactivate(ce);
+ }
+
+ @Override
+ public int canEnhance(ContentItem ci) throws EngineException {
+ if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null) {
+ return ENHANCE_ASYNC;
+ } else {
+ return CANNOT_ENHANCE;
+ }
+ }
+
+ @Override
+ public void computeEnhancements(ContentItem ci) throws EngineException {
+ Entry<UriRef, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
+ if (contentPart == null) {
+ throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This "
+ + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
+ }
+ String text = "";
+ try {
+ text = ContentItemHelper.getText(contentPart.getValue());
+ } catch (IOException e) {
+ throw new InvalidContentException(this, ci, e);
+ }
+ if (text.trim().length() == 0) {
+ log.info("No text contained in ContentPart {"+contentPart.getKey()+"} of ContentItem {"+ci.getUri()+"}");
+ return;
+ }
+
+ try {
+
+ String[] tmps=text.split(" ");
+ List<GuessedLanguage> lista = null;
+ if(tmps.length>5)
+ lista = this.client.guessLanguage(text);
+ else
+ lista = this.client.guessQueryLanguage(text);
+ LiteralFactory literalFactory = LiteralFactory.getInstance();
+
+ MGraph g = ci.getMetadata();
+
+ GuessedLanguage gl = lista.get(0);
+ UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
+ g.add(new TripleImpl(textEnhancement, DC_LANGUAGE, new PlainLiteralImpl(gl.getLang())));
+ g.add(new TripleImpl(textEnhancement, ENHANCER_CONFIDENCE, literalFactory.createTypedLiteral(gl.getConfidence())));
+
+
+ } catch (Exception e) {
+ log.error(e.getMessage(),e);
+ }
+
+ }
+
+
+ @Override
+ public Map<String, Object> getServiceProperties() {
+ return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
+ }
+
+
+}
\ No newline at end of file
Added: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/langid/impl/GuessedLanguage.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/langid/impl/GuessedLanguage.java?rev=1338425&view=auto
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/langid/impl/GuessedLanguage.java (added)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/langid/impl/GuessedLanguage.java Mon May 14 21:07:19 2012
@@ -0,0 +1,30 @@
+package org.apache.stanbol.enhancer.engines.celi.langid.impl;
+
+public class GuessedLanguage {
+
+ private String lang;
+ private double confidence;
+
+ public GuessedLanguage(String lang, double d) {
+ this.lang=lang;
+ this.confidence=d;
+ }
+
+ public String getLang() {
+ return lang;
+ }
+
+ public void setLang(String lang) {
+ this.lang = lang;
+ }
+
+ public double getConfidence() {
+ return confidence;
+ }
+
+ public void setConfidence(double confidence) {
+ this.confidence = confidence;
+ }
+
+
+}
Added: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/langid/impl/LanguageIdentifierClientHTTP.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/langid/impl/LanguageIdentifierClientHTTP.java?rev=1338425&view=auto
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/langid/impl/LanguageIdentifierClientHTTP.java (added)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/langid/impl/LanguageIdentifierClientHTTP.java Mon May 14 21:07:19 2012
@@ -0,0 +1,165 @@
+package org.apache.stanbol.enhancer.engines.celi.langid.impl;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.net.HttpURLConnection;
+import java.net.URI;
+import java.net.URL;
+import java.util.List;
+import java.util.Vector;
+
+import javax.xml.soap.MessageFactory;
+import javax.xml.soap.SOAPBody;
+import javax.xml.soap.SOAPMessage;
+import javax.xml.soap.SOAPPart;
+import javax.xml.transform.stream.StreamSource;
+
+import org.apache.clerezza.rdf.core.impl.util.Base64;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringEscapeUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+
+public class LanguageIdentifierClientHTTP {
+
+ private URL serviceEP;
+ private String licenseKey;
+
+ private final Logger log = LoggerFactory.getLogger(getClass());
+
+
+ public LanguageIdentifierClientHTTP(URL serviceUrl, String licenseKey){
+ this.serviceEP=serviceUrl;
+ this.licenseKey=licenseKey;
+ }
+
+
+ public String doPostRequest(URL url, String body) throws IOException {
+ HttpURLConnection urlConn = (HttpURLConnection) url.openConnection();
+ urlConn.setRequestMethod("POST");
+ urlConn.setDoInput(true);
+ if (null != body) {
+ urlConn.setDoOutput(true);
+ } else {
+ urlConn.setDoOutput(false);
+ }
+ urlConn.setUseCaches(false);
+ String contentType = "text/xml; charset=utf-8";
+ urlConn.setRequestProperty("Content-Type", contentType);
+ if(this.licenseKey!=null){
+ String encoded = Base64.encode(this.licenseKey.getBytes("UTF-8"));
+ urlConn.setRequestProperty("Authorization", "Basic "+encoded);
+ }
+
+ // send POST output
+ if (null != body) {
+ OutputStreamWriter printout = new OutputStreamWriter(urlConn.getOutputStream(), "UTF-8");
+ printout.write(body);
+ printout.flush();
+ printout.close();
+ }
+
+ //close connection
+ urlConn.disconnect();
+
+ // get response data
+ return IOUtils.toString(urlConn.getInputStream(), "UTF8");
+ }
+
+
+
+ public List<GuessedLanguage> guessQueryLanguage(String text){
+ List<GuessedLanguage> guesses = new Vector<GuessedLanguage>();
+
+ try {
+ String txt = StringEscapeUtils.escapeXml(text);
+ String xmldata = "<soapenv:Envelope xmlns:soapenv=\"http://schemas.xmlsoap.org/soap/envelope/\" xmlns:lan=\"http://research.celi.it/LanguageIdentifierWS\"><soapenv:Header/><soapenv:Body>"
+ +"<lan:guessQueryLanguage><textToGuess>"+txt+"</textToGuess></lan:guessQueryLanguage></soapenv:Body></soapenv:Envelope>";
+
+
+ String responseXml = doPostRequest(this.serviceEP, xmldata);
+ log.debug(responseXml);
+
+ // Create SoapMessage
+ MessageFactory msgFactory = MessageFactory.newInstance();
+ SOAPMessage message = msgFactory.createMessage();
+ SOAPPart soapPart = message.getSOAPPart();
+
+ // Load the SOAP text into a stream source
+ ByteArrayInputStream stream = new ByteArrayInputStream(responseXml.getBytes("UTF-8"));
+ StreamSource source = new StreamSource(stream);
+
+ // Set contents of message
+ soapPart.setContent(source);
+
+ SOAPBody soapBody = message.getSOAPBody();
+ NodeList nlist = soapBody.getElementsByTagNameNS("*","return");
+ for (int i = 0; i < nlist.getLength(); i++) {
+ try {
+ Element result = (Element) nlist.item(i);
+ String lang = result.getAttribute("language");
+ double d=Double.parseDouble(result.getAttribute("guessConfidence"));
+
+ guesses.add(new GuessedLanguage(lang, d));
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ return guesses;
+ }
+
+ public List<GuessedLanguage> guessLanguage(String text) {
+
+ List<GuessedLanguage> guesses = new Vector<GuessedLanguage>();
+
+ try {
+ String txt = StringEscapeUtils.escapeXml(text);
+ String xmldata = "<soapenv:Envelope xmlns:soapenv=\"http://schemas.xmlsoap.org/soap/envelope/\" xmlns:lan=\"http://research.celi.it/LanguageIdentifierWS\"><soapenv:Header/><soapenv:Body>"
+ +"<lan:guessLanguage><textToGuess>"+txt+"</textToGuess></lan:guessLanguage></soapenv:Body></soapenv:Envelope>";
+
+ URI uri = new URI("http://linguagrid.org/LSGrid/ws/language-identifier");
+
+ String responseXml = doPostRequest(uri.toURL(), xmldata);
+ log.debug(responseXml);
+
+ // Create SoapMessage
+ MessageFactory msgFactory = MessageFactory.newInstance();
+ SOAPMessage message = msgFactory.createMessage();
+ SOAPPart soapPart = message.getSOAPPart();
+
+ // Load the SOAP text into a stream source
+ ByteArrayInputStream stream = new ByteArrayInputStream(responseXml.getBytes("UTF-8"));
+ StreamSource source = new StreamSource(stream);
+
+ // Set contents of message
+ soapPart.setContent(source);
+
+ SOAPBody soapBody = message.getSOAPBody();
+ NodeList nlist = soapBody.getElementsByTagNameNS("*","return");
+ for (int i = 0; i < nlist.getLength(); i++) {
+ try {
+ Element result = (Element) nlist.item(i);
+ String lang = result.getAttribute("language");
+ double d=Double.parseDouble(result.getAttribute("guessConfidence"));
+
+ guesses.add(new GuessedLanguage(lang, d));
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ return guesses;
+ }
+}
Added: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/CeliLemmatizerEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/CeliLemmatizerEnhancementEngine.java?rev=1338425&view=auto
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/CeliLemmatizerEnhancementEngine.java (added)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/CeliLemmatizerEnhancementEngine.java Mon May 14 21:07:19 2012
@@ -0,0 +1,259 @@
+package org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl;
+
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_CREATOR;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.DC_LANGUAGE;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_END;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_SELECTED_TEXT;
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.ENHANCER_START;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Vector;
+import java.util.Map.Entry;
+
+import org.apache.clerezza.rdf.core.Literal;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.Resource;
+import org.apache.clerezza.rdf.core.Triple;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.commons.stanboltools.offline.OnlineMode;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Component(immediate = true, metatype = true)
+@Service
+@Properties(value = { @Property(name = EnhancementEngine.PROPERTY_NAME, value = "celiLemmatizer") })
+public class CeliLemmatizerEnhancementEngine extends AbstractEnhancementEngine<IOException, RuntimeException> implements EnhancementEngine, ServiceProperties {
+
+ /**
+ * This ensures that no connections to external services are made if Stanbol is started in offline mode
+ * as the OnlineMode service will only be available if OfflineMode is deactivated.
+ */
+ @Reference
+ private OnlineMode onlineMode;
+
+ public static final UriRef hasLemmaForm = new UriRef("http://fise.iks-project.eu/ontology/hasLemmaForm");
+ public static final UriRef hasMorphoFeature = new UriRef("http://fise.iks-project.eu/ontology/hasMorphologicalFeature");
+
+ private static List<String> supportedLangs = new Vector<String>();
+ static {
+ supportedLangs.add("it");
+ supportedLangs.add("da");
+ supportedLangs.add("de");
+ supportedLangs.add("ru");
+ supportedLangs.add("ro");
+ }
+
+ /**
+ * The literal representing the LangIDEngine as creator.
+ */
+ public static final Literal LANG_ID_ENGINE_NAME = LiteralFactory.getInstance().createTypedLiteral("org.apache.stanbol.enhancer.engines.celi.langid.impl.CeliLanguageIdentifierEnhancementEngine");
+
+ /**
+ * The default value for the Execution of this Engine. Currently set to
+ * {@link ServiceProperties#ORDERING_CONTENT_EXTRACTION}
+ */
+ public static final Integer defaultOrder = ServiceProperties.ORDERING_CONTENT_EXTRACTION;
+
+ private Logger log = LoggerFactory.getLogger(getClass());
+
+ private String language = null;
+
+ /**
+ * This contains the only MIME type directly supported by this enhancement
+ * engine.
+ */
+ private static final String TEXT_PLAIN_MIMETYPE = "text/plain";
+
+ /**
+ * Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
+ */
+ private static final Set<String> SUPPORTED_MIMTYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);
+
+ @Property
+ public static final String LICENSE_KEY = "org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.CeliLemmatizerEnhancementEngine.license";
+
+ @Property(value = "http://linguagrid.org/LSGrid/ws/morpho-analyser")
+ public static final String SERVICE_URL = "org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.CeliLemmatizerEnhancementEngine.url";
+
+ @Property(boolValue = false)
+ public static final String MORPHOLOGICAL_ANALYSIS = "org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl.CeliLemmatizerEnhancementEngine.morphoAnalysis";
+
+ private String licenseKey;
+ private URL serviceURL;
+ private boolean completeMorphoAnalysis;
+
+ private LemmatizerClientHTTP client;
+
+ @Override
+ @Activate
+ protected void activate(ComponentContext ctx) throws IOException, ConfigurationException {
+ super.activate(ctx);
+ Dictionary<String, Object> properties = ctx.getProperties();
+ this.licenseKey = (String) properties.get(LICENSE_KEY);
+ if (licenseKey == null || licenseKey.isEmpty()) {
+ log.warn("no CELI license key configured for this Engine, a guest account will be used (max 100 requests per day). Go on http://linguagrid.org for getting a proper license key.");
+ }
+ String url = (String) properties.get(SERVICE_URL);
+ if (url == null || url.isEmpty()) {
+ throw new ConfigurationException(SERVICE_URL, String.format("%s : please configure the URL of the CELI Web Service (e.g. by" + "using the 'Configuration' tab of the Apache Felix Web Console).", getClass().getSimpleName()));
+ }
+ this.serviceURL = new URL(url);
+
+ try {
+ this.completeMorphoAnalysis = (Boolean) properties.get(MORPHOLOGICAL_ANALYSIS);
+ } catch (Exception e) {
+ this.completeMorphoAnalysis = false;
+ }
+ this.client = new LemmatizerClientHTTP(this.serviceURL, this.licenseKey);
+ }
+
+ @Override
+ @Deactivate
+ protected void deactivate(ComponentContext ce) {
+ super.deactivate(ce);
+ }
+
+ @Override
+ public int canEnhance(ContentItem ci) throws EngineException {
+ this.language = extractLanguage(ci);
+ if (language == null) {
+ throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
+ }
+
+ if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null && this.isLangSupported(language))
+ return ENHANCE_ASYNC;
+ else
+ return CANNOT_ENHANCE;
+ }
+
+ /**
+ * Extracts the language of the parsed ContentItem from the metadata
+ *
+ * @param ci
+ * the content item
+ * @return the language
+ */
+ private String extractLanguage(ContentItem ci) {
+ MGraph metadata = ci.getMetadata();
+ Iterator<Triple> langaugeEnhancementCreatorTriples = metadata.filter(null, DC_CREATOR, LANG_ID_ENGINE_NAME);
+ if (langaugeEnhancementCreatorTriples.hasNext()) {
+ String lang = EnhancementEngineHelper.getString(metadata, langaugeEnhancementCreatorTriples.next().getSubject(), DC_LANGUAGE);
+ if (lang != null) {
+ return lang;
+ } else {
+ log.info("Unable to extract language for ContentItem " + ci.getUri().getUnicodeString() + "! The Enhancement of the " + LANG_ID_ENGINE_NAME.getLexicalForm() + " is missing the " + DC_LANGUAGE + " property ... return '{}' as default");
+
+ return null;
+ }
+ } else {
+
+ Iterator<Triple> it = metadata.filter(null, DC_LANGUAGE, null);
+ if (it.hasNext()) {
+ Resource res = it.next().getObject();
+ if (res instanceof Literal) {
+ return ((Literal) res).getLexicalForm();
+ } else {
+ return res.toString();
+ }
+ }
+
+ log.warn("Unable to extract language for ContentItem " + ci.getUri().getUnicodeString() + "! Is the " + LANG_ID_ENGINE_NAME.getLexicalForm() + " active? ... return '{}' as default");
+ return null;
+ }
+ }
+
+ @Override
+ public void computeEnhancements(ContentItem ci) throws EngineException {
+ if (this.language == null)
+ this.language = extractLanguage(ci);
+
+ Entry<UriRef, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
+ if (contentPart == null) {
+ throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This "
+ + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
+ }
+ String text = "";
+ try {
+ text = ContentItemHelper.getText(contentPart.getValue());
+ } catch (IOException e) {
+ throw new InvalidContentException(this, ci, e);
+ }
+ if (text.trim().length() == 0) {
+ log.info("No text contained in ContentPart {" + contentPart.getKey() + "} of ContentItem {" + ci.getUri() + "}");
+ return;
+ }
+
+ try {
+
+ MGraph g = ci.getMetadata();
+ LiteralFactory literalFactory = LiteralFactory.getInstance();
+
+ if (this.completeMorphoAnalysis) {
+ List<LexicalEntry> terms = this.client.performMorfologicalAnalysis(text, language);
+ for (LexicalEntry le : terms) {
+ UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
+ g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT, literalFactory.createTypedLiteral(le.getWordForm())));
+ if (le.from > 0 && le.to > 0) {
+ g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(Integer.toString(le.from))));
+ g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(Integer.toString(le.to))));
+ }
+ for (Reading r : le.termReadings) {
+ g.add(new TripleImpl(textAnnotation, hasLemmaForm, literalFactory.createTypedLiteral(r.getLemma())));
+ for (String name : r.lexicalFeatures.keySet()) {
+ String value = r.lexicalFeatures.get(name);
+ g.add(new TripleImpl(textAnnotation, hasMorphoFeature, literalFactory.createTypedLiteral(name + "=" + value)));
+ }
+ }
+ }
+ } else {
+ String lemmatizedContents = this.client.lemmatizeContents(text, language);
+
+ UriRef textEnhancement = EnhancementEngineHelper.createTextEnhancement(ci, this);
+ g.add(new TripleImpl(textEnhancement, hasLemmaForm, literalFactory.createTypedLiteral(lemmatizedContents)));
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ }
+
+ private boolean isLangSupported(String language) {
+ if (supportedLangs.contains(language))
+ return true;
+ else
+ return false;
+ }
+
+ @Override
+ public Map<String, Object> getServiceProperties() {
+ return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
+ }
+
+}
Added: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LemmatizerClientHTTP.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LemmatizerClientHTTP.java?rev=1338425&view=auto
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LemmatizerClientHTTP.java (added)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LemmatizerClientHTTP.java Mon May 14 21:07:19 2012
@@ -0,0 +1,187 @@
+package org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl;
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.util.HashSet;
+import java.util.Hashtable;
+import java.util.List;
+import java.util.Vector;
+
+import javax.xml.soap.MessageFactory;
+import javax.xml.soap.SOAPBody;
+import javax.xml.soap.SOAPMessage;
+import javax.xml.soap.SOAPPart;
+import javax.xml.transform.stream.StreamSource;
+
+import org.apache.clerezza.rdf.core.impl.util.Base64;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.lang.StringEscapeUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+
+public class LemmatizerClientHTTP {
+
+ private URL serviceEP;
+ private String licenseKey;
+ private final Logger log = LoggerFactory.getLogger(getClass());
+
+ public LemmatizerClientHTTP(URL serviceUrl, String licenseKey){
+ this.serviceEP=serviceUrl;
+ this.licenseKey=licenseKey;
+ }
+
+ public String doPostRequest(URL url, String body) throws IOException {
+
+ HttpURLConnection urlConn = (HttpURLConnection) url.openConnection();
+ urlConn.setRequestMethod("POST");
+ urlConn.setDoInput(true);
+ if (null != body) {
+ urlConn.setDoOutput(true);
+ } else {
+ urlConn.setDoOutput(false);
+ }
+ urlConn.setUseCaches(false);
+ String contentType = "text/xml; charset=utf-8";
+ urlConn.setRequestProperty("Content-Type", contentType);
+ if(this.licenseKey!=null){
+ String encoded = Base64.encode(this.licenseKey.getBytes("UTF-8"));
+ urlConn.setRequestProperty("Authorization", "Basic "+encoded);
+ }
+ // send POST output
+ if (null != body) {
+ OutputStreamWriter printout = new OutputStreamWriter(urlConn.getOutputStream(), "UTF-8");
+ printout.write(body);
+ printout.flush();
+ printout.close();
+ }
+
+ //close connection
+ urlConn.disconnect();
+
+ // get response data
+ return IOUtils.toString(urlConn.getInputStream(), "UTF-8");
+ }
+
+ public List<LexicalEntry> performMorfologicalAnalysis(String text,String lang) {
+ List<LexicalEntry> lista=new Vector<LexicalEntry>();
+ try {
+ String txt = StringEscapeUtils.escapeXml(text);
+ String xmldata = "<soapenv:Envelope xmlns:soapenv=\"http://schemas.xmlsoap.org/soap/envelope/\" xmlns:mor=\"http://research.celi.it/MorphologicalAnalyzer\"><soapenv:Header/><soapenv:Body>"+
+ "<mor:inputText lang=\""+lang+"\" text=\""+txt+"\"/></soapenv:Body></soapenv:Envelope>";
+
+ String responseXml = doPostRequest(this.serviceEP, xmldata);
+ log.debug(responseXml);
+
+ // Create SoapMessage
+ MessageFactory msgFactory = MessageFactory.newInstance();
+ SOAPMessage message = msgFactory.createMessage();
+ SOAPPart soapPart = message.getSOAPPart();
+
+ // Load the SOAP text into a stream source
+ ByteArrayInputStream stream = new ByteArrayInputStream(responseXml.getBytes("UTF-8"));
+ StreamSource source = new StreamSource(stream);
+
+ // Set contents of message
+ soapPart.setContent(source);
+
+ SOAPBody soapBody = message.getSOAPBody();
+ NodeList nlist = soapBody.getElementsByTagNameNS("*","LexicalEntry");
+ for (int i = 0; i < nlist.getLength() ; i++) {
+ try {
+ Element result = (Element) nlist.item(i);
+ String wordForm = result.getAttribute("WordForm");
+ int from = Integer.parseInt(result.getAttribute("OffsetFrom"));
+ int to = Integer.parseInt(result.getAttribute("OffsetTo"));
+ LexicalEntry le=new LexicalEntry(wordForm, from, to);
+
+ List<Reading> readings = new Vector<Reading>();
+ NodeList lemmasList = result.getElementsByTagNameNS("*","Lemma");
+ if(lemmasList!=null && lemmasList.getLength()>0){
+ for(int j=0;j<lemmasList.getLength();j++){
+ Element lemmaElm = (Element) lemmasList.item(j);
+ String lemma = lemmaElm.getTextContent();
+ NodeList features = ((Element)lemmaElm.getParentNode()).getElementsByTagNameNS("*","LexicalFeature");
+ Hashtable<String,String> featuresMap=new Hashtable<String,String>();
+ for(int k=0;features!=null && k<features.getLength();k++){
+ Element feat = (Element) features.item(k);
+ String name = feat.getAttribute("name");
+ String value = feat.getTextContent();
+ featuresMap.put(name, value);
+ }
+ Reading r=new Reading(lemma, featuresMap);
+ readings.add(r);
+ }
+ }
+
+ le.setTermReadings(readings);
+ lista.add(le);
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ return lista;
+ }
+
+
+ public String lemmatizeContents(String text,String lang) {
+ String buff="";
+ try {
+ String txt = StringEscapeUtils.escapeXml(text);
+ String xmldata = "<soapenv:Envelope xmlns:soapenv=\"http://schemas.xmlsoap.org/soap/envelope/\" xmlns:mor=\"http://research.celi.it/MorphologicalAnalyzer\"><soapenv:Header/><soapenv:Body>"+
+ "<mor:inputText lang=\""+lang+"\" text=\""+txt+"\"/></soapenv:Body></soapenv:Envelope>";
+
+ String responseXml = doPostRequest(this.serviceEP, xmldata);
+ log.debug(responseXml);
+
+ // Create SoapMessage
+ MessageFactory msgFactory = MessageFactory.newInstance();
+ SOAPMessage message = msgFactory.createMessage();
+ SOAPPart soapPart = message.getSOAPPart();
+
+ // Load the SOAP text into a stream source
+ ByteArrayInputStream stream = new ByteArrayInputStream(responseXml.getBytes("UTF-8"));
+ StreamSource source = new StreamSource(stream);
+
+ // Set contents of message
+ soapPart.setContent(source);
+
+ SOAPBody soapBody = message.getSOAPBody();
+ NodeList nlist = soapBody.getElementsByTagNameNS("*","LexicalEntry");
+ for (int i = 0; i < nlist.getLength() ; i++) {
+ try {
+ Element result = (Element) nlist.item(i);
+ NodeList lemmasList = result.getElementsByTagNameNS("*","Lemma");
+ if(lemmasList!=null && lemmasList.getLength()>0){
+ HashSet<String> lemmas=new HashSet<String>();
+ for(int j=0;j<lemmasList.getLength();j++){
+ lemmas.add(lemmasList.item(j).getTextContent());
+ }
+ for(String lemma: lemmas){
+ buff=buff+lemma+" ";
+ }
+ }
+ else
+ buff=buff+result.getAttributeNS("*","WordForm")+" ";
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ }
+ } catch (Exception e) {
+ e.printStackTrace();
+ }
+
+ return buff.trim();
+ }
+
+}
Added: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LexicalEntry.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LexicalEntry.java?rev=1338425&view=auto
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LexicalEntry.java (added)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/LexicalEntry.java Mon May 14 21:07:19 2012
@@ -0,0 +1,53 @@
+package org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl;
+
+import java.util.List;
+
+public class LexicalEntry {
+
+ String wordForm;
+ int from, to;
+
+ List<Reading> termReadings=null;
+
+ public LexicalEntry(String wordForm, int from, int to) {
+ super();
+ this.wordForm = wordForm;
+ this.from = from;
+ this.to = to;
+ }
+
+ public String getWordForm() {
+ return wordForm;
+ }
+
+ public void setWordForm(String wordForm) {
+ this.wordForm = wordForm;
+ }
+
+ public int getFrom() {
+ return from;
+ }
+
+ public void setFrom(int from) {
+ this.from = from;
+ }
+
+ public int getTo() {
+ return to;
+ }
+
+ public void setTo(int to) {
+ this.to = to;
+ }
+
+ public List<Reading> getTermReadings() {
+ return termReadings;
+ }
+
+ public void setTermReadings(List<Reading> termReadings) {
+ this.termReadings = termReadings;
+ }
+
+
+}
+
Added: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/Reading.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/Reading.java?rev=1338425&view=auto
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/Reading.java (added)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/lemmatizer/impl/Reading.java Mon May 14 21:07:19 2012
@@ -0,0 +1,33 @@
+package org.apache.stanbol.enhancer.engines.celi.lemmatizer.impl;
+
+import java.util.Hashtable;
+
+public class Reading {
+
+ String lemma;
+ Hashtable<String,String> lexicalFeatures;
+
+ public Reading(String lemma, Hashtable<String, String> lexicalFeatures) {
+ super();
+ this.lemma = lemma;
+ this.lexicalFeatures = lexicalFeatures;
+ }
+
+ public String getLemma() {
+ return lemma;
+ }
+
+ public void setLemma(String lemma) {
+ this.lemma = lemma;
+ }
+
+ public Hashtable<String, String> getLexicalFeatures() {
+ return lexicalFeatures;
+ }
+
+ public void setLexicalFeatures(Hashtable<String, String> lexicalFeatures) {
+ this.lexicalFeatures = lexicalFeatures;
+ }
+
+
+}
Added: incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/CeliNamedEntityExtractionEnhancementEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/CeliNamedEntityExtractionEnhancementEngine.java?rev=1338425&view=auto
==============================================================================
--- incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/CeliNamedEntityExtractionEnhancementEngine.java (added)
+++ incubator/stanbol/branches/celi-enhancement-engines/engines/celi/src/main/java/org/apache/stanbol/enhancer/engines/celi/ner/impl/CeliNamedEntityExtractionEnhancementEngine.java Mon May 14 21:07:19 2012
@@ -0,0 +1,345 @@
+package org.apache.stanbol.enhancer.engines.celi.ner.impl;
+
+import static org.apache.stanbol.enhancer.servicesapi.rdf.Properties.*;
+
+import java.io.IOException;
+import java.net.URL;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Dictionary;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.Map.Entry;
+import java.util.Vector;
+
+import org.apache.clerezza.rdf.core.Language;
+import org.apache.clerezza.rdf.core.Literal;
+import org.apache.clerezza.rdf.core.LiteralFactory;
+import org.apache.clerezza.rdf.core.MGraph;
+import org.apache.clerezza.rdf.core.NoConvertorException;
+import org.apache.clerezza.rdf.core.Resource;
+import org.apache.clerezza.rdf.core.Triple;
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.clerezza.rdf.core.impl.PlainLiteralImpl;
+import org.apache.clerezza.rdf.core.impl.TripleImpl;
+import org.apache.felix.scr.annotations.Activate;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Deactivate;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Reference;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.commons.stanboltools.offline.OnlineMode;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.InvalidContentException;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.impl.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper;
+import org.apache.stanbol.enhancer.servicesapi.rdf.OntologicalClasses;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+@Component(immediate = true, metatype = true)
+@Service
+@Properties(value = { @Property(name = EnhancementEngine.PROPERTY_NAME, value = "celiNer") })
+public class CeliNamedEntityExtractionEnhancementEngine extends AbstractEnhancementEngine<IOException, RuntimeException> implements EnhancementEngine, ServiceProperties {
+
+ /**
+ * This ensures that no connections to external services are made if Stanbol is started in offline mode
+ * as the OnlineMode service will only be available if OfflineMode is deactivated.
+ */
+ @SuppressWarnings("unused")
+ @Reference
+ private OnlineMode onlineMode;
+
+ /**
+ * The literal representing the LangIDEngine as creator.
+ */
+ public static final Literal LANG_ID_ENGINE_NAME = LiteralFactory.getInstance().createTypedLiteral("org.apache.stanbol.enhancer.engines.celi.langid.impl.CeliLanguageIdentifierEnhancementEngine");
+
+ private static Map<String, UriRef> entityTypes = new HashMap<String, UriRef>();
+ static {
+ entityTypes.put("pers", OntologicalClasses.DBPEDIA_PERSON);
+ entityTypes.put("loc", OntologicalClasses.DBPEDIA_PLACE);
+ entityTypes.put("org", OntologicalClasses.DBPEDIA_ORGANISATION);
+
+ entityTypes.put("time", OntologicalClasses.SKOS_CONCEPT);
+ entityTypes.put("prod", OntologicalClasses.SKOS_CONCEPT);
+ entityTypes.put("amount", OntologicalClasses.SKOS_CONCEPT);
+ }
+ /**
+ * The supported languages (configured via the {@link #SUPPORTED_LANGUAGES}
+ * configuration.
+ */
+ private Collection<String> supportedLangs;
+
+ /**
+ * The default value for the Execution of this Engine. Currently set to
+ * {@link ServiceProperties#ORDERING_CONTENT_EXTRACTION}
+ */
+ public static final Integer defaultOrder = ORDERING_CONTENT_EXTRACTION;
+
+ private Logger log = LoggerFactory.getLogger(getClass());
+
+ /**
+ * This contains the only MIME type directly supported by this enhancement
+ * engine.
+ */
+ private static final String TEXT_PLAIN_MIMETYPE = "text/plain";
+
+ /**
+ * Set containing the only supported mime type {@link #TEXT_PLAIN_MIMETYPE}
+ */
+ private static final Set<String> SUPPORTED_MIMTYPES = Collections.singleton(TEXT_PLAIN_MIMETYPE);
+
+ @Property
+ public static final String LICENSE_KEY = "org.apache.stanbol.enhancer.engines.celi.ner.license";
+
+
+ @Property(value = "http://linguagrid.org/LSGrid/ws/com.celi-france.linguagrid.namedentityrecognition.v0u0.demo")
+ public static final String SERVICE_URL = "org.apache.stanbol.enhancer.engines.celi.ner.url";
+
+ @Property(value = "fr",cardinality=1000)
+ public static final String SUPPORTED_LANGUAGES = "org.apache.stanbol.enhancer.engines.celi.ner.languages";
+
+ private String licenseKey;
+ private URL serviceURL;
+
+ private NERserviceClientHTTP client;
+
+ @Override
+ @Activate
+ protected void activate(ComponentContext ctx) throws IOException, ConfigurationException {
+ super.activate(ctx);
+ @SuppressWarnings("unchecked")
+ Dictionary<String, Object> properties = ctx.getProperties();
+
+ this.licenseKey = (String) properties.get(LICENSE_KEY);
+ if (licenseKey == null || licenseKey.isEmpty()) {
+ log.warn("no CELI license key configured for this Engine, a guest account will be used (max 100 requests per day). Go on http://linguagrid.org for getting a proper license key.");
+ }
+ String url = (String) properties.get(SERVICE_URL);
+ if (url == null || url.isEmpty()) {
+ throw new ConfigurationException(SERVICE_URL, String.format("%s : please configure the URL of the CELI Web Service (e.g. by" + "using the 'Configuration' tab of the Apache Felix Web Console).", getClass().getSimpleName()));
+ }
+ this.serviceURL = new URL(url);
+
+ this.client = new NERserviceClientHTTP(this.serviceURL, this.licenseKey);
+
+ //init the supported languages (now configurable)
+ Object languagObject = properties.get(SUPPORTED_LANGUAGES);
+ HashSet<String> languages;
+ if(languagObject instanceof String){
+ //support splitting multiple languages with ';'
+ languages = new HashSet<String>(Arrays.asList(languagObject.toString().split(";")));
+ if(languages.remove("")){
+ log.warn("Languages configuration '{}' contained empty language -> removed",languagObject);
+ }//empty not allowed
+ } else if(languagObject instanceof Iterable<?>){
+ languages = new HashSet<String>();
+ for(Object o : (Iterable<Object>)languagObject){
+ if(o != null && !o.toString().isEmpty()){
+ languages.add(o.toString());
+ } else {
+ log.warn("Language configuration '{}' contained illegal value '{}' -> removed",
+ languagObject.getClass().isArray()?
+ Arrays.toString((Object[])languagObject): //nicer logging for arrays
+ languagObject,o);
+ }
+ }
+ } else {
+ languages = null;
+ }
+ if(languages == null || languages.isEmpty()){
+ throw new ConfigurationException(SUPPORTED_LANGUAGES, String.format(
+ "Missing or invalid configuration of the supported languages (config :'%s'",
+ languagObject != null && languagObject.getClass().isArray() ?
+ Arrays.toString((Object[])languagObject): //nicer logging for arrays
+ languagObject));
+ }
+ this.supportedLangs = Collections.unmodifiableSet(languages);
+ }
+
+ @Override
+ @Deactivate
+ protected void deactivate(ComponentContext ce) {
+ super.deactivate(ce);
+ this.supportedLangs = null;
+ this.client = null;
+ this.serviceURL = null;
+ }
+
+ @Override
+ public int canEnhance(ContentItem ci) throws EngineException {
+ String language = extractLanguage(ci);
+ if (language == null) {
+ log.info("Unable to extract language annotation for ContentItem -> will not enhance",
+ ci.getUri());
+ return CANNOT_ENHANCE;
+ } else if(isLangSupported(language)){
+ log.debug("Language '{}' of contentItem {} is not supported (supported: {}) -> will not enhance",
+ new Object[]{language,ci.getUri(),supportedLangs});
+ return CANNOT_ENHANCE;
+ }
+
+ if (ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES) != null)
+ return ENHANCE_ASYNC;
+ else
+ log.debug("No Content of type {} found in ConentItem {} -> will not enhance",
+ SUPPORTED_MIMTYPES,ci.getUri());
+ return CANNOT_ENHANCE;
+ }
+
+ /**
+ * Extracts the language of the parsed ContentItem from the metadata
+ *
+ * @param ci
+ * the content item
+ * @return the language
+ */
+ private String extractLanguage(ContentItem ci) {
+ MGraph metadata = ci.getMetadata();
+ Iterator<Triple> langaugeEnhancementCreatorTriples = metadata.filter(null, DC_CREATOR, LANG_ID_ENGINE_NAME);
+ if (langaugeEnhancementCreatorTriples.hasNext()) {
+ String lang = EnhancementEngineHelper.getString(metadata, langaugeEnhancementCreatorTriples.next().getSubject(), DC_LANGUAGE);
+ if (lang != null) {
+ return lang;
+ } else {
+ log.info("Unable to extract language for ContentItem " + ci.getUri().getUnicodeString() + "! The Enhancement of the " + LANG_ID_ENGINE_NAME.getLexicalForm() + " is missing the " + DC_LANGUAGE + " property ... return '{}' as default");
+ return null;
+ }
+ } else {
+
+ Iterator<Triple> it = metadata.filter(null, DC_LANGUAGE, null);
+ if (it.hasNext()) {
+ Resource res = it.next().getObject();
+ if (res instanceof Literal) {
+ return ((Literal) res).getLexicalForm();
+ } else {
+ return res.toString();
+ }
+ }
+
+ log.warn("Unable to extract language for ContentItem " + ci.getUri().getUnicodeString() + "! Is the " + LANG_ID_ENGINE_NAME.getLexicalForm() + " active? ... return '{}' as default");
+ return null;
+ }
+ }
+
+ @Override
+ public void computeEnhancements(ContentItem ci) throws EngineException {
+ Entry<UriRef, Blob> contentPart = ContentItemHelper.getBlob(ci, SUPPORTED_MIMTYPES);
+ if (contentPart == null) {
+ throw new IllegalStateException("No ContentPart with Mimetype '" + TEXT_PLAIN_MIMETYPE + "' found for ContentItem " + ci.getUri() + ": This is also checked in the canEnhance method! -> This "
+ + "indicated an Bug in the implementation of the " + "EnhancementJobManager!");
+ }
+ String text = "";
+ try {
+ text = ContentItemHelper.getText(contentPart.getValue());
+ } catch (IOException e) {
+ throw new InvalidContentException(this, ci, e);
+ }
+ if (text.trim().length() == 0) {
+ log.info("No text contained in ContentPart {" + contentPart.getKey() + "} of ContentItem {" + ci.getUri() + "}");
+ return;
+ }
+ String language = extractLanguage(ci);
+ if (language == null) {
+ throw new IllegalStateException("Unable to extract Language for " + "ContentItem " + ci.getUri() + ": This is also checked in the canEnhance " + "method! -> This indicated an Bug in the implementation of the " + "EnhancementJobManager!");
+ }
+ Language lang = new Language(language); //used for the palin literals in TextAnnotations
+ try {
+ List<NamedEntity> lista = this.client.extractEntities(text);
+ LiteralFactory literalFactory = LiteralFactory.getInstance();
+
+ MGraph g = ci.getMetadata();
+
+ for (NamedEntity ne : lista) {
+ try {
+ UriRef textAnnotation = EnhancementEngineHelper.createTextEnhancement(ci, this);
+ //add selected text as PlainLiteral in the language extracted from the text
+ g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTED_TEXT,
+ new PlainLiteralImpl(ne.getFormKind(),lang)));
+ g.add(new TripleImpl(textAnnotation, DC_TYPE, getEntityRefForType(ne.type)));
+ if (ne.getFrom() != null && ne.getTo() != null) {
+ g.add(new TripleImpl(textAnnotation, ENHANCER_START, literalFactory.createTypedLiteral(
+ ne.getFrom().intValue())));
+ g.add(new TripleImpl(textAnnotation, ENHANCER_END, literalFactory.createTypedLiteral(
+ ne.getTo().intValue())));
+ g.add(new TripleImpl(textAnnotation, ENHANCER_SELECTION_CONTEXT,
+ new PlainLiteralImpl(getSelectionContext(text, ne.getFormKind(), ne.getFrom().intValue()), lang)));
+ }
+ } catch (NoConvertorException e) {
+ log.error(e.getMessage(), e);
+ }
+ }
+ } catch (Exception e) {
+ log.error(e.getMessage(), e);
+ }
+
+ }
+ /**
+ * The maximum size of the preix/suffix for the selection context
+ */
+ private static final int SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE = 50;
+ /**
+ * Extracts the selection context based on the content, selection and
+ * the start char offset of the selection
+ * @param content
+ * @param selection
+ * @param current
+ * @return
+ */
+ private String getSelectionContext(String content, String selection,int current){
+ //extract the selection context
+ int beginPos;
+ if(current <= SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE){
+ beginPos = 0;
+ } else {
+ int start = current-SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE;
+ beginPos = content.indexOf(' ',start);
+ if(beginPos < 0 || beginPos >= current){ //no words
+ beginPos = start; //begin within a word
+ }
+ }
+ int endPos;
+ if(current+selection.length()+SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE >= content.length()){
+ endPos = content.length();
+ } else {
+ int start = current+selection.length()+SELECTION_CONTEXT_PREFIX_SUFFIX_SIZE;
+ endPos = content.lastIndexOf(' ', start);
+ if(endPos <= current+selection.length()){
+ endPos = start; //end within a word;
+ }
+ }
+ return content.substring(beginPos, endPos);
+ }
+
+
+ private boolean isLangSupported(String language) {
+ return supportedLangs.contains(language);
+ }
+
+ private Resource getEntityRefForType(String type) {
+ if (!entityTypes.containsKey(type))
+ return null;
+ else
+ return entityTypes.get(type);
+ }
+
+ @Override
+ public Map<String, Object> getServiceProperties() {
+ return Collections.unmodifiableMap(Collections.singletonMap(ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
+ }
+
+}