You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/02/28 16:36:06 UTC
svn commit: r1294695 - in /incubator/stanbol/trunk:
enhancer/bundlelist/src/main/bundles/ enhancer/engines/
enhancer/engines/langid/ enhancer/engines/tika/ enhancer/engines/tika/src/
enhancer/engines/tika/src/license/ enhancer/engines/tika/src/main/ en...
Author: rwesten
Date: Tue Feb 28 15:36:04 2012
New Revision: 1294695
URL: http://svn.apache.org/viewvc?rev=1294695&view=rev
Log:
STANBOL-512: First version of the Apache Stanbol Enhancement Engine for Apache Tika
Supports:
* mime type detection (if none or "application/octed-stream"
* plain text extraction
* xhtml extraction
missing:
* processing of the extracted metadata
Added:
incubator/stanbol/trunk/enhancer/engines/tika/ (with props)
incubator/stanbol/trunk/enhancer/engines/tika/pom.xml (with props)
incubator/stanbol/trunk/enhancer/engines/tika/src/
incubator/stanbol/trunk/enhancer/engines/tika/src/license/
incubator/stanbol/trunk/enhancer/engines/tika/src/main/
incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/
incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/
incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/
incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/
incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/
incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/
incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/
incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java (with props)
incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/
incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/MultiHandler.java (with props)
incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/PlainTextHandler.java (with props)
incubator/stanbol/trunk/enhancer/engines/tika/src/main/resources/
incubator/stanbol/trunk/enhancer/engines/tika/src/main/resources/OSGI-INF/
incubator/stanbol/trunk/enhancer/engines/tika/src/main/resources/OSGI-INF/metatype/
incubator/stanbol/trunk/enhancer/engines/tika/src/test/
incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/
incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/
incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/
incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/
incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/
incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/
incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/
incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/MockComponentContext.java (with props)
incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java (with props)
incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/
incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/README
incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.doc (with props)
incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.html (with props)
incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.odt (with props)
incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.pages (with props)
incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.pdf (with props)
incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.rtf
incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.xhtml (with props)
incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.html (with props)
incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.pdf (with props)
Modified:
incubator/stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml
incubator/stanbol/trunk/enhancer/engines/langid/pom.xml
incubator/stanbol/trunk/enhancer/engines/pom.xml
incubator/stanbol/trunk/parent/pom.xml
Modified: incubator/stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml?rev=1294695&r1=1294694&r2=1294695&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml (original)
+++ incubator/stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml Tue Feb 28 15:36:04 2012
@@ -17,6 +17,21 @@
-->
<bundles>
+ <!-- *********************************************************************
+ start level 10 TO 19 reserved for required libraries (internal and external)
+ ********************************************************************* -->
+ <startLevel level="17">
+ <bundle> <!-- Apache Tika core (required by the LangId and TikaEngine) -->
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-core</artifactId>
+ <version>1.0</version>
+ </bundle>
+ <bundle> <!-- Apache Tika bundle (required by the TikaEngine) -->
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-bundle</artifactId>
+ <version>1.0</version>
+ </bundle>
+ </startLevel>
<!-- Stanbol Enhancer infrastructure and required libraries -->
<startLevel level="20">
<bundle>
@@ -91,46 +106,62 @@
<!-- Stanbol Enhancer plug-ins (the Enhancement Engines) -->
<startLevel level="25">
- <bundle>
+
+ <!-- language identification -->
+
+ <bundle>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.enhancer.engines.langid</artifactId>
<version>0.9.0-incubating-SNAPSHOT</version>
</bundle>
- <bundle>
+
+ <!-- Content conversion (2 engines)-->
+
+ <bundle> <!-- Metaxa -->
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.enhancer.engines.metaxa</artifactId>
<version>0.9.0-incubating-SNAPSHOT</version>
</bundle>
- <bundle>
+ <bundle><!-- Apache Tika Engine -->
<groupId>org.apache.stanbol</groupId>
- <artifactId>org.apache.stanbol.enhancer.engines.opencalais</artifactId>
+ <artifactId>org.apache.stanbol.enhancer.engines.tika</artifactId>
<version>0.9.0-incubating-SNAPSHOT</version>
</bundle>
- <bundle>
+
+ <!-- Named Entity Recoqunition (NER)-->
+
+ <bundle> <!-- Open NLP based NER -->
<groupId>org.apache.stanbol</groupId>
- <artifactId>org.apache.stanbol.enhancer.engines.zemanta</artifactId>
+ <artifactId>org.apache.stanbol.enhancer.engines.opennlp.ner</artifactId>
<version>0.9.0-incubating-SNAPSHOT</version>
</bundle>
- <bundle>
+ <!-- Entity Extraction/Linking -->
+ <bundle><!-- NER linking (depends on the Entityhub) -->
<groupId>org.apache.stanbol</groupId>
- <artifactId>org.apache.stanbol.enhancer.engines.opennlp.ner</artifactId>
+ <artifactId>org.apache.stanbol.enhancer.engine.entitytagging</artifactId>
<version>0.9.0-incubating-SNAPSHOT</version>
</bundle>
- <bundle>
+ <bundle><!-- Keyword Extraction from Text (depends on the Entityhub) -->
<groupId>org.apache.stanbol</groupId>
- <artifactId>org.apache.stanbol.enhancer.engines.geonames</artifactId>
+ <artifactId>org.apache.stanbol.enhancer.engine.keywordextraction</artifactId>
<version>0.9.0-incubating-SNAPSHOT</version>
</bundle>
- <!-- Entity Tagging Engine (depends on the Entityhub) -->
- <bundle>
+
+ <!-- External Service Integration -->
+
+ <bundle> <!-- http://www.opencalais.com/ -->
<groupId>org.apache.stanbol</groupId>
- <artifactId>org.apache.stanbol.enhancer.engine.entitytagging</artifactId>
+ <artifactId>org.apache.stanbol.enhancer.engines.opencalais</artifactId>
<version>0.9.0-incubating-SNAPSHOT</version>
</bundle>
- <!-- Keyword Extraction Engine (depends on the Entityhub) -->
- <bundle>
+ <bundle> <!-- http://www.zemanta.com/ -->
<groupId>org.apache.stanbol</groupId>
- <artifactId>org.apache.stanbol.enhancer.engine.keywordextraction</artifactId>
+ <artifactId>org.apache.stanbol.enhancer.engines.zemanta</artifactId>
+ <version>0.9.0-incubating-SNAPSHOT</version>
+ </bundle>
+ <bundle> <!-- http://www.geonames.org/ -->
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.geonames</artifactId>
<version>0.9.0-incubating-SNAPSHOT</version>
</bundle>
</startLevel>
Modified: incubator/stanbol/trunk/enhancer/engines/langid/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langid/pom.xml?rev=1294695&r1=1294694&r2=1294695&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langid/pom.xml (original)
+++ incubator/stanbol/trunk/enhancer/engines/langid/pom.xml Tue Feb 28 15:36:04 2012
@@ -61,7 +61,6 @@
org.apache.stanbol.enhancer.engines.langid;version=${project.version}
</Export-Package>
<Embed-Dependency>
- textcat
</Embed-Dependency>
</instructions>
</configuration>
@@ -91,8 +90,6 @@
<dependency>
<groupId>org.apache.stanbol</groupId>
<artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
- <version>0.9.0-incubating-SNAPSHOT</version>
- <scope>provided</scope>
</dependency>
<dependency>
Modified: incubator/stanbol/trunk/enhancer/engines/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/pom.xml?rev=1294695&r1=1294694&r2=1294695&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/pom.xml (original)
+++ incubator/stanbol/trunk/enhancer/engines/pom.xml Tue Feb 28 15:36:04 2012
@@ -46,11 +46,11 @@
<module>langid</module>
<module>topic</module>
<module>metaxa</module>
+ <module>tika</module>
<module>geonames</module>
<module>entitytagging</module>
<module>keywordextraction</module>
- <!-- Entityhub based enhancement engine(s) -->
<module>opencalais</module>
<module>zemanta</module>
</modules>
Propchange: incubator/stanbol/trunk/enhancer/engines/tika/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Tue Feb 28 15:36:04 2012
@@ -0,0 +1,7 @@
+.classpath
+
+.project
+
+.settings
+
+target
Added: incubator/stanbol/trunk/enhancer/engines/tika/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/pom.xml?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/pom.xml (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/pom.xml Tue Feb 28 15:36:04 2012
@@ -0,0 +1,133 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <artifactId>org.apache.stanbol.enhancer.parent</artifactId>
+ <groupId>org.apache.stanbol</groupId>
+ <version>0.9.0-incubating-SNAPSHOT</version>
+ <relativePath>../../parent</relativePath>
+ </parent>
+
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.engines.tika</artifactId>
+ <packaging>bundle</packaging>
+
+ <name>Apache Stanbol Enhancer Enhancement Engine : Apache Tika </name>
+ <description>Enhancement Engine that uses Apache Tika to convert parsed
+ documents to plain text and xhtml
+ </description>
+
+ <inceptionYear>2012</inceptionYear>
+
+ <scm>
+ <connection>
+ scm:svn:http://svn.apache.org/repos/asf/incubator/stanbol/trunk/enhancer/engines/tika/
+ </connection>
+ <developerConnection>
+ scm:svn:https://svn.apache.org/repos/asf/incubator/stanbol/trunk/enhancer/engines/tika/
+ </developerConnection>
+ <url>http://incubator.apache.org/stanbol/</url>
+ </scm>
+
+ <build>
+ <plugins>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-bundle-plugin</artifactId>
+ <extensions>true</extensions>
+ <configuration>
+ <instructions>
+ <Export-Package>
+ org.apache.stanbol.enhancer.engines.tika;version=${project.version}
+ </Export-Package>
+ </instructions>
+ </configuration>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>maven-scr-plugin</artifactId>
+ </plugin>
+ <plugin>
+ <groupId>org.apache.rat</groupId>
+ <artifactId>apache-rat-plugin</artifactId>
+ <configuration>
+ <excludes>
+ <!-- AL20 License -->
+ <exclude>src/license/THIRD-PARTY.properties</exclude>
+ <!-- AL20 License for test resources (see src/test/resources/README) -->
+ <exclude>src/test/resources/test.doc</exclude>
+ <exclude>src/test/resources/test.html</exclude>
+ <exclude>src/test/resources/test.xhtml</exclude>
+ <exclude>src/test/resources/test.odt</exclude>
+ <exclude>src/test/resources/test.pages</exclude>
+ <exclude>src/test/resources/test.pdf</exclude>
+ <exclude>src/test/resources/test.rtf</exclude>
+ <exclude>src/test/resources/test2.html</exclude>
+ <exclude>src/test/resources/test2.pdf</exclude>
+ </excludes>
+ </configuration>
+ </plugin>
+ </plugins>
+ </build>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.stanbol</groupId>
+ <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
+ <version>0.9.0-incubating-SNAPSHOT</version>
+ <scope>provided</scope>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>org.apache.felix</groupId>
+ <artifactId>org.apache.felix.scr.annotations</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.clerezza</groupId>
+ <artifactId>rdf.core</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>commons-io</groupId>
+ <artifactId>commons-io</artifactId>
+ </dependency>
+ <dependency>
+ <groupId>org.slf4j</groupId>
+ <artifactId>slf4j-api</artifactId>
+ </dependency>
+
+ <dependency>
+ <groupId>junit</groupId>
+ <artifactId>junit</artifactId>
+ <scope>test</scope>
+ </dependency>
+ </dependencies>
+
+</project>
Propchange: incubator/stanbol/trunk/enhancer/engines/tika/pom.xml
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java Tue Feb 28 15:36:04 2012
@@ -0,0 +1,222 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.tika;
+
+import static org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.randomUUID;
+import static org.apache.tika.mime.MediaType.TEXT_PLAIN;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.Collections;
+import java.util.Map;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.commons.io.IOUtils;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.enhancer.engines.tika.handler.MultiHandler;
+import org.apache.stanbol.enhancer.engines.tika.handler.PlainTextHandler;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.InMemoryBlob;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+
+/**
+ * EnhancementEngine based on Apache Tika that converts the content of parsed
+ * content items to xhtml and plain text. In addition it converts extracted
+ * metadata to RDF and adds them to the {@link ContentItem#getMetadata()}
+ *
+ * @author Rupert Westenthaler
+ *
+ */
+@Component(immediate = true, metatype = true, inherit=true)
+@Service
+@Properties(value={
+ @Property(name=EnhancementEngine.PROPERTY_NAME,value="tika")
+})
+public class TikaEngine
+ extends AbstractEnhancementEngine<RuntimeException,RuntimeException>
+ implements EnhancementEngine, ServiceProperties {
+ private final Logger log = LoggerFactory.getLogger(TikaEngine.class);
+ /**
+ * The default value for the Execution of this Engine. Currently set to
+ * {@link ServiceProperties#ORDERING_PRE_PROCESSING}
+ */
+ public static final Integer defaultOrder = ORDERING_PRE_PROCESSING;
+
+ protected static MediaType XHTML = new MediaType("application", "xhtml+xml");
+
+ private TikaConfig config;
+ private Parser parser;
+ private Detector detector;
+
+ private static class MediaTypeAndStream {
+ MediaType mediaType;
+ InputStream in;
+ }
+
+ @Override
+ public int canEnhance(ContentItem ci) throws EngineException {
+ return ENHANCE_ASYNC;
+ }
+
+ @Override
+ public void computeEnhancements(ContentItem ci) throws EngineException {
+ MediaTypeAndStream mtas = extractMediaType(ci);
+ if(mtas.mediaType == null){
+ return; //unable to parse and detect content type
+ }
+ MediaType plainMediaType = mtas.mediaType.getBaseType();
+ if(plainMediaType.equals(MediaType.TEXT_PLAIN)){
+ return; //we need not to process plain text!
+ }
+ ParseContext context = new ParseContext();
+ context.set(Parser.class,parser);
+ if(parser.getSupportedTypes(context).contains(plainMediaType)) {
+ final InputStream in;
+ if(mtas.in == null){
+ in = ci.getStream();
+ } else {
+ in = mtas.in;
+ }
+ Metadata metadata = new Metadata();
+ //set the already parsed contentType
+ metadata.set(Metadata.CONTENT_TYPE, mtas.mediaType.toString());
+ final StringWriter writer = new StringWriter();
+ final ContentHandler textHandler = new BodyContentHandler( //only the Body
+ new PlainTextHandler(writer, true,false)); //skip ignoreable
+ final ToXMLContentHandler xhtmlHandler;
+ final ContentHandler mainHandler;
+ if(!plainMediaType.equals(XHTML)){ //do not parse XHTML from XHTML
+ xhtmlHandler = new ToXMLContentHandler();
+ mainHandler = new MultiHandler(textHandler,xhtmlHandler);
+ } else {
+ mainHandler = textHandler;
+ xhtmlHandler = null;
+ }
+ try {
+ parser.parse(in, mainHandler, metadata, context);
+ } catch (Exception e) {
+ throw new EngineException("Unable to convert ContentItem "+
+ ci.getUri()+" with mimeType '"+ci.getMimeType()+"' to "+
+ "plain text!",e);
+ }
+ IOUtils.closeQuietly(in);
+// log.info("Plain Content: \n{} \n",writer.toString());
+ String random = randomUUID().toString();
+ UriRef textBlobUri = new UriRef("urn:tika:text:"+random);
+ ci.addPart(textBlobUri,
+ new InMemoryBlob(writer.toString(),
+ TEXT_PLAIN.toString())); //string -> no encoding
+ if(xhtmlHandler != null){
+// log.info("XML Content: \n{} \n",xhtmlHandler.toString());
+ UriRef xhtmlBlobUri = new UriRef("urn:tika:xhtml:"+random);
+ ci.addPart(xhtmlBlobUri,
+ new InMemoryBlob(xhtmlHandler.toString(),
+ "application/xhtml+xml")); //string -> no encoding
+ }
+ //TODO:
+ // * add also the Metadata extracted by Apache Tika
+
+ } //else not supported format
+
+ }
+ /**
+ * Getter for the contentType. If not set or {@link MediaType#OCTET_STREAM}
+ * than the media type is detected.<p>
+ * This method returns the MediaType and the Stream used to detect the
+ * MimeType. This allows to reuse the stream and the mediaType
+ * @param ci
+ * @param mediaTypeArray
+ * @return
+ */
+ private MediaTypeAndStream extractMediaType(ContentItem ci) {
+ MediaTypeAndStream mtas = new MediaTypeAndStream();
+ mtas.mediaType = getMediaType(ci.getBlob());
+ if(mtas.mediaType == null || mtas.mediaType.equals(MediaType.OCTET_STREAM)){
+ mtas.in = new BufferedInputStream(ci.getStream());
+ try {
+ mtas.mediaType = detector.detect(mtas.in, new Metadata());
+ } catch (IOException e) {
+ log.warn("Exception while detection the MediaType of the" +
+ "parsed ContentItem "+ci.getUri(),e);
+ IOUtils.closeQuietly(mtas.in);
+ mtas.in = null;
+ }
+ }
+ return mtas;
+ }
+
+ /**
+ * @param ci
+ * @return
+ */
+ private MediaType getMediaType(Blob blob) {
+ String[] mediaTypeArray = blob.getMimeType().split("/");
+ if(mediaTypeArray.length != 2){
+ log.warn("Encounterd illegal formatted mediaType '{}' -> will try " +
+ "to detect the mediaType based on the parsed content!",
+ blob.getMimeType());
+ return null;
+ } else {
+ return new MediaType(mediaTypeArray[0], mediaTypeArray[1],
+ blob.getParameter());
+ }
+ }
+ @Override
+ protected void activate(ComponentContext ctx) throws ConfigurationException {
+ super.activate(ctx);
+ config = TikaConfig.getDefaultConfig();
+ this.detector = config.getDetector();
+ this.parser = new AutoDetectParser(config);
+ }
+ @Override
+ protected void deactivate(ComponentContext ctx) throws RuntimeException {
+ this.config = null;
+ this.parser = null;
+ this.detector = null;
+ super.deactivate(ctx);
+ }
+
+ public Map<String, Object> getServiceProperties() {
+ return Collections.unmodifiableMap(
+ Collections.singletonMap(
+ ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
+ }
+
+}
Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/MultiHandler.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/MultiHandler.java?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/MultiHandler.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/MultiHandler.java Tue Feb 28 15:36:04 2012
@@ -0,0 +1,127 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.tika.handler;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Similar to {@link ContentHandlerDecorator} - as it processed the exact same
+ * methods - but supports forwarding such calls to several parsed {@link ContentHandler}s
+ *
+ * @author Rupert Westenthaler
+ *
+ */
+public class MultiHandler extends DefaultHandler {
+
+ List<ContentHandler> handlers;
+
+ public MultiHandler(ContentHandler...handlers) {
+ if(handlers == null || handlers.length < 1){
+ throw new IllegalArgumentException("The parsed ContentHandler array MUST NOT be NULL or empty!");
+ }
+ this.handlers = Arrays.asList(handlers);
+ if(this.handlers.contains(null)){
+ throw new IllegalArgumentException("Tha parsed ContentHandlers array MUST NOT contain an NULL entry!");
+ }
+ }
+
+ @Override
+ public void startPrefixMapping(String prefix, String uri) throws SAXException {
+ for(ContentHandler handler : handlers){
+ handler.startPrefixMapping(prefix, uri);
+ }
+ }
+
+ @Override
+ public void endPrefixMapping(String prefix) throws SAXException {
+ for(ContentHandler handler : handlers){
+ handler.endPrefixMapping(prefix);
+ }
+ }
+
+ @Override
+ public void processingInstruction(String target, String data) throws SAXException {
+ for(ContentHandler handler : handlers){
+ handler.processingInstruction(target, data);
+ }
+ }
+
+ @Override
+ public void setDocumentLocator(Locator locator) {
+ for(ContentHandler handler : handlers){
+ handler.setDocumentLocator(locator);
+ }
+ }
+
+ @Override
+ public void startDocument() throws SAXException {
+ for(ContentHandler handler : handlers){
+ handler.startDocument();
+ }
+ }
+
+ @Override
+ public void endDocument() throws SAXException {
+ for(ContentHandler handler : handlers){
+ handler.endDocument();
+ }
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException {
+ for(ContentHandler handler : handlers){
+ handler.startElement(uri, localName, name, atts);
+ }
+ }
+
+ @Override
+ public void endElement(String uri, String localName, String name) throws SAXException {
+ for(ContentHandler handler : handlers){
+ handler.endElement(uri, localName, name);
+ }
+ }
+
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ for(ContentHandler handler : handlers){
+ handler.characters(ch, start, length);
+ }
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+ for(ContentHandler handler : handlers){
+ handler.ignorableWhitespace(ch, start, length);
+ }
+ }
+
+ @Override
+ public void skippedEntity(String name) throws SAXException {
+ for(ContentHandler handler : handlers){
+ handler.skippedEntity(name);
+ }
+ }
+
+}
Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/MultiHandler.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/PlainTextHandler.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/PlainTextHandler.java?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/PlainTextHandler.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/PlainTextHandler.java Tue Feb 28 15:36:04 2012
@@ -0,0 +1,83 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.tika.handler;
+
+import java.io.Writer;
+
+import org.apache.tika.sax.ToTextContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+/**
+ * Small extensions to the default {@link ToTextContentHandler}. This allows
+ * to <ul>
+ * <li>skip ignoreable whitespaces
+ * <li>skip linebreaks within literals
+ * </ul>
+ *
+ * @author Rupert Westenthaler
+ *
+ */
+public class PlainTextHandler extends ToTextContentHandler {
+
+
+
+ private final boolean skipWhitespaces;
+ private final boolean skipLinebreakes;
+ public PlainTextHandler(Writer writer, boolean skipIgnoreableWhitespaces, boolean skipLinebreaksWithinLiterals) {
+ super(writer);
+ this.skipWhitespaces = skipIgnoreableWhitespaces;
+ this.skipLinebreakes = skipLinebreaksWithinLiterals;
+ }
+
+ @Override
+ public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+ if(!skipWhitespaces){
+ super.ignorableWhitespace(ch, start, length);
+ } //else ignore
+ }
+ @Override
+ public void characters(char[] ch, int start, int length) throws SAXException {
+ int in;
+ if(skipLinebreakes){
+ //use an in(serte) and an it(erator) index to avoid copying
+ //the data to a new char[].
+ in = start;
+ for(int it = start; it<length;it++){
+ if(ch[it] != '\n'){
+ ch[in] = ch[it];
+ in++;
+ } //ignore line breaks
+ }
+ if(in == start){ //only line breaks
+ return; // -> nothing to add
+ }
+ } else {
+ in = length;
+ }
+ super.characters(ch, start, in);
+ }
+
+ @Override
+ public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+ super.startElement(uri, localName, qName, attributes);
+ }
+ @Override
+ public void endElement(String uri, String localName, String qName) throws SAXException {
+ // TODO Auto-generated method stub
+ super.endElement(uri, localName, qName);
+ }
+}
Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/PlainTextHandler.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/MockComponentContext.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/MockComponentContext.java?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/MockComponentContext.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/MockComponentContext.java Tue Feb 28 15:36:04 2012
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.tika;
+
+import java.io.File;
+import java.io.InputStream;
+import java.util.Dictionary;
+import java.util.Hashtable;
+
+import org.osgi.framework.Bundle;
+import org.osgi.framework.BundleContext;
+import org.osgi.framework.BundleException;
+import org.osgi.framework.BundleListener;
+import org.osgi.framework.Filter;
+import org.osgi.framework.FrameworkListener;
+import org.osgi.framework.InvalidSyntaxException;
+import org.osgi.framework.ServiceListener;
+import org.osgi.framework.ServiceReference;
+import org.osgi.framework.ServiceRegistration;
+import org.osgi.service.component.ComponentContext;
+import org.osgi.service.component.ComponentInstance;
+
+public class MockComponentContext implements ComponentContext {
+
+ protected final Dictionary<String, Object> properties;
+
+ public MockComponentContext() {
+ properties = new Hashtable<String, Object>();
+ }
+
+ public MockComponentContext(Dictionary<String, Object> properties) {
+ this.properties = properties;
+ }
+
+ public void disableComponent(String name) {
+ }
+
+ public void enableComponent(String name) {
+ }
+
+ public BundleContext getBundleContext() {
+ return new BundleContext() {
+
+ @Override
+ public boolean ungetService(ServiceReference reference) {
+ return false;
+ }
+
+ @Override
+ public void removeServiceListener(ServiceListener listener) {
+ }
+
+ @Override
+ public void removeFrameworkListener(FrameworkListener listener) {
+ }
+
+ @Override
+ public void removeBundleListener(BundleListener listener) {
+ }
+
+ @Override
+ public ServiceRegistration registerService(String clazz,
+ Object service, Dictionary properties) {
+ return null;
+ }
+
+ @Override
+ public ServiceRegistration registerService(String[] clazzes,
+ Object service, Dictionary properties) {
+ return null;
+ }
+
+ @Override
+ public Bundle installBundle(String location, InputStream input)
+ throws BundleException {
+ return null;
+ }
+
+ @Override
+ public Bundle installBundle(String location) throws BundleException {
+ return null;
+ }
+
+ @Override
+ public ServiceReference[] getServiceReferences(String clazz,
+ String filter) throws InvalidSyntaxException {
+ return null;
+ }
+
+ @Override
+ public ServiceReference getServiceReference(String clazz) {
+ return null;
+ }
+
+ @Override
+ public Object getService(ServiceReference reference) {
+ return null;
+ }
+
+ @Override
+ public String getProperty(String key) {
+ return null;
+ }
+
+ @Override
+ public File getDataFile(String filename) {
+ return new File(System.getProperty("java.io.tmpdir"));
+ }
+
+ @Override
+ public Bundle[] getBundles() {
+ return null;
+ }
+
+ @Override
+ public Bundle getBundle(long id) {
+ return null;
+ }
+
+ @Override
+ public Bundle getBundle() {
+ return null;
+ }
+
+ @Override
+ public ServiceReference[] getAllServiceReferences(String clazz,
+ String filter) throws InvalidSyntaxException {
+ return null;
+ }
+
+ @Override
+ public Filter createFilter(String filter)
+ throws InvalidSyntaxException {
+ return null;
+ }
+
+ @Override
+ public void addServiceListener(ServiceListener listener,
+ String filter) throws InvalidSyntaxException {
+
+ }
+
+ @Override
+ public void addServiceListener(ServiceListener listener) {
+ }
+
+ @Override
+ public void addFrameworkListener(FrameworkListener listener) {
+ }
+
+ @Override
+ public void addBundleListener(BundleListener listener) {
+ }
+ };
+ }
+
+ public ComponentInstance getComponentInstance() {
+ return null;
+ }
+
+ public Dictionary<String, Object> getProperties() {
+ return properties;
+ }
+
+ public ServiceReference getServiceReference() {
+ return null;
+ }
+
+ public Bundle getUsingBundle() {
+ return null;
+ }
+
+ public Object locateService(String name) {
+ return null;
+ }
+
+ public Object locateService(String name, ServiceReference reference) {
+ return null;
+ }
+
+ public Object[] locateServices(String name) {
+ return null;
+ }
+
+}
Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/MockComponentContext.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java Tue Feb 28 15:36:04 2012
@@ -0,0 +1,367 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.tika;
+
+import static java.util.Collections.singleton;
+import static org.apache.commons.io.IOUtils.closeQuietly;
+import static org.apache.commons.io.IOUtils.toByteArray;
+import static org.apache.stanbol.enhancer.engines.tika.TikaEngine.XHTML;
+import static org.apache.stanbol.enhancer.servicesapi.EnhancementEngine.CANNOT_ENHANCE;
+import static org.apache.tika.mime.MediaType.OCTET_STREAM;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.util.Map.Entry;
+import java.util.regex.Pattern;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.LineIterator;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.apache.stanbol.enhancer.servicesapi.helper.InMemoryContentItem;
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.osgi.service.cm.ConfigurationException;
+
+public class TikaEngineTest {
+
+ private static TikaEngine engine;
+ private static MockComponentContext context;
+
+ @BeforeClass
+ public static void setUpServices() throws IOException {
+ context = new MockComponentContext();
+ context.properties.put(TikaEngine.PROPERTY_NAME, "tika");
+ }
+
+ @Before
+ public void bindServices() throws ConfigurationException {
+ if(engine == null){
+ engine = new TikaEngine();
+ engine.activate(context);
+ }
+ }
+
+ @Test
+ public void testHtml() throws EngineException, IOException {
+ ContentItem ci = createContentItem("test.html", "text/html; charset=UTF-8");
+ assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+ engine.computeEnhancements(ci);
+ Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
+ singleton("text/plain"));
+ assertNotNull(contentPart);
+ Blob plainTextBlob = contentPart.getValue();
+ assertNotNull(plainTextBlob);
+ assertContentRegexp(plainTextBlob,
+ "The Apache Stanbol Enhancer",
+ "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
+ //validate XHTML results
+ contentPart = ContentItemHelper.getBlob(ci,
+ singleton("application/xhtml+xml"));
+ assertNotNull(contentPart);
+ Blob xhtmlBlob = contentPart.getValue();
+ assertNotNull(xhtmlBlob);
+ assertContentRegexp(xhtmlBlob,
+ "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
+ "<head>",
+ "<meta name=",
+ "<title>The Apache Stanbol Enhancer</title>",
+ "The Apache Stanbol Enhancer",
+ "The Stanbol enhancer can detect famous cities",
+ "</body></html>");
+ }
+ @Test
+ public void testPdf() throws EngineException, IOException {
+ //PDF created by Apple Pages
+ ContentItem ci = createContentItem("test.pdf", "application/pdf");
+ assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+ engine.computeEnhancements(ci);
+ Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
+ singleton("text/plain"));
+ assertNotNull(contentPart);
+ Blob plainTextBlob = contentPart.getValue();
+ assertNotNull(plainTextBlob);
+ assertContentRegexp(plainTextBlob,
+ "The Apache Stanbol Enhancer",
+ "The Stanbol enhancer can detect famous cities ");
+ //validate XHTML results
+ contentPart = ContentItemHelper.getBlob(ci,
+ singleton("application/xhtml+xml"));
+ assertNotNull(contentPart);
+ Blob xhtmlBlob = contentPart.getValue();
+ assertNotNull(xhtmlBlob);
+ assertContentRegexp(xhtmlBlob,
+ "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
+ "<head>",
+ "<meta name=",
+ "<div class=\"page\">",
+ "The Apache Stanbol Enhancer",
+ "The Stanbol enhancer can detect famous cities",
+ "</body></html>");
+
+ //PDF created by OpenOffice
+ ci = createContentItem("test2.pdf", "application/pdf");
+ assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+ engine.computeEnhancements(ci);
+ //validate plain text results
+ contentPart = ContentItemHelper.getBlob(ci,
+ singleton("text/plain"));
+ assertNotNull(contentPart);
+ plainTextBlob = contentPart.getValue();
+ assertNotNull(plainTextBlob);
+ assertContentRegexp(plainTextBlob,
+ "The Apache Stanbol Enhancer",
+ "The Stanbol enhancer can detect famous cities");
+ //validate XHTML results
+ contentPart = ContentItemHelper.getBlob(ci,
+ singleton("application/xhtml+xml"));
+ assertNotNull(contentPart);
+ xhtmlBlob = contentPart.getValue();
+ assertNotNull(xhtmlBlob);
+ assertContentRegexp(xhtmlBlob,
+ "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
+ "<head>",
+ "<meta name=",
+ "<div class=\"page\">",
+ "The Apache Stanbol Enhancer",
+ "The Stanbol enhancer can detect famous cities",
+ "</body></html>");
+
+ }
+ @Test
+ public void testMsWord() throws EngineException, IOException {
+ ContentItem ci = createContentItem("test.doc", "application/msword");
+ assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+ engine.computeEnhancements(ci);
+ Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
+ singleton("text/plain"));
+ assertNotNull(contentPart);
+ Blob plainTextBlob = contentPart.getValue();
+ assertNotNull(plainTextBlob);
+ assertContentRegexp(plainTextBlob,
+ "The Apache Stanbol Enhancer",
+ "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
+ //validate XHTML results
+ contentPart = ContentItemHelper.getBlob(ci,
+ singleton("application/xhtml+xml"));
+ assertNotNull(contentPart);
+ Blob xhtmlBlob = contentPart.getValue();
+ assertNotNull(xhtmlBlob);
+ assertContentRegexp(xhtmlBlob,
+ "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
+ "<head>",
+ "<meta name=",
+ "<title>",
+ "The Apache Stanbol Enhancer",
+ "The Stanbol enhancer can detect famous cities",
+ "</body></html>"); }
+ @Test
+ public void testRtf() throws EngineException, IOException {
+ ContentItem ci = createContentItem("test.rtf", "application/rtf");
+ assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+ engine.computeEnhancements(ci);
+ Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
+ singleton("text/plain"));
+ assertNotNull(contentPart);
+ Blob plainTextBlob = contentPart.getValue();
+ assertNotNull(plainTextBlob);
+ assertContentRegexp(plainTextBlob,
+ "The Apache Stanbol Enhancer",
+ "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
+ //validate XHTML results
+ contentPart = ContentItemHelper.getBlob(ci,
+ singleton("application/xhtml+xml"));
+ assertNotNull(contentPart);
+ Blob xhtmlBlob = contentPart.getValue();
+ assertNotNull(xhtmlBlob);
+ assertContentRegexp(xhtmlBlob,
+ "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
+ "<head>",
+ "<meta name=",
+ "<title>",
+ "The Apache Stanbol Enhancer",
+ "The Stanbol enhancer can detect famous cities",
+ "</body></html>");
+ }
+ @Test
+ public void testOdt() throws EngineException, IOException {
+ ContentItem ci = createContentItem("test.odt", "application/vnd.oasis.opendocument.text");
+ assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+ engine.computeEnhancements(ci);
+ Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
+ singleton("text/plain"));
+ assertNotNull(contentPart);
+ Blob plainTextBlob = contentPart.getValue();
+ assertNotNull(plainTextBlob);
+ assertContentRegexp(plainTextBlob,
+ "The Apache Stanbol Enhancer",
+ "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
+ //validate XHTML results
+ contentPart = ContentItemHelper.getBlob(ci,
+ singleton("application/xhtml+xml"));
+ assertNotNull(contentPart);
+ Blob xhtmlBlob = contentPart.getValue();
+ assertNotNull(xhtmlBlob);
+ assertContentRegexp(xhtmlBlob,
+ "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
+ "<head>",
+ "<meta name=",
+ "<title>",
+ "The Apache Stanbol Enhancer",
+ "The Stanbol enhancer can detect famous cities",
+ "</body></html>");
+ }
+ @Test
+ public void testContentTypeDetection() throws EngineException, IOException {
+ ContentItem ci = createContentItem("test.pdf", OCTET_STREAM.toString());
+ assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+ engine.computeEnhancements(ci);
+ Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
+ singleton("text/plain"));
+ assertNotNull(contentPart);
+ Blob plainTextBlob = contentPart.getValue();
+ assertNotNull(plainTextBlob);
+ assertContentRegexp(plainTextBlob,
+ "The Apache Stanbol Enhancer",
+ "The Stanbol enhancer can detect famous cities");
+ //validate XHTML results
+ contentPart = ContentItemHelper.getBlob(ci,
+ singleton("application/xhtml+xml"));
+ assertNotNull(contentPart);
+ Blob xhtmlBlob = contentPart.getValue();
+ assertNotNull(xhtmlBlob);
+ assertContentRegexp(xhtmlBlob,
+ "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
+ "<head>",
+ "<meta name=",
+ "<div class=\"page\">",
+ "The Apache Stanbol Enhancer",
+ "The Stanbol enhancer can detect famous cities",
+ "</body></html>");
+ }
+ /**
+ * Tests that text is not processed
+ */
+ @Test
+ public void testText() throws EngineException {
+ byte[] data = ("The Stanbol enhancer can " +
+ "detect famous cities such as Paris and people such as Bob " +
+ "Marley.").getBytes(Charset.forName("UTF-8"));
+ ContentItem ci = new InMemoryContentItem(data,"text/plain; charset=UTF-8");
+ Assert.assertEquals(1, ContentItemHelper.getContentParts(ci, Blob.class).size());
+ }
+ @Test
+ public void testUnsupported() throws EngineException, IOException {
+ ContentItem ci = createContentItem("test.pages", "application/x-iwork-pages-sffpages");
+ assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+ engine.computeEnhancements(ci);
+ Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
+ singleton("text/plain"));
+ //it MUST NOT give an error but also not add a content part
+ assertNull(contentPart);
+ //only the original content
+ assertEquals(1, ContentItemHelper.getContentParts(ci, Blob.class).size());
+
+
+ }
+ @Test
+ public void testXhtml() throws EngineException, IOException {
+ ContentItem ci = createContentItem("test.xhtml", XHTML.toString()+"; charset=UTF-8");
+ assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+ engine.computeEnhancements(ci);
+ Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci,
+ singleton("text/plain"));
+ assertNotNull(contentPart);
+ Blob plainTextBlob = contentPart.getValue();
+ assertNotNull(plainTextBlob);
+ assertContentRegexp(plainTextBlob,
+ "The Apache Stanbol Enhancer",
+ "The Stanbol enhancer can detect famous cities");
+ //only the original and the plain text
+ // this asserts that no xhtml is parsed from the parsed xhtml content
+ assertEquals(2, ContentItemHelper.getContentParts(ci, Blob.class).size());
+ }
+
+ private ContentItem createContentItem(String resourceName, String contentType){
+ InputStream in = TikaEngineTest.class.getClassLoader().getResourceAsStream(resourceName);
+ assertNotNull(in);
+ byte[] data;
+ try {
+ data = toByteArray(in);
+ } catch (IOException e) {
+ throw new IllegalStateException("Unable to read test data!",e);
+ }
+ closeQuietly(in);
+ UriRef ref = new UriRef("urn:contentItem:content-"+ContentItemHelper.toHexString(data));
+ return new InMemoryContentItem(data,contentType);
+ }
+ /**
+ * Tests if the parsed regex pattern are contained in any line of the parsed
+ * test
+ * @throws IOException
+ */
+ public void assertContentRegexp(Blob blob, String... regexp) throws IOException {
+ Charset charset;
+ if(blob.getParameter().containsKey("charset")){
+ charset = Charset.forName(blob.getParameter().get("charset"));
+ } else {
+ charset = Charset.defaultCharset();
+ }
+ Reader reader = null;
+ nextPattern:
+ for (String expr : regexp) {
+ if(reader != null){
+ closeQuietly(reader);
+ }
+ final Pattern p = Pattern.compile(".*" + expr + ".*");
+ reader = new InputStreamReader(blob.getStream(), charset);
+ final LineIterator it = new LineIterator(reader);
+ while (it.hasNext()) {
+ final String line = it.nextLine();
+ if (p.matcher(line).matches()) {
+ continue nextPattern;
+ }
+ }
+ fail(this + ": no match for regexp '" + expr + "', content=\n" +
+ IOUtils.toString(blob.getStream(), charset.toString()));
+ }
+ }
+ @After
+ public void unbindServices() {/*nothing to do */}
+
+ @AfterClass
+ public static void shutdownServices() {
+ engine.deactivate(context);
+ engine = null;
+ }
+
+}
Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/README
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/README?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/README (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/README Tue Feb 28 15:36:04 2012
@@ -0,0 +1,30 @@
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements. See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+All files within this directory are provided under the
+
+ Apache License, Version 2.0
+
+This includes the following files:
+
+ test.doc
+ test.html
+ test.xhtml
+ test.odt
+ test.pages
+ test.pdf
+ test.rtf
+ test2.html
+ test2.pdf
\ No newline at end of file
Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.doc
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.doc?rev=1294695&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.doc
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.html
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.html?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.html (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.html Tue Feb 28 15:36:04 2012
@@ -0,0 +1,11 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+ <title>The Apache Stanbol Enhancer</title>
+</head>
+<body>
+<h1>The Apache Stanbol Enhancer</h1>
+
+<p>The <b>Stanbol enhancer</b> can detect famous cities such as <i>Paris</i> and people such as <i>Bob Marley</i>.</p>
+</body>
+</html>
\ No newline at end of file
Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.html
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.odt
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.odt?rev=1294695&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.odt
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.pages
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.pages?rev=1294695&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.pages
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.pdf
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.pdf?rev=1294695&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.pdf
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.rtf
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.rtf?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.rtf (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.rtf Tue Feb 28 15:36:04 2012
@@ -0,0 +1,14 @@
+{\rtf1\ansi\ansicpg1252\cocoartf1138\cocoasubrtf320
+{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
+{\colortbl;\red255\green255\blue255;}
+\margl1440\margr1440\margb1800\margt1800
+\deftab708
+\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardeftab708\pardirnatural
+
+\f0\b\fs36 \cf0 \expnd0\expndtw0\kerning0
+\up0 \nosupersub \ulnone \outl0\strokewidth0 \strokec0 The Apache Stanbol Enhancer\
+\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardeftab708\pardirnatural
+
+\b0\fs24 \expnd0\expndtw0\kerning0
+\
+The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.}
\ No newline at end of file
Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.xhtml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.xhtml?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.xhtml (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.xhtml Tue Feb 28 15:36:04 2012
@@ -0,0 +1,8 @@
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<title>XHTML test</title>
+</head>
+<body>
+<h1>The Apache Stanbol Enhancer</h1>
+<p>The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.</p>
+</body></html>
\ No newline at end of file
Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.xhtml
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.html
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.html?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.html (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.html Tue Feb 28 15:36:04 2012
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html
+ PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN" "http://www.w3.org/TR/MathML2/dtd/xhtml-math11-f.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml"><!--This file was converted to xhtml by OpenOffice.org - see http://xml.openoffice.org/odf2xhtml for more info.--><head profile="http://dublincore.org/documents/dcmi-terms/"><meta http-equiv="Content-Type" content="application/xhtml+xml; charset=utf-8"/><title xml:lang="en-US">The Apache Stanbol Enhancer</title><meta name="DCTERMS.title" content="The Apache Stanbol Enhancer" xml:lang="en-US"/><meta name="DCTERMS.language" content="en-US" scheme="DCTERMS.RFC4646"/><meta name="DCTERMS.source" content="http://xml.openoffice.org/odf2xhtml"/><meta name="DCTERMS.creator" content="Szaby Grünwald"/><meta name="DCTERMS.issued" content="2012-02-28T08:40:48" scheme="DCTERMS.W3CDTF"/><meta name="DCTERMS.contributor" content="Szaby Grünwald"/><meta name="DCTERMS.modified" content="2012-02-28T08:42:03" scheme="DCTERMS.W3CDTF"/><meta name="DCTERMS.provenance" content="" xml:lang="en-US"/><meta name="DCTERMS.subject" content="," xml:lang="en-
US"/><link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" hreflang="en"/><link rel="schema.DCTERMS" href="http://purl.org/dc/terms/" hreflang="en"/><link rel="schema.DCTYPE" href="http://purl.org/dc/dcmitype/" hreflang="en"/><link rel="schema.DCAM" href="http://purl.org/dc/dcam/" hreflang="en"/><base href="."/><style type="text/css">
+ @page { }
+ table { border-collapse:collapse; border-spacing:0; empty-cells:show }
+ td, th { vertical-align:top; font-size:12pt;}
+ h1, h2, h3, h4, h5, h6 { clear:both }
+ ol, ul { margin:0; padding:0;}
+ li { list-style: none; margin:0; padding:0;}
+ <!-- "li span.odfLiEnd" - IE 7 issue-->
+ li span. { clear: both; line-height:0; width:0; height:0; margin:0; padding:0; }
+ span.footnodeNumber { padding-right:1em; }
+ span.annotation_style_by_filter { font-size:95%; font-family:Arial; background-color:#fff000; margin:0; border:0; padding:0; }
+ * { margin:0;}
+ .Heading_20_1 { font-size:115%; margin-bottom:0.0835in; margin-top:0.1665in; font-family:Arial; writing-mode:page; font-weight:bold; }
+ .P1 { font-size:12pt; font-family:Arial; writing-mode:page; }
+ .Standard { font-size:12pt; font-family:Times New Roman; writing-mode:page; }
+ <!-- ODF styles with no properties representable as CSS -->
+ { }
+ </style></head><body dir="ltr" style="max-width:8.2681in;margin-top:0.7874in; margin-bottom:0.7874in; margin-left:0.7874in; margin-right:0.7874in; "><h1 class="Heading_20_1"><a id="a__The_Apache_Stanbol_Enhancer"><span/></a>The Apache Stanbol Enhancer</h1><p class="Standard">Â </p><p class="P1">The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.</p></body></html>
\ No newline at end of file
Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.html
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.pdf
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.pdf?rev=1294695&view=auto
==============================================================================
Binary file - no diff available.
Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.pdf
------------------------------------------------------------------------------
svn:mime-type = application/octet-stream
Modified: incubator/stanbol/trunk/parent/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/parent/pom.xml?rev=1294695&r1=1294694&r2=1294695&view=diff
==============================================================================
--- incubator/stanbol/trunk/parent/pom.xml (original)
+++ incubator/stanbol/trunk/parent/pom.xml Tue Feb 28 15:36:04 2012
@@ -1599,13 +1599,18 @@
</dependency>
- <!-- Apache Tika core -->
+ <!-- Apache Tika -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
- <version>0.9</version>
+ <version>1.0</version>
</dependency>
-
+ <dependency>
+ <groupId>org.apache.tika</groupId>
+ <artifactId>tika-parsers</artifactId>
+ <version>1.0</version>
+ </dependency>
+
<!-- Aperture -->
<dependency>
<groupId>org.semanticdesktop.aperture</groupId>