You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2012/02/28 16:36:06 UTC

svn commit: r1294695 - in /incubator/stanbol/trunk: enhancer/bundlelist/src/main/bundles/ enhancer/engines/ enhancer/engines/langid/ enhancer/engines/tika/ enhancer/engines/tika/src/ enhancer/engines/tika/src/license/ enhancer/engines/tika/src/main/ en...

Author: rwesten
Date: Tue Feb 28 15:36:04 2012
New Revision: 1294695

URL: http://svn.apache.org/viewvc?rev=1294695&view=rev
Log:
STANBOL-512: First version of the Apache Stanbol Enhancement Engine for Apache Tika

Supports:

* mime type detection (if none or "application/octed-stream"
* plain text extraction
* xhtml extraction

missing:

* processing of the extracted metadata

Added:
    incubator/stanbol/trunk/enhancer/engines/tika/   (with props)
    incubator/stanbol/trunk/enhancer/engines/tika/pom.xml   (with props)
    incubator/stanbol/trunk/enhancer/engines/tika/src/
    incubator/stanbol/trunk/enhancer/engines/tika/src/license/
    incubator/stanbol/trunk/enhancer/engines/tika/src/main/
    incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/
    incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/
    incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/
    incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/
    incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/
    incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/
    incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/
    incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java   (with props)
    incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/
    incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/MultiHandler.java   (with props)
    incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/PlainTextHandler.java   (with props)
    incubator/stanbol/trunk/enhancer/engines/tika/src/main/resources/
    incubator/stanbol/trunk/enhancer/engines/tika/src/main/resources/OSGI-INF/
    incubator/stanbol/trunk/enhancer/engines/tika/src/main/resources/OSGI-INF/metatype/
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/MockComponentContext.java   (with props)
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java   (with props)
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/README
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.doc   (with props)
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.html   (with props)
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.odt   (with props)
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.pages   (with props)
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.pdf   (with props)
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.rtf
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.xhtml   (with props)
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.html   (with props)
    incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.pdf   (with props)
Modified:
    incubator/stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml
    incubator/stanbol/trunk/enhancer/engines/langid/pom.xml
    incubator/stanbol/trunk/enhancer/engines/pom.xml
    incubator/stanbol/trunk/parent/pom.xml

Modified: incubator/stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml?rev=1294695&r1=1294694&r2=1294695&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml (original)
+++ incubator/stanbol/trunk/enhancer/bundlelist/src/main/bundles/list.xml Tue Feb 28 15:36:04 2012
@@ -17,6 +17,21 @@
 -->
 
 <bundles>
+  <!-- ********************************************************************* 
+    start level 10 TO 19 reserved for required libraries (internal and external) 
+    ********************************************************************* -->
+  <startLevel level="17">
+    <bundle> <!-- Apache Tika core (required by the LangId and TikaEngine) -->
+        <groupId>org.apache.tika</groupId>
+        <artifactId>tika-core</artifactId>
+        <version>1.0</version>
+    </bundle>
+    <bundle> <!-- Apache Tika bundle (required by the TikaEngine) -->
+        <groupId>org.apache.tika</groupId>
+        <artifactId>tika-bundle</artifactId>
+        <version>1.0</version>
+    </bundle>
+  </startLevel>
   <!-- Stanbol Enhancer infrastructure and required libraries -->
   <startLevel level="20">
     <bundle>
@@ -91,46 +106,62 @@
 
   <!-- Stanbol Enhancer plug-ins (the Enhancement Engines) -->
   <startLevel level="25">
-    <bundle>
+
+    <!-- language identification -->
+
+    <bundle> 
       <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.enhancer.engines.langid</artifactId>
       <version>0.9.0-incubating-SNAPSHOT</version>
     </bundle>
-    <bundle>
+
+    <!-- Content conversion (2 engines)-->
+
+    <bundle> <!-- Metaxa -->
       <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.enhancer.engines.metaxa</artifactId>
       <version>0.9.0-incubating-SNAPSHOT</version>
     </bundle>
-    <bundle>
+    <bundle><!-- Apache Tika Engine -->
       <groupId>org.apache.stanbol</groupId>
-      <artifactId>org.apache.stanbol.enhancer.engines.opencalais</artifactId>
+      <artifactId>org.apache.stanbol.enhancer.engines.tika</artifactId>
       <version>0.9.0-incubating-SNAPSHOT</version>
     </bundle>
-    <bundle>
+
+    <!-- Named Entity Recoqunition (NER)-->
+
+    <bundle> <!-- Open NLP based NER -->
       <groupId>org.apache.stanbol</groupId>
-      <artifactId>org.apache.stanbol.enhancer.engines.zemanta</artifactId>
+      <artifactId>org.apache.stanbol.enhancer.engines.opennlp.ner</artifactId>
       <version>0.9.0-incubating-SNAPSHOT</version>
     </bundle>
-    <bundle>
+    <!-- Entity Extraction/Linking -->
+    <bundle><!-- NER linking (depends on the Entityhub) -->
       <groupId>org.apache.stanbol</groupId>
-      <artifactId>org.apache.stanbol.enhancer.engines.opennlp.ner</artifactId>
+      <artifactId>org.apache.stanbol.enhancer.engine.entitytagging</artifactId>
       <version>0.9.0-incubating-SNAPSHOT</version>
     </bundle>
-    <bundle>
+    <bundle><!-- Keyword Extraction from Text (depends on the Entityhub) -->
       <groupId>org.apache.stanbol</groupId>
-      <artifactId>org.apache.stanbol.enhancer.engines.geonames</artifactId>
+      <artifactId>org.apache.stanbol.enhancer.engine.keywordextraction</artifactId>
       <version>0.9.0-incubating-SNAPSHOT</version>
     </bundle>
-    <!-- Entity Tagging Engine (depends on the Entityhub) -->
-    <bundle>
+
+    <!-- External Service Integration -->
+
+    <bundle> <!-- http://www.opencalais.com/ -->
       <groupId>org.apache.stanbol</groupId>
-      <artifactId>org.apache.stanbol.enhancer.engine.entitytagging</artifactId>
+      <artifactId>org.apache.stanbol.enhancer.engines.opencalais</artifactId>
       <version>0.9.0-incubating-SNAPSHOT</version>
     </bundle>
-    <!-- Keyword Extraction Engine (depends on the Entityhub) -->
-    <bundle>
+    <bundle> <!-- http://www.zemanta.com/ -->
       <groupId>org.apache.stanbol</groupId>
-      <artifactId>org.apache.stanbol.enhancer.engine.keywordextraction</artifactId>
+      <artifactId>org.apache.stanbol.enhancer.engines.zemanta</artifactId>
+      <version>0.9.0-incubating-SNAPSHOT</version>
+    </bundle>
+    <bundle> <!-- http://www.geonames.org/ -->
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.enhancer.engines.geonames</artifactId>
       <version>0.9.0-incubating-SNAPSHOT</version>
     </bundle>
   </startLevel>

Modified: incubator/stanbol/trunk/enhancer/engines/langid/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/langid/pom.xml?rev=1294695&r1=1294694&r2=1294695&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/langid/pom.xml (original)
+++ incubator/stanbol/trunk/enhancer/engines/langid/pom.xml Tue Feb 28 15:36:04 2012
@@ -61,7 +61,6 @@
               org.apache.stanbol.enhancer.engines.langid;version=${project.version}
             </Export-Package>
             <Embed-Dependency>
-              textcat
             </Embed-Dependency>
           </instructions>
         </configuration>
@@ -91,8 +90,6 @@
     <dependency>
       <groupId>org.apache.stanbol</groupId>
       <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
-      <version>0.9.0-incubating-SNAPSHOT</version>
-      <scope>provided</scope>
     </dependency>
 
     <dependency>

Modified: incubator/stanbol/trunk/enhancer/engines/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/pom.xml?rev=1294695&r1=1294694&r2=1294695&view=diff
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/pom.xml (original)
+++ incubator/stanbol/trunk/enhancer/engines/pom.xml Tue Feb 28 15:36:04 2012
@@ -46,11 +46,11 @@
     <module>langid</module>
     <module>topic</module>
     <module>metaxa</module>
+    <module>tika</module>
     <module>geonames</module>
     <module>entitytagging</module>
     <module>keywordextraction</module>
 
-    <!-- Entityhub based enhancement engine(s) -->
     <module>opencalais</module>
     <module>zemanta</module>
   </modules>

Propchange: incubator/stanbol/trunk/enhancer/engines/tika/
------------------------------------------------------------------------------
--- svn:ignore (added)
+++ svn:ignore Tue Feb 28 15:36:04 2012
@@ -0,0 +1,7 @@
+.classpath
+
+.project
+
+.settings
+
+target

Added: incubator/stanbol/trunk/enhancer/engines/tika/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/pom.xml?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/pom.xml (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/pom.xml Tue Feb 28 15:36:04 2012
@@ -0,0 +1,133 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+        http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
+
+  <modelVersion>4.0.0</modelVersion>
+
+  <parent>
+    <artifactId>org.apache.stanbol.enhancer.parent</artifactId>
+    <groupId>org.apache.stanbol</groupId>
+    <version>0.9.0-incubating-SNAPSHOT</version>
+    <relativePath>../../parent</relativePath>
+  </parent>
+
+  <groupId>org.apache.stanbol</groupId>
+  <artifactId>org.apache.stanbol.enhancer.engines.tika</artifactId>
+  <packaging>bundle</packaging>
+
+  <name>Apache Stanbol Enhancer Enhancement Engine : Apache Tika </name>
+  <description>Enhancement Engine that uses Apache Tika to convert parsed
+  documents to plain text and xhtml
+  </description>
+
+  <inceptionYear>2012</inceptionYear>
+
+  <scm>
+    <connection>
+      scm:svn:http://svn.apache.org/repos/asf/incubator/stanbol/trunk/enhancer/engines/tika/
+    </connection>
+    <developerConnection>
+      scm:svn:https://svn.apache.org/repos/asf/incubator/stanbol/trunk/enhancer/engines/tika/
+    </developerConnection>
+    <url>http://incubator.apache.org/stanbol/</url>
+  </scm>
+
+  <build>
+    <plugins>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-bundle-plugin</artifactId>
+        <extensions>true</extensions>
+        <configuration>
+          <instructions>
+            <Export-Package>
+              org.apache.stanbol.enhancer.engines.tika;version=${project.version}
+            </Export-Package>
+          </instructions>
+        </configuration>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.felix</groupId>
+        <artifactId>maven-scr-plugin</artifactId>
+      </plugin>
+      <plugin>
+        <groupId>org.apache.rat</groupId>
+        <artifactId>apache-rat-plugin</artifactId>
+        <configuration>
+          <excludes>
+            <!-- AL20 License  -->
+            <exclude>src/license/THIRD-PARTY.properties</exclude>
+            <!-- AL20 License for test resources (see src/test/resources/README) -->
+            <exclude>src/test/resources/test.doc</exclude>
+            <exclude>src/test/resources/test.html</exclude>
+            <exclude>src/test/resources/test.xhtml</exclude>
+            <exclude>src/test/resources/test.odt</exclude>
+            <exclude>src/test/resources/test.pages</exclude>
+            <exclude>src/test/resources/test.pdf</exclude>
+            <exclude>src/test/resources/test.rtf</exclude>
+            <exclude>src/test/resources/test2.html</exclude>
+            <exclude>src/test/resources/test2.pdf</exclude>
+          </excludes>
+        </configuration>
+      </plugin>
+    </plugins>
+  </build>
+
+  <properties>
+    <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+  </properties>
+
+  <dependencies>
+    <dependency>
+      <groupId>org.apache.stanbol</groupId>
+      <artifactId>org.apache.stanbol.enhancer.servicesapi</artifactId>
+      <version>0.9.0-incubating-SNAPSHOT</version>
+      <scope>provided</scope>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-parsers</artifactId>
+    </dependency>
+
+    <dependency>
+      <groupId>org.apache.felix</groupId>
+      <artifactId>org.apache.felix.scr.annotations</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.apache.clerezza</groupId>
+      <artifactId>rdf.core</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>commons-io</groupId>
+      <artifactId>commons-io</artifactId>
+    </dependency>
+    <dependency>
+      <groupId>org.slf4j</groupId>
+      <artifactId>slf4j-api</artifactId>
+    </dependency>
+
+    <dependency>
+      <groupId>junit</groupId>
+      <artifactId>junit</artifactId>
+      <scope>test</scope>
+    </dependency>
+  </dependencies>
+
+</project>

Propchange: incubator/stanbol/trunk/enhancer/engines/tika/pom.xml
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java Tue Feb 28 15:36:04 2012
@@ -0,0 +1,222 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.tika;
+
+import static org.apache.stanbol.enhancer.servicesapi.helper.EnhancementEngineHelper.randomUUID;
+import static org.apache.tika.mime.MediaType.TEXT_PLAIN;
+
+import java.io.BufferedInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.StringWriter;
+import java.util.Collections;
+import java.util.Map;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.commons.io.IOUtils;
+import org.apache.felix.scr.annotations.Component;
+import org.apache.felix.scr.annotations.Properties;
+import org.apache.felix.scr.annotations.Property;
+import org.apache.felix.scr.annotations.Service;
+import org.apache.stanbol.enhancer.engines.tika.handler.MultiHandler;
+import org.apache.stanbol.enhancer.engines.tika.handler.PlainTextHandler;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.EnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.ServiceProperties;
+import org.apache.stanbol.enhancer.servicesapi.helper.AbstractEnhancementEngine;
+import org.apache.stanbol.enhancer.servicesapi.helper.InMemoryBlob;
+import org.apache.tika.config.TikaConfig;
+import org.apache.tika.detect.Detector;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.BodyContentHandler;
+import org.apache.tika.sax.ToXMLContentHandler;
+import org.osgi.service.cm.ConfigurationException;
+import org.osgi.service.component.ComponentContext;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+import org.xml.sax.ContentHandler;
+
+/**
+ * EnhancementEngine based on Apache Tika that converts the content of parsed 
+ * content items to xhtml and plain text. In addition it converts extracted 
+ * metadata to RDF and adds them to the {@link ContentItem#getMetadata()}
+ * 
+ * @author Rupert Westenthaler
+ *
+ */
+@Component(immediate = true, metatype = true, inherit=true)
+@Service
+@Properties(value={
+    @Property(name=EnhancementEngine.PROPERTY_NAME,value="tika")
+})
+public class TikaEngine 
+        extends AbstractEnhancementEngine<RuntimeException,RuntimeException> 
+        implements EnhancementEngine, ServiceProperties {
+    private final Logger log = LoggerFactory.getLogger(TikaEngine.class);
+    /**
+     * The default value for the Execution of this Engine. Currently set to
+     * {@link ServiceProperties#ORDERING_PRE_PROCESSING}
+     */
+    public static final Integer defaultOrder = ORDERING_PRE_PROCESSING;
+
+    protected static MediaType XHTML = new MediaType("application", "xhtml+xml");
+    
+    private TikaConfig config;
+    private Parser parser;
+    private Detector detector;
+
+    private static class MediaTypeAndStream {
+        MediaType mediaType;
+        InputStream in;
+    }
+   
+    @Override
+    public int canEnhance(ContentItem ci) throws EngineException {
+        return ENHANCE_ASYNC;
+    }
+    
+    @Override
+    public void computeEnhancements(ContentItem ci) throws EngineException {
+        MediaTypeAndStream mtas = extractMediaType(ci);
+        if(mtas.mediaType == null){
+            return; //unable to parse and detect content type
+        }
+        MediaType plainMediaType = mtas.mediaType.getBaseType();
+        if(plainMediaType.equals(MediaType.TEXT_PLAIN)){
+            return; //we need not to process plain text!
+        }
+        ParseContext context = new ParseContext();
+        context.set(Parser.class,parser);
+        if(parser.getSupportedTypes(context).contains(plainMediaType)) {
+            final InputStream in;
+            if(mtas.in == null){
+                in = ci.getStream();
+            } else {
+                in = mtas.in;
+            }
+            Metadata metadata = new Metadata();
+            //set the already parsed contentType
+            metadata.set(Metadata.CONTENT_TYPE, mtas.mediaType.toString());
+            final StringWriter writer = new StringWriter();
+            final ContentHandler textHandler = new BodyContentHandler( //only the Body
+                new PlainTextHandler(writer, true,false)); //skip ignoreable
+            final ToXMLContentHandler xhtmlHandler;
+            final ContentHandler mainHandler;
+            if(!plainMediaType.equals(XHTML)){ //do not parse XHTML from XHTML
+                xhtmlHandler = new ToXMLContentHandler();
+                mainHandler = new MultiHandler(textHandler,xhtmlHandler);
+            } else {
+                mainHandler = textHandler;
+                xhtmlHandler = null;
+            }
+            try {
+                parser.parse(in, mainHandler, metadata, context);
+            } catch (Exception e) {
+                throw new EngineException("Unable to convert ContentItem "+
+                        ci.getUri()+" with mimeType '"+ci.getMimeType()+"' to "+
+                        "plain text!",e);
+            }
+            IOUtils.closeQuietly(in);
+//            log.info("Plain Content: \n{} \n",writer.toString());
+            String random = randomUUID().toString();
+            UriRef textBlobUri = new UriRef("urn:tika:text:"+random);
+            ci.addPart(textBlobUri, 
+                new InMemoryBlob(writer.toString(), 
+                    TEXT_PLAIN.toString())); //string -> no encoding
+            if(xhtmlHandler != null){
+//                log.info("XML Content: \n{} \n",xhtmlHandler.toString());
+                UriRef xhtmlBlobUri = new UriRef("urn:tika:xhtml:"+random);
+                ci.addPart(xhtmlBlobUri, 
+                    new InMemoryBlob(xhtmlHandler.toString(),
+                        "application/xhtml+xml")); //string -> no encoding
+            }
+            //TODO:
+            // * add also the Metadata extracted by Apache Tika
+            
+        } //else not supported format
+
+    }
+    /**
+     * Getter for the contentType. If not set or {@link MediaType#OCTET_STREAM}
+     * than the media type is detected.<p>
+     * This method returns the MediaType and the Stream used to detect the
+     * MimeType. This allows to reuse the stream and the mediaType
+     * @param ci
+     * @param mediaTypeArray
+     * @return
+     */
+    private MediaTypeAndStream extractMediaType(ContentItem ci) {
+        MediaTypeAndStream mtas = new MediaTypeAndStream();
+        mtas.mediaType = getMediaType(ci.getBlob());
+        if(mtas.mediaType == null || mtas.mediaType.equals(MediaType.OCTET_STREAM)){
+            mtas.in = new BufferedInputStream(ci.getStream());
+            try {
+                mtas.mediaType = detector.detect(mtas.in, new Metadata());
+            } catch (IOException e) {
+                log.warn("Exception while detection the MediaType of the" +
+                        "parsed ContentItem "+ci.getUri(),e);
+                IOUtils.closeQuietly(mtas.in);
+                mtas.in = null;
+            }
+        }
+        return mtas;
+    }
+
+    /**
+     * @param ci
+     * @return
+     */
+    private MediaType getMediaType(Blob blob) {
+        String[] mediaTypeArray = blob.getMimeType().split("/");
+        if(mediaTypeArray.length != 2){
+            log.warn("Encounterd illegal formatted mediaType '{}'  -> will try " +
+            		"to detect the mediaType based on the parsed content!",
+                blob.getMimeType());
+            return null;
+        } else {
+            return new MediaType(mediaTypeArray[0], mediaTypeArray[1],
+                blob.getParameter());
+        }
+    }
+    @Override
+    protected void activate(ComponentContext ctx) throws ConfigurationException {
+        super.activate(ctx);
+        config = TikaConfig.getDefaultConfig();
+        this.detector = config.getDetector();
+        this.parser = new AutoDetectParser(config);
+    }
+    @Override
+    protected void deactivate(ComponentContext ctx) throws RuntimeException {
+        this.config = null;
+        this.parser = null;
+        this.detector = null;
+        super.deactivate(ctx);
+    }
+
+    public Map<String, Object> getServiceProperties() {
+        return Collections.unmodifiableMap(
+            Collections.singletonMap(
+                ENHANCEMENT_ENGINE_ORDERING, (Object) defaultOrder));
+    }
+
+}

Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/MultiHandler.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/MultiHandler.java?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/MultiHandler.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/MultiHandler.java Tue Feb 28 15:36:04 2012
@@ -0,0 +1,127 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.tika.handler;
+
+import java.util.Arrays;
+import java.util.List;
+
+import org.apache.tika.sax.ContentHandlerDecorator;
+import org.xml.sax.Attributes;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.Locator;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+/**
+ * Similar to {@link ContentHandlerDecorator} - as it processed the exact same
+ * methods - but supports forwarding such calls to several parsed {@link ContentHandler}s
+ * 
+ * @author Rupert Westenthaler
+ *
+ */
+public class MultiHandler extends DefaultHandler {
+
+    List<ContentHandler> handlers;
+
+    public MultiHandler(ContentHandler...handlers) {
+        if(handlers == null || handlers.length < 1){
+            throw new IllegalArgumentException("The parsed ContentHandler array MUST NOT be NULL or empty!");
+        }
+        this.handlers = Arrays.asList(handlers);
+        if(this.handlers.contains(null)){
+            throw new IllegalArgumentException("Tha parsed ContentHandlers array MUST NOT contain an NULL entry!");
+        }
+    }
+
+    @Override
+    public void startPrefixMapping(String prefix, String uri) throws SAXException {
+        for(ContentHandler handler : handlers){
+            handler.startPrefixMapping(prefix, uri);
+        }
+    }
+
+    @Override
+    public void endPrefixMapping(String prefix) throws SAXException {
+        for(ContentHandler handler : handlers){
+            handler.endPrefixMapping(prefix);
+        }
+    }
+
+    @Override
+    public void processingInstruction(String target, String data) throws SAXException {
+        for(ContentHandler handler : handlers){
+            handler.processingInstruction(target, data);
+        }
+    }
+
+    @Override
+    public void setDocumentLocator(Locator locator) {
+        for(ContentHandler handler : handlers){
+            handler.setDocumentLocator(locator);
+        }
+    }
+
+    @Override
+    public void startDocument() throws SAXException {
+        for(ContentHandler handler : handlers){
+            handler.startDocument();
+        }
+    }
+
+    @Override
+    public void endDocument() throws SAXException {
+        for(ContentHandler handler : handlers){
+            handler.endDocument();
+        }
+    }
+
+    @Override
+    public void startElement(String uri, String localName, String name, Attributes atts) throws SAXException {
+        for(ContentHandler handler : handlers){
+            handler.startElement(uri, localName, name, atts);
+        }
+    }
+
+    @Override
+    public void endElement(String uri, String localName, String name) throws SAXException {
+        for(ContentHandler handler : handlers){
+            handler.endElement(uri, localName, name);
+        }
+    }
+
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+        for(ContentHandler handler : handlers){
+            handler.characters(ch, start, length);
+        }
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+        for(ContentHandler handler : handlers){
+            handler.ignorableWhitespace(ch, start, length);
+        }
+    }
+
+    @Override
+    public void skippedEntity(String name) throws SAXException {
+        for(ContentHandler handler : handlers){
+            handler.skippedEntity(name);
+        }
+    }
+    
+}

Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/MultiHandler.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/PlainTextHandler.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/PlainTextHandler.java?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/PlainTextHandler.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/PlainTextHandler.java Tue Feb 28 15:36:04 2012
@@ -0,0 +1,83 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.tika.handler;
+
+import java.io.Writer;
+
+import org.apache.tika.sax.ToTextContentHandler;
+import org.xml.sax.Attributes;
+import org.xml.sax.SAXException;
+/**
+ * Small extensions to the default {@link ToTextContentHandler}. This allows
+ * to <ul>
+ * <li>skip ignoreable whitespaces
+ * <li>skip linebreaks within literals
+ * </ul>
+ * 
+ * @author Rupert Westenthaler
+ *
+ */
+public class PlainTextHandler extends ToTextContentHandler {
+
+    
+    
+    private final boolean skipWhitespaces;
+    private final boolean skipLinebreakes;
+    public PlainTextHandler(Writer writer, boolean skipIgnoreableWhitespaces, boolean skipLinebreaksWithinLiterals) {
+        super(writer);
+        this.skipWhitespaces = skipIgnoreableWhitespaces;
+        this.skipLinebreakes = skipLinebreaksWithinLiterals;
+    }
+
+    @Override
+    public void ignorableWhitespace(char[] ch, int start, int length) throws SAXException {
+        if(!skipWhitespaces){
+            super.ignorableWhitespace(ch, start, length);
+        } //else ignore
+    }
+    @Override
+    public void characters(char[] ch, int start, int length) throws SAXException {
+        int in;
+        if(skipLinebreakes){
+            //use an in(serte) and an it(erator) index to avoid copying
+            //the data to a new char[].
+            in = start;
+            for(int it = start; it<length;it++){
+                if(ch[it] != '\n'){
+                    ch[in] = ch[it];
+                    in++;
+                } //ignore line breaks
+            }
+            if(in == start){ //only line breaks 
+                return; // -> nothing to add
+            }
+        } else {
+            in = length;
+        }
+        super.characters(ch, start, in);
+    }
+    
+    @Override
+    public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
+        super.startElement(uri, localName, qName, attributes);
+    }
+    @Override
+    public void endElement(String uri, String localName, String qName) throws SAXException {
+        // TODO Auto-generated method stub
+        super.endElement(uri, localName, qName);
+    }
+}

Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/handler/PlainTextHandler.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/MockComponentContext.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/MockComponentContext.java?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/MockComponentContext.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/MockComponentContext.java Tue Feb 28 15:36:04 2012
@@ -0,0 +1,199 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.stanbol.enhancer.engines.tika;
+
+import java.io.File;
+import java.io.InputStream;
+import java.util.Dictionary;
+import java.util.Hashtable;
+
+import org.osgi.framework.Bundle;
+import org.osgi.framework.BundleContext;
+import org.osgi.framework.BundleException;
+import org.osgi.framework.BundleListener;
+import org.osgi.framework.Filter;
+import org.osgi.framework.FrameworkListener;
+import org.osgi.framework.InvalidSyntaxException;
+import org.osgi.framework.ServiceListener;
+import org.osgi.framework.ServiceReference;
+import org.osgi.framework.ServiceRegistration;
+import org.osgi.service.component.ComponentContext;
+import org.osgi.service.component.ComponentInstance;
+
+public class MockComponentContext implements ComponentContext {
+
+    protected final Dictionary<String, Object> properties;
+
+    public MockComponentContext() {
+        properties = new Hashtable<String, Object>();
+    }
+
+    public MockComponentContext(Dictionary<String, Object> properties) {
+        this.properties = properties;
+    }
+
+    public void disableComponent(String name) {
+    }
+
+    public void enableComponent(String name) {
+    }
+
+    public BundleContext getBundleContext() {
+        return new BundleContext() {
+
+            @Override
+            public boolean ungetService(ServiceReference reference) {
+                return false;
+            }
+
+            @Override
+            public void removeServiceListener(ServiceListener listener) {
+            }
+
+            @Override
+            public void removeFrameworkListener(FrameworkListener listener) {
+            }
+
+            @Override
+            public void removeBundleListener(BundleListener listener) {
+            }
+
+            @Override
+            public ServiceRegistration registerService(String clazz,
+                    Object service, Dictionary properties) {
+                return null;
+            }
+
+            @Override
+            public ServiceRegistration registerService(String[] clazzes,
+                    Object service, Dictionary properties) {
+                return null;
+            }
+
+            @Override
+            public Bundle installBundle(String location, InputStream input)
+                    throws BundleException {
+                return null;
+            }
+
+            @Override
+            public Bundle installBundle(String location) throws BundleException {
+                return null;
+            }
+
+            @Override
+            public ServiceReference[] getServiceReferences(String clazz,
+                    String filter) throws InvalidSyntaxException {
+                return null;
+            }
+
+            @Override
+            public ServiceReference getServiceReference(String clazz) {
+                return null;
+            }
+
+            @Override
+            public Object getService(ServiceReference reference) {
+                return null;
+            }
+
+            @Override
+            public String getProperty(String key) {
+                return null;
+            }
+
+            @Override
+            public File getDataFile(String filename) {
+                return new File(System.getProperty("java.io.tmpdir"));
+            }
+
+            @Override
+            public Bundle[] getBundles() {
+                return null;
+            }
+
+            @Override
+            public Bundle getBundle(long id) {
+                return null;
+            }
+
+            @Override
+            public Bundle getBundle() {
+                return null;
+            }
+
+            @Override
+            public ServiceReference[] getAllServiceReferences(String clazz,
+                    String filter) throws InvalidSyntaxException {
+                return null;
+            }
+
+            @Override
+            public Filter createFilter(String filter)
+                    throws InvalidSyntaxException {
+                return null;
+            }
+
+            @Override
+            public void addServiceListener(ServiceListener listener,
+                    String filter) throws InvalidSyntaxException {
+
+            }
+
+            @Override
+            public void addServiceListener(ServiceListener listener) {
+            }
+
+            @Override
+            public void addFrameworkListener(FrameworkListener listener) {
+            }
+
+            @Override
+            public void addBundleListener(BundleListener listener) {
+            }
+        };
+    }
+
+    public ComponentInstance getComponentInstance() {
+        return null;
+    }
+
+    public Dictionary<String, Object> getProperties() {
+        return properties;
+    }
+
+    public ServiceReference getServiceReference() {
+        return null;
+    }
+
+    public Bundle getUsingBundle() {
+        return null;
+    }
+
+    public Object locateService(String name) {
+        return null;
+    }
+
+    public Object locateService(String name, ServiceReference reference) {
+        return null;
+    }
+
+    public Object[] locateServices(String name) {
+        return null;
+    }
+
+}

Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/MockComponentContext.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java Tue Feb 28 15:36:04 2012
@@ -0,0 +1,367 @@
+/*
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements.  See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License.  You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.stanbol.enhancer.engines.tika;
+
+import static java.util.Collections.singleton;
+import static org.apache.commons.io.IOUtils.closeQuietly;
+import static org.apache.commons.io.IOUtils.toByteArray;
+import static org.apache.stanbol.enhancer.engines.tika.TikaEngine.XHTML;
+import static org.apache.stanbol.enhancer.servicesapi.EnhancementEngine.CANNOT_ENHANCE;
+import static org.apache.tika.mime.MediaType.OCTET_STREAM;
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertFalse;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+import static org.junit.Assert.fail;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.util.Map.Entry;
+import java.util.regex.Pattern;
+
+import org.apache.clerezza.rdf.core.UriRef;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.LineIterator;
+import org.apache.stanbol.enhancer.servicesapi.Blob;
+import org.apache.stanbol.enhancer.servicesapi.ContentItem;
+import org.apache.stanbol.enhancer.servicesapi.EngineException;
+import org.apache.stanbol.enhancer.servicesapi.helper.ContentItemHelper;
+import org.apache.stanbol.enhancer.servicesapi.helper.InMemoryContentItem;
+import org.junit.After;
+import org.junit.AfterClass;
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.osgi.service.cm.ConfigurationException;
+
+public class TikaEngineTest {
+
+    private static TikaEngine engine;
+    private static MockComponentContext context;
+    
+    @BeforeClass
+    public static void setUpServices() throws IOException {
+        context = new MockComponentContext();
+        context.properties.put(TikaEngine.PROPERTY_NAME, "tika");
+    }
+
+    @Before
+    public void bindServices() throws ConfigurationException {
+        if(engine == null){
+            engine = new TikaEngine();
+            engine.activate(context);
+        }
+    }
+
+    @Test
+    public void testHtml() throws EngineException, IOException {
+        ContentItem ci = createContentItem("test.html", "text/html; charset=UTF-8");
+        assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+        engine.computeEnhancements(ci);
+        Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, 
+            singleton("text/plain"));
+        assertNotNull(contentPart);
+        Blob plainTextBlob = contentPart.getValue();
+        assertNotNull(plainTextBlob);
+        assertContentRegexp(plainTextBlob, 
+            "The Apache Stanbol Enhancer",
+            "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
+        //validate XHTML results
+        contentPart = ContentItemHelper.getBlob(ci, 
+            singleton("application/xhtml+xml"));
+        assertNotNull(contentPart);
+        Blob xhtmlBlob = contentPart.getValue();
+        assertNotNull(xhtmlBlob);
+        assertContentRegexp(xhtmlBlob,
+            "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
+            "<head>",
+            "<meta name=",
+            "<title>The Apache Stanbol Enhancer</title>",
+            "The Apache Stanbol Enhancer",
+            "The Stanbol enhancer can detect famous cities",
+            "</body></html>");
+    }
+    @Test
+    public void testPdf() throws EngineException, IOException {
+        //PDF created by Apple Pages
+        ContentItem ci = createContentItem("test.pdf", "application/pdf");
+        assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+        engine.computeEnhancements(ci);
+        Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, 
+            singleton("text/plain"));
+        assertNotNull(contentPart);
+        Blob plainTextBlob = contentPart.getValue();
+        assertNotNull(plainTextBlob);
+        assertContentRegexp(plainTextBlob, 
+            "The Apache Stanbol Enhancer",
+            "The Stanbol enhancer can detect famous cities ");        
+        //validate XHTML results
+        contentPart = ContentItemHelper.getBlob(ci, 
+            singleton("application/xhtml+xml"));
+        assertNotNull(contentPart);
+        Blob xhtmlBlob = contentPart.getValue();
+        assertNotNull(xhtmlBlob);
+        assertContentRegexp(xhtmlBlob,
+            "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
+            "<head>",
+            "<meta name=",
+            "<div class=\"page\">",
+            "The Apache Stanbol Enhancer",
+            "The Stanbol enhancer can detect famous cities",
+            "</body></html>");
+        
+        //PDF created by OpenOffice
+        ci = createContentItem("test2.pdf", "application/pdf");
+        assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+        engine.computeEnhancements(ci);
+        //validate plain text results
+        contentPart = ContentItemHelper.getBlob(ci, 
+            singleton("text/plain"));
+        assertNotNull(contentPart);
+        plainTextBlob = contentPart.getValue();
+        assertNotNull(plainTextBlob);
+        assertContentRegexp(plainTextBlob, 
+            "The Apache Stanbol Enhancer",
+            "The Stanbol enhancer can detect famous cities");
+        //validate XHTML results
+        contentPart = ContentItemHelper.getBlob(ci, 
+            singleton("application/xhtml+xml"));
+        assertNotNull(contentPart);
+        xhtmlBlob = contentPart.getValue();
+        assertNotNull(xhtmlBlob);
+        assertContentRegexp(xhtmlBlob,
+            "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
+            "<head>",
+            "<meta name=",
+            "<div class=\"page\">",
+            "The Apache Stanbol Enhancer",
+            "The Stanbol enhancer can detect famous cities",
+            "</body></html>");
+
+    }
+    @Test
+    public void testMsWord() throws EngineException, IOException {
+        ContentItem ci = createContentItem("test.doc", "application/msword");
+        assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+        engine.computeEnhancements(ci);
+        Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, 
+            singleton("text/plain"));
+        assertNotNull(contentPart);
+        Blob plainTextBlob = contentPart.getValue();
+        assertNotNull(plainTextBlob);
+        assertContentRegexp(plainTextBlob, 
+            "The Apache Stanbol Enhancer",
+            "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
+        //validate XHTML results
+        contentPart = ContentItemHelper.getBlob(ci, 
+            singleton("application/xhtml+xml"));
+        assertNotNull(contentPart);
+        Blob xhtmlBlob = contentPart.getValue();
+        assertNotNull(xhtmlBlob);
+        assertContentRegexp(xhtmlBlob,
+            "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
+            "<head>",
+            "<meta name=",
+            "<title>",
+            "The Apache Stanbol Enhancer",
+            "The Stanbol enhancer can detect famous cities",
+            "</body></html>");    }
+    @Test
+    public void testRtf() throws EngineException, IOException {
+        ContentItem ci = createContentItem("test.rtf", "application/rtf");
+        assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+        engine.computeEnhancements(ci);
+        Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, 
+            singleton("text/plain"));
+        assertNotNull(contentPart);
+        Blob plainTextBlob = contentPart.getValue();
+        assertNotNull(plainTextBlob);
+        assertContentRegexp(plainTextBlob, 
+            "The Apache Stanbol Enhancer",
+            "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
+        //validate XHTML results
+        contentPart = ContentItemHelper.getBlob(ci, 
+            singleton("application/xhtml+xml"));
+        assertNotNull(contentPart);
+        Blob xhtmlBlob = contentPart.getValue();
+        assertNotNull(xhtmlBlob);
+        assertContentRegexp(xhtmlBlob,
+            "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
+            "<head>",
+            "<meta name=",
+            "<title>",
+            "The Apache Stanbol Enhancer",
+            "The Stanbol enhancer can detect famous cities",
+            "</body></html>");
+    }
+    @Test
+    public void testOdt() throws EngineException, IOException {
+        ContentItem ci = createContentItem("test.odt", "application/vnd.oasis.opendocument.text");
+        assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+        engine.computeEnhancements(ci);
+        Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, 
+            singleton("text/plain"));
+        assertNotNull(contentPart);
+        Blob plainTextBlob = contentPart.getValue();
+        assertNotNull(plainTextBlob);
+        assertContentRegexp(plainTextBlob, 
+            "The Apache Stanbol Enhancer",
+            "The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.");
+        //validate XHTML results
+        contentPart = ContentItemHelper.getBlob(ci, 
+            singleton("application/xhtml+xml"));
+        assertNotNull(contentPart);
+        Blob xhtmlBlob = contentPart.getValue();
+        assertNotNull(xhtmlBlob);
+        assertContentRegexp(xhtmlBlob,
+            "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
+            "<head>",
+            "<meta name=",
+            "<title>",
+            "The Apache Stanbol Enhancer",
+            "The Stanbol enhancer can detect famous cities",
+            "</body></html>");
+    }
+    @Test
+    public void testContentTypeDetection() throws EngineException, IOException {
+        ContentItem ci = createContentItem("test.pdf", OCTET_STREAM.toString());
+        assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+        engine.computeEnhancements(ci);
+        Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, 
+            singleton("text/plain"));
+        assertNotNull(contentPart);
+        Blob plainTextBlob = contentPart.getValue();
+        assertNotNull(plainTextBlob);
+        assertContentRegexp(plainTextBlob, 
+            "The Apache Stanbol Enhancer",
+            "The Stanbol enhancer can detect famous cities");
+        //validate XHTML results
+        contentPart = ContentItemHelper.getBlob(ci, 
+            singleton("application/xhtml+xml"));
+        assertNotNull(contentPart);
+        Blob xhtmlBlob = contentPart.getValue();
+        assertNotNull(xhtmlBlob);
+        assertContentRegexp(xhtmlBlob,
+            "<html xmlns=\"http://www.w3.org/1999/xhtml\">",
+            "<head>",
+            "<meta name=",
+            "<div class=\"page\">",
+            "The Apache Stanbol Enhancer",
+            "The Stanbol enhancer can detect famous cities",
+            "</body></html>");
+    }
+    /**
+     * Tests that text is not processed
+     */
+    @Test
+    public void testText() throws EngineException {
+        byte[] data = ("The Stanbol enhancer can " +
+                "detect famous cities such as Paris and people such as Bob " +
+                "Marley.").getBytes(Charset.forName("UTF-8"));
+        ContentItem ci = new InMemoryContentItem(data,"text/plain; charset=UTF-8");
+        Assert.assertEquals(1, ContentItemHelper.getContentParts(ci, Blob.class).size());
+    }
+    @Test
+    public void testUnsupported() throws EngineException, IOException {
+        ContentItem ci = createContentItem("test.pages", "application/x-iwork-pages-sffpages");
+        assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+        engine.computeEnhancements(ci);
+        Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, 
+            singleton("text/plain"));
+        //it MUST NOT give an error but also not add a content part
+        assertNull(contentPart);
+        //only the original content
+        assertEquals(1, ContentItemHelper.getContentParts(ci, Blob.class).size());
+        
+        
+    }
+    @Test
+    public void testXhtml() throws EngineException, IOException {
+        ContentItem ci = createContentItem("test.xhtml", XHTML.toString()+"; charset=UTF-8");
+        assertFalse(engine.canEnhance(ci) == CANNOT_ENHANCE);
+        engine.computeEnhancements(ci);
+        Entry<UriRef,Blob> contentPart = ContentItemHelper.getBlob(ci, 
+            singleton("text/plain"));
+        assertNotNull(contentPart);
+        Blob plainTextBlob = contentPart.getValue();
+        assertNotNull(plainTextBlob);
+        assertContentRegexp(plainTextBlob, 
+            "The Apache Stanbol Enhancer",
+            "The Stanbol enhancer can detect famous cities");
+        //only the original and the plain text
+        // this asserts that no xhtml is parsed from the parsed xhtml content
+        assertEquals(2, ContentItemHelper.getContentParts(ci, Blob.class).size());
+    }
+    
+    private ContentItem createContentItem(String resourceName, String contentType){
+        InputStream in = TikaEngineTest.class.getClassLoader().getResourceAsStream(resourceName);
+        assertNotNull(in);
+        byte[] data;
+        try {
+            data = toByteArray(in);
+        } catch (IOException e) {
+            throw new IllegalStateException("Unable to read test data!",e);
+        }
+        closeQuietly(in);
+        UriRef ref = new UriRef("urn:contentItem:content-"+ContentItemHelper.toHexString(data));
+        return new InMemoryContentItem(data,contentType);
+    }
+    /**
+     * Tests if the parsed regex pattern are contained in any line of the parsed
+     * test
+     * @throws IOException 
+     */
+    public void assertContentRegexp(Blob blob, String... regexp) throws IOException {
+        Charset charset;
+        if(blob.getParameter().containsKey("charset")){
+            charset = Charset.forName(blob.getParameter().get("charset"));
+        } else {
+            charset = Charset.defaultCharset();
+        }
+        Reader reader = null;
+        nextPattern:
+        for (String expr : regexp) {
+            if(reader != null){
+                closeQuietly(reader);
+            }
+            final Pattern p = Pattern.compile(".*" + expr + ".*");
+            reader = new InputStreamReader(blob.getStream(), charset);
+            final LineIterator it = new LineIterator(reader);
+            while (it.hasNext()) {
+                final String line = it.nextLine();
+                if (p.matcher(line).matches()) {
+                    continue nextPattern;
+                }
+            }
+            fail(this + ": no match for regexp '" + expr + "', content=\n" + 
+                    IOUtils.toString(blob.getStream(), charset.toString()));
+        }
+    }
+    @After
+    public void unbindServices() {/*nothing to do */}
+
+    @AfterClass
+    public static void shutdownServices() {
+        engine.deactivate(context);
+        engine = null;
+    }
+
+}

Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/java/org/apache/stanbol/enhancer/engines/tika/TikaEngineTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/README
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/README?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/README (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/README Tue Feb 28 15:36:04 2012
@@ -0,0 +1,30 @@
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+
+All files within this directory are provided under the 
+
+    Apache License, Version 2.0
+    
+This includes the following files:
+
+    test.doc
+    test.html
+    test.xhtml
+    test.odt
+    test.pages
+    test.pdf
+    test.rtf
+    test2.html
+    test2.pdf
\ No newline at end of file

Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.doc
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.doc?rev=1294695&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.doc
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.html
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.html?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.html (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.html Tue Feb 28 15:36:04 2012
@@ -0,0 +1,11 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+<head>
+    <title>The Apache Stanbol Enhancer</title>
+</head>
+<body>
+<h1>The Apache Stanbol Enhancer</h1>
+
+<p>The <b>Stanbol enhancer</b> can detect famous cities such as <i>Paris</i> and people such as <i>Bob Marley</i>.</p>
+</body>
+</html>
\ No newline at end of file

Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.html
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.odt
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.odt?rev=1294695&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.odt
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.pages
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.pages?rev=1294695&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.pages
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.pdf
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.pdf?rev=1294695&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.rtf
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.rtf?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.rtf (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.rtf Tue Feb 28 15:36:04 2012
@@ -0,0 +1,14 @@
+{\rtf1\ansi\ansicpg1252\cocoartf1138\cocoasubrtf320
+{\fonttbl\f0\fswiss\fcharset0 Helvetica;}
+{\colortbl;\red255\green255\blue255;}
+\margl1440\margr1440\margb1800\margt1800
+\deftab708
+\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardeftab708\pardirnatural
+
+\f0\b\fs36 \cf0 \expnd0\expndtw0\kerning0
+\up0 \nosupersub \ulnone \outl0\strokewidth0 \strokec0 The Apache Stanbol Enhancer\
+\pard\tx560\tx1120\tx1680\tx2240\tx2800\tx3360\tx3920\tx4480\tx5040\tx5600\tx6160\tx6720\pardeftab708\pardirnatural
+
+\b0\fs24 \expnd0\expndtw0\kerning0
+\
+The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.}
\ No newline at end of file

Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.xhtml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.xhtml?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.xhtml (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.xhtml Tue Feb 28 15:36:04 2012
@@ -0,0 +1,8 @@
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<title>XHTML test</title>
+</head>
+<body>
+<h1>The Apache Stanbol Enhancer</h1>
+<p>The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.</p>
+</body></html> 
\ No newline at end of file

Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test.xhtml
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.html
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.html?rev=1294695&view=auto
==============================================================================
--- incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.html (added)
+++ incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.html Tue Feb 28 15:36:04 2012
@@ -0,0 +1,21 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE html
+  PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN" "http://www.w3.org/TR/MathML2/dtd/xhtml-math11-f.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml"><!--This file was converted to xhtml by OpenOffice.org - see http://xml.openoffice.org/odf2xhtml for more info.--><head profile="http://dublincore.org/documents/dcmi-terms/"><meta http-equiv="Content-Type" content="application/xhtml+xml; charset=utf-8"/><title xml:lang="en-US">The Apache Stanbol Enhancer</title><meta name="DCTERMS.title" content="The Apache Stanbol Enhancer" xml:lang="en-US"/><meta name="DCTERMS.language" content="en-US" scheme="DCTERMS.RFC4646"/><meta name="DCTERMS.source" content="http://xml.openoffice.org/odf2xhtml"/><meta name="DCTERMS.creator" content="Szaby Grünwald"/><meta name="DCTERMS.issued" content="2012-02-28T08:40:48" scheme="DCTERMS.W3CDTF"/><meta name="DCTERMS.contributor" content="Szaby Grünwald"/><meta name="DCTERMS.modified" content="2012-02-28T08:42:03" scheme="DCTERMS.W3CDTF"/><meta name="DCTERMS.provenance" content="" xml:lang="en-US"/><meta name="DCTERMS.subject" content="," xml:lang="en-
 US"/><link rel="schema.DC" href="http://purl.org/dc/elements/1.1/" hreflang="en"/><link rel="schema.DCTERMS" href="http://purl.org/dc/terms/" hreflang="en"/><link rel="schema.DCTYPE" href="http://purl.org/dc/dcmitype/" hreflang="en"/><link rel="schema.DCAM" href="http://purl.org/dc/dcam/" hreflang="en"/><base href="."/><style type="text/css">
+	@page {  }
+	table { border-collapse:collapse; border-spacing:0; empty-cells:show }
+	td, th { vertical-align:top; font-size:12pt;}
+	h1, h2, h3, h4, h5, h6 { clear:both }
+	ol, ul { margin:0; padding:0;}
+	li { list-style: none; margin:0; padding:0;}
+	<!-- "li span.odfLiEnd" - IE 7 issue-->
+	li span. { clear: both; line-height:0; width:0; height:0; margin:0; padding:0; }
+	span.footnodeNumber { padding-right:1em; }
+	span.annotation_style_by_filter { font-size:95%; font-family:Arial; background-color:#fff000;  margin:0; border:0; padding:0;  }
+	* { margin:0;}
+	.Heading_20_1 { font-size:115%; margin-bottom:0.0835in; margin-top:0.1665in; font-family:Arial; writing-mode:page; font-weight:bold; }
+	.P1 { font-size:12pt; font-family:Arial; writing-mode:page; }
+	.Standard { font-size:12pt; font-family:Times New Roman; writing-mode:page; }
+	<!-- ODF styles with no properties representable as CSS -->
+	{ }
+	</style></head><body dir="ltr" style="max-width:8.2681in;margin-top:0.7874in; margin-bottom:0.7874in; margin-left:0.7874in; margin-right:0.7874in; "><h1 class="Heading_20_1"><a id="a__The_Apache_Stanbol_Enhancer"><span/></a>The Apache Stanbol Enhancer</h1><p class="Standard"> </p><p class="P1">The Stanbol enhancer can detect famous cities such as Paris and people such as Bob Marley.</p></body></html>
\ No newline at end of file

Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.html
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.pdf
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.pdf?rev=1294695&view=auto
==============================================================================
Binary file - no diff available.

Propchange: incubator/stanbol/trunk/enhancer/engines/tika/src/test/resources/test2.pdf
------------------------------------------------------------------------------
    svn:mime-type = application/octet-stream

Modified: incubator/stanbol/trunk/parent/pom.xml
URL: http://svn.apache.org/viewvc/incubator/stanbol/trunk/parent/pom.xml?rev=1294695&r1=1294694&r2=1294695&view=diff
==============================================================================
--- incubator/stanbol/trunk/parent/pom.xml (original)
+++ incubator/stanbol/trunk/parent/pom.xml Tue Feb 28 15:36:04 2012
@@ -1599,13 +1599,18 @@
     </dependency>
     
 
-    <!-- Apache Tika core -->
+    <!-- Apache Tika -->
     <dependency>
       <groupId>org.apache.tika</groupId>
       <artifactId>tika-core</artifactId>
-      <version>0.9</version>
+      <version>1.0</version>
     </dependency>
-
+    <dependency>
+      <groupId>org.apache.tika</groupId>
+      <artifactId>tika-parsers</artifactId>
+      <version>1.0</version>
+    </dependency>
+    
     <!-- Aperture -->
     <dependency>
       <groupId>org.semanticdesktop.aperture</groupId>