You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/03/01 07:41:03 UTC

svn commit: r1451480 - in /stanbol/trunk/enhancement-engines/tika/src/main: java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java resources/OSGI-INF/metatype/metatype.properties

Author: rwesten
Date: Fri Mar  1 06:41:03 2013
New Revision: 1451480

URL: http://svn.apache.org/r1451480
Log:
STANBOL-970: The TikaEngine now sets the ContextClassLoader the the BundleClassloader; minor: if present the charset is now explicitly set to the Tika Metadata

Modified:
    stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
    stanbol/trunk/enhancement-engines/tika/src/main/resources/OSGI-INF/metatype/metatype.properties

Modified: stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java?rev=1451480&r1=1451479&r2=1451480&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java (original)
+++ stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java Fri Mar  1 06:41:03 2013
@@ -67,7 +67,6 @@ import org.apache.stanbol.enhancer.servi
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.MSOffice;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
 import org.apache.tika.parser.AutoDetectParser;
@@ -215,6 +214,11 @@ public class TikaEngine 
             final Metadata metadata = new Metadata();
             //set the already parsed contentType
             metadata.set(Metadata.CONTENT_TYPE, mtas.mediaType.toString());
+            //also explicitly set the charset as contentEncoding
+            String charset = mtas.mediaType.getParameters().get("charset");
+            if(charset != null){
+                metadata.set(Metadata.CONTENT_ENCODING, charset);
+            }
             ContentSink plainTextSink;
             try {
                 plainTextSink = ciFactory.createContentSink(TEXT_PLAIN +"; charset="+UTF8.name());
@@ -248,6 +252,22 @@ public class TikaEngine 
                     xhtmlHandler = null;
                     xhtmlSink = null;
                 }
+                /* 
+                 * We need to replace the context Classloader with the Bundle ClassLoader
+                 * to ensure that Singleton instances of XML frameworks (such as node4j) 
+                 * do not leak into the OSGI environment.
+                 * 
+                 * Most Java XML libs prefer to load implementations by using the 
+                 * {@link Thread#getContextClassLoader()}. However OSGI has no control over
+                 * this {@link ClassLoader}. Because of that there can be situations where
+                 * Interfaces are loaded via the Bundle Classloader and the implementations
+                 * are taken from the context Classloader. What can cause 
+                 * {@link ClassCastException}, {@link ExceptionInInitializerError}s, ...
+                 * 
+                 * Setting the context Classloader to the Bundle classloader helps to avoid
+                 * those situations.
+                 */
+                ClassLoader contextClassLoader = updateContextClassLoader();
                 try {
                     AccessController.doPrivileged(new PrivilegedExceptionAction<Object>() {
                         public Object run() throws IOException, SAXException, TikaException {
@@ -264,6 +284,9 @@ public class TikaEngine 
                     } else { //runtime exception
                         throw RuntimeException.class.cast(e);
                     }
+                } finally {
+                    //reset the previous context ClassLoader
+                    Thread.currentThread().setContextClassLoader(contextClassLoader);
                 }
             } finally { //ensure that the writers are closed correctly
                 IOUtils.closeQuietly(in);
@@ -356,6 +379,32 @@ public class TikaEngine 
                 blob.getParameter());
         }
     }
+    
+    /**
+     * Sets the Bundle {@link ClassLoader} context Classloader of the 
+     *  {@link Thread#currentThread()}.
+     * <p>
+     * Users of this utility method need to make sure that the 
+     * ClassLoader is reset to the original value - as
+     * returned by this method by adding a 
+     * <pre><code>
+     *     ClassLoader classLoader = updateContextClassLoader();
+     *     try {
+     *         //the code that needs to be executed
+     *     } finally {
+     *         Thread.currentThread().setContextClassLoader(classLoader);
+     *     }
+     * </code></pre><p>
+     * @return the {@link ClassLoader} of {@link Thread#currentThread()} before
+     * calling this method
+     */
+    private ClassLoader updateContextClassLoader() {
+        ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
+        Thread.currentThread().setContextClassLoader(TikaEngine.class.getClassLoader());
+        return classLoader;
+    }
+    
+    
     @Override
     protected void activate(ComponentContext ctx) throws ConfigurationException {
         super.activate(ctx);

Modified: stanbol/trunk/enhancement-engines/tika/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/tika/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1451480&r1=1451479&r2=1451480&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/tika/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ stanbol/trunk/enhancement-engines/tika/src/main/resources/OSGI-INF/metatype/metatype.properties Fri Mar  1 06:41:03 2013
@@ -68,3 +68,8 @@ stanbol.engine.tika.mapping.geo.name=GEO
 stanbol.engine.tika.mapping.geo.description=Encodes latitude, longitude and \
 altitude information extracted by Apache Tika by using the W3C wgs84 Ontology
 
+stanbol.engine.tika.mapping.unmapped.name=Unmapped Properties
+stanbol.engine.tika.mapping.unmapped.description=This allows to include Tika Properties \
+not mapped by any of the above mappings to 'urn:tika.apache.org:tika:{property-name}'. \
+Only Tika properties following the '{ns}:{localname}' naming schema are considered.
+