You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@stanbol.apache.org by rw...@apache.org on 2013/03/01 07:41:03 UTC
svn commit: r1451480 - in /stanbol/trunk/enhancement-engines/tika/src/main:
java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
resources/OSGI-INF/metatype/metatype.properties
Author: rwesten
Date: Fri Mar 1 06:41:03 2013
New Revision: 1451480
URL: http://svn.apache.org/r1451480
Log:
STANBOL-970: The TikaEngine now sets the ContextClassLoader the the BundleClassloader; minor: if present the charset is now explicitly set to the Tika Metadata
Modified:
stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
stanbol/trunk/enhancement-engines/tika/src/main/resources/OSGI-INF/metatype/metatype.properties
Modified: stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java?rev=1451480&r1=1451479&r2=1451480&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java (original)
+++ stanbol/trunk/enhancement-engines/tika/src/main/java/org/apache/stanbol/enhancer/engines/tika/TikaEngine.java Fri Mar 1 06:41:03 2013
@@ -67,7 +67,6 @@ import org.apache.stanbol.enhancer.servi
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.metadata.MSOffice;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AutoDetectParser;
@@ -215,6 +214,11 @@ public class TikaEngine
final Metadata metadata = new Metadata();
//set the already parsed contentType
metadata.set(Metadata.CONTENT_TYPE, mtas.mediaType.toString());
+ //also explicitly set the charset as contentEncoding
+ String charset = mtas.mediaType.getParameters().get("charset");
+ if(charset != null){
+ metadata.set(Metadata.CONTENT_ENCODING, charset);
+ }
ContentSink plainTextSink;
try {
plainTextSink = ciFactory.createContentSink(TEXT_PLAIN +"; charset="+UTF8.name());
@@ -248,6 +252,22 @@ public class TikaEngine
xhtmlHandler = null;
xhtmlSink = null;
}
+ /*
+ * We need to replace the context Classloader with the Bundle ClassLoader
+ * to ensure that Singleton instances of XML frameworks (such as node4j)
+ * do not leak into the OSGI environment.
+ *
+ * Most Java XML libs prefer to load implementations by using the
+ * {@link Thread#getContextClassLoader()}. However OSGI has no control over
+ * this {@link ClassLoader}. Because of that there can be situations where
+ * Interfaces are loaded via the Bundle Classloader and the implementations
+ * are taken from the context Classloader. What can cause
+ * {@link ClassCastException}, {@link ExceptionInInitializerError}s, ...
+ *
+ * Setting the context Classloader to the Bundle classloader helps to avoid
+ * those situations.
+ */
+ ClassLoader contextClassLoader = updateContextClassLoader();
try {
AccessController.doPrivileged(new PrivilegedExceptionAction<Object>() {
public Object run() throws IOException, SAXException, TikaException {
@@ -264,6 +284,9 @@ public class TikaEngine
} else { //runtime exception
throw RuntimeException.class.cast(e);
}
+ } finally {
+ //reset the previous context ClassLoader
+ Thread.currentThread().setContextClassLoader(contextClassLoader);
}
} finally { //ensure that the writers are closed correctly
IOUtils.closeQuietly(in);
@@ -356,6 +379,32 @@ public class TikaEngine
blob.getParameter());
}
}
+
+ /**
+ * Sets the Bundle {@link ClassLoader} context Classloader of the
+ * {@link Thread#currentThread()}.
+ * <p>
+ * Users of this utility method need to make sure that the
+ * ClassLoader is reset to the original value - as
+ * returned by this method by adding a
+ * <pre><code>
+ * ClassLoader classLoader = updateContextClassLoader();
+ * try {
+ * //the code that needs to be executed
+ * } finally {
+ * Thread.currentThread().setContextClassLoader(classLoader);
+ * }
+ * </code></pre><p>
+ * @return the {@link ClassLoader} of {@link Thread#currentThread()} before
+ * calling this method
+ */
+ private ClassLoader updateContextClassLoader() {
+ ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
+ Thread.currentThread().setContextClassLoader(TikaEngine.class.getClassLoader());
+ return classLoader;
+ }
+
+
@Override
protected void activate(ComponentContext ctx) throws ConfigurationException {
super.activate(ctx);
Modified: stanbol/trunk/enhancement-engines/tika/src/main/resources/OSGI-INF/metatype/metatype.properties
URL: http://svn.apache.org/viewvc/stanbol/trunk/enhancement-engines/tika/src/main/resources/OSGI-INF/metatype/metatype.properties?rev=1451480&r1=1451479&r2=1451480&view=diff
==============================================================================
--- stanbol/trunk/enhancement-engines/tika/src/main/resources/OSGI-INF/metatype/metatype.properties (original)
+++ stanbol/trunk/enhancement-engines/tika/src/main/resources/OSGI-INF/metatype/metatype.properties Fri Mar 1 06:41:03 2013
@@ -68,3 +68,8 @@ stanbol.engine.tika.mapping.geo.name=GEO
stanbol.engine.tika.mapping.geo.description=Encodes latitude, longitude and \
altitude information extracted by Apache Tika by using the W3C wgs84 Ontology
+stanbol.engine.tika.mapping.unmapped.name=Unmapped Properties
+stanbol.engine.tika.mapping.unmapped.description=This allows to include Tika Properties \
+not mapped by any of the above mappings to 'urn:tika.apache.org:tika:{property-name}'. \
+Only Tika properties following the '{ns}:{localname}' naming schema are considered.
+