You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/08/03 17:50:49 UTC
[tika] 02/03: improve xml reading
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
commit bd9d75d8b0a85af2937047bfad04288c3044b2a6
Author: TALLISON <ta...@apache.org>
AuthorDate: Fri Aug 3 13:16:09 2018 -0400
improve xml reading
---
.../java/org/apache/tika/utils/XMLReaderUtils.java | 97 +++++++++++++++++++++-
1 file changed, 95 insertions(+), 2 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 10c2274..382be2d 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -45,6 +45,7 @@ import java.io.IOException;
import java.io.InputStream;
import java.io.Serializable;
import java.io.StringReader;
+import java.lang.reflect.Method;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.ReentrantReadWriteLock;
@@ -70,6 +71,8 @@ public class XMLReaderUtils implements Serializable {
*/
private static int POOL_SIZE = 10;
+ private static long LAST_LOG = -1;
+
//TODO: figure out if the rw lock is any better than a simple lock
private static final ReentrantReadWriteLock SAX_READ_WRITE_LOCK = new ReentrantReadWriteLock();
private static final ReentrantReadWriteLock DOM_READ_WRITE_LOCK = new ReentrantReadWriteLock();
@@ -138,7 +141,9 @@ public class XMLReaderUtils implements Serializable {
*/
public static SAXParser getSAXParser() throws TikaException {
try {
- return getSAXParserFactory().newSAXParser();
+ SAXParser parser = getSAXParserFactory().newSAXParser();
+ trySetXercesSecurityManager(parser);
+ return parser;
} catch (ParserConfigurationException e) {
throw new TikaException("Unable to configure a SAX parser", e);
} catch (SAXException e) {
@@ -202,6 +207,7 @@ public class XMLReaderUtils implements Serializable {
trySetSAXFeature(factory, "http://xml.org/sax/features/external-parameter-entities", false);
trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
trySetSAXFeature(factory, "http://apache.org/xml/features/nonvalidating/load-dtd-grammar", false);
+ trySetXercesSecurityManager(factory);
return factory;
}
@@ -244,6 +250,7 @@ public class XMLReaderUtils implements Serializable {
tryToSetStaxProperty(factory, XMLInputFactory.IS_VALIDATING, false);
factory.setXMLResolver(IGNORING_STAX_ENTITY_RESOLVER);
+ trySetStaxSecurityManager(factory);
return factory;
}
@@ -261,7 +268,7 @@ public class XMLReaderUtils implements Serializable {
try {
factory.setProperty(key, value);
} catch (IllegalArgumentException e) {
- //swallow
+ LOG.log(Level.WARNING, "StAX Feature unsupported: " + key, e);
}
}
@@ -499,4 +506,90 @@ public class XMLReaderUtils implements Serializable {
}
POOL_SIZE = poolSize;
}
+
+ private static void trySetXercesSecurityManager(DocumentBuilderFactory factory) {
+ //from POI
+ // Try built-in JVM one first, standalone if not
+ for (String securityManagerClassName : new String[] {
+ //"com.sun.org.apache.xerces.internal.util.SecurityManager",
+ "org.apache.xerces.util.SecurityManager"
+ }) {
+ try {
+ Object mgr = Class.forName(securityManagerClassName).newInstance();
+ Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE);
+ setLimit.invoke(mgr, 4096);
+ factory.setAttribute("http://apache.org/xml/properties/security-manager", mgr);
+ // Stop once one can be setup without error
+ return;
+ } catch (ClassNotFoundException e) {
+ // continue without log, this is expected in some setups
+ } catch (Throwable e) { // NOSONAR - also catch things like NoClassDefError here
+ // throttle the log somewhat as it can spam the log otherwise
+ if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
+ LOG.log(Level.WARNING, "SAX Security Manager could not be setup [log suppressed for 5 minutes]", e);
+ LAST_LOG = System.currentTimeMillis();
+ }
+ }
+ }
+
+ // separate old version of Xerces not found => use the builtin way of setting the property
+ try {
+ factory.setAttribute("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", 4096);
+ } catch (IllegalArgumentException e) { // NOSONAR - also catch things like NoClassDefError here
+ // throttle the log somewhat as it can spam the log otherwise
+ if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
+ LOG.log(Level.WARNING, "SAX Security Manager could not be setup [log suppressed for 5 minutes]", e);
+ LAST_LOG = System.currentTimeMillis();
+ }
+ }
+ }
+
+ private static void trySetXercesSecurityManager(SAXParser parser) {
+ //from POI
+ // Try built-in JVM one first, standalone if not
+ for (String securityManagerClassName : new String[] {
+ //"com.sun.org.apache.xerces.internal.util.SecurityManager",
+ "org.apache.xerces.util.SecurityManager"
+ }) {
+ try {
+ Object mgr = Class.forName(securityManagerClassName).newInstance();
+ Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE);
+ setLimit.invoke(mgr, 4096);
+ parser.setProperty("http://apache.org/xml/properties/security-manager", mgr);
+ // Stop once one can be setup without error
+ return;
+ } catch (ClassNotFoundException e) {
+ // continue without log, this is expected in some setups
+ } catch (Throwable e) { // NOSONAR - also catch things like NoClassDefError here
+ // throttle the log somewhat as it can spam the log otherwise
+ if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
+ LOG.log(Level.WARNING, "SAX Security Manager could not be setup [log suppressed for 5 minutes]", e);
+ LAST_LOG = System.currentTimeMillis();
+ }
+ }
+ }
+
+ // separate old version of Xerces not found => use the builtin way of setting the property
+ try {
+ parser.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", 4096);
+ } catch (SAXException e) { // NOSONAR - also catch things like NoClassDefError here
+ // throttle the log somewhat as it can spam the log otherwise
+ if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
+ LOG.log(Level.WARNING, "SAX Security Manager could not be setup [log suppressed for 5 minutes]", e);
+ LAST_LOG = System.currentTimeMillis();
+ }
+ }
+ }
+
+ private static void trySetStaxSecurityManager(XMLInputFactory inputFactory) {
+ try {
+ inputFactory.setProperty("com.ctc.wstx.maxEntityCount", 4096);
+ } catch (IllegalArgumentException e) {
+ // throttle the log somewhat as it can spam the log otherwise
+ if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
+ LOG.log(Level.WARNING, "SAX Security Manager could not be setup [log suppressed for 5 minutes]", e);
+ LAST_LOG = System.currentTimeMillis();
+ }
+ }
+ }
}