You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/09/06 12:57:58 UTC
[tika] branch master updated: improve xml parsing
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/master by this push:
new 4e67928 improve xml parsing
4e67928 is described below
commit 4e67928412ad56333d400f3728ecdb59d07d9d63
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Sep 6 08:57:47 2018 -0400
improve xml parsing
---
.../java/org/apache/tika/utils/XMLReaderUtils.java | 43 +++++++++++++++++++---
.../org/apache/tika/TestXMLEntityExpansion.java | 28 ++++++++++----
2 files changed, 58 insertions(+), 13 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 382be2d..0069a9a 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -46,6 +46,7 @@ import java.io.InputStream;
import java.io.Serializable;
import java.io.StringReader;
import java.lang.reflect.Method;
+import java.util.Properties;
import java.util.concurrent.ArrayBlockingQueue;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.locks.ReentrantReadWriteLock;
@@ -73,6 +74,25 @@ public class XMLReaderUtils implements Serializable {
private static long LAST_LOG = -1;
+ private static final String JAXP_ENTITY_EXPANSION_LIMIT_KEY = "jdk.xml.entityExpansionLimit";
+ private static final int DEFAULT_MAX_ENTITY_EXPANSIONS = 20;
+
+ private static int MAX_ENTITY_EXPANSIONS = determineMaxEntityExpansions();
+
+ private static int determineMaxEntityExpansions() {
+ Properties properties = System.getProperties();
+ if (properties != null && properties.containsKey(JAXP_ENTITY_EXPANSION_LIMIT_KEY)) {
+ try {
+ return Integer.parseInt(properties.getProperty(JAXP_ENTITY_EXPANSION_LIMIT_KEY));
+ } catch (NumberFormatException e) {
+ LOG.log(Level.WARNING, "Couldn't parse an integer for the entity expansion limit:"+
+ properties.getProperty(JAXP_ENTITY_EXPANSION_LIMIT_KEY)+
+ "; backing off to default: "+DEFAULT_MAX_ENTITY_EXPANSIONS);
+ }
+ }
+ return DEFAULT_MAX_ENTITY_EXPANSIONS;
+ }
+
//TODO: figure out if the rw lock is any better than a simple lock
private static final ReentrantReadWriteLock SAX_READ_WRITE_LOCK = new ReentrantReadWriteLock();
private static final ReentrantReadWriteLock DOM_READ_WRITE_LOCK = new ReentrantReadWriteLock();
@@ -105,6 +125,19 @@ public class XMLReaderUtils implements Serializable {
};
/**
+ * Set the maximum number of entity expansions allowable in SAX/DOM/StAX parsing.
+ * <b>NOTE:</b>A value less than or equal to zero indicates no limit.
+ * This will override the system property {@link #JAXP_ENTITY_EXPANSION_LIMIT_KEY}
+ * and the {@link #DEFAULT_MAX_ENTITY_EXPANSIONS} value for pa
+ *
+ * @param maxEntityExpansions -- maximum number of allowable entity expansions
+ * @since Apache Tika 1.19
+ */
+ public static void setMaxEntityExpansions(int maxEntityExpansions) {
+ MAX_ENTITY_EXPANSIONS = maxEntityExpansions;
+ }
+
+ /**
* Returns the XMLReader specified in this parsing context. If a reader
* is not explicitly specified, then one is created using the specified
* or the default SAX parser.
@@ -517,7 +550,7 @@ public class XMLReaderUtils implements Serializable {
try {
Object mgr = Class.forName(securityManagerClassName).newInstance();
Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE);
- setLimit.invoke(mgr, 4096);
+ setLimit.invoke(mgr, MAX_ENTITY_EXPANSIONS);
factory.setAttribute("http://apache.org/xml/properties/security-manager", mgr);
// Stop once one can be setup without error
return;
@@ -534,7 +567,7 @@ public class XMLReaderUtils implements Serializable {
// separate old version of Xerces not found => use the builtin way of setting the property
try {
- factory.setAttribute("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", 4096);
+ factory.setAttribute("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", MAX_ENTITY_EXPANSIONS);
} catch (IllegalArgumentException e) { // NOSONAR - also catch things like NoClassDefError here
// throttle the log somewhat as it can spam the log otherwise
if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
@@ -554,7 +587,7 @@ public class XMLReaderUtils implements Serializable {
try {
Object mgr = Class.forName(securityManagerClassName).newInstance();
Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE);
- setLimit.invoke(mgr, 4096);
+ setLimit.invoke(mgr, MAX_ENTITY_EXPANSIONS);
parser.setProperty("http://apache.org/xml/properties/security-manager", mgr);
// Stop once one can be setup without error
return;
@@ -571,7 +604,7 @@ public class XMLReaderUtils implements Serializable {
// separate old version of Xerces not found => use the builtin way of setting the property
try {
- parser.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", 4096);
+ parser.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", MAX_ENTITY_EXPANSIONS);
} catch (SAXException e) { // NOSONAR - also catch things like NoClassDefError here
// throttle the log somewhat as it can spam the log otherwise
if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
@@ -583,7 +616,7 @@ public class XMLReaderUtils implements Serializable {
private static void trySetStaxSecurityManager(XMLInputFactory inputFactory) {
try {
- inputFactory.setProperty("com.ctc.wstx.maxEntityCount", 4096);
+ inputFactory.setProperty("com.ctc.wstx.maxEntityCount", MAX_ENTITY_EXPANSIONS);
} catch (IllegalArgumentException e) {
// throttle the log somewhat as it can spam the log otherwise
if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/TestXMLEntityExpansion.java b/tika-parsers/src/test/java/org/apache/tika/TestXMLEntityExpansion.java
index 77a166b..54e3a7c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/TestXMLEntityExpansion.java
+++ b/tika-parsers/src/test/java/org/apache/tika/TestXMLEntityExpansion.java
@@ -26,17 +26,14 @@ import org.xml.sax.SAXParseException;
import java.io.ByteArrayInputStream;
import java.nio.charset.StandardCharsets;
-import static org.apache.tika.XMLTestBase.injectXML;
-import static org.apache.tika.XMLTestBase.parse;
import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
/**
* Tests to confirm defenses against entity expansion attacks.
*/
@Ignore("initial draft, needs more work")
-public class TestXMLEntityExpansion
-{
+public class TestXMLEntityExpansion extends XMLTestBase {
+
private static final byte[] ENTITY_EXPANSION_BOMB = new String(
"<!DOCTYPE kaboom [ " +
"<!ENTITY a \"1234567890\" > " +
@@ -61,13 +58,28 @@ public class TestXMLEntityExpansion
"]> " +
"<kaboom>&s;</kaboom>").getBytes(StandardCharsets.UTF_8);
- //a truly vulnerable parser, say xerces2, doesn't oom, it thrashes with gc.
//Set a reasonable amount of time as the timeout
+ //Make sure that the test apparatus actually works.
@Test(timeout = 20000)
- public void testInjectedXML() throws Exception {
+ public void testVulnerableParser() throws Exception {
byte[] bytes = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><document>blah</document>".getBytes(StandardCharsets.UTF_8);
byte[] injected = injectXML(bytes, ENTITY_EXPANSION_BOMB);
- parse("injected", new ByteArrayInputStream(injected), new XMLTestBase.VulnerableSAXParser());
+
+ Thread thread = new Thread() {
+ @Override
+ public void run() {
+ try {
+ parse("injected", new ByteArrayInputStream(injected), new XMLTestBase.VulnerableSAXParser());
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+ };
+ thread.start();
+ Thread.sleep(10000);
+ assertTrue(thread.isAlive());
+ thread.interrupt();
+
}
@Test(timeout = 20000)//