You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/09/06 12:57:58 UTC

[tika] branch master updated: improve xml parsing

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new 4e67928  improve xml parsing
4e67928 is described below

commit 4e67928412ad56333d400f3728ecdb59d07d9d63
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Sep 6 08:57:47 2018 -0400

    improve xml parsing
---
 .../java/org/apache/tika/utils/XMLReaderUtils.java | 43 +++++++++++++++++++---
 .../org/apache/tika/TestXMLEntityExpansion.java    | 28 ++++++++++----
 2 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 382be2d..0069a9a 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -46,6 +46,7 @@ import java.io.InputStream;
 import java.io.Serializable;
 import java.io.StringReader;
 import java.lang.reflect.Method;
+import java.util.Properties;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
@@ -73,6 +74,25 @@ public class XMLReaderUtils implements Serializable {
 
     private static long LAST_LOG = -1;
 
+    private static final String JAXP_ENTITY_EXPANSION_LIMIT_KEY = "jdk.xml.entityExpansionLimit";
+    private static final int DEFAULT_MAX_ENTITY_EXPANSIONS = 20;
+
+    private static int MAX_ENTITY_EXPANSIONS = determineMaxEntityExpansions();
+
+    private static int determineMaxEntityExpansions() {
+        Properties properties = System.getProperties();
+        if (properties != null && properties.containsKey(JAXP_ENTITY_EXPANSION_LIMIT_KEY)) {
+            try {
+                return Integer.parseInt(properties.getProperty(JAXP_ENTITY_EXPANSION_LIMIT_KEY));
+            } catch (NumberFormatException e) {
+                LOG.log(Level.WARNING, "Couldn't parse an integer for the entity expansion limit:"+
+                        properties.getProperty(JAXP_ENTITY_EXPANSION_LIMIT_KEY)+
+                        "; backing off to default: "+DEFAULT_MAX_ENTITY_EXPANSIONS);
+            }
+        }
+        return DEFAULT_MAX_ENTITY_EXPANSIONS;
+    }
+
     //TODO: figure out if the rw lock is any better than a simple lock
     private static final ReentrantReadWriteLock SAX_READ_WRITE_LOCK = new ReentrantReadWriteLock();
     private static final ReentrantReadWriteLock DOM_READ_WRITE_LOCK = new ReentrantReadWriteLock();
@@ -105,6 +125,19 @@ public class XMLReaderUtils implements Serializable {
             };
 
     /**
+     * Set the maximum number of entity expansions allowable in SAX/DOM/StAX parsing.
+     * <b>NOTE:</b>A value less than or equal to zero indicates no limit.
+     * This will override the system property {@link #JAXP_ENTITY_EXPANSION_LIMIT_KEY}
+     * and the {@link #DEFAULT_MAX_ENTITY_EXPANSIONS} value for pa
+     *
+     * @param maxEntityExpansions -- maximum number of allowable entity expansions
+     * @since Apache Tika 1.19
+     */
+    public static void setMaxEntityExpansions(int maxEntityExpansions) {
+        MAX_ENTITY_EXPANSIONS = maxEntityExpansions;
+    }
+
+    /**
      * Returns the XMLReader specified in this parsing context. If a reader
      * is not explicitly specified, then one is created using the specified
      * or the default SAX parser.
@@ -517,7 +550,7 @@ public class XMLReaderUtils implements Serializable {
             try {
                 Object mgr = Class.forName(securityManagerClassName).newInstance();
                 Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE);
-                setLimit.invoke(mgr, 4096);
+                setLimit.invoke(mgr, MAX_ENTITY_EXPANSIONS);
                 factory.setAttribute("http://apache.org/xml/properties/security-manager", mgr);
                 // Stop once one can be setup without error
                 return;
@@ -534,7 +567,7 @@ public class XMLReaderUtils implements Serializable {
 
         // separate old version of Xerces not found => use the builtin way of setting the property
         try {
-            factory.setAttribute("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", 4096);
+            factory.setAttribute("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", MAX_ENTITY_EXPANSIONS);
         } catch (IllegalArgumentException e) {     // NOSONAR - also catch things like NoClassDefError here
             // throttle the log somewhat as it can spam the log otherwise
             if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
@@ -554,7 +587,7 @@ public class XMLReaderUtils implements Serializable {
             try {
                 Object mgr = Class.forName(securityManagerClassName).newInstance();
                 Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE);
-                setLimit.invoke(mgr, 4096);
+                setLimit.invoke(mgr, MAX_ENTITY_EXPANSIONS);
                 parser.setProperty("http://apache.org/xml/properties/security-manager", mgr);
                 // Stop once one can be setup without error
                 return;
@@ -571,7 +604,7 @@ public class XMLReaderUtils implements Serializable {
 
         // separate old version of Xerces not found => use the builtin way of setting the property
         try {
-            parser.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", 4096);
+            parser.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", MAX_ENTITY_EXPANSIONS);
         } catch (SAXException e) {     // NOSONAR - also catch things like NoClassDefError here
             // throttle the log somewhat as it can spam the log otherwise
             if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
@@ -583,7 +616,7 @@ public class XMLReaderUtils implements Serializable {
 
     private static void trySetStaxSecurityManager(XMLInputFactory inputFactory) {
         try {
-            inputFactory.setProperty("com.ctc.wstx.maxEntityCount", 4096);
+            inputFactory.setProperty("com.ctc.wstx.maxEntityCount", MAX_ENTITY_EXPANSIONS);
         } catch (IllegalArgumentException e) {
             // throttle the log somewhat as it can spam the log otherwise
             if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/TestXMLEntityExpansion.java b/tika-parsers/src/test/java/org/apache/tika/TestXMLEntityExpansion.java
index 77a166b..54e3a7c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/TestXMLEntityExpansion.java
+++ b/tika-parsers/src/test/java/org/apache/tika/TestXMLEntityExpansion.java
@@ -26,17 +26,14 @@ import org.xml.sax.SAXParseException;
 import java.io.ByteArrayInputStream;
 import java.nio.charset.StandardCharsets;
 
-import static org.apache.tika.XMLTestBase.injectXML;
-import static org.apache.tika.XMLTestBase.parse;
 import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
 
 /**
  * Tests to confirm defenses against entity expansion attacks.
  */
 @Ignore("initial draft, needs more work")
-public class TestXMLEntityExpansion
-{
+public class TestXMLEntityExpansion extends XMLTestBase {
+
     private static final byte[] ENTITY_EXPANSION_BOMB = new String(
             "<!DOCTYPE kaboom [ " +
                     "<!ENTITY a \"1234567890\" > " +
@@ -61,13 +58,28 @@ public class TestXMLEntityExpansion
                     "]> " +
                     "<kaboom>&s;</kaboom>").getBytes(StandardCharsets.UTF_8);
 
-    //a truly vulnerable parser, say xerces2, doesn't oom, it thrashes with gc.
     //Set a reasonable amount of time as the timeout
+    //Make sure that the test apparatus actually works.
     @Test(timeout = 20000)
-    public void testInjectedXML() throws Exception {
+    public void testVulnerableParser() throws Exception {
         byte[] bytes = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><document>blah</document>".getBytes(StandardCharsets.UTF_8);
         byte[] injected = injectXML(bytes, ENTITY_EXPANSION_BOMB);
-        parse("injected", new ByteArrayInputStream(injected), new XMLTestBase.VulnerableSAXParser());
+
+        Thread thread = new Thread() {
+            @Override
+            public void run() {
+                try {
+                    parse("injected", new ByteArrayInputStream(injected), new XMLTestBase.VulnerableSAXParser());
+                } catch (Exception e) {
+                    throw new RuntimeException(e);
+                }
+            }
+        };
+        thread.start();
+        Thread.sleep(10000);
+        assertTrue(thread.isAlive());
+        thread.interrupt();
+
     }
 
     @Test(timeout = 20000)//