You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2017/10/11 12:44:02 UTC

[tika] branch master updated: add tests for xml vulnerabilities. More work remains on entity expansion...

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new f444fd7  add tests for xml vulnerabilities.  More work remains on entity expansion...
f444fd7 is described below

commit f444fd784b99b181cd7bd54cdec9fbd132b4ef93
Author: tballison <ta...@mitre.org>
AuthorDate: Wed Oct 11 08:43:52 2017 -0400

    add tests for xml vulnerabilities.  More work remains on entity expansion...
---
 .../org/apache/tika/TestXMLEntityExpansion.java    |  84 +++++++++++++
 .../test/java/org/apache/tika/TestXXEInXML.java    | 118 +++---------------
 .../src/test/java/org/apache/tika/XMLTestBase.java | 137 +++++++++++++++++++++
 3 files changed, 235 insertions(+), 104 deletions(-)

diff --git a/tika-parsers/src/test/java/org/apache/tika/TestXMLEntityExpansion.java b/tika-parsers/src/test/java/org/apache/tika/TestXMLEntityExpansion.java
new file mode 100644
index 0000000..056f056
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/TestXMLEntityExpansion.java
@@ -0,0 +1,84 @@
+package org.apache.tika;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.parser.AutoDetectParser;
+import org.apache.tika.parser.Parser;
+import org.junit.Ignore;
+import org.junit.Test;
+import org.xml.sax.SAXParseException;
+
+import java.io.ByteArrayInputStream;
+import java.nio.charset.StandardCharsets;
+
+import static org.apache.tika.XMLTestBase.injectXML;
+import static org.apache.tika.XMLTestBase.parse;
+import static org.junit.Assert.assertTrue;
+import static org.junit.Assert.fail;
+
+/**
+ * Tests to confirm defenses against entity expansion attacks.
+ */
+@Ignore("initial draft, needs more work")
+public class TestXMLEntityExpansion
+{
+    private static final byte[] ENTITY_EXPANSION_BOMB = new String(
+            "<!DOCTYPE kaboom [ " +
+                    "<!ENTITY a \"1234567890\" > " +
+                    "<!ENTITY b \"&a;&a;&a;&a;&a;&a;&a;&a;&a;&a;\" >" +
+                    "<!ENTITY c \"&b;&b;&b;&b;&b;&b;&b;&b;&b;&b;\" > " +
+                    "<!ENTITY d \"&c;&c;&c;&c;&c;&c;&c;&c;&c;&c;\" > " +
+                    "<!ENTITY e \"&d;&d;&d;&d;&d;&d;&d;&d;&d;&d;\" > " +
+                    "<!ENTITY f \"&e;&e;&e;&e;&e;&e;&e;&e;&e;&e;\" > " +
+                    "<!ENTITY g \"&f;&f;&f;&f;&f;&f;&f;&f;&f;&f;\" > " +
+                    "<!ENTITY h \"&g;&g;&g;&g;&g;&g;&g;&g;&g;&g;\" > " +
+                    "<!ENTITY i \"&h;&h;&h;&h;&h;&h;&h;&h;&h;&h;\" > " +
+                    "<!ENTITY j \"&i;&i;&i;&i;&i;&i;&i;&i;&i;&i;\" > " +
+                    "<!ENTITY k \"&j;&j;&j;&j;&j;&j;&j;&j;&j;&j;\" > " +
+                    "<!ENTITY l \"&k;&k;&k;&k;&k;&k;&k;&k;&k;&k;\" > " +
+                    "<!ENTITY m \"&l;&l;&l;&l;&l;&l;&l;&l;&l;&l;\" > " +
+                    "<!ENTITY n \"&m;&m;&m;&m;&m;&m;&m;&m;&m;&m;\" > " +
+                    "<!ENTITY o \"&n;&n;&n;&n;&n;&n;&n;&n;&n;&n;\" > " +
+                    "<!ENTITY p \"&o;&o;&o;&o;&o;&o;&o;&o;&o;&o;\" > " +
+                    "<!ENTITY q \"&p;&p;&p;&p;&p;&p;&p;&p;&p;&p;\" > " +
+                    "<!ENTITY r \"&q;&q;&q;&q;&q;&q;&q;&q;&q;&q;\" > " +
+                    "<!ENTITY s \"&r;&r;&r;&r;&r;&r;&r;&r;&r;&r;\" > " +
+                    "]> " +
+                    "<kaboom>&s;</kaboom>").getBytes(StandardCharsets.UTF_8);
+
+    //a truly vulnerable parser, say xerces2, doesn't oom, it thrashes with gc.
+    //Set a reasonable amount of time as the timeout
+    @Test(timeout = 20000)
+    public void testInjectedXML() throws Exception {
+        byte[] bytes = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><document>blah</document>".getBytes(StandardCharsets.UTF_8);
+        byte[] injected = injectXML(bytes, ENTITY_EXPANSION_BOMB);
+        parse("injected", new ByteArrayInputStream(injected), new XMLTestBase.VulnerableSAXParser());
+    }
+
+    @Test(timeout = 20000)//
+    public void testProtectedXML() throws Exception {
+        byte[] bytes = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><document>blah</document>".getBytes(StandardCharsets.UTF_8);
+        byte[] injected = injectXML(bytes, ENTITY_EXPANSION_BOMB);
+        test("injected", injected, new AutoDetectParser());
+    }
+
+    private static void test(String testFileName, byte[] bytes, Parser parser) throws Exception {
+        boolean ex = false;
+        try {
+            parse(testFileName, new ByteArrayInputStream(bytes), parser);
+        } catch (SAXParseException e) {
+            if (e.getMessage() == null ||
+                    ! e.getMessage().contains("entity expansions")) {
+                throw new RuntimeException("Should have seen 'entity expansions' in the msg", e);
+            }
+            ex = true;
+        } catch (TikaException e) {
+            Throwable cause = e.getCause();
+            if (cause == null || cause.getMessage() == null ||
+                    ! cause.getMessage().contains("entity expansions")) {
+                throw new RuntimeException("Cause should have mentioned 'entity expansions'", e);
+            }
+            ex = true;
+        }
+        assertTrue("should have had an exception", ex);
+    }
+}
diff --git a/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java b/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java
index 3dd394d..1e922b0 100644
--- a/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java
+++ b/tika-parsers/src/test/java/org/apache/tika/TestXXEInXML.java
@@ -1,42 +1,26 @@
 package org.apache.tika;
 
 import org.apache.tika.config.TikaConfig;
-import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.IOUtils;
 import org.apache.tika.metadata.Metadata;
-import org.apache.tika.mime.MediaType;
-import org.apache.tika.parser.AbstractParser;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.ParseContext;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
-import org.apache.tika.sax.TaggedContentHandler;
-import org.apache.tika.sax.TextContentHandler;
 import org.apache.tika.sax.ToHTMLContentHandler;
+import org.junit.Ignore;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;
-import org.xml.sax.SAXException;
-import org.xml.sax.helpers.DefaultHandler;
 
-import javax.xml.parsers.ParserConfigurationException;
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;
 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
-import java.io.File;
 import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.IOException;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
-import java.util.Collections;
-import java.util.Enumeration;
-import java.util.Set;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipFile;
-import java.util.zip.ZipOutputStream;
 
 import static org.junit.Assert.fail;
 
@@ -45,15 +29,17 @@ import static org.junit.Assert.fail;
  * It does not test for XXE prevention in files that may contain xml
  * files, such as PDFs and other XMP-containing files.
  */
-public class TestXXEInXML extends TikaTest {
+@Ignore
+public class TestXXEInXML extends XMLTestBase {
     //TODO: figure out how to test XFA and xmp in PDFs
 
-    private static final String EVIL = "<!DOCTYPE roottag PUBLIC \"-//OXML/XXE/EN\" \"file:///couldnt_possibly_exist\">";
+    private static final byte[] XXE =
+            "<!DOCTYPE roottag PUBLIC \"-//OXML/XXE/EN\" \"file:///couldnt_possibly_exist\">".getBytes(StandardCharsets.UTF_8);
 
     @Test
     public void testConfirmVulnerable() throws Exception {
         try {
-            parse("testXXE.xml", getResourceAsStream("/test-documents/testXXE.xml"), new VulnerableXMLParser());
+            parse("testXXE.xml", getResourceAsStream("/test-documents/testXXE.xml"), new VulnerableSAXParser());
             fail("should have failed!!!");
         } catch (FileNotFoundException e) {
 
@@ -68,9 +54,9 @@ public class TestXXEInXML extends TikaTest {
     @Test
     public void testInjectedXML() throws Exception {
         byte[] bytes = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><document>blah</document>".getBytes(StandardCharsets.UTF_8);
-        byte[] injected = injectXML(bytes);
+        byte[] injected = injectXML(bytes, XXE);
         try {
-            parse("injected", new ByteArrayInputStream(injected), new VulnerableXMLParser());
+            parse("injected", new ByteArrayInputStream(injected), new VulnerableSAXParser());
             fail("injected should have triggered xxe");
         } catch (FileNotFoundException e) {
 
@@ -82,14 +68,14 @@ public class TestXXEInXML extends TikaTest {
         InputStream is = getResourceAsStream("/test-documents/testWORD_2003ml.xml");
         ByteArrayOutputStream bos = new ByteArrayOutputStream();
         IOUtils.copy(is, bos);
-        byte[] injected = injectXML(bos.toByteArray());
+        byte[] injected = injectXML(bos.toByteArray(), XXE);
         parse("testWORD_2003ml.xml", new ByteArrayInputStream(injected), new AutoDetectParser());
         is.close();
 
         is = getResourceAsStream("/test-documents/testWORD_2006ml.xml");
         bos = new ByteArrayOutputStream();
         IOUtils.copy(is, bos);
-        injected = injectXML(bos.toByteArray());
+        injected = injectXML(bos.toByteArray(), XXE);
         parse("testWORD_2006ml.xml", new ByteArrayInputStream(injected), new AutoDetectParser());
     }
 
@@ -118,7 +104,7 @@ public class TestXXEInXML extends TikaTest {
     private void _testOOXML(String fileName) throws Exception {
 
         Path originalOOXML = getResourceAsFile("/test-documents/"+fileName).toPath();
-        Path injected = injectZippedXMLs(originalOOXML, false);
+        Path injected = injectZippedXMLs(originalOOXML, XXE, false);
 
         Parser p = new AutoDetectParser();
         ContentHandler xhtml = new ToHTMLContentHandler();
@@ -143,7 +129,7 @@ public class TestXXEInXML extends TikaTest {
             parseContext.set(OfficeParserConfig.class, officeParserConfig);
             officeParserConfig.setUseSAXDocxExtractor(true);
             officeParserConfig.setUseSAXPptxExtractor(true);
-            injected = injectZippedXMLs(originalOOXML, true);
+            injected = injectZippedXMLs(originalOOXML, XXE, true);
 
             p.parse(Files.newInputStream(injected), xhtml, metadata, parseContext);
         } catch (FileNotFoundException e) {
@@ -159,7 +145,7 @@ public class TestXXEInXML extends TikaTest {
     //handlePart
     public void testDocxWithIncorrectSAXConfiguration() throws Exception {
         Path originalDocx = getResourceAsFile("/test-documents/testWORD_macros.docm").toPath();
-        Path injected = injectZippedXMLs(originalDocx, true);
+        Path injected = injectZippedXMLs(originalDocx, XXE,true);
         Parser p = new AutoDetectParser();
         ContentHandler xhtml = new ToHTMLContentHandler();
         ParseContext parseContext = new ParseContext();
@@ -183,89 +169,13 @@ public class TestXXEInXML extends TikaTest {
                 getResourceAsStream("/org/apache/tika/config/TIKA-1558-blacklist.xml") ) {
             ByteArrayOutputStream bos = new ByteArrayOutputStream();
             IOUtils.copy(is, bos);
-            byte[] injected = injectXML(bos.toByteArray());
+            byte[] injected = injectXML(bos.toByteArray(), XXE);
             TikaConfig tikaConfig = new TikaConfig(new ByteArrayInputStream(injected));
         }
     }
 
-    private Path injectZippedXMLs(Path original, boolean includeSlides) throws IOException {
-        ZipFile input = new ZipFile(original.toFile());
-        File output = Files.createTempFile("tika-xxe-", ".zip").toFile();
-        ZipOutputStream outZip = new ZipOutputStream(new FileOutputStream(output));
-        Enumeration<? extends ZipEntry> zipEntryEnumeration = input.entries();
-        while (zipEntryEnumeration.hasMoreElements()) {
-            ZipEntry entry = zipEntryEnumeration.nextElement();
-            ByteArrayOutputStream bos = new ByteArrayOutputStream();
-            IOUtils.copy(input.getInputStream(entry), bos);
-            byte[] bytes = bos.toByteArray();
-            if (entry.getName().endsWith(".xml") &&
-                    //don't inject the slides because you'll get a bean exception
-                    //Unexpected node
-                    (! includeSlides && ! entry.getName().contains("slides/slide"))) {
-                bytes = injectXML(bytes);
-            }
-            ZipEntry outEntry = new ZipEntry(entry.getName());
-            outZip.putNextEntry(outEntry);
-            outZip.write(bytes);
-            outZip.closeEntry();
-        }
-        outZip.flush();
-        outZip.close();
-
-        return output.toPath();
-    }
-
-    private byte[] injectXML(byte[] input) throws IOException {
-
-        int startXML = -1;
-        int endXML = -1;
-        for (int i = 0; i < input.length; i++) {
-            if (input[i] == '<' && i+1 < input.length && input[i+1] == '?') {
-                    startXML = i;
-            }
-            if (input[i] == '?' && i+1 < input.length && input[i+1] == '>') {
-                endXML = i+1;
-                break;
-            }
-        }
-        ByteArrayOutputStream bos = new ByteArrayOutputStream();
-        if (startXML > -1 && endXML > -1) {
-            bos.write(input, startXML, endXML-startXML+1);
-        }
-        bos.write(EVIL.getBytes(StandardCharsets.UTF_8));
-        bos.write(input, endXML+1, (input.length-endXML-1));
-        return bos.toByteArray();
-    }
-
-    private void parse(String testFile, InputStream is, Parser parser) throws Exception {
-        parser.parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
-    }
 
 
-    private static class VulnerableXMLParser extends AbstractParser {
-
-        @Override
-        public Set<MediaType> getSupportedTypes(ParseContext context) {
-            return Collections.singleton(MediaType.APPLICATION_XML);
-        }
 
-        @Override
-        public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
-
-            TaggedContentHandler tagged = new TaggedContentHandler(handler);
-            try {
-                SAXParserFactory saxParserFactory = SAXParserFactory.newInstance();
-                SAXParser parser = saxParserFactory.newSAXParser();
-                parser.parse( stream,
-                        new TextContentHandler(handler,
-                        true));
-            } catch (SAXException e) {
-                //there will be one...ignore it
-            } catch (ParserConfigurationException e) {
-                throw new TikaException("parser config ex", e);
-            }
-
-        }
-    }
 
 }
diff --git a/tika-parsers/src/test/java/org/apache/tika/XMLTestBase.java b/tika-parsers/src/test/java/org/apache/tika/XMLTestBase.java
new file mode 100644
index 0000000..5534ca9
--- /dev/null
+++ b/tika-parsers/src/test/java/org/apache/tika/XMLTestBase.java
@@ -0,0 +1,137 @@
+package org.apache.tika;
+
+import org.apache.tika.exception.TikaException;
+import org.apache.tika.io.IOUtils;
+import org.apache.tika.metadata.Metadata;
+import org.apache.tika.mime.MediaType;
+import org.apache.tika.parser.AbstractParser;
+import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.Parser;
+import org.apache.tika.sax.TaggedContentHandler;
+import org.apache.tika.sax.TextContentHandler;
+import org.xml.sax.ContentHandler;
+import org.xml.sax.SAXException;
+import org.xml.sax.helpers.DefaultHandler;
+
+import javax.xml.parsers.ParserConfigurationException;
+import javax.xml.parsers.SAXParser;
+import javax.xml.parsers.SAXParserFactory;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.Set;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipFile;
+import java.util.zip.ZipOutputStream;
+
+public class XMLTestBase extends TikaTest {
+
+    static byte[] injectXML(byte[] input, byte[] toInject) throws IOException {
+
+        int startXML = -1;
+        int endXML = -1;
+        for (int i = 0; i < input.length; i++) {
+            if (input[i] == '<' && i+1 < input.length && input[i+1] == '?') {
+                startXML = i;
+            }
+            if (input[i] == '?' && i+1 < input.length && input[i+1] == '>') {
+                endXML = i+1;
+                break;
+            }
+        }
+        ByteArrayOutputStream bos = new ByteArrayOutputStream();
+        if (startXML > -1 && endXML > -1) {
+            bos.write(input, startXML, endXML-startXML+1);
+        }
+        bos.write(toInject);
+        bos.write(input, endXML+1, (input.length-endXML-1));
+        return bos.toByteArray();
+    }
+
+    static Path injectZippedXMLs(Path original, byte[] toInject, boolean includeSlides) throws IOException {
+        ZipFile input = new ZipFile(original.toFile());
+        File output = Files.createTempFile("tika-xxe-", ".zip").toFile();
+        ZipOutputStream outZip = new ZipOutputStream(new FileOutputStream(output));
+        Enumeration<? extends ZipEntry> zipEntryEnumeration = input.entries();
+        while (zipEntryEnumeration.hasMoreElements()) {
+            ZipEntry entry = zipEntryEnumeration.nextElement();
+            ByteArrayOutputStream bos = new ByteArrayOutputStream();
+            IOUtils.copy(input.getInputStream(entry), bos);
+            byte[] bytes = bos.toByteArray();
+            if (entry.getName().endsWith(".xml") &&
+                    //don't inject the slides because you'll get a bean exception
+                    //Unexpected node
+                    (! includeSlides && ! entry.getName().contains("slides/slide"))) {
+                bytes = injectXML(bytes, toInject);
+            }
+            ZipEntry outEntry = new ZipEntry(entry.getName());
+            outZip.putNextEntry(outEntry);
+            outZip.write(bytes);
+            outZip.closeEntry();
+        }
+        outZip.flush();
+        outZip.close();
+
+        return output.toPath();
+    }
+
+    static class VulnerableDOMParser extends AbstractParser {
+
+        @Override
+        public Set<MediaType> getSupportedTypes(ParseContext context) {
+            return Collections.singleton(MediaType.APPLICATION_XML);
+        }
+
+        @Override
+        public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+
+            TaggedContentHandler tagged = new TaggedContentHandler(handler);
+            try {
+                SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(
+                        "org.apache.xerces.parsers.SAXParser", this.getClass().getClassLoader());
+                SAXParser parser = saxParserFactory.newSAXParser();
+                parser.parse( stream,
+                        new TextContentHandler(handler,
+                                true));
+            } catch (ParserConfigurationException e) {
+                throw new TikaException("parser config ex", e);
+            }
+
+        }
+    }
+
+    static class VulnerableSAXParser extends AbstractParser {
+
+        @Override
+        public Set<MediaType> getSupportedTypes(ParseContext context) {
+            return Collections.singleton(MediaType.APPLICATION_XML);
+        }
+
+        @Override
+        public void parse(InputStream stream, ContentHandler handler, Metadata metadata, ParseContext context) throws IOException, SAXException, TikaException {
+
+            TaggedContentHandler tagged = new TaggedContentHandler(handler);
+            try {
+                SAXParserFactory saxParserFactory = SAXParserFactory.newInstance(
+                        "org.apache.xerces.jaxp.SAXParserFactoryImpl", this.getClass().getClassLoader());
+                SAXParser parser = saxParserFactory.newSAXParser();
+                parser.parse( stream,
+                        new TextContentHandler(handler,
+                                true));
+            } catch (ParserConfigurationException e) {
+                throw new TikaException("parser config ex", e);
+            }
+
+        }
+    }
+    static void parse(String testFileName, InputStream is, Parser parser) throws Exception {
+        parser.parse(is, new DefaultHandler(), new Metadata(), new ParseContext());
+    }
+}

-- 
To stop receiving notification emails like this one, please contact
['"commits@tika.apache.org" <co...@tika.apache.org>'].