You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2018/09/06 13:37:52 UTC

[tika] branch branch_1x updated (2fd54ff -> 4f85418)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git.


    from 2fd54ff  TIKA-2722 -- clean up setting calendar values, take2
     new 1ff63b0  improve xml parsing
     new 39f69ef  Mime magic for "MIME Encapsulation of Aggregate HTML Documents" (MHTML), pulled out from rfc822 (may not be fully correct long-term...)
     new 4f85418  Changes update

The 3 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 CHANGES.txt                                        | 33 +++++++++++------
 .../java/org/apache/tika/utils/XMLReaderUtils.java | 43 +++++++++++++++++++---
 .../org/apache/tika/mime/tika-mimetypes.xml        | 22 ++++++++++-
 .../org/apache/tika/TestXMLEntityExpansion.java    | 28 ++++++++++----
 4 files changed, 99 insertions(+), 27 deletions(-)


[tika] 02/03: Mime magic for "MIME Encapsulation of Aggregate HTML Documents" (MHTML), pulled out from rfc822 (may not be fully correct long-term...)

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 39f69efc642ade48c5b8021d0500e7363c8f4ac5
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Thu Sep 6 09:28:14 2018 +0100

    Mime magic for "MIME Encapsulation of Aggregate HTML Documents" (MHTML), pulled out from rfc822 (may not be fully correct long-term...)
---
 .../org/apache/tika/mime/tika-mimetypes.xml        | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
index 61a1634..fc34cf8 100644
--- a/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
+++ b/tika-core/src/main/resources/org/apache/tika/mime/tika-mimetypes.xml
@@ -5796,9 +5796,28 @@
     </magic>
     <glob pattern="*.eml"/>
     <glob pattern="*.mime"/>
+    <sub-class-of type="text/x-tika-text-based-message"/>
+  </mime-type>
+
+  <!-- TODO See TIKA-2723 for discussions on the mime type hierarchy -->
+  <!--  and best parser structure for these email-like formats -->
+  <mime-type type="multipart/related">
+    <acronym>MHTML</acronym>
+    <_comment>MIME Encapsulation of Aggregate HTML Documents</_comment>
+    <tika:link>http://tools.ietf.org/html/rfc2557</tika:link>
+    <alias type="application/x-mimearchive"/>
+    <alias type="message/rfc2557"/>
+    <!-- higher priority than message/rfc822 -->
+    <magic priority="60">
+      <match value="From: \x3cSaved by Windows Internet Explorer 8\x3e" type="stringignorecase" offset="0"/>
+      <match value="From: \x22Saved by Internet Explorer 11\x22" type="stringignorecase" offset="0"/>
+      <match value="MIME-Version: 1.0" type="string" offset="0">
+        <match value="\nContent-Type: multipart/related" type="string" offset="16:512"/>
+      </match>
+    </magic>
     <glob pattern="*.mht"/>
     <glob pattern="*.mhtml"/>
-    <sub-class-of type="text/x-tika-text-based-message"/>
+    <sub-class-of type="message/rfc822"/>
   </mime-type>
 
   <mime-type type="message/s-http"/>
@@ -5900,7 +5919,6 @@
   <mime-type type="multipart/header-set"/>
   <mime-type type="multipart/mixed"/>
   <mime-type type="multipart/parallel"/>
-  <mime-type type="multipart/related"/>
   <mime-type type="multipart/report"/>
   <mime-type type="multipart/signed"/>
   <mime-type type="multipart/voice-message"/>


[tika] 01/03: improve xml parsing

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 1ff63b0c8dd5a21e7eb5f83e1a96f1fbd5107966
Author: TALLISON <ta...@apache.org>
AuthorDate: Thu Sep 6 08:57:47 2018 -0400

    improve xml parsing
---
 .../java/org/apache/tika/utils/XMLReaderUtils.java | 43 +++++++++++++++++++---
 .../org/apache/tika/TestXMLEntityExpansion.java    | 28 ++++++++++----
 2 files changed, 58 insertions(+), 13 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
index 382be2d..0069a9a 100644
--- a/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/utils/XMLReaderUtils.java
@@ -46,6 +46,7 @@ import java.io.InputStream;
 import java.io.Serializable;
 import java.io.StringReader;
 import java.lang.reflect.Method;
+import java.util.Properties;
 import java.util.concurrent.ArrayBlockingQueue;
 import java.util.concurrent.TimeUnit;
 import java.util.concurrent.locks.ReentrantReadWriteLock;
@@ -73,6 +74,25 @@ public class XMLReaderUtils implements Serializable {
 
     private static long LAST_LOG = -1;
 
+    private static final String JAXP_ENTITY_EXPANSION_LIMIT_KEY = "jdk.xml.entityExpansionLimit";
+    private static final int DEFAULT_MAX_ENTITY_EXPANSIONS = 20;
+
+    private static int MAX_ENTITY_EXPANSIONS = determineMaxEntityExpansions();
+
+    private static int determineMaxEntityExpansions() {
+        Properties properties = System.getProperties();
+        if (properties != null && properties.containsKey(JAXP_ENTITY_EXPANSION_LIMIT_KEY)) {
+            try {
+                return Integer.parseInt(properties.getProperty(JAXP_ENTITY_EXPANSION_LIMIT_KEY));
+            } catch (NumberFormatException e) {
+                LOG.log(Level.WARNING, "Couldn't parse an integer for the entity expansion limit:"+
+                        properties.getProperty(JAXP_ENTITY_EXPANSION_LIMIT_KEY)+
+                        "; backing off to default: "+DEFAULT_MAX_ENTITY_EXPANSIONS);
+            }
+        }
+        return DEFAULT_MAX_ENTITY_EXPANSIONS;
+    }
+
     //TODO: figure out if the rw lock is any better than a simple lock
     private static final ReentrantReadWriteLock SAX_READ_WRITE_LOCK = new ReentrantReadWriteLock();
     private static final ReentrantReadWriteLock DOM_READ_WRITE_LOCK = new ReentrantReadWriteLock();
@@ -105,6 +125,19 @@ public class XMLReaderUtils implements Serializable {
             };
 
     /**
+     * Set the maximum number of entity expansions allowable in SAX/DOM/StAX parsing.
+     * <b>NOTE:</b>A value less than or equal to zero indicates no limit.
+     * This will override the system property {@link #JAXP_ENTITY_EXPANSION_LIMIT_KEY}
+     * and the {@link #DEFAULT_MAX_ENTITY_EXPANSIONS} value for pa
+     *
+     * @param maxEntityExpansions -- maximum number of allowable entity expansions
+     * @since Apache Tika 1.19
+     */
+    public static void setMaxEntityExpansions(int maxEntityExpansions) {
+        MAX_ENTITY_EXPANSIONS = maxEntityExpansions;
+    }
+
+    /**
      * Returns the XMLReader specified in this parsing context. If a reader
      * is not explicitly specified, then one is created using the specified
      * or the default SAX parser.
@@ -517,7 +550,7 @@ public class XMLReaderUtils implements Serializable {
             try {
                 Object mgr = Class.forName(securityManagerClassName).newInstance();
                 Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE);
-                setLimit.invoke(mgr, 4096);
+                setLimit.invoke(mgr, MAX_ENTITY_EXPANSIONS);
                 factory.setAttribute("http://apache.org/xml/properties/security-manager", mgr);
                 // Stop once one can be setup without error
                 return;
@@ -534,7 +567,7 @@ public class XMLReaderUtils implements Serializable {
 
         // separate old version of Xerces not found => use the builtin way of setting the property
         try {
-            factory.setAttribute("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", 4096);
+            factory.setAttribute("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", MAX_ENTITY_EXPANSIONS);
         } catch (IllegalArgumentException e) {     // NOSONAR - also catch things like NoClassDefError here
             // throttle the log somewhat as it can spam the log otherwise
             if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
@@ -554,7 +587,7 @@ public class XMLReaderUtils implements Serializable {
             try {
                 Object mgr = Class.forName(securityManagerClassName).newInstance();
                 Method setLimit = mgr.getClass().getMethod("setEntityExpansionLimit", Integer.TYPE);
-                setLimit.invoke(mgr, 4096);
+                setLimit.invoke(mgr, MAX_ENTITY_EXPANSIONS);
                 parser.setProperty("http://apache.org/xml/properties/security-manager", mgr);
                 // Stop once one can be setup without error
                 return;
@@ -571,7 +604,7 @@ public class XMLReaderUtils implements Serializable {
 
         // separate old version of Xerces not found => use the builtin way of setting the property
         try {
-            parser.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", 4096);
+            parser.setProperty("http://www.oracle.com/xml/jaxp/properties/entityExpansionLimit", MAX_ENTITY_EXPANSIONS);
         } catch (SAXException e) {     // NOSONAR - also catch things like NoClassDefError here
             // throttle the log somewhat as it can spam the log otherwise
             if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
@@ -583,7 +616,7 @@ public class XMLReaderUtils implements Serializable {
 
     private static void trySetStaxSecurityManager(XMLInputFactory inputFactory) {
         try {
-            inputFactory.setProperty("com.ctc.wstx.maxEntityCount", 4096);
+            inputFactory.setProperty("com.ctc.wstx.maxEntityCount", MAX_ENTITY_EXPANSIONS);
         } catch (IllegalArgumentException e) {
             // throttle the log somewhat as it can spam the log otherwise
             if(System.currentTimeMillis() > LAST_LOG + TimeUnit.MINUTES.toMillis(5)) {
diff --git a/tika-parsers/src/test/java/org/apache/tika/TestXMLEntityExpansion.java b/tika-parsers/src/test/java/org/apache/tika/TestXMLEntityExpansion.java
index 77a166b..54e3a7c 100644
--- a/tika-parsers/src/test/java/org/apache/tika/TestXMLEntityExpansion.java
+++ b/tika-parsers/src/test/java/org/apache/tika/TestXMLEntityExpansion.java
@@ -26,17 +26,14 @@ import org.xml.sax.SAXParseException;
 import java.io.ByteArrayInputStream;
 import java.nio.charset.StandardCharsets;
 
-import static org.apache.tika.XMLTestBase.injectXML;
-import static org.apache.tika.XMLTestBase.parse;
 import static org.junit.Assert.assertTrue;
-import static org.junit.Assert.fail;
 
 /**
  * Tests to confirm defenses against entity expansion attacks.
  */
 @Ignore("initial draft, needs more work")
-public class TestXMLEntityExpansion
-{
+public class TestXMLEntityExpansion extends XMLTestBase {
+
     private static final byte[] ENTITY_EXPANSION_BOMB = new String(
             "<!DOCTYPE kaboom [ " +
                     "<!ENTITY a \"1234567890\" > " +
@@ -61,13 +58,28 @@ public class TestXMLEntityExpansion
                     "]> " +
                     "<kaboom>&s;</kaboom>").getBytes(StandardCharsets.UTF_8);
 
-    //a truly vulnerable parser, say xerces2, doesn't oom, it thrashes with gc.
     //Set a reasonable amount of time as the timeout
+    //Make sure that the test apparatus actually works.
     @Test(timeout = 20000)
-    public void testInjectedXML() throws Exception {
+    public void testVulnerableParser() throws Exception {
         byte[] bytes = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><document>blah</document>".getBytes(StandardCharsets.UTF_8);
         byte[] injected = injectXML(bytes, ENTITY_EXPANSION_BOMB);
-        parse("injected", new ByteArrayInputStream(injected), new XMLTestBase.VulnerableSAXParser());
+
+        Thread thread = new Thread() {
+            @Override
+            public void run() {
+                try {
+                    parse("injected", new ByteArrayInputStream(injected), new XMLTestBase.VulnerableSAXParser());
+                } catch (Exception e) {
+                    throw new RuntimeException(e);
+                }
+            }
+        };
+        thread.start();
+        Thread.sleep(10000);
+        assertTrue(thread.isAlive());
+        thread.interrupt();
+
     }
 
     @Test(timeout = 20000)//


[tika] 03/03: Changes update

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git

commit 4f8541836b78d7ee0cd293ccb5b6207a98730eff
Author: Nick Burch <ni...@gagravarr.org>
AuthorDate: Thu Sep 6 09:28:24 2018 +0100

    Changes update
    
    # Conflicts:
    #	CHANGES.txt
---
 CHANGES.txt | 33 +++++++++++++++++++++------------
 1 file changed, 21 insertions(+), 12 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index ae7627d..5846387 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,15 @@
+Release 2.0.0 - ???
+   BREAKING CHANGES in 2.0.0
+
+   * Remove deprecated Metadata keys/properties (TIKA-1974).
+
+   Other changes
+
 Release 1.19 ???
 
+   * Mime magic improvements for Olympus RAW (TIKA-2658), interpreted
+     server-side languages via HTTP (TIKA-2648), MHTML (TIKA-2723)
+
    * Add absolute timeout to ForkParser rather than testing
      for active (TIKA-2656).
 
@@ -19,18 +29,6 @@ Release 1.19 ???
    * Add the RecursiveParserWrapperHandler to improve the RecursiveParserWrapper
      API slightly (TIKA-2644).
 
-   * Support for SAS7BDAT data files (TIKA-2462)
-
-   * Handle .epub files using .htm rather than .html extensions for the
-     embedded contents (TIKA-1288)
-
-   * Mime magic for ACES Images (TIKA-2628) and DPX Images (TIKA-2629)
-
-   * For sparse XLSX and XLSB files, always output missing cells to
-     the left of filled ones (matching XLS), and optionally output
-     missing rows on all 3 formats if requested via the
-     OfficeParserContext (TIKA-2479)
-
 
 Release 1.18 - 4/20/2018
 
@@ -100,6 +98,17 @@ Release 1.18 - 4/20/2018
    * Added local Docker image build using dockerfile-maven-plugin to allow
      images to be built from source (TIKA-1518).
 
+   * Support for SAS7BDAT data files (TIKA-2462)
+
+   * Handle .epub files using .htm rather than .html extensions for the
+     embedded contents (TIKA-1288)
+
+   * Mime magic for ACES Images (TIKA-2628) and DPX Images (TIKA-2629)
+
+   * For sparse XLSX and XLSB files, always output missing cells to
+     the left of filled ones (matching XLS), and optionally output
+     missing rows on all 3 formats if requested via the
+     OfficeParserContext (TIKA-2479)
 
 Release 1.17 - 12/8/2017