You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/05/18 21:11:57 UTC

svn commit: r1124385 - in /tika/trunk/tika-core/src: main/java/org/apache/tika/io/ main/java/org/apache/tika/parser/ main/java/org/apache/tika/sax/ test/java/org/apache/tika/sax/

Author: jukka
Date: Wed May 18 19:11:56 2011
New Revision: 1124385

URL: http://svn.apache.org/viewvc?rev=1124385&view=rev
Log:
TIKA-645: Parsers can't get at an underlying TikaInputStream to get the file if they wanted one

Make SecureContentHandler use TikaInputStream instead of a CountingInputStream.

This is a minor backwards-compatibility issue, but since I believe SecureContentHandler is not widely used outside AutoDetectParser I don't think this is a bit problem.

Modified:
    tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
    tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java
    tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java?rev=1124385&r1=1124384&r2=1124385&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java Wed May 18 19:11:56 2011
@@ -404,7 +404,7 @@ public class TikaInputStream extends Tag
      * Marked position, or -1 if there is no current mark.
      */
     private long mark = -1;
-    
+
     /**
      * A opened container, such as a POIFS FileSystem
      *  for an OLE2 document, or a Zip file for a
@@ -538,6 +538,15 @@ public class TikaInputStream extends Tag
         return length;
     }
 
+    /**
+     * Returns the current position within the stream.
+     *
+     * @return stream position
+     */
+    public long getPosition() {
+        return position;
+    }
+
     @Override
     public int available() throws IOException {
         if (in == null) {

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=1124385&r1=1124384&r2=1124385&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java Wed May 18 19:11:56 2011
@@ -16,7 +16,6 @@
  */
 package org.apache.tika.parser;
 
-import java.io.BufferedInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 
@@ -24,7 +23,7 @@ import org.apache.tika.config.TikaConfig
 import org.apache.tika.detect.DefaultDetector;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.CountingInputStream;
+import org.apache.tika.io.TemporaryFiles;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.mime.MediaType;
@@ -115,28 +114,26 @@ public class AutoDetectParser extends Co
             InputStream stream, ContentHandler handler,
             Metadata metadata, ParseContext context)
             throws IOException, SAXException, TikaException {
-        if(stream instanceof TikaInputStream || stream instanceof BufferedInputStream) {
-           // Input stream can be trusted for type detection
-        } else {
-           // We need (reliable!) mark support for type detection before parsing
-           stream = new BufferedInputStream(stream);
-        }
-
-        // Automatically detect the MIME type of the document
-        MediaType type = detector.detect(stream, metadata);
-        metadata.set(Metadata.CONTENT_TYPE, type.toString());
-
-        // TIKA-216: Zip bomb prevention
-        CountingInputStream count = new CountingInputStream(stream);
-        SecureContentHandler secure = new SecureContentHandler(handler, count);
-
-        // Parse the document
+        TemporaryFiles tmp = new TemporaryFiles();
         try {
-            super.parse(count, secure, metadata, context);
-        } catch (SAXException e) {
-            // Convert zip bomb exceptions to TikaExceptions
-            secure.throwIfCauseOf(e);
-            throw e;
+            TikaInputStream tis = TikaInputStream.get(stream, tmp);
+
+            // Automatically detect the MIME type of the document
+            MediaType type = detector.detect(tis, metadata);
+            metadata.set(Metadata.CONTENT_TYPE, type.toString());
+
+            // TIKA-216: Zip bomb prevention
+            SecureContentHandler sch = new SecureContentHandler(handler, tis);
+            try {
+                // Parse the document
+                super.parse(tis, sch, metadata, context);
+            } catch (SAXException e) {
+                // Convert zip bomb exceptions to TikaExceptions
+                sch.throwIfCauseOf(e);
+                throw e;
+            }
+        } finally {
+            tmp.dispose();
         }
     }
 

Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java?rev=1124385&r1=1124384&r2=1124385&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java Wed May 18 19:11:56 2011
@@ -17,7 +17,7 @@
 package org.apache.tika.sax;
 
 import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.CountingInputStream;
+import org.apache.tika.io.TikaInputStream;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
@@ -38,7 +38,7 @@ public class SecureContentHandler extend
     /**
      * The input stream that Tika is parsing.
      */
-    private final CountingInputStream stream;
+    private final TikaInputStream stream;
 
     /**
      * Number of output characters that Tika has produced so far.
@@ -62,11 +62,10 @@ public class SecureContentHandler extend
      * the given counting input stream.
      *
      * @param handler the content handler to be decorated
-     * @param stream the input stream to be parsed, wrapped into
-     *        a {@link CountingInputStream} decorator
+     * @param stream the input stream to be parsed
      */
     public SecureContentHandler(
-            ContentHandler handler, CountingInputStream stream) {
+            ContentHandler handler, TikaInputStream stream) {
         super(handler);
         this.stream = stream;
     }
@@ -141,7 +140,7 @@ public class SecureContentHandler extend
     private void advance(int length) throws SAXException {
         characterCount += length;
         if (characterCount > threshold
-                && characterCount > stream.getByteCount() * ratio) {
+                && characterCount > stream.getPosition() * ratio) {
             throw new SecureSAXException();
         }
     }
@@ -169,7 +168,7 @@ public class SecureContentHandler extend
 
         public SecureSAXException() {
             super("Suspected zip bomb: "
-                    + stream.getByteCount() + " input bytes produced "
+                    + stream.getPosition() + " input bytes produced "
                     + characterCount + " output characters");
         }
 

Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java?rev=1124385&r1=1124384&r2=1124385&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java Wed May 18 19:11:56 2011
@@ -18,13 +18,13 @@ package org.apache.tika.sax;
 
 import java.io.IOException;
 
-import org.apache.tika.io.CountingInputStream;
+import junit.framework.TestCase;
+
 import org.apache.tika.io.NullInputStream;
+import org.apache.tika.io.TikaInputStream;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
-import junit.framework.TestCase;
-
 /**
  * Tests for the {@link SecureContentHandler} class.
  */
@@ -32,12 +32,12 @@ public class SecureContentHandlerTest ex
 
     private static final int MANY_BYTES = 2000000;
 
-    private CountingInputStream stream;
+    private TikaInputStream stream;
 
     private SecureContentHandler handler;
 
     protected void setUp() {
-        stream = new CountingInputStream(new NullInputStream(MANY_BYTES));
+        stream = TikaInputStream.get(new NullInputStream(MANY_BYTES));
         handler = new SecureContentHandler(new DefaultHandler(), stream);
     }