You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ju...@apache.org on 2011/05/18 21:11:57 UTC
svn commit: r1124385 - in /tika/trunk/tika-core/src:
main/java/org/apache/tika/io/ main/java/org/apache/tika/parser/
main/java/org/apache/tika/sax/ test/java/org/apache/tika/sax/
Author: jukka
Date: Wed May 18 19:11:56 2011
New Revision: 1124385
URL: http://svn.apache.org/viewvc?rev=1124385&view=rev
Log:
TIKA-645: Parsers can't get at an underlying TikaInputStream to get the file if they wanted one
Make SecureContentHandler use TikaInputStream instead of a CountingInputStream.
This is a minor backwards-compatibility issue, but since I believe SecureContentHandler is not widely used outside AutoDetectParser I don't think this is a bit problem.
Modified:
tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java
tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java?rev=1124385&r1=1124384&r2=1124385&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java Wed May 18 19:11:56 2011
@@ -404,7 +404,7 @@ public class TikaInputStream extends Tag
* Marked position, or -1 if there is no current mark.
*/
private long mark = -1;
-
+
/**
* A opened container, such as a POIFS FileSystem
* for an OLE2 document, or a Zip file for a
@@ -538,6 +538,15 @@ public class TikaInputStream extends Tag
return length;
}
+ /**
+ * Returns the current position within the stream.
+ *
+ * @return stream position
+ */
+ public long getPosition() {
+ return position;
+ }
+
@Override
public int available() throws IOException {
if (in == null) {
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java?rev=1124385&r1=1124384&r2=1124385&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java Wed May 18 19:11:56 2011
@@ -16,7 +16,6 @@
*/
package org.apache.tika.parser;
-import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
@@ -24,7 +23,7 @@ import org.apache.tika.config.TikaConfig
import org.apache.tika.detect.DefaultDetector;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.CountingInputStream;
+import org.apache.tika.io.TemporaryFiles;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.mime.MediaType;
@@ -115,28 +114,26 @@ public class AutoDetectParser extends Co
InputStream stream, ContentHandler handler,
Metadata metadata, ParseContext context)
throws IOException, SAXException, TikaException {
- if(stream instanceof TikaInputStream || stream instanceof BufferedInputStream) {
- // Input stream can be trusted for type detection
- } else {
- // We need (reliable!) mark support for type detection before parsing
- stream = new BufferedInputStream(stream);
- }
-
- // Automatically detect the MIME type of the document
- MediaType type = detector.detect(stream, metadata);
- metadata.set(Metadata.CONTENT_TYPE, type.toString());
-
- // TIKA-216: Zip bomb prevention
- CountingInputStream count = new CountingInputStream(stream);
- SecureContentHandler secure = new SecureContentHandler(handler, count);
-
- // Parse the document
+ TemporaryFiles tmp = new TemporaryFiles();
try {
- super.parse(count, secure, metadata, context);
- } catch (SAXException e) {
- // Convert zip bomb exceptions to TikaExceptions
- secure.throwIfCauseOf(e);
- throw e;
+ TikaInputStream tis = TikaInputStream.get(stream, tmp);
+
+ // Automatically detect the MIME type of the document
+ MediaType type = detector.detect(tis, metadata);
+ metadata.set(Metadata.CONTENT_TYPE, type.toString());
+
+ // TIKA-216: Zip bomb prevention
+ SecureContentHandler sch = new SecureContentHandler(handler, tis);
+ try {
+ // Parse the document
+ super.parse(tis, sch, metadata, context);
+ } catch (SAXException e) {
+ // Convert zip bomb exceptions to TikaExceptions
+ sch.throwIfCauseOf(e);
+ throw e;
+ }
+ } finally {
+ tmp.dispose();
}
}
Modified: tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java?rev=1124385&r1=1124384&r2=1124385&view=diff
==============================================================================
--- tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java (original)
+++ tika/trunk/tika-core/src/main/java/org/apache/tika/sax/SecureContentHandler.java Wed May 18 19:11:56 2011
@@ -17,7 +17,7 @@
package org.apache.tika.sax;
import org.apache.tika.exception.TikaException;
-import org.apache.tika.io.CountingInputStream;
+import org.apache.tika.io.TikaInputStream;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -38,7 +38,7 @@ public class SecureContentHandler extend
/**
* The input stream that Tika is parsing.
*/
- private final CountingInputStream stream;
+ private final TikaInputStream stream;
/**
* Number of output characters that Tika has produced so far.
@@ -62,11 +62,10 @@ public class SecureContentHandler extend
* the given counting input stream.
*
* @param handler the content handler to be decorated
- * @param stream the input stream to be parsed, wrapped into
- * a {@link CountingInputStream} decorator
+ * @param stream the input stream to be parsed
*/
public SecureContentHandler(
- ContentHandler handler, CountingInputStream stream) {
+ ContentHandler handler, TikaInputStream stream) {
super(handler);
this.stream = stream;
}
@@ -141,7 +140,7 @@ public class SecureContentHandler extend
private void advance(int length) throws SAXException {
characterCount += length;
if (characterCount > threshold
- && characterCount > stream.getByteCount() * ratio) {
+ && characterCount > stream.getPosition() * ratio) {
throw new SecureSAXException();
}
}
@@ -169,7 +168,7 @@ public class SecureContentHandler extend
public SecureSAXException() {
super("Suspected zip bomb: "
- + stream.getByteCount() + " input bytes produced "
+ + stream.getPosition() + " input bytes produced "
+ characterCount + " output characters");
}
Modified: tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java
URL: http://svn.apache.org/viewvc/tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java?rev=1124385&r1=1124384&r2=1124385&view=diff
==============================================================================
--- tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java (original)
+++ tika/trunk/tika-core/src/test/java/org/apache/tika/sax/SecureContentHandlerTest.java Wed May 18 19:11:56 2011
@@ -18,13 +18,13 @@ package org.apache.tika.sax;
import java.io.IOException;
-import org.apache.tika.io.CountingInputStream;
+import junit.framework.TestCase;
+
import org.apache.tika.io.NullInputStream;
+import org.apache.tika.io.TikaInputStream;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
-import junit.framework.TestCase;
-
/**
* Tests for the {@link SecureContentHandler} class.
*/
@@ -32,12 +32,12 @@ public class SecureContentHandlerTest ex
private static final int MANY_BYTES = 2000000;
- private CountingInputStream stream;
+ private TikaInputStream stream;
private SecureContentHandler handler;
protected void setUp() {
- stream = new CountingInputStream(new NullInputStream(MANY_BYTES));
+ stream = TikaInputStream.get(new NullInputStream(MANY_BYTES));
handler = new SecureContentHandler(new DefaultHandler(), stream);
}