You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2023/02/17 16:32:58 UTC

[tika] branch TIKA-3976 created (now e6c389c54)

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a change to branch TIKA-3976
in repository https://gitbox.apache.org/repos/asf/tika.git


      at e6c389c54 TIKA-3976 -- allow users to turn off exception on zero-byte files

This branch includes the following new commits:

     new e6c389c54 TIKA-3976 -- allow users to turn off exception on zero-byte files

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



[tika] 01/01: TIKA-3976 -- allow users to turn off exception on zero-byte files

Posted by ta...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch TIKA-3976
in repository https://gitbox.apache.org/repos/asf/tika.git

commit e6c389c5452ed7792c1fff5dcfad9de527d32833
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 17 11:32:40 2023 -0500

    TIKA-3976 -- allow users to turn off exception on zero-byte files
---
 .../org/apache/tika/parser/AutoDetectParser.java   | 10 ++++++----
 .../apache/tika/parser/AutoDetectParserConfig.java | 13 +++++++++++-
 .../src/test/java/org/apache/tika/TikaTest.java    | 18 ++++++++++-------
 .../tika/parser/AutoDetectParserConfigTest.java    | 23 +++++++++++++++++++++-
 .../test/resources/configs/tika-config-digests.xml |  1 +
 5 files changed, 52 insertions(+), 13 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
index 12c0e82ae..491ad572e 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParser.java
@@ -177,11 +177,13 @@ public class AutoDetectParser extends CompositeParser {
             }
             //check for zero-byte inputstream
             if (tis.getOpenContainer() == null) {
-                tis.mark(1);
-                if (tis.read() == -1) {
-                    throw new ZeroByteFileException("InputStream must have > 0 bytes");
+                if (autoDetectParserConfig.getThrowOnZeroBytes()) {
+                    tis.mark(1);
+                    if (tis.read() == -1) {
+                        throw new ZeroByteFileException("InputStream must have > 0 bytes");
+                    }
+                    tis.reset();
                 }
-                tis.reset();
             }
             handler = decorateHandler(handler, metadata, context, autoDetectParserConfig);
             // TIKA-216: Zip bomb prevention
diff --git a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
index 215b0bc32..d5a1567e1 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/AutoDetectParserConfig.java
@@ -100,6 +100,8 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable {
 
     private DigestingParser.DigesterFactory digesterFactory = null;
 
+    private boolean throwOnZeroBytes = true;
+
     /**
      * Creates a SecureContentHandlerConfig using the passed in parameters.
      *
@@ -198,6 +200,14 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable {
         return this.digesterFactory;
     }
 
+    public void setThrowOnZeroBytes(boolean throwOnZeroBytes) {
+        this.throwOnZeroBytes = throwOnZeroBytes;
+    }
+
+    public boolean getThrowOnZeroBytes() {
+        return throwOnZeroBytes;
+    }
+
     @Override
     public String toString() {
         return "AutoDetectParserConfig{" + "spoolToDisk=" + spoolToDisk + ", outputThreshold=" +
@@ -206,7 +216,8 @@ public class AutoDetectParserConfig extends ConfigBase implements Serializable {
                 maximumPackageEntryDepth + ", metadataWriteFilterFactory=" +
                 metadataWriteFilterFactory + ", embeddedDocumentExtractorFactory=" +
                 embeddedDocumentExtractorFactory + ", contentHandlerDecoratorFactory=" +
-                contentHandlerDecoratorFactory + ", digesterFactory=" + digesterFactory + '}';
+                contentHandlerDecoratorFactory + ", digesterFactory=" + digesterFactory +
+                ", throwOnZeroBytes=" + throwOnZeroBytes + '}';
     }
 }
 
diff --git a/tika-core/src/test/java/org/apache/tika/TikaTest.java b/tika-core/src/test/java/org/apache/tika/TikaTest.java
index fa112ca4c..a00d7b2b0 100644
--- a/tika-core/src/test/java/org/apache/tika/TikaTest.java
+++ b/tika-core/src/test/java/org/apache/tika/TikaTest.java
@@ -342,29 +342,33 @@ public abstract class TikaTest {
 
     protected List<Metadata> getRecursiveMetadata(Path path, ParseContext context,
                                                   boolean suppressException) throws Exception {
-        try (TikaInputStream tis = TikaInputStream.get(path)) {
-            return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, new Metadata(), context,
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
+            return getRecursiveMetadata(tis, AUTO_DETECT_PARSER, metadata, context,
                     suppressException);
         }
     }
 
     protected List<Metadata> getRecursiveMetadata(Path path, Parser parser,
                                                   boolean suppressException) throws Exception {
-        try (TikaInputStream tis = TikaInputStream.get(path)) {
-            return getRecursiveMetadata(tis, parser, new Metadata(), new ParseContext(),
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(path, metadata)) {
+            return getRecursiveMetadata(tis, parser, metadata, new ParseContext(),
                     suppressException);
         }
     }
 
     protected List<Metadata> getRecursiveMetadata(Path p, boolean suppressException)
             throws Exception {
-        try (TikaInputStream tis = TikaInputStream.get(p)) {
-            return getRecursiveMetadata(tis, new Metadata(), new ParseContext(), suppressException);
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(p, metadata)) {
+            return getRecursiveMetadata(tis, metadata, new ParseContext(), suppressException);
         }
     }
 
     protected List<Metadata> getRecursiveMetadata(Path filePath) throws Exception {
-        try (TikaInputStream tis = TikaInputStream.get(filePath)) {
+        Metadata metadata = new Metadata();
+        try (TikaInputStream tis = TikaInputStream.get(filePath, metadata)) {
             return getRecursiveMetadata(tis, true);
         }
     }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
index 2a5dbf2b9..7ef747157 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserConfigTest.java
@@ -19,6 +19,8 @@ package org.apache.tika.parser;
 import static org.junit.jupiter.api.Assertions.assertEquals;
 
 import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
 import java.util.List;
 
 import org.junit.jupiter.api.Test;
@@ -104,7 +106,7 @@ public class AutoDetectParserConfigTest extends TikaTest {
         //test to make sure that the decorator is only applied once for
         //legacy (e.g. not RecursiveParserWrapperHandler) parsing
         TikaConfig tikaConfig = null;
-        try (InputStream is = OOXMLParserTest.class.getResourceAsStream(
+        try (InputStream is = AutoDetectParserConfigTest.class.getResourceAsStream(
                 "/configs/tika-config-digests.xml")) {
             tikaConfig = new TikaConfig(is);
         }
@@ -138,4 +140,23 @@ public class AutoDetectParserConfigTest extends TikaTest {
         assertEquals("org.apache.tika.parser.EmptyParser",
                 metadataList.get(0).get("X-TIKA:Parsed-By"));
     }
+
+    @Test
+    public void testContainerZeroBytes() throws Exception {
+        Path tmp = Files.createTempFile("tika-test", "");
+        try {
+            TikaConfig tikaConfig = null;
+            try (InputStream is = AutoDetectParserConfigTest.class.getResourceAsStream(
+                    "/configs/tika-config-digests.xml")) {
+                tikaConfig = new TikaConfig(is);
+            }
+            Parser p = new AutoDetectParser(tikaConfig);
+            List<Metadata> metadataList = getRecursiveMetadata(tmp, p, true);
+            assertEquals("d41d8cd98f00b204e9800998ecf8427e",
+                    metadataList.get(0).get("X-TIKA:digest:MD5"));
+            assertEquals("0", metadataList.get(0).get(Metadata.CONTENT_LENGTH));
+        } finally {
+            Files.delete(tmp);
+        }
+    }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml
index 0ec913d50..c1fbb7b48 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/configs/tika-config-digests.xml
@@ -27,5 +27,6 @@
       <markLimit>100000</markLimit>
       <algorithmString>sha256:32,md5</algorithmString>
     </digesterFactory>
+    <throwOnZeroBytes>false</throwOnZeroBytes>
   </autoDetectParserConfig>
 </properties>