You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2020/04/01 16:02:51 UTC

[tika] branch master updated: TIKA-3081 -- convert TikaInputStream's skip to the equivalent of skipFully

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/master by this push:
     new aaa9f40  TIKA-3081 -- convert TikaInputStream's skip to the equivalent of skipFully
aaa9f40 is described below

commit aaa9f40e3c8119f1a155e3f1eea5c2ffe7f4f26f
Author: tallison <ta...@apache.org>
AuthorDate: Wed Apr 1 12:02:15 2020 -0400

    TIKA-3081 -- convert TikaInputStream's skip to the equivalent of skipFully
---
 .../java/org/apache/tika/io/TikaInputStream.java   | 24 ++++++++++++++++++++--
 .../tika/parser/microsoft/onenote/OneNotePtr.java  |  6 +++++-
 2 files changed, 27 insertions(+), 3 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
index c995270..3997f9e 100644
--- a/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
+++ b/tika-core/src/main/java/org/apache/tika/io/TikaInputStream.java
@@ -60,11 +60,19 @@ import org.apache.tika.parser.Parser;
  * associated with a TikaInputStream should first use the
  * {@link #get(InputStream)} factory method to cast or wrap a given
  * {@link InputStream} into a TikaInputStream instance.
+ * <p>
+ * TikaInputStream includes a few safety features to protect against parsers
+ * that may fail to check for an EOF or may incorrectly rely on the unreliable
+ * value returned from {@link FileInputStream#skip}.  These parser failures
+ * can lead to infinite loops.  We strongly encourage the use of
+ * TikaInputStream.
  *
  * @since Apache Tika 0.8
  */
 public class TikaInputStream extends TaggedInputStream {
 
+    private static final int MAX_CONSECUTIVE_EOFS = 1000;
+
     /**
      * Checks whether the given stream is a TikaInputStream instance.
      * The given stream can be <code>null</code>, in which case the return
@@ -731,9 +739,21 @@ public class TikaInputStream extends TaggedInputStream {
         return position;
     }
 
+    /**
+     * This relies on {@link IOUtils#skip(InputStream, long)} to ensure
+     * that the alleged bytes skipped were actually skipped.
+     *
+     * @param ln the number of bytes to skip
+     * @return the number of bytes skipped
+     * @throws IOException if the number of bytes requested to be skipped does not match the number of bytes skipped
+     *      or if there's an IOException during the read.
+     */
     @Override
     public long skip(long ln) throws IOException {
-        long n = super.skip(ln);
+        long n = IOUtils.skip(super.in, ln);
+        if (n != ln) {
+            throw new IOException("tried to skip "+ln + " but actually skipped: "+n);
+        }
         position += n;
         return n;
     }
@@ -777,7 +797,7 @@ public class TikaInputStream extends TaggedInputStream {
             position += n;
         } else {
             consecutiveEOFs++;
-            if (consecutiveEOFs > 1000) {
+            if (consecutiveEOFs > MAX_CONSECUTIVE_EOFS) {
                 throw new IOException("Read too many -1 (EOFs); there could be an infinite loop." +
                         "If you think your file is not corrupt, please open an issue on Tika's JIRA");
             }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java
index c3fb150..85b20e8 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/onenote/OneNotePtr.java
@@ -337,7 +337,12 @@ class OneNotePtr {
             // + 4 bytes for the FileNode header
             CheckedFileNodePushBack pushBack = new CheckedFileNodePushBack(data);
             try {
+                long initialOffset = offset;
                 FileNode fileNode = deserializeFileNode(data.children.get(data.children.size() - 1), curPath);
+                if (initialOffset == offset) {
+                    //nothing read; avoid an infinite loop
+                    break;
+                }
                 if (fileNode.id == FndStructureConstants.ChunkTerminatorFND || fileNode.id == 0) {
                     terminated = true;
                     break;
@@ -678,7 +683,6 @@ class OneNotePtr {
         end = backup.end;
 
         if (reserved != 1) {
-            System.exit(1);
             throw new TikaException("RESERVED_NONZERO");
         }