You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/02/11 17:02:18 UTC

[tika] branch main updated: TIKA-3679 -- prefer commons-io IOUtils skipFully and readFully to trigger EOF automatically where possible.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new e83195e  TIKA-3679 -- prefer commons-io IOUtils skipFully and readFully to trigger EOF automatically where possible.
e83195e is described below

commit e83195ea0a401872451038f92c4e9cc62fad5273
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 11 11:55:51 2022 -0500

    TIKA-3679 -- prefer commons-io IOUtils skipFully and readFully to trigger EOF automatically where possible.
---
 .../java/org/apache/tika/parser/dwg/DWGParser.java |  9 ++---
 .../java/org/apache/tika/parser/prt/PRTParser.java |  2 +-
 .../microsoft/MSEmbeddedStreamTranslator.java      |  2 +-
 .../apache/tika/parser/microsoft/OfficeParser.java |  2 +-
 .../parser/microsoft/rtf/RTFObjDataParser.java     |  4 +-
 .../apache/tika/parser/hwp/HwpStreamReader.java    | 46 +++++-----------------
 .../apache/tika/parser/hwp/HwpTextExtractorV5.java | 13 ++----
 7 files changed, 22 insertions(+), 56 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
index 8c2f087..385418c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
@@ -21,7 +21,7 @@ import java.io.InputStream;
 import java.util.Collections;
 import java.util.Set;
 
-import org.apache.poi.util.IOUtils;
+import org.apache.commons.io.IOUtils;
 import org.apache.poi.util.StringUtil;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
@@ -279,11 +279,8 @@ public class DWGParser extends AbstractParser {
         if (offsetToSection == 0) {
             return false;
         }
-        long skipped = IOUtils.skipFully(stream, toSkip);
-        if (skipped != toSkip) {
-            throw new TikaException("Failed to skip: " + toSkip +
-                    " bytes; skipped: " + skipped);
-        }
+        IOUtils.skipFully(stream, toSkip);
+
         return true;
     }
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
index ecb7261..f17e693 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
@@ -24,7 +24,7 @@ import java.io.UnsupportedEncodingException;
 import java.util.Collections;
 import java.util.Set;
 
-import org.apache.poi.util.IOUtils;
+import org.apache.commons.io.IOUtils;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
index 33f739e..2d51a86 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
@@ -21,6 +21,7 @@ import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 
+import org.apache.commons.io.IOUtils;
 import org.apache.poi.poifs.filesystem.DirectoryEntry;
 import org.apache.poi.poifs.filesystem.DocumentEntry;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
@@ -28,7 +29,6 @@ import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.util.IOUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index b02e3b5..3bd22da 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -28,6 +28,7 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.Set;
 
+import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.poi.hdgf.extractor.VisioTextExtractor;
 import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
@@ -38,7 +39,6 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
 import org.apache.poi.poifs.macros.VBAMacroReader;
-import org.apache.poi.util.IOUtils;
 import org.apache.poi.util.LocaleUtil;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java
index 9eb17d8..c4b3a28 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java
@@ -28,6 +28,7 @@ import java.util.Locale;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.io.IOUtils;
 import org.apache.poi.poifs.filesystem.DirectoryNode;
 import org.apache.poi.poifs.filesystem.DocumentEntry;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
@@ -36,7 +37,6 @@ import org.apache.poi.poifs.filesystem.FileMagic;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.util.IOUtils;
 
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.exception.TikaMemoryLimitException;
@@ -65,7 +65,7 @@ class RTFObjDataParser {
     /**
      * Parses the embedded object/pict string
      *
-     * @param bytes actual bytes (already converted from the
+     * @param is actual bytes (already converted from the
      *              hex pair string stored in the embedded object data into actual bytes or read
      *              as raw binary bytes)
      * @return a SimpleRTFEmbObj or null
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
index 76bef38..76fd648 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
@@ -20,11 +20,11 @@ import java.io.EOFException;
 import java.io.IOException;
 import java.io.InputStream;
 
-import org.apache.poi.util.IOUtils;
+import org.apache.commons.io.IOUtils;
 import org.apache.poi.util.LittleEndian;
 
 public class HwpStreamReader {
-    private byte[] skipBuffer = new byte[4096];
+
     private InputStream input;
     private byte[] buf;
 
@@ -41,12 +41,7 @@ public class HwpStreamReader {
      * @throws IOException
      */
     public short uint8() throws IOException {
-        int read = IOUtils.readFully(input, buf, 0, 1);
-
-        if (read == -1) {
-            return -1;
-        }
-
+        IOUtils.readFully(input, buf, 0, 1);
         return LittleEndian.getUByte(buf);
     }
 
@@ -57,15 +52,7 @@ public class HwpStreamReader {
      * @throws IOException
      */
     public int uint16() throws IOException {
-        int read = IOUtils.readFully(input, buf, 0, 2);
-
-        if (read == -1) {
-            return -1;
-        }
-
-        if (read < 2) {
-            throw new EOFException();
-        }
+        IOUtils.readFully(input, buf, 0, 2);
         return LittleEndian.getUShort(buf);
     }
 
@@ -81,11 +68,8 @@ public class HwpStreamReader {
             throw new IllegalArgumentException();
         }
         byte[] buf = new byte[i * 2];
-        int read = IOUtils.readFully(input, buf, 0, i * 2);
+        IOUtils.readFully(input, buf, 0, i * 2);
 
-        if (read != i * 2) {
-            throw new EOFException();
-        }
         int[] uints = new int[i];
         for (int ii = 0; ii < i; ii++) {
             uints[ii] = LittleEndian.getUShort(buf, ii * 2);
@@ -101,7 +85,9 @@ public class HwpStreamReader {
      * @throws IOException
      */
     public long uint32() throws IOException {
-        int read = IOUtils.readFully(input, buf, 0, 4);
+        //uint32 is used to try to read the next record.
+        //if nothing is read, we should not throw an EOF, we should return -1
+        int read = org.apache.poi.util.IOUtils.readFully(input, buf, 0, 4);
 
         if (read == -1) {
             return -1;
@@ -114,19 +100,7 @@ public class HwpStreamReader {
         return LittleEndian.getUInt(buf);
     }
 
-    /**
-     * ensure skip of n byte
-     *
-     * @param n
-     * @throws IOException
-     */
-    public void ensureSkip(long n) throws IOException {
-        //Leaving this for anyone who can figure out why this doesn't
-        //work.  See HwpV5ParserTest#testMultiThreadedSkipFully
-        //long skipped = org.apache.tika.io.IOUtils.skip(input, n);
-        long skipped = org.apache.tika.io.IOUtils.skip(input, n, skipBuffer);
-        if (skipped != n) {
-            throw new EOFException();
-        }
+    public void skipFully(long toSkip) throws IOException {
+        IOUtils.skipFully(input, toSkip);
     }
 }
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
index fc1c169..aaab29d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
@@ -16,7 +16,6 @@
  */
 package org.apache.tika.parser.hwp;
 
-import java.io.EOFException;
 import java.io.FileNotFoundException;
 import java.io.IOException;
 import java.io.InputStream;
@@ -36,6 +35,7 @@ import javax.crypto.CipherInputStream;
 import javax.crypto.NoSuchPaddingException;
 import javax.crypto.spec.SecretKeySpec;
 
+import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.input.CloseShieldInputStream;
 import org.apache.poi.hpsf.NoPropertySetStreamException;
 import org.apache.poi.hpsf.Property;
@@ -46,7 +46,6 @@ import org.apache.poi.poifs.filesystem.DocumentEntry;
 import org.apache.poi.poifs.filesystem.DocumentInputStream;
 import org.apache.poi.poifs.filesystem.Entry;
 import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.util.IOUtils;
 import org.apache.poi.util.LittleEndian;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -318,13 +317,9 @@ public class HwpTextExtractorV5 implements Serializable {
     private Key readKey(InputStream input) throws IOException {
         byte[] data = new byte[260];
 
-        if (IOUtils.readFully(input, data, 0, 4) != 4) { // TAG,
-            throw new EOFException();
-        }
+        IOUtils.readFully(input, data, 0, 4);
 
-        if (IOUtils.readFully(input, data, 0, 256) != 256) {
-            throw new EOFException();
-        }
+        IOUtils.readFully(input, data, 0, 256);
 
         SRand srand = new SRand(LittleEndian.getInt(data));
         byte xor = 0;
@@ -390,7 +385,7 @@ public class HwpTextExtractorV5 implements Serializable {
                     xhtml.endElement("p");
                 }
             } else {
-                reader.ensureSkip(tag.length);
+                reader.skipFully(tag.length);
             }
         }
     }