You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/02/11 17:02:18 UTC
[tika] branch main updated: TIKA-3679 -- prefer commons-io IOUtils skipFully and readFully to trigger EOF automatically where possible.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new e83195e TIKA-3679 -- prefer commons-io IOUtils skipFully and readFully to trigger EOF automatically where possible.
e83195e is described below
commit e83195ea0a401872451038f92c4e9cc62fad5273
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 11 11:55:51 2022 -0500
TIKA-3679 -- prefer commons-io IOUtils skipFully and readFully to trigger EOF automatically where possible.
---
.../java/org/apache/tika/parser/dwg/DWGParser.java | 9 ++---
.../java/org/apache/tika/parser/prt/PRTParser.java | 2 +-
.../microsoft/MSEmbeddedStreamTranslator.java | 2 +-
.../apache/tika/parser/microsoft/OfficeParser.java | 2 +-
.../parser/microsoft/rtf/RTFObjDataParser.java | 4 +-
.../apache/tika/parser/hwp/HwpStreamReader.java | 46 +++++-----------------
.../apache/tika/parser/hwp/HwpTextExtractorV5.java | 13 ++----
7 files changed, 22 insertions(+), 56 deletions(-)
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
index 8c2f087..385418c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
@@ -21,7 +21,7 @@ import java.io.InputStream;
import java.util.Collections;
import java.util.Set;
-import org.apache.poi.util.IOUtils;
+import org.apache.commons.io.IOUtils;
import org.apache.poi.util.StringUtil;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
@@ -279,11 +279,8 @@ public class DWGParser extends AbstractParser {
if (offsetToSection == 0) {
return false;
}
- long skipped = IOUtils.skipFully(stream, toSkip);
- if (skipped != toSkip) {
- throw new TikaException("Failed to skip: " + toSkip +
- " bytes; skipped: " + skipped);
- }
+ IOUtils.skipFully(stream, toSkip);
+
return true;
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
index ecb7261..f17e693 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
@@ -24,7 +24,7 @@ import java.io.UnsupportedEncodingException;
import java.util.Collections;
import java.util.Set;
-import org.apache.poi.util.IOUtils;
+import org.apache.commons.io.IOUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
index 33f739e..2d51a86 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/extractor/microsoft/MSEmbeddedStreamTranslator.java
@@ -21,6 +21,7 @@ import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
+import org.apache.commons.io.IOUtils;
import org.apache.poi.poifs.filesystem.DirectoryEntry;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
@@ -28,7 +29,6 @@ import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.util.IOUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
index b02e3b5..3bd22da 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OfficeParser.java
@@ -28,6 +28,7 @@ import java.util.Locale;
import java.util.Map;
import java.util.Set;
+import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.poi.hdgf.extractor.VisioTextExtractor;
import org.apache.poi.hpbf.extractor.PublisherTextExtractor;
@@ -38,7 +39,6 @@ import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
import org.apache.poi.poifs.macros.VBAMacroReader;
-import org.apache.poi.util.IOUtils;
import org.apache.poi.util.LocaleUtil;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java
index 9eb17d8..c4b3a28 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/RTFObjDataParser.java
@@ -28,6 +28,7 @@ import java.util.Locale;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.io.IOUtils;
import org.apache.poi.poifs.filesystem.DirectoryNode;
import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
@@ -36,7 +37,6 @@ import org.apache.poi.poifs.filesystem.FileMagic;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.util.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.TikaMemoryLimitException;
@@ -65,7 +65,7 @@ class RTFObjDataParser {
/**
* Parses the embedded object/pict string
*
- * @param bytes actual bytes (already converted from the
+ * @param is actual bytes (already converted from the
* hex pair string stored in the embedded object data into actual bytes or read
* as raw binary bytes)
* @return a SimpleRTFEmbObj or null
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
index 76bef38..76fd648 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpStreamReader.java
@@ -20,11 +20,11 @@ import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
-import org.apache.poi.util.IOUtils;
+import org.apache.commons.io.IOUtils;
import org.apache.poi.util.LittleEndian;
public class HwpStreamReader {
- private byte[] skipBuffer = new byte[4096];
+
private InputStream input;
private byte[] buf;
@@ -41,12 +41,7 @@ public class HwpStreamReader {
* @throws IOException
*/
public short uint8() throws IOException {
- int read = IOUtils.readFully(input, buf, 0, 1);
-
- if (read == -1) {
- return -1;
- }
-
+ IOUtils.readFully(input, buf, 0, 1);
return LittleEndian.getUByte(buf);
}
@@ -57,15 +52,7 @@ public class HwpStreamReader {
* @throws IOException
*/
public int uint16() throws IOException {
- int read = IOUtils.readFully(input, buf, 0, 2);
-
- if (read == -1) {
- return -1;
- }
-
- if (read < 2) {
- throw new EOFException();
- }
+ IOUtils.readFully(input, buf, 0, 2);
return LittleEndian.getUShort(buf);
}
@@ -81,11 +68,8 @@ public class HwpStreamReader {
throw new IllegalArgumentException();
}
byte[] buf = new byte[i * 2];
- int read = IOUtils.readFully(input, buf, 0, i * 2);
+ IOUtils.readFully(input, buf, 0, i * 2);
- if (read != i * 2) {
- throw new EOFException();
- }
int[] uints = new int[i];
for (int ii = 0; ii < i; ii++) {
uints[ii] = LittleEndian.getUShort(buf, ii * 2);
@@ -101,7 +85,9 @@ public class HwpStreamReader {
* @throws IOException
*/
public long uint32() throws IOException {
- int read = IOUtils.readFully(input, buf, 0, 4);
+ //uint32 is used to try to read the next record.
+ //if nothing is read, we should not throw an EOF, we should return -1
+ int read = org.apache.poi.util.IOUtils.readFully(input, buf, 0, 4);
if (read == -1) {
return -1;
@@ -114,19 +100,7 @@ public class HwpStreamReader {
return LittleEndian.getUInt(buf);
}
- /**
- * ensure skip of n byte
- *
- * @param n
- * @throws IOException
- */
- public void ensureSkip(long n) throws IOException {
- //Leaving this for anyone who can figure out why this doesn't
- //work. See HwpV5ParserTest#testMultiThreadedSkipFully
- //long skipped = org.apache.tika.io.IOUtils.skip(input, n);
- long skipped = org.apache.tika.io.IOUtils.skip(input, n, skipBuffer);
- if (skipped != n) {
- throw new EOFException();
- }
+ public void skipFully(long toSkip) throws IOException {
+ IOUtils.skipFully(input, toSkip);
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
index fc1c169..aaab29d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
@@ -16,7 +16,6 @@
*/
package org.apache.tika.parser.hwp;
-import java.io.EOFException;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
@@ -36,6 +35,7 @@ import javax.crypto.CipherInputStream;
import javax.crypto.NoSuchPaddingException;
import javax.crypto.spec.SecretKeySpec;
+import org.apache.commons.io.IOUtils;
import org.apache.commons.io.input.CloseShieldInputStream;
import org.apache.poi.hpsf.NoPropertySetStreamException;
import org.apache.poi.hpsf.Property;
@@ -46,7 +46,6 @@ import org.apache.poi.poifs.filesystem.DocumentEntry;
import org.apache.poi.poifs.filesystem.DocumentInputStream;
import org.apache.poi.poifs.filesystem.Entry;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
-import org.apache.poi.util.IOUtils;
import org.apache.poi.util.LittleEndian;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
@@ -318,13 +317,9 @@ public class HwpTextExtractorV5 implements Serializable {
private Key readKey(InputStream input) throws IOException {
byte[] data = new byte[260];
- if (IOUtils.readFully(input, data, 0, 4) != 4) { // TAG,
- throw new EOFException();
- }
+ IOUtils.readFully(input, data, 0, 4);
- if (IOUtils.readFully(input, data, 0, 256) != 256) {
- throw new EOFException();
- }
+ IOUtils.readFully(input, data, 0, 256);
SRand srand = new SRand(LittleEndian.getInt(data));
byte xor = 0;
@@ -390,7 +385,7 @@ public class HwpTextExtractorV5 implements Serializable {
xhtml.endElement("p");
}
} else {
- reader.ensureSkip(tag.length);
+ reader.skipFully(tag.length);
}
}
}