You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/27 16:55:07 UTC
tika git commit: TIKA-2023 -- clean up RTFParser to use EndianUtils
and IOUtils.readFully
Repository: tika
Updated Branches:
refs/heads/2.x b14b47e76 -> 5bc597dc8
TIKA-2023 -- clean up RTFParser to use EndianUtils and IOUtils.readFully
Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/5bc597dc
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/5bc597dc
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/5bc597dc
Branch: refs/heads/2.x
Commit: 5bc597dc8d39f3248d849912b20b4f864f854a84
Parents: b14b47e
Author: tballison <ta...@mitre.org>
Authored: Mon Jun 27 12:55:00 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Mon Jun 27 12:55:00 2016 -0400
----------------------------------------------------------------------
.../java/org/apache/tika/io/EndianUtils.java | 20 ++++++++++
.../org/apache/tika/io/EndianUtilsTest.java | 19 +++++++++
.../tika/parser/rtf/RTFEmbObjHandler.java | 7 +---
.../tika/parser/rtf/RTFObjDataParser.java | 41 ++++++++------------
.../apache/tika/parser/rtf/TextExtractor.java | 11 +-----
5 files changed, 60 insertions(+), 38 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/tika/blob/5bc597dc/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
index 1e33986..3416f55 100644
--- a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
@@ -70,6 +70,26 @@ public class EndianUtils {
return (ch1 << 8) + (ch2 << 0);
}
+
+ /**
+ * Get a LE unsigned int value from an InputStream
+ *
+ * @param stream the InputStream from which the int is to be read
+ * @return the int (32-bit) value
+ * @exception IOException will be propagated back to the caller
+ * @exception BufferUnderrunException if the stream cannot provide enough bytes
+ */
+ public static long readUIntLE(InputStream stream) throws IOException, BufferUnderrunException {
+ int ch1 = stream.read();
+ int ch2 = stream.read();
+ int ch3 = stream.read();
+ int ch4 = stream.read();
+ if ((ch1 | ch2 | ch3 | ch4) < 0) {
+ throw new BufferUnderrunException();
+ }
+ return ((ch4 << 24) + (ch3<<16) + (ch2 << 8) + (ch1 << 0)) & 0x00FFFFFFFFl;
+ }
+
/**
* Get a LE int value from an InputStream
*
http://git-wip-us.apache.org/repos/asf/tika/blob/5bc597dc/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java b/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
index 7f10cdd..8f481c3 100644
--- a/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
@@ -18,6 +18,7 @@
package org.apache.tika.io;
import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
import java.io.ByteArrayInputStream;
@@ -37,4 +38,22 @@ public class EndianUtilsTest {
data = new byte[] { (byte)0xac, (byte)0xbe, 0x17 };
assertEquals((long)728855, EndianUtils.readUE7(new ByteArrayInputStream(data)));
}
+
+ @Test
+ public void testReadUIntLE() throws Exception {
+ byte[] data = new byte[] {(byte)0x08, (byte)0x00, (byte)0x00, (byte)0x00 };
+ assertEquals((long) 8, EndianUtils.readUIntLE(new ByteArrayInputStream(data)));
+
+ data = new byte[] {(byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF };
+ assertEquals(4294967295L, EndianUtils.readUIntLE(new ByteArrayInputStream(data)));
+
+ data = new byte[] {(byte)0xFF, (byte)0xFF, (byte)0xFF };
+ try {
+ EndianUtils.readUIntLE(new ByteArrayInputStream(data));
+ fail("Should have thrown exception");
+ } catch (EndianUtils.BufferUnderrunException e) {
+
+ }
+
+ }
}
http://git-wip-us.apache.org/repos/asf/tika/blob/5bc597dc/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
index 183df62..1334906 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
@@ -22,6 +22,7 @@ import java.io.InputStream;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.io.IOUtils;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.detect.Detector;
import org.apache.tika.exception.TikaException;
@@ -157,11 +158,7 @@ class RTFEmbObjHandler {
}
byte[] bytes = new byte[len];
- int bytesRead = is.read(bytes);
- if (bytesRead < len) {
- throw new TikaException("unexpected end of file: need " + len +
- " bytes of binary data, found " + (len - bytesRead));
- }
+ IOUtils.readFully(is, bytes);
os.write(bytes);
}
http://git-wip-us.apache.org/repos/asf/tika/blob/5bc597dc/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
index cc9d62f..147d2e8 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
@@ -36,6 +36,7 @@ import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
import org.apache.poi.poifs.filesystem.Ole10Native;
import org.apache.poi.poifs.filesystem.Ole10NativeException;
import org.apache.poi.util.IOUtils;
+import org.apache.tika.io.EndianUtils;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.RTFMetadata;
@@ -48,10 +49,6 @@ import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
*/
class RTFObjDataParser {
- private final static int[] INT_LE_POWS = new int[]{
- 1, 256, 65536, 16777216
- };
-
private final static String WIN_ASCII = "WINDOWS-1252";
/**
@@ -195,7 +192,12 @@ class RTFObjDataParser {
//should we add this to the metadata?
readAnsiString(is); //iconFilePath
- readUShort(is); //iconIndex
+ try {
+ //iconIndex
+ EndianUtils.readUShortBE(is);
+ } catch (EndianUtils.BufferUnderrunException e) {
+ throw new IOException(e);
+ }
int type = readUShort(is); //type
//1 is link, 3 is embedded object
@@ -209,7 +211,7 @@ class RTFObjDataParser {
String ansiFilePath = readAnsiString(is); //filePath
long bytesLen = readUInt(is);
byte[] objBytes = initByteArray(bytesLen);
- is.read(objBytes);
+ IOUtils.readFully(is, objBytes);
StringBuilder unicodeFilePath = new StringBuilder();
try {
@@ -248,24 +250,19 @@ class RTFObjDataParser {
private int readUShort(InputStream is) throws IOException {
- int lo = is.read();
- int hi = is.read() * 256;
- if (lo == -1 || hi == -1) {
- throw new IOException("Hit end of stream before reading little endian unsigned short.");
+ try {
+ return EndianUtils.readUShortLE(is);
+ } catch (EndianUtils.BufferUnderrunException e) {
+ throw new IOException(e);
}
- return hi + lo;
}
private long readUInt(InputStream is) throws IOException {
- long sum = 0;
- for (int i = 0; i < 4; i++) {
- int v = is.read();
- if (v == -1) {
- throw new IOException("Hit end of stream before finishing little endian unsigned int.");
- }
- sum += v * (long) INT_LE_POWS[i];
+ try {
+ return EndianUtils.readUIntLE(is);
+ } catch (EndianUtils.BufferUnderrunException e) {
+ throw new IOException(e);
}
- return sum;
}
private String readAnsiString(InputStream is) throws IOException {
@@ -296,11 +293,7 @@ class RTFObjDataParser {
private byte[] readBytes(InputStream is, long len) throws IOException {
//initByteArray tests for "reading of too many bytes"
byte[] bytes = initByteArray(len);
- int read = is.read(bytes);
- if (read != len) {
- throw new IOException("Hit end of stream before reading all bytes");
- }
-
+ IOUtils.readFully(is, bytes);
return bytes;
}
http://git-wip-us.apache.org/repos/asf/tika/blob/5bc597dc/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
index ee959f2..cf92406 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
@@ -33,6 +33,7 @@ import java.util.Locale;
import java.util.Map;
import java.util.TimeZone;
+import org.apache.commons.io.IOUtils;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -952,15 +953,7 @@ final class TextExtractor {
embObjHandler.reset();
}
} else {
- int bytesToRead = param;
- byte[] tmpArray = new byte[Math.min(1024, bytesToRead)];
- while (bytesToRead > 0) {
- int r = in.read(tmpArray, 0, Math.min(bytesToRead, tmpArray.length));
- if (r < 0) {
- throw new TikaException("unexpected end of file: need " + param + " bytes of binary data, found " + (param - bytesToRead));
- }
- bytesToRead -= r;
- }
+ IOUtils.skipFully(in, param);
}
} else {
// log some warning?