You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2016/06/27 16:55:07 UTC

tika git commit: TIKA-2023 -- clean up RTFParser to use EndianUtils and IOUtils.readFully

Repository: tika
Updated Branches:
  refs/heads/2.x b14b47e76 -> 5bc597dc8


TIKA-2023 -- clean up RTFParser to use EndianUtils and IOUtils.readFully


Project: http://git-wip-us.apache.org/repos/asf/tika/repo
Commit: http://git-wip-us.apache.org/repos/asf/tika/commit/5bc597dc
Tree: http://git-wip-us.apache.org/repos/asf/tika/tree/5bc597dc
Diff: http://git-wip-us.apache.org/repos/asf/tika/diff/5bc597dc

Branch: refs/heads/2.x
Commit: 5bc597dc8d39f3248d849912b20b4f864f854a84
Parents: b14b47e
Author: tballison <ta...@mitre.org>
Authored: Mon Jun 27 12:55:00 2016 -0400
Committer: tballison <ta...@mitre.org>
Committed: Mon Jun 27 12:55:00 2016 -0400

----------------------------------------------------------------------
 .../java/org/apache/tika/io/EndianUtils.java    | 20 ++++++++++
 .../org/apache/tika/io/EndianUtilsTest.java     | 19 +++++++++
 .../tika/parser/rtf/RTFEmbObjHandler.java       |  7 +---
 .../tika/parser/rtf/RTFObjDataParser.java       | 41 ++++++++------------
 .../apache/tika/parser/rtf/TextExtractor.java   | 11 +-----
 5 files changed, 60 insertions(+), 38 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/tika/blob/5bc597dc/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
----------------------------------------------------------------------
diff --git a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
index 1e33986..3416f55 100644
--- a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
@@ -70,6 +70,26 @@ public class EndianUtils {
       return (ch1 << 8) + (ch2 << 0);
    }
 
+
+   /**
+    * Get a LE unsigned int value from an InputStream
+    *
+    * @param  stream the InputStream from which the int is to be read
+    * @return                              the int (32-bit) value
+    * @exception  IOException              will be propagated back to the caller
+    * @exception  BufferUnderrunException  if the stream cannot provide enough bytes
+    */
+   public static long readUIntLE(InputStream stream) throws IOException, BufferUnderrunException {
+      int ch1 = stream.read();
+      int ch2 = stream.read();
+      int ch3 = stream.read();
+      int ch4 = stream.read();
+      if ((ch1 | ch2 | ch3 | ch4) < 0) {
+         throw new BufferUnderrunException();
+      }
+      return ((ch4 << 24) + (ch3<<16) + (ch2 << 8) + (ch1 << 0)) & 0x00FFFFFFFFl;
+   }
+
    /**
     * Get a LE int value from an InputStream
     *

http://git-wip-us.apache.org/repos/asf/tika/blob/5bc597dc/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
----------------------------------------------------------------------
diff --git a/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java b/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
index 7f10cdd..8f481c3 100644
--- a/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
+++ b/tika-core/src/test/java/org/apache/tika/io/EndianUtilsTest.java
@@ -18,6 +18,7 @@
 package org.apache.tika.io;
 
 import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.fail;
 
 import java.io.ByteArrayInputStream;
 
@@ -37,4 +38,22 @@ public class EndianUtilsTest {
         data = new byte[] { (byte)0xac, (byte)0xbe, 0x17 };
         assertEquals((long)728855, EndianUtils.readUE7(new ByteArrayInputStream(data)));
     }
+
+    @Test
+    public void testReadUIntLE() throws Exception {
+        byte[] data = new byte[] {(byte)0x08, (byte)0x00, (byte)0x00, (byte)0x00 };
+        assertEquals((long) 8, EndianUtils.readUIntLE(new ByteArrayInputStream(data)));
+
+        data = new byte[] {(byte)0xFF, (byte)0xFF, (byte)0xFF, (byte)0xFF };
+        assertEquals(4294967295L, EndianUtils.readUIntLE(new ByteArrayInputStream(data)));
+
+        data = new byte[] {(byte)0xFF, (byte)0xFF, (byte)0xFF  };
+        try {
+            EndianUtils.readUIntLE(new ByteArrayInputStream(data));
+            fail("Should have thrown exception");
+        } catch (EndianUtils.BufferUnderrunException e) {
+
+        }
+
+    }
 }

http://git-wip-us.apache.org/repos/asf/tika/blob/5bc597dc/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
index 183df62..1334906 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFEmbObjHandler.java
@@ -22,6 +22,7 @@ import java.io.InputStream;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import org.apache.commons.io.FilenameUtils;
+import org.apache.commons.io.IOUtils;
 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.detect.Detector;
 import org.apache.tika.exception.TikaException;
@@ -157,11 +158,7 @@ class RTFEmbObjHandler {
         }
 
         byte[] bytes = new byte[len];
-        int bytesRead = is.read(bytes);
-        if (bytesRead < len) {
-            throw new TikaException("unexpected end of file: need " + len +
-                    " bytes of binary data, found " + (len - bytesRead));
-        }
+        IOUtils.readFully(is, bytes);
         os.write(bytes);
     }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/5bc597dc/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
index cc9d62f..147d2e8 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/RTFObjDataParser.java
@@ -36,6 +36,7 @@ import org.apache.poi.poifs.filesystem.NPOIFSFileSystem;
 import org.apache.poi.poifs.filesystem.Ole10Native;
 import org.apache.poi.poifs.filesystem.Ole10NativeException;
 import org.apache.poi.util.IOUtils;
+import org.apache.tika.io.EndianUtils;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.RTFMetadata;
@@ -48,10 +49,6 @@ import org.apache.tika.parser.microsoft.OfficeParser.POIFSDocumentType;
  */
 class RTFObjDataParser {
 
-    private final static int[] INT_LE_POWS = new int[]{
-            1, 256, 65536, 16777216
-    };
-
     private final static String WIN_ASCII = "WINDOWS-1252";
 
     /**
@@ -195,7 +192,12 @@ class RTFObjDataParser {
 
         //should we add this to the metadata?
         readAnsiString(is); //iconFilePath
-        readUShort(is); //iconIndex
+        try {
+            //iconIndex
+            EndianUtils.readUShortBE(is);
+        } catch (EndianUtils.BufferUnderrunException e) {
+            throw new IOException(e);
+        }
         int type = readUShort(is); //type
 
         //1 is link, 3 is embedded object
@@ -209,7 +211,7 @@ class RTFObjDataParser {
         String ansiFilePath = readAnsiString(is); //filePath
         long bytesLen = readUInt(is);
         byte[] objBytes = initByteArray(bytesLen);
-        is.read(objBytes);
+        IOUtils.readFully(is, objBytes);
         StringBuilder unicodeFilePath = new StringBuilder();
 
         try {
@@ -248,24 +250,19 @@ class RTFObjDataParser {
 
 
     private int readUShort(InputStream is) throws IOException {
-        int lo = is.read();
-        int hi = is.read() * 256;
-        if (lo == -1 || hi == -1) {
-            throw new IOException("Hit end of stream before reading little endian unsigned short.");
+        try {
+            return EndianUtils.readUShortLE(is);
+        } catch (EndianUtils.BufferUnderrunException e) {
+            throw new IOException(e);
         }
-        return hi + lo;
     }
 
     private long readUInt(InputStream is) throws IOException {
-        long sum = 0;
-        for (int i = 0; i < 4; i++) {
-            int v = is.read();
-            if (v == -1) {
-                throw new IOException("Hit end of stream before finishing little endian unsigned int.");
-            }
-            sum += v * (long) INT_LE_POWS[i];
+        try {
+            return EndianUtils.readUIntLE(is);
+        } catch (EndianUtils.BufferUnderrunException e) {
+            throw new IOException(e);
         }
-        return sum;
     }
 
     private String readAnsiString(InputStream is) throws IOException {
@@ -296,11 +293,7 @@ class RTFObjDataParser {
     private byte[] readBytes(InputStream is, long len) throws IOException {
         //initByteArray tests for "reading of too many bytes"
         byte[] bytes = initByteArray(len);
-        int read = is.read(bytes);
-        if (read != len) {
-            throw new IOException("Hit end of stream before reading all bytes");
-        }
-
+        IOUtils.readFully(is, bytes);
         return bytes;
     }
 

http://git-wip-us.apache.org/repos/asf/tika/blob/5bc597dc/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
----------------------------------------------------------------------
diff --git a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
index ee959f2..cf92406 100644
--- a/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
+++ b/tika-parser-modules/tika-parser-office-module/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
@@ -33,6 +33,7 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.TimeZone;
 
+import org.apache.commons.io.IOUtils;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Office;
@@ -952,15 +953,7 @@ final class TextExtractor {
                         embObjHandler.reset();
                     }
                 } else {
-                    int bytesToRead = param;
-                    byte[] tmpArray = new byte[Math.min(1024, bytesToRead)];
-                    while (bytesToRead > 0) {
-                        int r = in.read(tmpArray, 0, Math.min(bytesToRead, tmpArray.length));
-                        if (r < 0) {
-                            throw new TikaException("unexpected end of file: need " + param + " bytes of binary data, found " + (param - bytesToRead));
-                        }
-                        bytesToRead -= r;
-                    }
+                    IOUtils.skipFully(in, param);
                 }
             } else {
                 // log some warning?