You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/02/11 14:41:51 UTC

[tika] branch branch_1x updated: TIKA-3677 -- remove sanity check language from the 1.x repo

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new ab709a5  TIKA-3677 -- remove sanity check language from the 1.x repo
ab709a5 is described below

commit ab709a5299be867c0e603116491faaa6546ed889
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 11 09:41:33 2022 -0500

    TIKA-3677 -- remove sanity check language from the 1.x repo
---
 .../src/main/java/org/apache/tika/cli/TikaCLI.java |   2 +-
 .../test/java/org/apache/tika/cli/TikaCLITest.java |   2 +-
 .../main/java/org/apache/tika/io/EndianUtils.java  |   4 +-
 .../chm/accessor/ChmDirectoryListingSet.java       |   1 -
 .../apache/tika/parser/chm/lzx/ChmLzxBlock.java    |  10 +-
 .../org/apache/tika/parser/chm/lzx/ChmSection.java |  14 +-
 .../java/org/apache/tika/parser/dbf/DBFCell.java   |   2 +
 .../java/org/apache/tika/parser/dwg/DWGParser.java |  16 +-
 .../tika/parser/executable/ExecutableParser.java   |   2 +-
 .../apache/tika/parser/hwp/HwpTextExtractorV5.java |  15 +-
 .../org/apache/tika/parser/image/BPGParser.java    |  28 ++-
 .../org/apache/tika/parser/image/ICNSParser.java   |  11 +-
 .../parser/microsoft/AbstractPOIFSExtractor.java   | 217 +++++++++++++--------
 .../tika/parser/microsoft/MSOwnerFileParser.java   |   3 +-
 .../tika/parser/microsoft/OutlookExtractor.java    |   2 +-
 .../java/org/apache/tika/parser/prt/PRTParser.java |   4 +-
 .../org/apache/tika/parser/rtf/TextExtractor.java  |   2 +-
 .../org/apache/tika/parser/video/FLVParser.java    |   9 +-
 .../org/apache/tika/parser/pdf/PDFParserTest.java  |   2 +-
 19 files changed, 226 insertions(+), 120 deletions(-)

diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index ce7e65a..79868b4 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -855,7 +855,7 @@ public class TikaCLI {
         Set<String> tikaLacking = new TreeSet<String>();
         Set<String> tikaNoMagic = new TreeSet<String>();
         
-        // Sanity check
+        // Plausibility check
         File dir = new File(magicDir);
         if ((new File(dir, "elf")).exists() &&
             (new File(dir, "mime")).exists() &&
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index ede3e95..8864fcc 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -290,7 +290,7 @@ public class TikaCLITest {
         String[] expectedChildren = new String[]{
                 "MBD002B040A.cdx",
                 "file4.png",
-                "MBD002B0FA6_file5.bin",
+                "MBD002B0FA6.bin",
                 "MBD00262FE3.txt",
                 "file0.emf"
         };
diff --git a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
index 96ba1ae..3b1bc89 100644
--- a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
@@ -219,7 +219,9 @@ public class EndianUtils {
     public static long readUE7(InputStream stream) throws IOException {
         int i;
         long v = 0;
-        while ((i = stream.read()) >= 0) {
+        final int max = 6;
+        int read = 0;
+        while ((i = stream.read()) >= 0 && read++ < max) {
             v = v << 7;
             if ((i & 128) == 128) {
                 // Continues
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java b/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
index 5b3d3f6..f22031a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
@@ -141,7 +141,6 @@ public class ChmDirectoryListingSet {
             byte[] dir_chunk = null;
             Set<Integer> processed = new HashSet<>();
             for (int i = startPmgl; i>=0; ) {
-                dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
                 int start = i * (int) chmItspHeader.getBlock_len() + dir_offset;
                 dir_chunk = ChmCommons
                         .copyOfRange(getData(), start,
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java b/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
index b5ea37a..040c7af 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.chm.lzx;
 import java.math.BigInteger;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
 import org.apache.tika.parser.chm.core.ChmCommons;
 import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
 import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
@@ -36,6 +37,9 @@ import org.apache.tika.parser.chm.exception.ChmParsingException;
  * 
  */
 public class ChmLzxBlock {
+
+    private static int MAX_CONTENT_SIZE = 50 * 1024 * 1024;
+
     private int block_number;
     private long block_length;
     private ChmLzxState state;
@@ -856,7 +860,11 @@ public class ChmLzxBlock {
                 start, getContent().length) : new byte[1];
     }
 
-    private void setContent(int contentLength) {
+    private void setContent(int contentLength) throws TikaMemoryLimitException {
+        if (contentLength > MAX_CONTENT_SIZE) {
+            throw new TikaMemoryLimitException("content length (" + contentLength +
+                    " bytes) is > MAX_CONTENT_SIZE");
+        }
         this.content = new byte[contentLength];
     }
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java b/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
index 77f9b3a..05bea7c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
@@ -20,6 +20,7 @@ import java.math.BigInteger;
 import java.util.Arrays;
 
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
 import org.apache.tika.parser.chm.core.ChmCommons;
 
 public class ChmSection {
@@ -94,8 +95,11 @@ public class ChmSection {
     public byte[] getPrevContent() {
         return prevcontent;
     }
-    
-    public BigInteger getBigInteger(int i) {
+
+    public BigInteger getBigInteger(int i) throws TikaException {
+        if (i > 8) {
+            throw new TikaMemoryLimitException("Big integer can't be > 8");
+        }
         if (getData() == null)
             return BigInteger.ZERO;
         if (getData().length - getSwath() < i)
@@ -115,16 +119,16 @@ public class ChmSection {
             byteval[i] = (byte) c[i];
         return byteval;
     }
-
+/*
     public BigInteger unmarshalUlong() {
         return getBigInteger(8);
     }
 
     public long unmarshalUInt() {
         return getBigInteger(4).longValue();
-    }
+    }*/
 
-    public int unmarshalInt() {
+    public int unmarshalInt() throws TikaException {
         return getBigInteger(4).intValue();
     }
 
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java
index fb13ef8..5e7ba0f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java
@@ -42,6 +42,7 @@ class DBFCell {
     DBFCell(DBFColumnHeader.ColType colType, int fieldLength, int decimalCount) {
         this.colType = colType;
         this.decimalCount = decimalCount;
+        //field length is limit-checked in DBFFileHeader
         this.bytes = new byte[fieldLength];
     }
 
@@ -78,6 +79,7 @@ class DBFCell {
      * @return copy of bytes that were read on the last read
      */
     byte[] getBytes() {
+        //bytesReadLast is effectively limit checked by DBFFileHeader
         byte[] ret = new byte[bytesReadLast];
         System.arraycopy(bytes, 0, ret, 0, bytesReadLast);
         return ret;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
index 2cfb53d..1f0859a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
@@ -276,7 +276,7 @@ public class DWGParser extends AbstractParser {
         // The offset is stored in the header from 0x20 onwards
         long offsetToSection = EndianUtils.getLongLE(header, 0x20);
         
-        // Sanity check the offset. Some files seem to use a different format,
+        // Check the offset. Some files seem to use a different format,
         //  and the offset isn't available at 0x20. Until we can work out how
         //  to find the offset in those files, skip them if detected
         if (offsetToSection > 0xa00000l) {
@@ -284,15 +284,15 @@ public class DWGParser extends AbstractParser {
            offsetToSection = 0;
         }
         
-        // Work out how far to skip, and sanity check
+        // Work out how far to skip, and check plausibility
         long toSkip = offsetToSection - header.length;
         if(offsetToSection == 0){
             return false;
-        }        
-        while (toSkip > 0) {
-            byte[] skip = new byte[Math.min((int) toSkip, 0x4000)];
-            IOUtils.readFully(stream, skip);
-            toSkip -= skip.length;
+        }
+        long skipped = IOUtils.skipFully(stream, toSkip);
+        if (skipped != toSkip) {
+            throw new TikaException("Failed to skip: " + toSkip +
+                    " bytes; skipped: " + skipped);
         }
         return true;
     }
@@ -339,7 +339,7 @@ public class DWGParser extends AbstractParser {
           // We should now have the count
           int count = EndianUtils.readUShortLE(stream);
           
-          // Sanity check it
+          // Plausibility check
           if(count > 0 && count < 0x7f) {
              // Looks plausible
              return count;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java
index b962e42..9720970 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java
@@ -97,7 +97,7 @@ public class ExecutableParser extends AbstractParser implements MachineMetadata
        // Grab the PE header offset
        int peOffset = LittleEndian.readInt(stream);
        
-       // Sanity check - while it may go anywhere, it's normally in the first few kb
+       // Plausibility check: while it may go anywhere, it's normally in the first few kb
        if (peOffset > 4096 || peOffset < 0x3f) return;
        
        // Skip the rest of the MS-DOS stub (if PE), until we reach what should
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
index 48b8c02..6abf929 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
@@ -51,6 +51,7 @@ import org.apache.poi.util.IOUtils;
 import org.apache.poi.util.LittleEndian;
 import org.apache.tika.exception.EncryptedDocumentException;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
 import org.apache.tika.exception.UnsupportedFormatException;
 import org.apache.tika.io.CloseShieldInputStream;
 import org.apache.tika.metadata.Metadata;
@@ -72,6 +73,8 @@ public class HwpTextExtractorV5 implements Serializable {
 
     private static final int HWPTAG_BEGIN = 0x010;
 
+    private static final int MAX_TAG_LENGTH = 50 * 1024 * 1024;
+
     private static final int I = 1; // INLINE
     private static final int C = 2; // CONTROL
     private static final int X = 3; // EXTENDED
@@ -249,7 +252,8 @@ public class HwpTextExtractorV5 implements Serializable {
      * @throws SAXException
      */
     private void parseBodyText(FileHeader header, DirectoryNode root,
-                               XHTMLContentHandler xhtml) throws IOException, SAXException {
+                               XHTMLContentHandler xhtml)
+            throws IOException, SAXException, TikaMemoryLimitException {
         // read BodyText
         Entry bodyText = root.getEntry("BodyText");
         if (bodyText == null || !bodyText.isDirectoryEntry()) {
@@ -289,7 +293,8 @@ public class HwpTextExtractorV5 implements Serializable {
      * @throws IOException
      */
     private void parseViewText(FileHeader header, DirectoryNode root,
-                               XHTMLContentHandler xhtml) throws IOException {
+                               XHTMLContentHandler xhtml)
+            throws IOException, TikaMemoryLimitException {
         // read BodyText
         Entry bodyText = root.getEntry("ViewText");
         if (bodyText == null || !bodyText.isDirectoryEntry()) {
@@ -383,7 +388,7 @@ public class HwpTextExtractorV5 implements Serializable {
      * @throws SAXException
      */
     private void parse(HwpStreamReader reader, XHTMLContentHandler xhtml)
-            throws IOException, SAXException {
+            throws IOException, SAXException, TikaMemoryLimitException {
         StringBuilder buf = new StringBuilder();
         TagInfo tag = new TagInfo();
 
@@ -395,6 +400,9 @@ public class HwpTextExtractorV5 implements Serializable {
                 if (tag.length % 2 != 0) {
                     throw new IOException("Invalid block size");
                 }
+                if (tag.length > MAX_TAG_LENGTH) {
+                    throw new TikaMemoryLimitException("Tags must be smaller than " + MAX_TAG_LENGTH);
+                }
                 buf.setLength(0);
                 writeParaText(reader, tag.length, buf);
 
@@ -422,6 +430,7 @@ public class HwpTextExtractorV5 implements Serializable {
      */
     private void writeParaText(HwpStreamReader reader, long datasize,
                                StringBuilder buf) throws IOException {
+        //datasize is bounds checked before calling writeParaText
         int[] chars = reader.uint16((int) (datasize / 2));
 
         for (int index = 0; index < chars.length; index++) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/BPGParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/BPGParser.java
index 2a48a55..2d0e0d3 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/BPGParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/BPGParser.java
@@ -24,7 +24,10 @@ import java.util.HashSet;
 import java.util.Set;
 
 import org.apache.poi.util.IOUtils;
+
+import org.apache.tika.config.Field;
 import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
 import org.apache.tika.io.EndianUtils;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.Photoshop;
@@ -37,7 +40,7 @@ import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 
 /**
- * Parser for the Better Portable Graphics )BPG) File Format.
+ * Parser for the Better Portable Graphics (BPG) File Format.
  * <p/>
  * Documentation on the file format is available from
  * http://bellard.org/bpg/bpg_spec.txt
@@ -48,10 +51,17 @@ public class BPGParser extends AbstractParser {
     protected static final int EXTENSION_TAG_XMP = 3;
     protected static final int EXTENSION_TAG_THUMBNAIL = 4;
     private static final long serialVersionUID = -161736541253892772L;
+
+    //50 MB -- throw TikaMemoryLimitException if xmp or exif is allegedly longer than this
+    private static final int DEFAULT_MAX_RECORD_LENGTH = 50 * 1024 * 1024;
+
     private static final Set<MediaType> SUPPORTED_TYPES =
             Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
                     MediaType.image("x-bpg"), MediaType.image("bpg"))));
 
+    private int maxRecordLength = DEFAULT_MAX_RECORD_LENGTH;
+
+
     public Set<MediaType> getSupportedTypes(ParseContext context) {
         return SUPPORTED_TYPES;
     }
@@ -144,6 +154,12 @@ public class BPGParser extends AbstractParser {
             while (extensionsDataSeen < extensionDataLength) {
                 int extensionType = (int) EndianUtils.readUE7(stream);
                 int extensionLength = (int) EndianUtils.readUE7(stream);
+                if (extensionLength > maxRecordLength) {
+                    throw new TikaMemoryLimitException("extension length (" +
+                            extensionLength + " bytes) is greater than 'maxRecordLength' (" +
+                            maxRecordLength + " bytes).  If this file is not corrupt, " +
+                            "consider bumping the maxRecordLength via tika-config.xml");
+                }
                 switch (extensionType) {
                     case EXTENSION_TAG_EXIF:
                         metadataExtractor.parseRawExif(stream, extensionLength, true);
@@ -152,7 +168,10 @@ public class BPGParser extends AbstractParser {
                         handleXMP(stream, extensionLength, metadataExtractor);
                         break;
                     default:
-                        stream.skip(extensionLength);
+                        long skipped = IOUtils.skipFully(stream, extensionLength);
+                        if (skipped != extensionLength) {
+                            throw new TikaException("failed to skip required amount");
+                        }
                 }
                 extensionsDataSeen += extensionLength;
             }
@@ -168,6 +187,11 @@ public class BPGParser extends AbstractParser {
         xhtml.endDocument();
     }
 
+    @Field
+    public void setMaxRecordLength(int maxRecordLength) {
+        this.maxRecordLength = maxRecordLength;
+    }
+
     protected void handleXMP(InputStream stream, int xmpLength,
                              ImageMetadataExtractor extractor) throws IOException, TikaException, SAXException {
         byte[] xmp = new byte[xmpLength];
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/ICNSParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/ICNSParser.java
index 47d89ab..8e27adf 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/ICNSParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/ICNSParser.java
@@ -62,9 +62,16 @@ public class ICNSParser extends AbstractParser {
         int image_length = java.nio.ByteBuffer.wrap(header).getInt();
         if (image_length > MAX_IMAGE_LENGTH_BYTES) {
             throw new TikaMemoryLimitException(image_length, MAX_IMAGE_LENGTH_BYTES);
+        } else if (image_length < 0) {
+            throw new TikaException("image length must be >= 0");
+        }
+        //image_length includes the initial 8 bytes.
+        int actualImageLength = image_length - 8;
+        byte[] full_file = new byte[actualImageLength];
+        long read = IOUtils.readFully(stream, full_file);
+        if (read != actualImageLength) {
+            throw new IOException("file not fully read from stream");
         }
-        byte[] full_file = new byte[image_length];
-        IOUtils.readFully(stream, full_file);
         ArrayList<ICNSType> icons = new ArrayList<>();
         ArrayList<ICNSType> icon_masks = new ArrayList<>();
         byte[] tempByteArray = new byte[4];
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index c9c409d..3f97249 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -149,28 +149,37 @@ abstract class AbstractPOIFSExtractor {
     /**
      * Handle an office document that's embedded at the POIFS level
      */
-    protected void handleEmbeddedOfficeDoc(
-            DirectoryEntry dir, String resourceName, XHTMLContentHandler xhtml)
+    protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, String resourceName,
+                                           XHTMLContentHandler xhtml)
             throws IOException, SAXException, TikaException {
 
+
         // Is it an embedded OLE2 document, or an embedded OOXML document?
+        //first try for ooxml
+        Entry ooxml = dir.hasEntry("Package") ? dir.getEntry("Package") :
+                (dir.hasEntry("package") ? dir.getEntry("package") : null);
 
-        if (dir.hasEntry("Package")) {
+        if (ooxml != null) {
             // It's OOXML (has a ZipFile):
-            Entry ooxml = dir.getEntry("Package");
+            Metadata metadata = new Metadata();
+            metadata.set(Metadata.CONTENT_LENGTH,
+                    Integer.toString(((DocumentEntry)ooxml).getSize()));
+            try (TikaInputStream stream = TikaInputStream
+                    .get(new DocumentInputStream((DocumentEntry) ooxml))) {
 
-            try (TikaInputStream stream = TikaInputStream.get(
-                    new DocumentInputStream((DocumentEntry) ooxml))) {
-                ZipContainerDetector detector = new ZipContainerDetector();
+                Detector detector = new ZipContainerDetector();
                 MediaType type = null;
                 try {
-                    //if there's a stream error while detecting...
-                    type = detector.detect(stream, new Metadata());
+                    type = detector.detect(stream, metadata);
+                } catch (SecurityException e) {
+                    throw e;
                 } catch (Exception e) {
+                    //if there's a stream error while detecting, give up
                     EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
                     return;
                 }
-                handleEmbeddedResource(stream, null, dir.getName(), dir.getStorageClsid(), type.toString(), xhtml, true);
+                handleEmbeddedResource(stream, metadata,null, dir.getName(), dir.getStorageClsid(),
+                        type.toString(), xhtml, true);
                 return;
             }
         }
@@ -181,91 +190,131 @@ abstract class AbstractPOIFSExtractor {
         Metadata metadata = new Metadata();
         metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
         if (dir.getStorageClsid() != null) {
-            metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID, dir.getStorageClsid().toString());
+            metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID,
+                    dir.getStorageClsid().toString());
         }
         POIFSDocumentType type = POIFSDocumentType.detectType(dir);
-        TikaInputStream embedded = null;
         String rName = (resourceName == null) ? dir.getName() : resourceName;
+        if (type == POIFSDocumentType.OLE10_NATIVE) {
+            handleOLENative(dir, type, rName, metadata, xhtml);
+        } else if (type == POIFSDocumentType.COMP_OBJ) {
+            handleCompObj(dir, type, rName, metadata, xhtml);
+        } else {
+            metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
+            metadata.set(Metadata.RESOURCE_NAME_KEY,
+                    rName + '.' + type.getExtension());
+            parseEmbedded(dir, xhtml, metadata);
+        }
+    }
+
+    private void handleCompObj(DirectoryEntry dir, POIFSDocumentType type, String rName,
+                               Metadata metadata, XHTMLContentHandler xhtml)
+            throws IOException, SAXException {
+        //TODO: figure out if the equivalent of OLE 1.0's
+        //getCommand() and getFileName() exist for OLE 2.0 to populate
+        //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
+
+        // Grab the contents and process
+        DocumentEntry contentsEntry;
         try {
-            if (type == POIFSDocumentType.OLE10_NATIVE) {
-                try {
-                    // Try to un-wrap the OLE10Native record:
-                    Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
-                    if (ole.getLabel() != null) {
-                        metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
-                    }
-                    if (ole.getCommand() != null) {
-                        metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
-                    }
-                    if (ole.getFileName() != null) {
-                        metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
-                    }
-                    byte[] data = ole.getDataBuffer();
-                    embedded = TikaInputStream.get(data);
-                } catch (Ole10NativeException ex) {
-                    // Not a valid OLE10Native record, skip it
-                } catch (Exception e) {
-                    EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
-                    return;
-                }
-            } else if (type == POIFSDocumentType.COMP_OBJ) {
-                try {
-                    //TODO: figure out if the equivalent of OLE 1.0's
-                    //getCommand() and getFileName() exist for OLE 2.0 to populate
-                    //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
+            contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
+        } catch (FileNotFoundException fnfe1) {
+            try {
+                contentsEntry = (DocumentEntry) dir.getEntry("Contents");
+            } catch (FileNotFoundException fnfe2) {
+                EmbeddedDocumentUtil.recordEmbeddedStreamException(fnfe2, parentMetadata);
+                return;
+            }
+        }
+        int length = contentsEntry.getSize();
+        DocumentInputStream inp = null;
+        try {
+            inp = new DocumentInputStream(contentsEntry);
+        } catch (SecurityException e) {
+            throw e;
+        } catch (Exception e) {
+            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
+            return;
+        }
+        try (TikaInputStream tis = TikaInputStream.get(inp)) {
+            // Try to work out what it is
+            MediaType mediaType = getDetector().detect(tis, metadata);
+            String extension = type.getExtension();
+            try {
+                MimeType mimeType = getMimeTypes().forName(mediaType.toString());
+                extension = mimeType.getExtension();
+            } catch (MimeTypeException mte) {
+                // No details on this type are known
+            }
 
-                    // Grab the contents and process
-                    DocumentEntry contentsEntry;
-                    try {
-                        contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
-                    } catch (FileNotFoundException ioe) {
-                        contentsEntry = (DocumentEntry) dir.getEntry("Contents");
-                    }
-                    DocumentInputStream inp = new DocumentInputStream(contentsEntry);
-                    byte[] contents = new byte[contentsEntry.getSize()];
-                    inp.readFully(contents);
-                    embedded = TikaInputStream.get(contents);
+            // Record what we can do about it
+            metadata.set(Metadata.CONTENT_TYPE, mediaType.getType());
+            metadata.set(Metadata.RESOURCE_NAME_KEY, rName + extension);
+            metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
+            parseEmbedded(dir, tis, xhtml, metadata);
+        } finally {
+            inp.close();
+        }
+    }
 
-                    // Try to work out what it is
-                    MediaType mediaType = getDetector().detect(embedded, new Metadata());
-                    String extension = type.getExtension();
-                    try {
-                        MimeType mimeType = getMimeTypes().forName(mediaType.toString());
-                        extension = mimeType.getExtension();
-                    } catch (MimeTypeException mte) {
-                        // No details on this type are known
-                    }
 
-                    // Record what we can do about it
-                    metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
-                    metadata.set(Metadata.RESOURCE_NAME_KEY, rName + extension);
-                } catch (Exception e) {
-                    EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
-                    return;
-                }
+    private void handleOLENative(DirectoryEntry dir, POIFSDocumentType type, String rName,
+                                 Metadata metadata, XHTMLContentHandler xhtml)
+            throws IOException, SAXException {
+        byte[] data = null;
+        try {
+            // Try to un-wrap the OLE10Native record:
+            Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
+            if (ole.getLabel() != null) {
+                metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
             } else {
-                metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
-                metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '.' + type.getExtension());
+                metadata.add(Metadata.RESOURCE_NAME_KEY, rName);
             }
-
-            // Should we parse it?
-            if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
-                if (embedded == null) {
-                    // Make a TikaInputStream that just
-                    // passes the root directory of the
-                    // embedded document, and is otherwise
-                    // empty (byte[0]):
-                    embedded = TikaInputStream.get(new byte[0]);
-                    embedded.setOpenContainer(dir);
-                }
-                embeddedDocumentUtil.parseEmbedded(embedded, xhtml, metadata, true);
+            if (ole.getCommand() != null) {
+                metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
             }
-        } catch (IOException e) {
-            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
-        } finally {
-            if (embedded != null) {
-                embedded.close();
+            if (ole.getFileName() != null) {
+                metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
+            }
+            data = ole.getDataBuffer();
+            metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(data.length));
+        } catch (Ole10NativeException ex) {
+            // Not a valid OLE10Native record, skip it
+        } catch (SecurityException e) {
+            throw e;
+        } catch (Exception e) {
+            EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
+            return;
+        }
+        try (TikaInputStream tis = TikaInputStream.get(data)) {
+            parseEmbedded(dir, tis, xhtml, metadata);
+        }
+    }
+
+    private void parseEmbedded(DirectoryEntry dir, TikaInputStream tis, XHTMLContentHandler xhtml,
+                               Metadata metadata) throws IOException, SAXException {
+        if (!embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
+            return;
+        }
+        if (dir.getStorageClsid() != null) {
+            metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID,
+                    dir.getStorageClsid().toString());
+        }
+        embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, true);
+    }
+
+    private void parseEmbedded(DirectoryEntry dir, XHTMLContentHandler xhtml, Metadata metadata)
+            throws IOException, SAXException {
+        if (!embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
+            return;
+        }
+        try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
+            tis.setOpenContainer(dir);
+            if (dir.getStorageClsid() != null) {
+                metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID,
+                        dir.getStorageClsid().toString());
             }
+            embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, true);
         }
     }
 }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
index df0cc73..301778b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
@@ -66,7 +66,7 @@ public class MSOwnerFileParser extends AbstractParser {
         byte[] asciiNameBytes = new byte[ASCII_CHUNK_LENGTH];
         IOUtils.readFully(stream, asciiNameBytes);
         int asciiNameLength = (int)asciiNameBytes[0];//don't need to convert to unsigned int because it can't be that long
-        //sanity check name length
+        //check name length
         if (asciiNameLength < 0) {
             throw new TikaException("ascii name length must be >= 0");
         } else if (asciiNameLength > ASCII_CHUNK_LENGTH) {
@@ -79,6 +79,7 @@ public class MSOwnerFileParser extends AbstractParser {
         int unicodeCharLength = stream.read();
         if (asciiNameLength == unicodeCharLength) {
             stream.read();//zero after the char length
+            //this is effectively bounds checked by asciiNameLength
             byte[] unicodeBytes = new byte[unicodeCharLength * 2];
             IOUtils.readFully(stream, unicodeBytes);
             String unicodeName = new String(unicodeBytes, StandardCharsets.UTF_16LE);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index cf6f51d..33b7fbf 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -587,7 +587,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
      */
     private void guess7BitEncoding(MAPIMessage msg) {
         Chunks mainChunks = msg.getMainChunks();
-        //sanity check
+        //can be null...¯\_(ツ)_/¯
         if (mainChunks == null) {
             return;
         }
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java
index ddb45f6..f20a737 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java
@@ -144,7 +144,7 @@ public class PRTParser extends AbstractParser {
        
        int length = EndianUtils.readUShortLE(stream);
        if(length <= MAX_SANE_TEXT_LENGTH) {
-          // Length sanity check passed
+          // Length check passed
           handleText(length, stream, xhtml);
        }
     }
@@ -170,7 +170,7 @@ public class PRTParser extends AbstractParser {
           IOUtils.readFully(stream, b2);
           int length = EndianUtils.getUShortLE(b2);
           if(length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
-             // Length sanity check passed
+             // Length check passed
              handleText(length, stream, xhtml);
           } else {
              // Was probably something else
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
index 4c062f2..4b37ba6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
@@ -948,7 +948,7 @@ final class TextExtractor {
                 } else if (equals("listtemplateid")) {
                     currentList.templateID = param;
                 } else if (equals("levelnfc") || equals("levelnfcn")) {
-                    //sanity check to make sure list information isn't corrupt
+                    //check to make sure list information isn't corrupt
                     if (listTableLevel > -1 &&
                             listTableLevel < currentList.numberType.length) {
                         currentList.numberType[listTableLevel] = param;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java
index 947b694..bfca9a7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java
@@ -83,9 +83,10 @@ public class FLVParser extends AbstractParser {
     }
 
     private int readUInt24(DataInputStream input) throws IOException {
-        int uint = input.read()<<16;
-        uint += input.read()<<8;
-        uint += input.read(); 
+        //readunsignedbyte checks for eof
+        int uint = input.readUnsignedByte()<<16;
+        uint += input.readUnsignedByte()<<8;
+        uint += input.readUnsignedByte();
         return uint;
     }
 
@@ -209,7 +210,7 @@ public class FLVParser extends AbstractParser {
                 break;
             }
 
-            int datalen = readUInt24(datainput); //body length
+            final int datalen = readUInt24(datainput); //body length
             readUInt32(datainput); // timestamp
             readUInt24(datainput); // streamid
 
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 1952939..4fc79d2 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1307,7 +1307,7 @@ public class PDFParserTest extends TikaTest {
         context.set(PDFParserConfig.class, config);
 
         List<Metadata> metadataList = getRecursiveMetadata("testPDF_childAttachments.pdf", context);
-        //sanity check
+        //plausibility check
         assertEquals(5, metadataList.size());
         //inlined jpeg metadata
         Metadata jpegMetadata = metadataList.get(1);