You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/02/11 14:41:51 UTC
[tika] branch branch_1x updated: TIKA-3677 -- remove sanity check language from the 1.x repo
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new ab709a5 TIKA-3677 -- remove sanity check language from the 1.x repo
ab709a5 is described below
commit ab709a5299be867c0e603116491faaa6546ed889
Author: tallison <ta...@apache.org>
AuthorDate: Fri Feb 11 09:41:33 2022 -0500
TIKA-3677 -- remove sanity check language from the 1.x repo
---
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 2 +-
.../test/java/org/apache/tika/cli/TikaCLITest.java | 2 +-
.../main/java/org/apache/tika/io/EndianUtils.java | 4 +-
.../chm/accessor/ChmDirectoryListingSet.java | 1 -
.../apache/tika/parser/chm/lzx/ChmLzxBlock.java | 10 +-
.../org/apache/tika/parser/chm/lzx/ChmSection.java | 14 +-
.../java/org/apache/tika/parser/dbf/DBFCell.java | 2 +
.../java/org/apache/tika/parser/dwg/DWGParser.java | 16 +-
.../tika/parser/executable/ExecutableParser.java | 2 +-
.../apache/tika/parser/hwp/HwpTextExtractorV5.java | 15 +-
.../org/apache/tika/parser/image/BPGParser.java | 28 ++-
.../org/apache/tika/parser/image/ICNSParser.java | 11 +-
.../parser/microsoft/AbstractPOIFSExtractor.java | 217 +++++++++++++--------
.../tika/parser/microsoft/MSOwnerFileParser.java | 3 +-
.../tika/parser/microsoft/OutlookExtractor.java | 2 +-
.../java/org/apache/tika/parser/prt/PRTParser.java | 4 +-
.../org/apache/tika/parser/rtf/TextExtractor.java | 2 +-
.../org/apache/tika/parser/video/FLVParser.java | 9 +-
.../org/apache/tika/parser/pdf/PDFParserTest.java | 2 +-
19 files changed, 226 insertions(+), 120 deletions(-)
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index ce7e65a..79868b4 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -855,7 +855,7 @@ public class TikaCLI {
Set<String> tikaLacking = new TreeSet<String>();
Set<String> tikaNoMagic = new TreeSet<String>();
- // Sanity check
+ // Plausibility check
File dir = new File(magicDir);
if ((new File(dir, "elf")).exists() &&
(new File(dir, "mime")).exists() &&
diff --git a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
index ede3e95..8864fcc 100644
--- a/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
+++ b/tika-app/src/test/java/org/apache/tika/cli/TikaCLITest.java
@@ -290,7 +290,7 @@ public class TikaCLITest {
String[] expectedChildren = new String[]{
"MBD002B040A.cdx",
"file4.png",
- "MBD002B0FA6_file5.bin",
+ "MBD002B0FA6.bin",
"MBD00262FE3.txt",
"file0.emf"
};
diff --git a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
index 96ba1ae..3b1bc89 100644
--- a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
@@ -219,7 +219,9 @@ public class EndianUtils {
public static long readUE7(InputStream stream) throws IOException {
int i;
long v = 0;
- while ((i = stream.read()) >= 0) {
+ final int max = 6;
+ int read = 0;
+ while ((i = stream.read()) >= 0 && read++ < max) {
v = v << 7;
if ((i & 128) == 128) {
// Continues
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java b/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
index 5b3d3f6..f22031a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/chm/accessor/ChmDirectoryListingSet.java
@@ -141,7 +141,6 @@ public class ChmDirectoryListingSet {
byte[] dir_chunk = null;
Set<Integer> processed = new HashSet<>();
for (int i = startPmgl; i>=0; ) {
- dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
int start = i * (int) chmItspHeader.getBlock_len() + dir_offset;
dir_chunk = ChmCommons
.copyOfRange(getData(), start,
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java b/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
index b5ea37a..040c7af 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmLzxBlock.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.chm.lzx;
import java.math.BigInteger;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.parser.chm.core.ChmCommons;
import org.apache.tika.parser.chm.core.ChmCommons.IntelState;
import org.apache.tika.parser.chm.core.ChmCommons.LzxState;
@@ -36,6 +37,9 @@ import org.apache.tika.parser.chm.exception.ChmParsingException;
*
*/
public class ChmLzxBlock {
+
+ private static int MAX_CONTENT_SIZE = 50 * 1024 * 1024;
+
private int block_number;
private long block_length;
private ChmLzxState state;
@@ -856,7 +860,11 @@ public class ChmLzxBlock {
start, getContent().length) : new byte[1];
}
- private void setContent(int contentLength) {
+ private void setContent(int contentLength) throws TikaMemoryLimitException {
+ if (contentLength > MAX_CONTENT_SIZE) {
+ throw new TikaMemoryLimitException("content length (" + contentLength +
+ " bytes) is > MAX_CONTENT_SIZE");
+ }
this.content = new byte[contentLength];
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java b/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
index 77f9b3a..05bea7c 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/chm/lzx/ChmSection.java
@@ -20,6 +20,7 @@ import java.math.BigInteger;
import java.util.Arrays;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.parser.chm.core.ChmCommons;
public class ChmSection {
@@ -94,8 +95,11 @@ public class ChmSection {
public byte[] getPrevContent() {
return prevcontent;
}
-
- public BigInteger getBigInteger(int i) {
+
+ public BigInteger getBigInteger(int i) throws TikaException {
+ if (i > 8) {
+ throw new TikaMemoryLimitException("Big integer can't be > 8");
+ }
if (getData() == null)
return BigInteger.ZERO;
if (getData().length - getSwath() < i)
@@ -115,16 +119,16 @@ public class ChmSection {
byteval[i] = (byte) c[i];
return byteval;
}
-
+/*
public BigInteger unmarshalUlong() {
return getBigInteger(8);
}
public long unmarshalUInt() {
return getBigInteger(4).longValue();
- }
+ }*/
- public int unmarshalInt() {
+ public int unmarshalInt() throws TikaException {
return getBigInteger(4).intValue();
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java
index fb13ef8..5e7ba0f 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/dbf/DBFCell.java
@@ -42,6 +42,7 @@ class DBFCell {
DBFCell(DBFColumnHeader.ColType colType, int fieldLength, int decimalCount) {
this.colType = colType;
this.decimalCount = decimalCount;
+ //field length is limit-checked in DBFFileHeader
this.bytes = new byte[fieldLength];
}
@@ -78,6 +79,7 @@ class DBFCell {
* @return copy of bytes that were read on the last read
*/
byte[] getBytes() {
+ //bytesReadLast is effectively limit checked by DBFFileHeader
byte[] ret = new byte[bytesReadLast];
System.arraycopy(bytes, 0, ret, 0, bytesReadLast);
return ret;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
index 2cfb53d..1f0859a 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
@@ -276,7 +276,7 @@ public class DWGParser extends AbstractParser {
// The offset is stored in the header from 0x20 onwards
long offsetToSection = EndianUtils.getLongLE(header, 0x20);
- // Sanity check the offset. Some files seem to use a different format,
+ // Check the offset. Some files seem to use a different format,
// and the offset isn't available at 0x20. Until we can work out how
// to find the offset in those files, skip them if detected
if (offsetToSection > 0xa00000l) {
@@ -284,15 +284,15 @@ public class DWGParser extends AbstractParser {
offsetToSection = 0;
}
- // Work out how far to skip, and sanity check
+ // Work out how far to skip, and check plausibility
long toSkip = offsetToSection - header.length;
if(offsetToSection == 0){
return false;
- }
- while (toSkip > 0) {
- byte[] skip = new byte[Math.min((int) toSkip, 0x4000)];
- IOUtils.readFully(stream, skip);
- toSkip -= skip.length;
+ }
+ long skipped = IOUtils.skipFully(stream, toSkip);
+ if (skipped != toSkip) {
+ throw new TikaException("Failed to skip: " + toSkip +
+ " bytes; skipped: " + skipped);
}
return true;
}
@@ -339,7 +339,7 @@ public class DWGParser extends AbstractParser {
// We should now have the count
int count = EndianUtils.readUShortLE(stream);
- // Sanity check it
+ // Plausibility check
if(count > 0 && count < 0x7f) {
// Looks plausible
return count;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java
index b962e42..9720970 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java
@@ -97,7 +97,7 @@ public class ExecutableParser extends AbstractParser implements MachineMetadata
// Grab the PE header offset
int peOffset = LittleEndian.readInt(stream);
- // Sanity check - while it may go anywhere, it's normally in the first few kb
+ // Plausibility check: while it may go anywhere, it's normally in the first few kb
if (peOffset > 4096 || peOffset < 0x3f) return;
// Skip the rest of the MS-DOS stub (if PE), until we reach what should
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
index 48b8c02..6abf929 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
@@ -51,6 +51,7 @@ import org.apache.poi.util.IOUtils;
import org.apache.poi.util.LittleEndian;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.exception.UnsupportedFormatException;
import org.apache.tika.io.CloseShieldInputStream;
import org.apache.tika.metadata.Metadata;
@@ -72,6 +73,8 @@ public class HwpTextExtractorV5 implements Serializable {
private static final int HWPTAG_BEGIN = 0x010;
+ private static final int MAX_TAG_LENGTH = 50 * 1024 * 1024;
+
private static final int I = 1; // INLINE
private static final int C = 2; // CONTROL
private static final int X = 3; // EXTENDED
@@ -249,7 +252,8 @@ public class HwpTextExtractorV5 implements Serializable {
* @throws SAXException
*/
private void parseBodyText(FileHeader header, DirectoryNode root,
- XHTMLContentHandler xhtml) throws IOException, SAXException {
+ XHTMLContentHandler xhtml)
+ throws IOException, SAXException, TikaMemoryLimitException {
// read BodyText
Entry bodyText = root.getEntry("BodyText");
if (bodyText == null || !bodyText.isDirectoryEntry()) {
@@ -289,7 +293,8 @@ public class HwpTextExtractorV5 implements Serializable {
* @throws IOException
*/
private void parseViewText(FileHeader header, DirectoryNode root,
- XHTMLContentHandler xhtml) throws IOException {
+ XHTMLContentHandler xhtml)
+ throws IOException, TikaMemoryLimitException {
// read BodyText
Entry bodyText = root.getEntry("ViewText");
if (bodyText == null || !bodyText.isDirectoryEntry()) {
@@ -383,7 +388,7 @@ public class HwpTextExtractorV5 implements Serializable {
* @throws SAXException
*/
private void parse(HwpStreamReader reader, XHTMLContentHandler xhtml)
- throws IOException, SAXException {
+ throws IOException, SAXException, TikaMemoryLimitException {
StringBuilder buf = new StringBuilder();
TagInfo tag = new TagInfo();
@@ -395,6 +400,9 @@ public class HwpTextExtractorV5 implements Serializable {
if (tag.length % 2 != 0) {
throw new IOException("Invalid block size");
}
+ if (tag.length > MAX_TAG_LENGTH) {
+ throw new TikaMemoryLimitException("Tags must be smaller than " + MAX_TAG_LENGTH);
+ }
buf.setLength(0);
writeParaText(reader, tag.length, buf);
@@ -422,6 +430,7 @@ public class HwpTextExtractorV5 implements Serializable {
*/
private void writeParaText(HwpStreamReader reader, long datasize,
StringBuilder buf) throws IOException {
+ //datasize is bounds checked before calling writeParaText
int[] chars = reader.uint16((int) (datasize / 2));
for (int index = 0; index < chars.length; index++) {
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/BPGParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/BPGParser.java
index 2a48a55..2d0e0d3 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/BPGParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/BPGParser.java
@@ -24,7 +24,10 @@ import java.util.HashSet;
import java.util.Set;
import org.apache.poi.util.IOUtils;
+
+import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Photoshop;
@@ -37,7 +40,7 @@ import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
/**
- * Parser for the Better Portable Graphics )BPG) File Format.
+ * Parser for the Better Portable Graphics (BPG) File Format.
* <p/>
* Documentation on the file format is available from
* http://bellard.org/bpg/bpg_spec.txt
@@ -48,10 +51,17 @@ public class BPGParser extends AbstractParser {
protected static final int EXTENSION_TAG_XMP = 3;
protected static final int EXTENSION_TAG_THUMBNAIL = 4;
private static final long serialVersionUID = -161736541253892772L;
+
+ //50 MB -- throw TikaMemoryLimitException if xmp or exif is allegedly longer than this
+ private static final int DEFAULT_MAX_RECORD_LENGTH = 50 * 1024 * 1024;
+
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(new HashSet<MediaType>(Arrays.asList(
MediaType.image("x-bpg"), MediaType.image("bpg"))));
+ private int maxRecordLength = DEFAULT_MAX_RECORD_LENGTH;
+
+
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@@ -144,6 +154,12 @@ public class BPGParser extends AbstractParser {
while (extensionsDataSeen < extensionDataLength) {
int extensionType = (int) EndianUtils.readUE7(stream);
int extensionLength = (int) EndianUtils.readUE7(stream);
+ if (extensionLength > maxRecordLength) {
+ throw new TikaMemoryLimitException("extension length (" +
+ extensionLength + " bytes) is greater than 'maxRecordLength' (" +
+ maxRecordLength + " bytes). If this file is not corrupt, " +
+ "consider bumping the maxRecordLength via tika-config.xml");
+ }
switch (extensionType) {
case EXTENSION_TAG_EXIF:
metadataExtractor.parseRawExif(stream, extensionLength, true);
@@ -152,7 +168,10 @@ public class BPGParser extends AbstractParser {
handleXMP(stream, extensionLength, metadataExtractor);
break;
default:
- stream.skip(extensionLength);
+ long skipped = IOUtils.skipFully(stream, extensionLength);
+ if (skipped != extensionLength) {
+ throw new TikaException("failed to skip required amount");
+ }
}
extensionsDataSeen += extensionLength;
}
@@ -168,6 +187,11 @@ public class BPGParser extends AbstractParser {
xhtml.endDocument();
}
+ @Field
+ public void setMaxRecordLength(int maxRecordLength) {
+ this.maxRecordLength = maxRecordLength;
+ }
+
protected void handleXMP(InputStream stream, int xmpLength,
ImageMetadataExtractor extractor) throws IOException, TikaException, SAXException {
byte[] xmp = new byte[xmpLength];
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/image/ICNSParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/image/ICNSParser.java
index 47d89ab..8e27adf 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/image/ICNSParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/image/ICNSParser.java
@@ -62,9 +62,16 @@ public class ICNSParser extends AbstractParser {
int image_length = java.nio.ByteBuffer.wrap(header).getInt();
if (image_length > MAX_IMAGE_LENGTH_BYTES) {
throw new TikaMemoryLimitException(image_length, MAX_IMAGE_LENGTH_BYTES);
+ } else if (image_length < 0) {
+ throw new TikaException("image length must be >= 0");
+ }
+ //image_length includes the initial 8 bytes.
+ int actualImageLength = image_length - 8;
+ byte[] full_file = new byte[actualImageLength];
+ long read = IOUtils.readFully(stream, full_file);
+ if (read != actualImageLength) {
+ throw new IOException("file not fully read from stream");
}
- byte[] full_file = new byte[image_length];
- IOUtils.readFully(stream, full_file);
ArrayList<ICNSType> icons = new ArrayList<>();
ArrayList<ICNSType> icon_masks = new ArrayList<>();
byte[] tempByteArray = new byte[4];
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index c9c409d..3f97249 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -149,28 +149,37 @@ abstract class AbstractPOIFSExtractor {
/**
* Handle an office document that's embedded at the POIFS level
*/
- protected void handleEmbeddedOfficeDoc(
- DirectoryEntry dir, String resourceName, XHTMLContentHandler xhtml)
+ protected void handleEmbeddedOfficeDoc(DirectoryEntry dir, String resourceName,
+ XHTMLContentHandler xhtml)
throws IOException, SAXException, TikaException {
+
// Is it an embedded OLE2 document, or an embedded OOXML document?
+ //first try for ooxml
+ Entry ooxml = dir.hasEntry("Package") ? dir.getEntry("Package") :
+ (dir.hasEntry("package") ? dir.getEntry("package") : null);
- if (dir.hasEntry("Package")) {
+ if (ooxml != null) {
// It's OOXML (has a ZipFile):
- Entry ooxml = dir.getEntry("Package");
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_LENGTH,
+ Integer.toString(((DocumentEntry)ooxml).getSize()));
+ try (TikaInputStream stream = TikaInputStream
+ .get(new DocumentInputStream((DocumentEntry) ooxml))) {
- try (TikaInputStream stream = TikaInputStream.get(
- new DocumentInputStream((DocumentEntry) ooxml))) {
- ZipContainerDetector detector = new ZipContainerDetector();
+ Detector detector = new ZipContainerDetector();
MediaType type = null;
try {
- //if there's a stream error while detecting...
- type = detector.detect(stream, new Metadata());
+ type = detector.detect(stream, metadata);
+ } catch (SecurityException e) {
+ throw e;
} catch (Exception e) {
+ //if there's a stream error while detecting, give up
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
- handleEmbeddedResource(stream, null, dir.getName(), dir.getStorageClsid(), type.toString(), xhtml, true);
+ handleEmbeddedResource(stream, metadata,null, dir.getName(), dir.getStorageClsid(),
+ type.toString(), xhtml, true);
return;
}
}
@@ -181,91 +190,131 @@ abstract class AbstractPOIFSExtractor {
Metadata metadata = new Metadata();
metadata.set(Metadata.EMBEDDED_RELATIONSHIP_ID, dir.getName());
if (dir.getStorageClsid() != null) {
- metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID, dir.getStorageClsid().toString());
+ metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID,
+ dir.getStorageClsid().toString());
}
POIFSDocumentType type = POIFSDocumentType.detectType(dir);
- TikaInputStream embedded = null;
String rName = (resourceName == null) ? dir.getName() : resourceName;
+ if (type == POIFSDocumentType.OLE10_NATIVE) {
+ handleOLENative(dir, type, rName, metadata, xhtml);
+ } else if (type == POIFSDocumentType.COMP_OBJ) {
+ handleCompObj(dir, type, rName, metadata, xhtml);
+ } else {
+ metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
+ metadata.set(Metadata.RESOURCE_NAME_KEY,
+ rName + '.' + type.getExtension());
+ parseEmbedded(dir, xhtml, metadata);
+ }
+ }
+
+ private void handleCompObj(DirectoryEntry dir, POIFSDocumentType type, String rName,
+ Metadata metadata, XHTMLContentHandler xhtml)
+ throws IOException, SAXException {
+ //TODO: figure out if the equivalent of OLE 1.0's
+ //getCommand() and getFileName() exist for OLE 2.0 to populate
+ //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
+
+ // Grab the contents and process
+ DocumentEntry contentsEntry;
try {
- if (type == POIFSDocumentType.OLE10_NATIVE) {
- try {
- // Try to un-wrap the OLE10Native record:
- Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
- if (ole.getLabel() != null) {
- metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
- }
- if (ole.getCommand() != null) {
- metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
- }
- if (ole.getFileName() != null) {
- metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
- }
- byte[] data = ole.getDataBuffer();
- embedded = TikaInputStream.get(data);
- } catch (Ole10NativeException ex) {
- // Not a valid OLE10Native record, skip it
- } catch (Exception e) {
- EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
- return;
- }
- } else if (type == POIFSDocumentType.COMP_OBJ) {
- try {
- //TODO: figure out if the equivalent of OLE 1.0's
- //getCommand() and getFileName() exist for OLE 2.0 to populate
- //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
+ contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
+ } catch (FileNotFoundException fnfe1) {
+ try {
+ contentsEntry = (DocumentEntry) dir.getEntry("Contents");
+ } catch (FileNotFoundException fnfe2) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(fnfe2, parentMetadata);
+ return;
+ }
+ }
+ int length = contentsEntry.getSize();
+ DocumentInputStream inp = null;
+ try {
+ inp = new DocumentInputStream(contentsEntry);
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
+ return;
+ }
+ try (TikaInputStream tis = TikaInputStream.get(inp)) {
+ // Try to work out what it is
+ MediaType mediaType = getDetector().detect(tis, metadata);
+ String extension = type.getExtension();
+ try {
+ MimeType mimeType = getMimeTypes().forName(mediaType.toString());
+ extension = mimeType.getExtension();
+ } catch (MimeTypeException mte) {
+ // No details on this type are known
+ }
- // Grab the contents and process
- DocumentEntry contentsEntry;
- try {
- contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
- } catch (FileNotFoundException ioe) {
- contentsEntry = (DocumentEntry) dir.getEntry("Contents");
- }
- DocumentInputStream inp = new DocumentInputStream(contentsEntry);
- byte[] contents = new byte[contentsEntry.getSize()];
- inp.readFully(contents);
- embedded = TikaInputStream.get(contents);
+ // Record what we can do about it
+ metadata.set(Metadata.CONTENT_TYPE, mediaType.getType());
+ metadata.set(Metadata.RESOURCE_NAME_KEY, rName + extension);
+ metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
+ parseEmbedded(dir, tis, xhtml, metadata);
+ } finally {
+ inp.close();
+ }
+ }
- // Try to work out what it is
- MediaType mediaType = getDetector().detect(embedded, new Metadata());
- String extension = type.getExtension();
- try {
- MimeType mimeType = getMimeTypes().forName(mediaType.toString());
- extension = mimeType.getExtension();
- } catch (MimeTypeException mte) {
- // No details on this type are known
- }
- // Record what we can do about it
- metadata.set(Metadata.CONTENT_TYPE, mediaType.getType().toString());
- metadata.set(Metadata.RESOURCE_NAME_KEY, rName + extension);
- } catch (Exception e) {
- EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
- return;
- }
+ private void handleOLENative(DirectoryEntry dir, POIFSDocumentType type, String rName,
+ Metadata metadata, XHTMLContentHandler xhtml)
+ throws IOException, SAXException {
+ byte[] data = null;
+ try {
+ // Try to un-wrap the OLE10Native record:
+ Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
+ if (ole.getLabel() != null) {
+ metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
} else {
- metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
- metadata.set(Metadata.RESOURCE_NAME_KEY, rName + '.' + type.getExtension());
+ metadata.add(Metadata.RESOURCE_NAME_KEY, rName);
}
-
- // Should we parse it?
- if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
- if (embedded == null) {
- // Make a TikaInputStream that just
- // passes the root directory of the
- // embedded document, and is otherwise
- // empty (byte[0]):
- embedded = TikaInputStream.get(new byte[0]);
- embedded.setOpenContainer(dir);
- }
- embeddedDocumentUtil.parseEmbedded(embedded, xhtml, metadata, true);
+ if (ole.getCommand() != null) {
+ metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
}
- } catch (IOException e) {
- EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
- } finally {
- if (embedded != null) {
- embedded.close();
+ if (ole.getFileName() != null) {
+ metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
+ }
+ data = ole.getDataBuffer();
+ metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(data.length));
+ } catch (Ole10NativeException ex) {
+ // Not a valid OLE10Native record, skip it
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
+ return;
+ }
+ try (TikaInputStream tis = TikaInputStream.get(data)) {
+ parseEmbedded(dir, tis, xhtml, metadata);
+ }
+ }
+
+ private void parseEmbedded(DirectoryEntry dir, TikaInputStream tis, XHTMLContentHandler xhtml,
+ Metadata metadata) throws IOException, SAXException {
+ if (!embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
+ return;
+ }
+ if (dir.getStorageClsid() != null) {
+ metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID,
+ dir.getStorageClsid().toString());
+ }
+ embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, true);
+ }
+
+ private void parseEmbedded(DirectoryEntry dir, XHTMLContentHandler xhtml, Metadata metadata)
+ throws IOException, SAXException {
+ if (!embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
+ return;
+ }
+ try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
+ tis.setOpenContainer(dir);
+ if (dir.getStorageClsid() != null) {
+ metadata.set(Metadata.EMBEDDED_STORAGE_CLASS_ID,
+ dir.getStorageClsid().toString());
}
+ embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, true);
}
}
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
index df0cc73..301778b 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
@@ -66,7 +66,7 @@ public class MSOwnerFileParser extends AbstractParser {
byte[] asciiNameBytes = new byte[ASCII_CHUNK_LENGTH];
IOUtils.readFully(stream, asciiNameBytes);
int asciiNameLength = (int)asciiNameBytes[0];//don't need to convert to unsigned int because it can't be that long
- //sanity check name length
+ //check name length
if (asciiNameLength < 0) {
throw new TikaException("ascii name length must be >= 0");
} else if (asciiNameLength > ASCII_CHUNK_LENGTH) {
@@ -79,6 +79,7 @@ public class MSOwnerFileParser extends AbstractParser {
int unicodeCharLength = stream.read();
if (asciiNameLength == unicodeCharLength) {
stream.read();//zero after the char length
+ //this is effectively bounds checked by asciiNameLength
byte[] unicodeBytes = new byte[unicodeCharLength * 2];
IOUtils.readFully(stream, unicodeBytes);
String unicodeName = new String(unicodeBytes, StandardCharsets.UTF_16LE);
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index cf6f51d..33b7fbf 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -587,7 +587,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
*/
private void guess7BitEncoding(MAPIMessage msg) {
Chunks mainChunks = msg.getMainChunks();
- //sanity check
+ //can be null...¯\_(ツ)_/¯
if (mainChunks == null) {
return;
}
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java
index ddb45f6..f20a737 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/prt/PRTParser.java
@@ -144,7 +144,7 @@ public class PRTParser extends AbstractParser {
int length = EndianUtils.readUShortLE(stream);
if(length <= MAX_SANE_TEXT_LENGTH) {
- // Length sanity check passed
+ // Length check passed
handleText(length, stream, xhtml);
}
}
@@ -170,7 +170,7 @@ public class PRTParser extends AbstractParser {
IOUtils.readFully(stream, b2);
int length = EndianUtils.getUShortLE(b2);
if(length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
- // Length sanity check passed
+ // Length check passed
handleText(length, stream, xhtml);
} else {
// Was probably something else
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
index 4c062f2..4b37ba6 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/rtf/TextExtractor.java
@@ -948,7 +948,7 @@ final class TextExtractor {
} else if (equals("listtemplateid")) {
currentList.templateID = param;
} else if (equals("levelnfc") || equals("levelnfcn")) {
- //sanity check to make sure list information isn't corrupt
+ //check to make sure list information isn't corrupt
if (listTableLevel > -1 &&
listTableLevel < currentList.numberType.length) {
currentList.numberType[listTableLevel] = param;
diff --git a/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java b/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java
index 947b694..bfca9a7 100644
--- a/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java
+++ b/tika-parsers/src/main/java/org/apache/tika/parser/video/FLVParser.java
@@ -83,9 +83,10 @@ public class FLVParser extends AbstractParser {
}
private int readUInt24(DataInputStream input) throws IOException {
- int uint = input.read()<<16;
- uint += input.read()<<8;
- uint += input.read();
+ //readunsignedbyte checks for eof
+ int uint = input.readUnsignedByte()<<16;
+ uint += input.readUnsignedByte()<<8;
+ uint += input.readUnsignedByte();
return uint;
}
@@ -209,7 +210,7 @@ public class FLVParser extends AbstractParser {
break;
}
- int datalen = readUInt24(datainput); //body length
+ final int datalen = readUInt24(datainput); //body length
readUInt32(datainput); // timestamp
readUInt24(datainput); // streamid
diff --git a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
index 1952939..4fc79d2 100644
--- a/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
+++ b/tika-parsers/src/test/java/org/apache/tika/parser/pdf/PDFParserTest.java
@@ -1307,7 +1307,7 @@ public class PDFParserTest extends TikaTest {
context.set(PDFParserConfig.class, config);
List<Metadata> metadataList = getRecursiveMetadata("testPDF_childAttachments.pdf", context);
- //sanity check
+ //plausibility check
assertEquals(5, metadataList.size());
//inlined jpeg metadata
Metadata jpegMetadata = metadataList.get(1);