You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/02/10 17:56:22 UTC
[tika] branch main updated: TIKA-3677 -- remove sanity check language and other cleanups
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 9a144ab TIKA-3677 -- remove sanity check language and other cleanups
9a144ab is described below
commit 9a144ab1666a41d469b3318025dc9a695be3ef70
Author: tallison <ta...@apache.org>
AuthorDate: Thu Feb 10 12:50:53 2022 -0500
TIKA-3677 -- remove sanity check language and other cleanups
---
.../src/main/java/org/apache/tika/cli/TikaCLI.java | 2 +-
.../main/java/org/apache/tika/io/EndianUtils.java | 4 +-
.../org/apache/tika/parser/video/FLVParser.java | 9 +-
.../java/org/apache/tika/parser/dwg/DWGParser.java | 14 +-
.../java/org/apache/tika/parser/prt/PRTParser.java | 12 +-
.../tika/parser/executable/ExecutableParser.java | 2 +-
.../org/apache/tika/parser/image/BPGParser.java | 30 ++-
.../org/apache/tika/parser/image/ICNSParser.java | 2 +
.../org/apache/tika/parser/image/PSDParser.java | 3 +-
.../parser/microsoft/AbstractPOIFSExtractor.java | 204 ++++++++++++---------
.../tika/parser/microsoft/MSOwnerFileParser.java | 2 +
.../tika/parser/microsoft/OutlookExtractor.java | 2 +-
.../tika/parser/microsoft/chm/ChmCommons.java | 7 +-
.../microsoft/chm/ChmDirectoryListingSet.java | 1 -
.../tika/parser/microsoft/chm/ChmLzxBlock.java | 9 +-
.../tika/parser/microsoft/chm/ChmSection.java | 14 +-
.../tika/parser/microsoft/rtf/TextExtractor.java | 2 +-
.../java/org/apache/tika/parser/dbf/DBFCell.java | 2 +
.../apache/tika/parser/hwp/HwpTextExtractorV5.java | 12 +-
19 files changed, 215 insertions(+), 118 deletions(-)
diff --git a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
index d548cb8..4da57a4 100644
--- a/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
+++ b/tika-app/src/main/java/org/apache/tika/cli/TikaCLI.java
@@ -884,7 +884,7 @@ public class TikaCLI {
Set<String> tikaLacking = new TreeSet<>();
Set<String> tikaNoMagic = new TreeSet<>();
- // Sanity check
+ // Plausibility check
File dir = new File(magicDir);
if ((new File(dir, "elf")).exists() && (new File(dir, "mime")).exists() &&
(new File(dir, "vorbis")).exists()) {
diff --git a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
index 0400f40..c09eadc 100644
--- a/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
+++ b/tika-core/src/main/java/org/apache/tika/io/EndianUtils.java
@@ -211,7 +211,9 @@ public class EndianUtils {
public static long readUE7(InputStream stream) throws IOException {
int i;
long v = 0;
- while ((i = stream.read()) >= 0) {
+ int max = 6;
+ int read = 0;
+ while ((i = stream.read()) >= 0 && read++ < max) {
v = v << 7;
if ((i & 128) == 128) {
// Continues
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/video/FLVParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/video/FLVParser.java
index c36d4ac..cbacdfb 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/video/FLVParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-audiovideo-module/src/main/java/org/apache/tika/parser/video/FLVParser.java
@@ -84,9 +84,10 @@ public class FLVParser extends AbstractParser {
}
private int readUInt24(DataInputStream input) throws IOException {
- int uint = input.read() << 16;
- uint += input.read() << 8;
- uint += input.read();
+ //readUnsignedByte ensures EOFException
+ int uint = input.readUnsignedByte() << 16;
+ uint += input.readUnsignedByte() << 8;
+ uint += input.readUnsignedByte();
return uint;
}
@@ -206,7 +207,7 @@ public class FLVParser extends AbstractParser {
break;
}
- int datalen = readUInt24(datainput); //body length
+ final int datalen = readUInt24(datainput); //body length
readUInt32(datainput); // timestamp
readUInt24(datainput); // streamid
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
index 7a988ca..8c2f087 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/dwg/DWGParser.java
@@ -266,7 +266,7 @@ public class DWGParser extends AbstractParser {
// The offset is stored in the header from 0x20 onwards
long offsetToSection = EndianUtils.getLongLE(header, 0x20);
- // Sanity check the offset. Some files seem to use a different format,
+ // Bounds check the offset. Some files seem to use a different format,
// and the offset isn't available at 0x20. Until we can work out how
// to find the offset in those files, skip them if detected
if (offsetToSection > 0xa00000l) {
@@ -274,15 +274,15 @@ public class DWGParser extends AbstractParser {
offsetToSection = 0;
}
- // Work out how far to skip, and sanity check
+ // Work out how far to skip, and bounds check
long toSkip = offsetToSection - header.length;
if (offsetToSection == 0) {
return false;
}
- while (toSkip > 0) {
- byte[] skip = new byte[Math.min((int) toSkip, 0x4000)];
- IOUtils.readFully(stream, skip);
- toSkip -= skip.length;
+ long skipped = IOUtils.skipFully(stream, toSkip);
+ if (skipped != toSkip) {
+ throw new TikaException("Failed to skip: " + toSkip +
+ " bytes; skipped: " + skipped);
}
return true;
}
@@ -329,7 +329,7 @@ public class DWGParser extends AbstractParser {
// We should now have the count
int count = EndianUtils.readUShortLE(stream);
- // Sanity check it
+ // Plausibilitu check it
if (count > 0 && count < 0x7f) {
// Looks plausible
return count;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
index dafe6f1..ecb7261 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-cad-module/src/main/java/org/apache/tika/parser/prt/PRTParser.java
@@ -55,7 +55,7 @@ public class PRTParser extends AbstractParser {
* How long do we allow a text run to claim to be, before we
* decide we're confused and it's not really text after all?
*/
- private static final int MAX_SANE_TEXT_LENGTH = 0x0800;
+ private static final int MAX_TEXT_LENGTH = 0x0800;
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
@@ -145,8 +145,8 @@ public class PRTParser extends AbstractParser {
}
int length = EndianUtils.readUShortLE(stream);
- if (length <= MAX_SANE_TEXT_LENGTH) {
- // Length sanity check passed
+ if (length <= MAX_TEXT_LENGTH) {
+ // Length check passed
handleText(length, stream, xhtml);
}
}
@@ -170,15 +170,15 @@ public class PRTParser extends AbstractParser {
byte[] b2 = new byte[2];
IOUtils.readFully(stream, b2);
int length = EndianUtils.getUShortLE(b2);
- if (length > 1 && length <= MAX_SANE_TEXT_LENGTH) {
- // Length sanity check passed
+ if (length > 1 && length <= MAX_TEXT_LENGTH) {
+ // Length check passed
handleText(length, stream, xhtml);
} else {
// Was probably something else
l5.record(b2[0]);
l5.record(b2[1]);
}
- } else if (maybeLength > 0 && maybeLength < MAX_SANE_TEXT_LENGTH) {
+ } else if (maybeLength > 0 && maybeLength < MAX_TEXT_LENGTH) {
// Looks like it's straight into the text
handleText(maybeLength, stream, xhtml);
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java
index 9658d62..0d94bab 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-code-module/src/main/java/org/apache/tika/parser/executable/ExecutableParser.java
@@ -98,7 +98,7 @@ public class ExecutableParser extends AbstractParser implements MachineMetadata
// Grab the PE header offset
int peOffset = EndianUtils.readIntLE(stream);
- // Sanity check - while it may go anywhere, it's normally in the first few kb
+ // Reasonability check - while it may go anywhere, it's normally in the first few kb
if (peOffset > 4096 || peOffset < 0x3f) {
return;
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/BPGParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/BPGParser.java
index beb1d1d..a154796 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/BPGParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/BPGParser.java
@@ -27,7 +27,9 @@ import org.apache.commons.io.IOUtils;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
+import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Photoshop;
@@ -46,6 +48,10 @@ public class BPGParser extends AbstractImageParser {
protected static final int EXTENSION_TAG_ICC_PROFILE = 2;
protected static final int EXTENSION_TAG_XMP = 3;
protected static final int EXTENSION_TAG_THUMBNAIL = 4;
+
+ //50 MB -- throw TikaMemoryLimitException if xmp or exif is allegedly longer than this
+ private static final int DEFAULT_MAX_RECORD_LENGTH = 50 * 1024 * 1024;
+
private static final long serialVersionUID = -161736541253892772L;
private static final Set<MediaType> SUPPORTED_TYPES = Collections.unmodifiableSet(
new HashSet<>(
@@ -55,6 +61,8 @@ public class BPGParser extends AbstractImageParser {
return SUPPORTED_TYPES;
}
+ private int maxRecordLength = DEFAULT_MAX_RECORD_LENGTH;
+
@Override
void extractMetadata(InputStream stream, ContentHandler contentHandler, Metadata metadata,
ParseContext parseContext)
@@ -145,6 +153,12 @@ public class BPGParser extends AbstractImageParser {
while (extensionsDataSeen < extensionDataLength) {
int extensionType = (int) EndianUtils.readUE7(stream);
int extensionLength = (int) EndianUtils.readUE7(stream);
+ if (extensionLength > maxRecordLength) {
+ throw new TikaMemoryLimitException("extension length (" +
+ extensionLength + " bytes) is greater than 'maxRecordLength' (" +
+ maxRecordLength + " bytes). If this file is not corrupt, " +
+ "consider bumping the maxRecordLength via tika-config.xml");
+ }
switch (extensionType) {
case EXTENSION_TAG_EXIF:
metadataExtractor.parseRawExif(stream, extensionLength, true);
@@ -153,7 +167,7 @@ public class BPGParser extends AbstractImageParser {
handleXMP(stream, extensionLength, metadataExtractor);
break;
default:
- stream.skip(extensionLength);
+ IOUtils.skipFully(stream, extensionLength);
}
extensionsDataSeen += extensionLength;
}
@@ -163,8 +177,22 @@ public class BPGParser extends AbstractImageParser {
// We can't do anything with these parts
}
+ @Field
+ public void setMaxRecordLength(int maxRecordLength) {
+ this.maxRecordLength = maxRecordLength;
+ }
+
protected void handleXMP(InputStream stream, int xmpLength, ImageMetadataExtractor extractor)
throws IOException, TikaException, SAXException {
+ if (xmpLength < 0) {
+ throw new TikaException("xmp length must be >= 0");
+ }
+ if (xmpLength > maxRecordLength) {
+ throw new TikaMemoryLimitException("xmplength (" + xmpLength + " bytes) is larger than maxXMPLength (" +
+ maxRecordLength + "). Consider setting maxXMPLength to a greater value for " +
+ "this parser via" +
+ " tika-config.xml if this file is not corrupt.");
+ }
byte[] xmp = new byte[xmpLength];
IOUtils.readFully(stream, xmp);
extractor.parseRawXMP(xmp);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/ICNSParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/ICNSParser.java
index 9b32b42..7423eed 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/ICNSParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/ICNSParser.java
@@ -65,6 +65,8 @@ public class ICNSParser extends AbstractParser {
image_length -= 8;//for the bytes read so far
if (image_length > MAX_IMAGE_LENGTH_BYTES) {
throw new TikaMemoryLimitException(image_length, MAX_IMAGE_LENGTH_BYTES);
+ } else if (image_length < 0) {
+ throw new TikaException("image length must be >= 0");
}
byte[] full_file = new byte[image_length];
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java
index 997b6e3..957a26e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-image-module/src/main/java/org/apache/tika/parser/image/PSDParser.java
@@ -32,6 +32,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.config.Field;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Photoshop;
@@ -234,7 +235,7 @@ public class PSDParser extends AbstractParser {
// Do we have use for the data segment?
if (captureData(id)) {
if (dataLen > maxDataLengthBytes) {
- throw new TikaException(
+ throw new TikaMemoryLimitException(
"data length must be < " + maxDataLengthBytes + ": " + dataLen);
}
data = new byte[dataLen];
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
index 47571a6..f5fc6d8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/AbstractPOIFSExtractor.java
@@ -118,7 +118,6 @@ abstract class AbstractPOIFSExtractor {
throws IOException, SAXException, TikaException {
try {
-
if (filename != null) {
embeddedMetadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, filename);
}
@@ -164,18 +163,24 @@ abstract class AbstractPOIFSExtractor {
if (ooxml != null) {
// It's OOXML (has a ZipFile):
+ Metadata metadata = new Metadata();
+ metadata.set(Metadata.CONTENT_LENGTH,
+ Integer.toString(((DocumentEntry)ooxml).getSize()));
try (TikaInputStream stream = TikaInputStream
.get(new DocumentInputStream((DocumentEntry) ooxml))) {
+
Detector detector = new DefaultZipContainerDetector();
MediaType type = null;
try {
- //if there's a stream error while detecting...
- type = detector.detect(stream, new Metadata());
+ type = detector.detect(stream, metadata);
+ } catch (SecurityException e) {
+ throw e;
} catch (Exception e) {
+ //if there's a stream error while detecting, give up
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
- handleEmbeddedResource(stream, null, dir.getName(), dir.getStorageClsid(),
+ handleEmbeddedResource(stream, metadata,null, dir.getName(), dir.getStorageClsid(),
type.toString(), xhtml, true);
return;
}
@@ -191,94 +196,127 @@ abstract class AbstractPOIFSExtractor {
dir.getStorageClsid().toString());
}
POIFSDocumentType type = POIFSDocumentType.detectType(dir);
- TikaInputStream embedded = null;
String rName = (resourceName == null) ? dir.getName() : resourceName;
+ if (type == POIFSDocumentType.OLE10_NATIVE) {
+ handleOLENative(dir, type, rName, metadata, xhtml);
+ } else if (type == POIFSDocumentType.COMP_OBJ) {
+ handleCompObj(dir, type, rName, metadata, xhtml);
+ } else {
+ metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
+ rName + '.' + type.getExtension());
+ parseEmbedded(dir, xhtml, metadata);
+ }
+ }
+
+ private void handleCompObj(DirectoryEntry dir, POIFSDocumentType type, String rName,
+ Metadata metadata, XHTMLContentHandler xhtml)
+ throws IOException, SAXException {
+ //TODO: figure out if the equivalent of OLE 1.0's
+ //getCommand() and getFileName() exist for OLE 2.0 to populate
+ //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
+
+ // Grab the contents and process
+ DocumentEntry contentsEntry;
try {
- if (type == POIFSDocumentType.OLE10_NATIVE) {
- try {
- // Try to un-wrap the OLE10Native record:
- Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
- if (ole.getLabel() != null) {
- metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
- rName + '/' + ole.getLabel());
- } else {
- metadata.add(TikaCoreProperties.RESOURCE_NAME_KEY, rName);
- }
- if (ole.getCommand() != null) {
- metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
- }
- if (ole.getFileName() != null) {
- metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
- }
- byte[] data = ole.getDataBuffer();
- embedded = TikaInputStream.get(data);
- } catch (Ole10NativeException ex) {
- // Not a valid OLE10Native record, skip it
- } catch (Exception e) {
- EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
- return;
- }
- } else if (type == POIFSDocumentType.COMP_OBJ) {
- try {
- //TODO: figure out if the equivalent of OLE 1.0's
- //getCommand() and getFileName() exist for OLE 2.0 to populate
- //TikaCoreProperties.ORIGINAL_RESOURCE_NAME
+ contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
+ } catch (FileNotFoundException fnfe1) {
+ try {
+ contentsEntry = (DocumentEntry) dir.getEntry("Contents");
+ } catch (FileNotFoundException fnfe2) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(fnfe2, parentMetadata);
+ return;
+ }
+ }
+ int length = contentsEntry.getSize();
+ DocumentInputStream inp = null;
+ try {
+ inp = new DocumentInputStream(contentsEntry);
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
+ return;
+ }
+ try (TikaInputStream tis = TikaInputStream.get(inp)) {
+ // Try to work out what it is
+ MediaType mediaType = getDetector().detect(tis, metadata);
+ String extension = type.getExtension();
+ try {
+ MimeType mimeType = getMimeTypes().forName(mediaType.toString());
+ extension = mimeType.getExtension();
+ } catch (MimeTypeException mte) {
+ // No details on this type are known
+ }
- // Grab the contents and process
- DocumentEntry contentsEntry;
- try {
- contentsEntry = (DocumentEntry) dir.getEntry("CONTENTS");
- } catch (FileNotFoundException ioe) {
- contentsEntry = (DocumentEntry) dir.getEntry("Contents");
- }
- byte[] contents;
- try (DocumentInputStream inp = new DocumentInputStream(contentsEntry)) {
- contents = new byte[contentsEntry.getSize()];
- inp.readFully(contents);
- }
- embedded = TikaInputStream.get(contents);
+ // Record what we can do about it
+ metadata.set(Metadata.CONTENT_TYPE, mediaType.getType());
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName + extension);
+ metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(length));
+ parseEmbedded(dir, tis, xhtml, metadata);
+ } finally {
+ inp.close();
+ }
+ }
- // Try to work out what it is
- MediaType mediaType = getDetector().detect(embedded, new Metadata());
- String extension = type.getExtension();
- try {
- MimeType mimeType = getMimeTypes().forName(mediaType.toString());
- extension = mimeType.getExtension();
- } catch (MimeTypeException mte) {
- // No details on this type are known
- }
- // Record what we can do about it
- metadata.set(Metadata.CONTENT_TYPE, mediaType.getType());
- metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName + extension);
- } catch (Exception e) {
- EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
- return;
- }
+ private void handleOLENative(DirectoryEntry dir, POIFSDocumentType type, String rName,
+ Metadata metadata, XHTMLContentHandler xhtml)
+ throws IOException, SAXException {
+ byte[] data = null;
+ try {
+ // Try to un-wrap the OLE10Native record:
+ Ole10Native ole = Ole10Native.createFromEmbeddedOleObject((DirectoryNode) dir);
+ if (ole.getLabel() != null) {
+ metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY, rName + '/' + ole.getLabel());
} else {
- metadata.set(Metadata.CONTENT_TYPE, type.getType().toString());
- metadata.set(TikaCoreProperties.RESOURCE_NAME_KEY,
- rName + '.' + type.getExtension());
+ metadata.add(TikaCoreProperties.RESOURCE_NAME_KEY, rName);
}
-
- // Should we parse it?
- if (embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
- if (embedded == null) {
- // Make a TikaInputStream that just
- // passes the root directory of the
- // embedded document, and is otherwise
- // empty (byte[0]):
- embedded = TikaInputStream.get(new byte[0]);
- embedded.setOpenContainer(dir);
- }
- embeddedDocumentUtil.parseEmbedded(embedded, xhtml, metadata, true);
+ if (ole.getCommand() != null) {
+ metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getCommand());
}
- } catch (IOException e) {
- EmbeddedDocumentUtil.recordEmbeddedStreamException(e, metadata);
- } finally {
- if (embedded != null) {
- embedded.close();
+ if (ole.getFileName() != null) {
+ metadata.add(TikaCoreProperties.ORIGINAL_RESOURCE_NAME, ole.getFileName());
+ }
+ data = ole.getDataBuffer();
+ metadata.set(Metadata.CONTENT_LENGTH, Integer.toString(data.length));
+ } catch (Ole10NativeException ex) {
+ // Not a valid OLE10Native record, skip it
+ } catch (SecurityException e) {
+ throw e;
+ } catch (Exception e) {
+ EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
+ return;
+ }
+ try (TikaInputStream tis = TikaInputStream.get(data)) {
+ parseEmbedded(dir, tis, xhtml, metadata);
+ }
+ }
+
+ private void parseEmbedded(DirectoryEntry dir, TikaInputStream tis, XHTMLContentHandler xhtml,
+ Metadata metadata) throws IOException, SAXException {
+ if (!embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
+ return;
+ }
+ if (dir.getStorageClsid() != null) {
+ metadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID,
+ dir.getStorageClsid().toString());
+ }
+ embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, true);
+ }
+
+ private void parseEmbedded(DirectoryEntry dir, XHTMLContentHandler xhtml, Metadata metadata)
+ throws IOException, SAXException {
+ if (!embeddedDocumentUtil.shouldParseEmbedded(metadata)) {
+ return;
+ }
+ try (TikaInputStream tis = TikaInputStream.get(new byte[0])) {
+ tis.setOpenContainer(dir);
+ if (dir.getStorageClsid() != null) {
+ metadata.set(TikaCoreProperties.EMBEDDED_STORAGE_CLASS_ID,
+ dir.getStorageClsid().toString());
}
+ embeddedDocumentUtil.parseEmbedded(tis, xhtml, metadata, true);
}
}
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
index 6bc57ea..81ba429 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/MSOwnerFileParser.java
@@ -42,6 +42,7 @@ public class MSOwnerFileParser extends AbstractParser {
private static final int ASCII_CHUNK_LENGTH = 54;
private static final MediaType MEDIA_TYPE = MediaType.application("x-ms-owner");
+ private static final int MAX_STRING_LENGTH = 10 * 1024 * 1024;
/**
* Serial version UID
*/
@@ -79,6 +80,7 @@ public class MSOwnerFileParser extends AbstractParser {
int unicodeCharLength = stream.read();
if (asciiNameLength == unicodeCharLength) {
stream.read();//zero after the char length
+ //this is effectively bounds checked by asciiNameLength
byte[] unicodeBytes = new byte[unicodeCharLength * 2];
IOUtils.readFully(stream, unicodeBytes);
String unicodeName = new String(unicodeBytes, StandardCharsets.UTF_16LE);
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
index fd884de..99462d8 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/OutlookExtractor.java
@@ -547,7 +547,7 @@ public class OutlookExtractor extends AbstractPOIFSExtractor {
*/
private void guess7BitEncoding(MAPIMessage msg) {
Chunks mainChunks = msg.getMainChunks();
- //sanity check
+ //null check
if (mainChunks == null) {
return;
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java
index 62773d7..4af06e4 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmCommons.java
@@ -306,9 +306,7 @@ public class ChmCommons {
if (newLength < 0) {
throw new IllegalArgumentException(from + " > " + to);
}
- if (to > original.length) {
- throw new TikaException("can't copy beyond array length");
- }
+
byte[] copy = new byte[newLength];
System.arraycopy(original, from, copy, 0, Math.min(original.length - from, newLength));
return copy;
@@ -324,6 +322,9 @@ public class ChmCommons {
if (to < 0) {
throw new IllegalArgumentException(to + " should be > 0");
}
+ if (to > original.length) {
+ throw new IllegalArgumentException("can't copy beyond array length");
+ }
}
/*
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmDirectoryListingSet.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmDirectoryListingSet.java
index 87538c4..ddca6a9 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmDirectoryListingSet.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmDirectoryListingSet.java
@@ -147,7 +147,6 @@ public class ChmDirectoryListingSet {
byte[] dir_chunk = null;
Set<Integer> processed = new HashSet<>();
for (int i = startPmgl; i >= 0; ) {
- dir_chunk = new byte[(int) chmItspHeader.getBlock_len()];
int start = i * (int) chmItspHeader.getBlock_len() + dir_offset;
dir_chunk = ChmCommons
.copyOfRange(getData(), start, start + (int) chmItspHeader.getBlock_len());
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmLzxBlock.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmLzxBlock.java
index 3029cb6..d1cd17e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmLzxBlock.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmLzxBlock.java
@@ -19,6 +19,7 @@ package org.apache.tika.parser.microsoft.chm;
import java.math.BigInteger;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.parser.microsoft.chm.ChmCommons.IntelState;
import org.apache.tika.parser.microsoft.chm.ChmCommons.LzxState;
@@ -32,6 +33,8 @@ import org.apache.tika.parser.microsoft.chm.ChmCommons.LzxState;
* such types.
*/
public class ChmLzxBlock {
+
+ private static int MAX_CONTENT_SIZE = 50 * 1024 * 1024;
private int block_number;
private long block_length;
private ChmLzxState state;
@@ -832,7 +835,11 @@ public class ChmLzxBlock {
return content;
}
- private void setContent(int contentLength) {
+ private void setContent(int contentLength) throws TikaException {
+ if (contentLength > MAX_CONTENT_SIZE) {
+ throw new TikaMemoryLimitException("content length (" + contentLength +
+ " bytes) is > MAX_CONTENT_SIZE");
+ }
this.content = new byte[contentLength];
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmSection.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmSection.java
index 3fc88cf..6c2961c 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmSection.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/chm/ChmSection.java
@@ -20,6 +20,7 @@ import java.math.BigInteger;
import java.util.Arrays;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
public class ChmSection {
final private byte[] data;
@@ -105,7 +106,10 @@ public class ChmSection {
return prevcontent;
}
- public BigInteger getBigInteger(int i) {
+ public BigInteger getBigInteger(int i) throws TikaException {
+ if (i > 8) {
+ throw new TikaMemoryLimitException("Big integer can't be > 8");
+ }
if (getData() == null) {
return BigInteger.ZERO;
}
@@ -128,6 +132,7 @@ public class ChmSection {
return byteval;
}
+ /*
public BigInteger unmarshalUlong() {
return getBigInteger(8);
}
@@ -139,8 +144,11 @@ public class ChmSection {
public int unmarshalInt() {
return getBigInteger(4).intValue();
}
-
- public byte[] unmarshalBytes(int i) {
+*/
+ public byte[] unmarshalBytes(int i) throws TikaException {
+ if (i > 8) {
+ throw new TikaMemoryLimitException("Must be <= 8");
+ }
if (i == 0) {
return new byte[1];
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
index bdb727f..0bec3c5 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-microsoft-module/src/main/java/org/apache/tika/parser/microsoft/rtf/TextExtractor.java
@@ -949,7 +949,7 @@ final class TextExtractor {
} else if (equals("listtemplateid")) {
currentList.templateID = param;
} else if (equals("levelnfc") || equals("levelnfcn")) {
- //sanity check to make sure list information isn't corrupt
+ //check to make sure list information isn't corrupt
if (listTableLevel > -1 && listTableLevel < currentList.numberType.length) {
currentList.numberType[listTableLevel] = param;
}
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFCell.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFCell.java
index 21fc981..870e7d0 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFCell.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/dbf/DBFCell.java
@@ -43,6 +43,7 @@ class DBFCell {
DBFCell(DBFColumnHeader.ColType colType, int fieldLength, int decimalCount) {
this.colType = colType;
this.decimalCount = decimalCount;
+ //field length is limit-checked in DBFFileHeader
this.bytes = new byte[fieldLength];
}
@@ -78,6 +79,7 @@ class DBFCell {
* @return copy of bytes that were read on the last read
*/
byte[] getBytes() {
+ //bytesReadLast is effectively limit checked by DBFFileHeader
byte[] ret = new byte[bytesReadLast];
System.arraycopy(bytes, 0, ret, 0, bytesReadLast);
return ret;
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
index 32c7869..fc1c169 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-miscoffice-module/src/main/java/org/apache/tika/parser/hwp/HwpTextExtractorV5.java
@@ -54,6 +54,7 @@ import org.xml.sax.SAXException;
import org.apache.tika.exception.EncryptedDocumentException;
import org.apache.tika.exception.TikaException;
+import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.exception.UnsupportedFormatException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Office;
@@ -66,6 +67,7 @@ public class HwpTextExtractorV5 implements Serializable {
private static final byte[] HWP_V5_SIGNATURE =
"HWP Document File".getBytes(StandardCharsets.US_ASCII);
private static final int HWPTAG_BEGIN = 0x010;
+ private static final int MAX_TAG_LENGTH = 50 * 1024 * 1024;
private static final int I = 1; // INLINE
private static final int C = 2; // CONTROL
private static final int X = 3; // EXTENDED
@@ -240,7 +242,7 @@ public class HwpTextExtractorV5 implements Serializable {
* @throws SAXException
*/
private void parseBodyText(FileHeader header, DirectoryNode root, XHTMLContentHandler xhtml)
- throws IOException, SAXException {
+ throws IOException, SAXException, TikaException {
// read BodyText
Entry bodyText = root.getEntry("BodyText");
if (bodyText == null || !bodyText.isDirectoryEntry()) {
@@ -278,7 +280,7 @@ public class HwpTextExtractorV5 implements Serializable {
* @throws IOException
*/
private void parseViewText(FileHeader header, DirectoryNode root, XHTMLContentHandler xhtml)
- throws IOException {
+ throws IOException, TikaException {
// read BodyText
Entry bodyText = root.getEntry("ViewText");
if (bodyText == null || !bodyText.isDirectoryEntry()) {
@@ -361,7 +363,7 @@ public class HwpTextExtractorV5 implements Serializable {
* @throws SAXException
*/
private void parse(HwpStreamReader reader, XHTMLContentHandler xhtml)
- throws IOException, SAXException {
+ throws IOException, TikaException, SAXException {
StringBuilder buf = new StringBuilder();
TagInfo tag = new TagInfo();
@@ -374,6 +376,9 @@ public class HwpTextExtractorV5 implements Serializable {
if (tag.length % 2 != 0) {
throw new IOException("Invalid block size");
}
+ if (tag.length > MAX_TAG_LENGTH) {
+ throw new TikaMemoryLimitException("Tags myst be smaller than " + MAX_TAG_LENGTH);
+ }
buf.setLength(0);
writeParaText(reader, tag.length, buf);
@@ -401,6 +406,7 @@ public class HwpTextExtractorV5 implements Serializable {
*/
private void writeParaText(HwpStreamReader reader, long datasize, StringBuilder buf)
throws IOException {
+ //datasize is bounds checked before calling writeParaText
int[] chars = reader.uint16((int) (datasize / 2));
for (int index = 0; index < chars.length; index++) {