You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/06/08 20:33:30 UTC
[tika] branch main updated: TIKA-3788 -- Record embedded file exceptions in the container file's metadata.
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 6f2ef64a5 TIKA-3788 -- Record embedded file exceptions in the container file's metadata.
6f2ef64a5 is described below
commit 6f2ef64a582328fb13198c97d51205b4d469424e
Author: tallison <ta...@apache.org>
AuthorDate: Wed Jun 8 16:33:17 2022 -0400
TIKA-3788 -- Record embedded file exceptions in the container file's metadata.
---
CHANGES.txt | 3 +++
.../ParsingEmbeddedDocumentExtractor.java | 26 +++++++++++++--------
.../apache/tika/metadata/TikaCoreProperties.java | 4 ++--
.../java/org/apache/tika/parser/ParseRecord.java | 21 ++++++++++++++---
.../apache/tika/parser/AutoDetectParserTest.java | 10 ++++++++
.../test-documents/mock/null_pointer.xml.gz | Bin 0 -> 651 bytes
6 files changed, 49 insertions(+), 15 deletions(-)
diff --git a/CHANGES.txt b/CHANGES.txt
index cb76c07f2..edb709cdf 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
Release 2.4.1 - ???
+ * Record embedded file exceptions in the container
+ file's metadata (TIKA-3788).
+
* Allow continuation of parsing after write limit has
been reached (TIKA-3787).
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
index f09963f9a..bdfd028f5 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
@@ -37,6 +37,7 @@ import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.parser.DelegatingParser;
import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ParseRecord;
import org.apache.tika.parser.Parser;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.EmbeddedContentHandler;
@@ -97,26 +98,23 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
// Use the delegate parser to parse this entry
try (TemporaryResources tmp = new TemporaryResources()) {
- final TikaInputStream newStream = TikaInputStream.get(
- new CloseShieldInputStream(stream), tmp);
+ final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
if (stream instanceof TikaInputStream) {
final Object container = ((TikaInputStream) stream).getOpenContainer();
if (container != null) {
newStream.setOpenContainer(container);
}
}
- DELEGATING_PARSER.parse(
- newStream,
- new EmbeddedContentHandler(new BodyContentHandler(handler)),
- metadata, context);
+ DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)),
+ metadata, context);
} catch (EncryptedDocumentException ede) {
- // TODO: can we log a warning that we lack the password?
- // For now, just skip the content
+ recordException(ede, context);
} catch (CorruptedFileException e) {
+ //necessary to stop the parse to avoid infinite loops
+ //on corrupt sqlite3 files
throw new IOException(e);
} catch (TikaException e) {
- // TODO: can we log a warning somehow?
- // Could not parse the entry, just skip the content
+ recordException(e, context);
}
if (outputHtml) {
@@ -124,6 +122,14 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
}
}
+ private void recordException(Exception e, ParseContext context) {
+ ParseRecord record = context.get(ParseRecord.class);
+ if (record == null) {
+ return;
+ }
+ record.addException(e);
+ }
+
public Parser getDelegatingParser() {
return DELEGATING_PARSER;
}
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index c4035ea31..79d80d658 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -74,11 +74,11 @@ public interface TikaCoreProperties {
//exception in an embedded file
Property EMBEDDED_EXCEPTION =
- Property.internalText(TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
+ Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
//warning while parsing in an embedded file
Property EMBEDDED_WARNING =
- Property.internalText(TIKA_META_EXCEPTION_PREFIX + "embedded_warning");
+ Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_warning");
Property WRITE_LIMIT_REACHED =
Property.internalBoolean(TIKA_META_EXCEPTION_PREFIX + "write_limit_reached");
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java b/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java
index 081c01920..7b5a58908 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java
@@ -27,6 +27,15 @@ import java.util.Set;
* after the parse by the {@link CompositeParser}.
*/
public class ParseRecord {
+
+ //hard limits so that specially crafted files
+ //don't cause an OOM
+ private static int MAX_PARSERS = 100;
+
+ private static final int MAX_EXCEPTIONS = 100;
+
+ private static final int MAX_WARNINGS = 100;
+
private int depth = 0;
private final Set<String> parsers = new LinkedHashSet<>();
@@ -53,15 +62,21 @@ public class ParseRecord {
}
void addParserClass(String parserClass) {
- parsers.add(parserClass);
+ if (parsers.size() < MAX_PARSERS) {
+ parsers.add(parserClass);
+ }
}
public void addException(Exception e) {
- exceptions.add(e);
+ if (exceptions.size() < MAX_EXCEPTIONS) {
+ exceptions.add(e);
+ }
}
public void addWarning(String msg) {
- warnings.add(msg);
+ if (warnings.size() < MAX_WARNINGS) {
+ warnings.add(msg);
+ }
}
public void setWriteLimitReached(boolean writeLimitReached) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
index ec3598d8d..98e99040d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -449,6 +449,16 @@ public class AutoDetectParserTest extends TikaTest {
assertNotContained("embed_4", txt);
}
+ @Test
+ public void testEmbeddedNPE() throws Exception {
+ Metadata metadata = new Metadata();
+ getXML("mock/null_pointer.xml.gz",
+ AUTO_DETECT_PARSER, metadata);
+ String embExString = metadata.get(TikaCoreProperties.EMBEDDED_EXCEPTION);
+ assertContains("another null pointer", embExString);
+
+ }
+
//This is not the complete/correct way to look for parsers within another parser
//However, it is good enough for this unit test for now.
private Parser find(CompositeParser parser, Class clazz) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/mock/null_pointer.xml.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/mock/null_pointer.xml.gz
new file mode 100644
index 000000000..b99d76587
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/mock/null_pointer.xml.gz differ