You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/06/08 20:33:30 UTC

[tika] branch main updated: TIKA-3788 -- Record embedded file exceptions in the container file's metadata.

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 6f2ef64a5 TIKA-3788 -- Record embedded file exceptions in the container file's metadata.
6f2ef64a5 is described below

commit 6f2ef64a582328fb13198c97d51205b4d469424e
Author: tallison <ta...@apache.org>
AuthorDate: Wed Jun 8 16:33:17 2022 -0400

    TIKA-3788 -- Record embedded file exceptions in the container file's metadata.
---
 CHANGES.txt                                        |   3 +++
 .../ParsingEmbeddedDocumentExtractor.java          |  26 +++++++++++++--------
 .../apache/tika/metadata/TikaCoreProperties.java   |   4 ++--
 .../java/org/apache/tika/parser/ParseRecord.java   |  21 ++++++++++++++---
 .../apache/tika/parser/AutoDetectParserTest.java   |  10 ++++++++
 .../test-documents/mock/null_pointer.xml.gz        | Bin 0 -> 651 bytes
 6 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/CHANGES.txt b/CHANGES.txt
index cb76c07f2..edb709cdf 100644
--- a/CHANGES.txt
+++ b/CHANGES.txt
@@ -1,5 +1,8 @@
 Release 2.4.1 - ???
 
+   * Record embedded file exceptions in the container
+     file's metadata (TIKA-3788).
+
    * Allow continuation of parsing after write limit has
      been reached (TIKA-3787).
 
diff --git a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
index f09963f9a..bdfd028f5 100644
--- a/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
+++ b/tika-core/src/main/java/org/apache/tika/extractor/ParsingEmbeddedDocumentExtractor.java
@@ -37,6 +37,7 @@ import org.apache.tika.metadata.Metadata;
 import org.apache.tika.metadata.TikaCoreProperties;
 import org.apache.tika.parser.DelegatingParser;
 import org.apache.tika.parser.ParseContext;
+import org.apache.tika.parser.ParseRecord;
 import org.apache.tika.parser.Parser;
 import org.apache.tika.sax.BodyContentHandler;
 import org.apache.tika.sax.EmbeddedContentHandler;
@@ -97,26 +98,23 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
 
         // Use the delegate parser to parse this entry
         try (TemporaryResources tmp = new TemporaryResources()) {
-            final TikaInputStream newStream = TikaInputStream.get(
-                    new CloseShieldInputStream(stream), tmp);
+            final TikaInputStream newStream = TikaInputStream.get(new CloseShieldInputStream(stream), tmp);
             if (stream instanceof TikaInputStream) {
                 final Object container = ((TikaInputStream) stream).getOpenContainer();
                 if (container != null) {
                     newStream.setOpenContainer(container);
                 }
             }
-            DELEGATING_PARSER.parse(
-                                    newStream,
-                                    new EmbeddedContentHandler(new BodyContentHandler(handler)),
-                                    metadata, context);
+            DELEGATING_PARSER.parse(newStream, new EmbeddedContentHandler(new BodyContentHandler(handler)),
+                    metadata, context);
         } catch (EncryptedDocumentException ede) {
-            // TODO: can we log a warning that we lack the password?
-            // For now, just skip the content
+            recordException(ede, context);
         } catch (CorruptedFileException e) {
+            //necessary to stop the parse to avoid infinite loops
+            //on corrupt sqlite3 files
             throw new IOException(e);
         } catch (TikaException e) {
-            // TODO: can we log a warning somehow?
-            // Could not parse the entry, just skip the content
+            recordException(e, context);
         }
 
         if (outputHtml) {
@@ -124,6 +122,14 @@ public class ParsingEmbeddedDocumentExtractor implements EmbeddedDocumentExtract
         }
     }
 
+    private void recordException(Exception e, ParseContext context) {
+        ParseRecord record = context.get(ParseRecord.class);
+        if (record == null) {
+            return;
+        }
+        record.addException(e);
+    }
+
     public Parser getDelegatingParser() {
         return DELEGATING_PARSER;
     }
diff --git a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
index c4035ea31..79d80d658 100644
--- a/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
+++ b/tika-core/src/main/java/org/apache/tika/metadata/TikaCoreProperties.java
@@ -74,11 +74,11 @@ public interface TikaCoreProperties {
 
     //exception in an embedded file
     Property EMBEDDED_EXCEPTION =
-            Property.internalText(TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
+            Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_exception");
 
     //warning while parsing in an embedded file
     Property EMBEDDED_WARNING =
-            Property.internalText(TIKA_META_EXCEPTION_PREFIX + "embedded_warning");
+            Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_warning");
 
     Property WRITE_LIMIT_REACHED =
             Property.internalBoolean(TIKA_META_EXCEPTION_PREFIX + "write_limit_reached");
diff --git a/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java b/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java
index 081c01920..7b5a58908 100644
--- a/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java
+++ b/tika-core/src/main/java/org/apache/tika/parser/ParseRecord.java
@@ -27,6 +27,15 @@ import java.util.Set;
  * after the parse by the {@link CompositeParser}.
  */
 public class ParseRecord {
+
+    //hard limits so that specially crafted files
+    //don't cause an OOM
+    private static int MAX_PARSERS = 100;
+
+    private static final int MAX_EXCEPTIONS = 100;
+
+    private static final int MAX_WARNINGS = 100;
+
     private int depth = 0;
     private final Set<String> parsers = new LinkedHashSet<>();
 
@@ -53,15 +62,21 @@ public class ParseRecord {
     }
 
     void addParserClass(String parserClass) {
-        parsers.add(parserClass);
+        if (parsers.size() < MAX_PARSERS) {
+            parsers.add(parserClass);
+        }
     }
 
     public void addException(Exception e) {
-        exceptions.add(e);
+        if (exceptions.size() < MAX_EXCEPTIONS) {
+            exceptions.add(e);
+        }
     }
 
     public void addWarning(String msg) {
-        warnings.add(msg);
+        if (warnings.size() < MAX_WARNINGS) {
+            warnings.add(msg);
+        }
     }
 
     public void setWriteLimitReached(boolean writeLimitReached) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
index ec3598d8d..98e99040d 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/AutoDetectParserTest.java
@@ -449,6 +449,16 @@ public class AutoDetectParserTest extends TikaTest {
         assertNotContained("embed_4", txt);
     }
 
+    @Test
+    public void testEmbeddedNPE() throws Exception {
+        Metadata metadata = new Metadata();
+        getXML("mock/null_pointer.xml.gz",
+                AUTO_DETECT_PARSER, metadata);
+        String embExString = metadata.get(TikaCoreProperties.EMBEDDED_EXCEPTION);
+        assertContains("another null pointer", embExString);
+
+    }
+
     //This is not the complete/correct way to look for parsers within another parser
     //However, it is good enough for this unit test for now.
     private Parser find(CompositeParser parser, Class clazz) {
diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/mock/null_pointer.xml.gz b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/mock/null_pointer.xml.gz
new file mode 100644
index 000000000..b99d76587
Binary files /dev/null and b/tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/resources/test-documents/mock/null_pointer.xml.gz differ