You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/08/05 14:15:21 UTC

[tika] branch main updated: add bounds to storage of types and subtypes

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 19fb7bf5d add bounds to storage of types and subtypes
19fb7bf5d is described below

commit 19fb7bf5d42641eff6575fa805ffb906a3e95401
Author: tallison <ta...@apache.org>
AuthorDate: Fri Aug 5 10:15:13 2022 -0400

    add bounds to storage of types and subtypes
---
 .../apache/tika/parser/pdf/AbstractPDF2XHTML.java  | 24 ++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
index 7e7ef16ea..95dd6565e 100644
--- a/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
+++ b/tika-parsers/tika-parsers-standard/tika-parsers-standard-modules/tika-parser-pdf-module/src/main/java/org/apache/tika/parser/pdf/AbstractPDF2XHTML.java
@@ -139,6 +139,10 @@ class AbstractPDF2XHTML extends PDFTextStripper {
      */
     private final static int MAX_RECURSION_DEPTH = 100;
     private final static int MAX_BOOKMARK_ITEMS = 10000;
+
+    //This is used for both types and subtypes.
+    //These can be unbounded.  We need to limit the number we store.
+    private final static int MAX_ANNOTATION_TYPES = 100;
     private static final String THREE_D = "3D";
     private static final COSName THREE_DD = COSName.getPDFName("3DD");
     private static final String NULL_STRING = "null";
@@ -644,16 +648,20 @@ class AbstractPDF2XHTML extends PDFTextStripper {
         try {
             for (PDAnnotation annotation : page.getAnnotations()) {
                 String annotationName = annotation.getAnnotationName();
-                if (annotationName != null) {
-                    annotationTypes.add(annotationName);
-                } else {
-                    annotationTypes.add(NULL_STRING);
+                if (annotationTypes.size() < MAX_ANNOTATION_TYPES) {
+                    if (annotationName != null) {
+                        annotationTypes.add(annotationName);
+                    } else {
+                        annotationTypes.add(NULL_STRING);
+                    }
                 }
                 String annotationSubtype = annotation.getSubtype();
-                if (annotationSubtype != null) {
-                    annotationSubtypes.add(annotationSubtype);
-                } else {
-                    annotationSubtypes.add(NULL_STRING);
+                if (annotationSubtypes.size() < MAX_ANNOTATION_TYPES) {
+                    if (annotationSubtype != null) {
+                        annotationSubtypes.add(annotationSubtype);
+                    } else {
+                        annotationSubtypes.add(NULL_STRING);
+                    }
                 }
                 if (annotation instanceof PDAnnotationFileAttachment) {
                     PDAnnotationFileAttachment fann = (PDAnnotationFileAttachment) annotation;