You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/31 17:07:52 UTC

[tika] branch main updated: TIKA-3781 -- set bounds on StandardsExtractingContentHandler's buffer

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/main by this push:
     new 3f0078639 TIKA-3781 -- set bounds on StandardsExtractingContentHandler's buffer
3f0078639 is described below

commit 3f0078639e9b15de7f5f8293df9222fdc1505fe0
Author: tallison <ta...@apache.org>
AuthorDate: Tue May 31 13:07:39 2022 -0400

    TIKA-3781 -- set bounds on StandardsExtractingContentHandler's buffer
---
 .../tika/sax/StandardsExtractingContentHandler.java  | 20 ++++++++++++++++++++
 .../main/java/org/apache/tika/sax/StandardsText.java | 11 +++++++----
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java
index ee248be5e..006034a01 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java
@@ -29,11 +29,16 @@ import org.apache.tika.metadata.Metadata;
 /**
  * StandardsExtractingContentHandler is a Content Handler used to extract
  * standard references while parsing.
+ * <p>
+ * This handler relies on complex regular expressions which can be slow on some types of
+ * input data.
  */
 public class StandardsExtractingContentHandler extends ContentHandlerDecorator {
     public static final String STANDARD_REFERENCES = "standard_references";
     private final Metadata metadata;
     private final StringBuilder stringBuilder;
+
+    private int maxBufferLength = 100000;
     private double threshold = 0;
 
     /**
@@ -89,6 +94,10 @@ public class StandardsExtractingContentHandler extends ContentHandlerDecorator {
     @Override
     public void characters(char[] ch, int start, int length) throws SAXException {
         try {
+            if (maxBufferLength > -1) {
+                int remaining = maxBufferLength - stringBuilder.length();
+                length = remaining > length ? length : remaining;
+            }
             String text = new String(Arrays.copyOfRange(ch, start, start + length));
             stringBuilder.append(text);
             super.characters(ch, start, length);
@@ -110,4 +119,15 @@ public class StandardsExtractingContentHandler extends ContentHandlerDecorator {
             metadata.add(STANDARD_REFERENCES, standardReference.toString());
         }
     }
+
+
+    /**
+     * The number of characters to store in memory for checking for standards.
+     *
+     * If this is unbounded, the complex regular expressions can take a long time
+     * to process some types of data.  Only increase this limit with great caution.
+     */
+    public void setMaxBufferLength(int maxBufferLength) {
+        this.maxBufferLength = maxBufferLength;
+    }
 }
diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
index 686832dcc..697eedee9 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
@@ -53,7 +53,8 @@ import org.apache.tika.sax.StandardReference.StandardReferenceBuilder;
 public class StandardsText {
     // Regular expression to match uppercase headers
     private static final String REGEX_HEADER =
-            "(\\d{1,10}\\.(\\d{1,10}\\.?){0,10})\\p{Blank}+([A-Z]{1,256}(\\s[A-Z]+){0,256}){5,}";
+            "(\\d{1,10}+\\.(\\d{1,10}+\\.?){0,10}+)\\p{Blank}+([A-Z]{1,64}+(\\s[A-Z]{1,64}+){0," +
+                    "256}+){5,10}+";
 
     // Regular expression to match the "APPLICABLE DOCUMENTS" and equivalent
     // sections
@@ -62,7 +63,8 @@ public class StandardsText {
 
     // Regular expression to match the alphanumeric identifier of the standard
     private static final String REGEX_IDENTIFIER =
-            "(?<identifier>([0-9]{3,}|([A-Z]+(-|_|\\.)?[0-9]{2,}))((-|_|\\.)?[A-Z0-9]+)*)";
+            "(?<identifier>([0-9]{3,64}+|([A-Z]{1,64}+(-|_|\\.)?[0-9]{2,64}+))((-|_|\\.)" +
+                    "?[A-Z0-9]{1,64}+){0,64}+)";
 
     // Regular expression to match the standard organization
     private static final String REGEX_ORGANIZATION = StandardOrganizations.getOrganzationsRegex();
@@ -73,8 +75,9 @@ public class StandardsText {
 
     // Regular expression to match a string that is supposed to be a standard
     // reference
-    private static final String REGEX_FALLBACK = "\\(?" + "(?<mainOrganization>[A-Z]\\w+)" +
-            "\\)?((\\s?(?<separator>\\/)\\s?)(\\w+\\s)*\\(?" + "(?<secondOrganization>[A-Z]\\w+)" +
+    private static final String REGEX_FALLBACK = "\\(?" + "(?<mainOrganization>[A-Z]\\w{1,64}+)" +
+            "\\)?((\\s?(?<separator>\\/)\\s?)(\\w{1,64}+\\s)*\\(?" + "(?<secondOrganization>[A-Z" +
+            "]\\w{1,64}+)" +
             "\\)?)?" + REGEX_STANDARD_TYPE + "?" + "(-|\\s)?" + REGEX_IDENTIFIER;
 
     // Regular expression to match the standard organization within a string