You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/31 17:07:52 UTC
[tika] branch main updated: TIKA-3781 -- set bounds on StandardsExtractingContentHandler's buffer
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/main by this push:
new 3f0078639 TIKA-3781 -- set bounds on StandardsExtractingContentHandler's buffer
3f0078639 is described below
commit 3f0078639e9b15de7f5f8293df9222fdc1505fe0
Author: tallison <ta...@apache.org>
AuthorDate: Tue May 31 13:07:39 2022 -0400
TIKA-3781 -- set bounds on StandardsExtractingContentHandler's buffer
---
.../tika/sax/StandardsExtractingContentHandler.java | 20 ++++++++++++++++++++
.../main/java/org/apache/tika/sax/StandardsText.java | 11 +++++++----
2 files changed, 27 insertions(+), 4 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java
index ee248be5e..006034a01 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java
@@ -29,11 +29,16 @@ import org.apache.tika.metadata.Metadata;
/**
* StandardsExtractingContentHandler is a Content Handler used to extract
* standard references while parsing.
+ * <p>
+ * This handler relies on complex regular expressions which can be slow on some types of
+ * input data.
*/
public class StandardsExtractingContentHandler extends ContentHandlerDecorator {
public static final String STANDARD_REFERENCES = "standard_references";
private final Metadata metadata;
private final StringBuilder stringBuilder;
+
+ private int maxBufferLength = 100000;
private double threshold = 0;
/**
@@ -89,6 +94,10 @@ public class StandardsExtractingContentHandler extends ContentHandlerDecorator {
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
try {
+ if (maxBufferLength > -1) {
+ int remaining = maxBufferLength - stringBuilder.length();
+ length = remaining > length ? length : remaining;
+ }
String text = new String(Arrays.copyOfRange(ch, start, start + length));
stringBuilder.append(text);
super.characters(ch, start, length);
@@ -110,4 +119,15 @@ public class StandardsExtractingContentHandler extends ContentHandlerDecorator {
metadata.add(STANDARD_REFERENCES, standardReference.toString());
}
}
+
+
+ /**
+ * The number of characters to store in memory for checking for standards.
+ *
+ * If this is unbounded, the complex regular expressions can take a long time
+ * to process some types of data. Only increase this limit with great caution.
+ */
+ public void setMaxBufferLength(int maxBufferLength) {
+ this.maxBufferLength = maxBufferLength;
+ }
}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
index 686832dcc..697eedee9 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
@@ -53,7 +53,8 @@ import org.apache.tika.sax.StandardReference.StandardReferenceBuilder;
public class StandardsText {
// Regular expression to match uppercase headers
private static final String REGEX_HEADER =
- "(\\d{1,10}\\.(\\d{1,10}\\.?){0,10})\\p{Blank}+([A-Z]{1,256}(\\s[A-Z]+){0,256}){5,}";
+ "(\\d{1,10}+\\.(\\d{1,10}+\\.?){0,10}+)\\p{Blank}+([A-Z]{1,64}+(\\s[A-Z]{1,64}+){0," +
+ "256}+){5,10}+";
// Regular expression to match the "APPLICABLE DOCUMENTS" and equivalent
// sections
@@ -62,7 +63,8 @@ public class StandardsText {
// Regular expression to match the alphanumeric identifier of the standard
private static final String REGEX_IDENTIFIER =
- "(?<identifier>([0-9]{3,}|([A-Z]+(-|_|\\.)?[0-9]{2,}))((-|_|\\.)?[A-Z0-9]+)*)";
+ "(?<identifier>([0-9]{3,64}+|([A-Z]{1,64}+(-|_|\\.)?[0-9]{2,64}+))((-|_|\\.)" +
+ "?[A-Z0-9]{1,64}+){0,64}+)";
// Regular expression to match the standard organization
private static final String REGEX_ORGANIZATION = StandardOrganizations.getOrganzationsRegex();
@@ -73,8 +75,9 @@ public class StandardsText {
// Regular expression to match a string that is supposed to be a standard
// reference
- private static final String REGEX_FALLBACK = "\\(?" + "(?<mainOrganization>[A-Z]\\w+)" +
- "\\)?((\\s?(?<separator>\\/)\\s?)(\\w+\\s)*\\(?" + "(?<secondOrganization>[A-Z]\\w+)" +
+ private static final String REGEX_FALLBACK = "\\(?" + "(?<mainOrganization>[A-Z]\\w{1,64}+)" +
+ "\\)?((\\s?(?<separator>\\/)\\s?)(\\w{1,64}+\\s)*\\(?" + "(?<secondOrganization>[A-Z" +
+ "]\\w{1,64}+)" +
"\\)?)?" + REGEX_STANDARD_TYPE + "?" + "(-|\\s)?" + REGEX_IDENTIFIER;
// Regular expression to match the standard organization within a string