You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/31 17:16:01 UTC
[tika] branch branch_1x updated: TIKA-3781 -- add limits to buffer in StandardsExtractingContentHandler
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new 22f763a3f TIKA-3781 -- add limits to buffer in StandardsExtractingContentHandler
22f763a3f is described below
commit 22f763a3f14f9a47e46212a74b2a5d4339de6ab5
Author: tallison <ta...@apache.org>
AuthorDate: Tue May 31 13:15:54 2022 -0400
TIKA-3781 -- add limits to buffer in StandardsExtractingContentHandler
---
.../sax/StandardsExtractingContentHandler.java | 51 +++++++----
.../java/org/apache/tika/sax/StandardsText.java | 98 +++++++++++-----------
2 files changed, 84 insertions(+), 65 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java
index 5d633004f..6be84b8c7 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java
@@ -20,29 +20,32 @@ package org.apache.tika.sax;
import java.util.Arrays;
import java.util.List;
-import org.apache.tika.metadata.Metadata;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.DefaultHandler;
+import org.apache.tika.metadata.Metadata;
+
/**
* StandardsExtractingContentHandler is a Content Handler used to extract
* standard references while parsing.
- *
+ * <p>
+ * This handler relies on complex regular expressions which can be slow on some types of
+ * input data.
*/
public class StandardsExtractingContentHandler extends ContentHandlerDecorator {
public static final String STANDARD_REFERENCES = "standard_references";
- private Metadata metadata;
- private StringBuilder stringBuilder;
+ private final Metadata metadata;
+ private final StringBuilder stringBuilder;
+
+ private int maxBufferLength = 100000;
private double threshold = 0;
/**
* Creates a decorator for the given SAX event handler and Metadata object.
- *
- * @param handler
- * SAX event handler to be decorated.
- * @param metadata
- * {@link Metadata} object.
+ *
+ * @param handler SAX event handler to be decorated.
+ * @param metadata {@link Metadata} object.
*/
public StandardsExtractingContentHandler(ContentHandler handler, Metadata metadata) {
super(handler);
@@ -64,9 +67,9 @@ public class StandardsExtractingContentHandler extends ContentHandlerDecorator {
/**
* Gets the threshold to be used for selecting the standard references found
* within the text based on their score.
- *
+ *
* @return the threshold to be used for selecting the standard references
- * found within the text based on their score.
+ * found within the text based on their score.
*/
public double getThreshold() {
return threshold;
@@ -74,9 +77,8 @@ public class StandardsExtractingContentHandler extends ContentHandlerDecorator {
/**
* Sets the score to be used as threshold.
- *
- * @param score
- * the score to be used as threshold.
+ *
+ * @param score the score to be used as threshold.
*/
public void setThreshold(double score) {
this.threshold = score;
@@ -92,6 +94,10 @@ public class StandardsExtractingContentHandler extends ContentHandlerDecorator {
@Override
public void characters(char[] ch, int start, int length) throws SAXException {
try {
+ if (maxBufferLength > -1) {
+ int remaining = maxBufferLength - stringBuilder.length();
+ length = remaining > length ? length : remaining;
+ }
String text = new String(Arrays.copyOfRange(ch, start, start + length));
stringBuilder.append(text);
super.characters(ch, start, length);
@@ -107,10 +113,21 @@ public class StandardsExtractingContentHandler extends ContentHandlerDecorator {
@Override
public void endDocument() throws SAXException {
super.endDocument();
- List<StandardReference> standards = StandardsText.extractStandardReferences(stringBuilder.toString(),
- threshold);
+ List<StandardReference> standards =
+ StandardsText.extractStandardReferences(stringBuilder.toString(), threshold);
for (StandardReference standardReference : standards) {
metadata.add(STANDARD_REFERENCES, standardReference.toString());
}
}
-}
\ No newline at end of file
+
+
+ /**
+ * The number of characters to store in memory for checking for standards.
+ *
+ * If this is unbounded, the complex regular expressions can take a long time
+ * to process some types of data. Only increase this limit with great caution.
+ */
+ public void setMaxBufferLength(int maxBufferLength) {
+ this.maxBufferLength = maxBufferLength;
+ }
+}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
index b4109d9dc..b9806fc5f 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
@@ -30,7 +30,7 @@ import org.apache.tika.sax.StandardReference.StandardReferenceBuilder;
/**
* StandardText relies on regular expressions to extract standard references
* from text.
- *
+ *
* <p>
* This class helps to find the standard references from text by performing the
* following steps:
@@ -42,7 +42,7 @@ import org.apache.tika.sax.StandardReference.StandardReferenceBuilder;
* <li>each potential standard reference starts with score equal to 0.25;</li>
* <li>increases by 0.25 the score of references which include the name of a
* known standard organization ({@link StandardOrganizations});</li>
- * <li>increases by 0.25 the score of references which include the word
+ * <li>increases by 0.25 the score of references which include the word
* Publication or Standard;</li>
* <li>increases by 0.25 the score of references which have been found within
* "Applicable Documents" and equivalent sections;</li>
@@ -53,15 +53,18 @@ import org.apache.tika.sax.StandardReference.StandardReferenceBuilder;
public class StandardsText {
// Regular expression to match uppercase headers
private static final String REGEX_HEADER =
- "(\\d{1,10}\\.(\\d{1,10}\\.?){0,10})\\p{Blank}+([A-Z]{1,256}(\\s[A-Z]+){0,256}){5,}";
+ "(\\d{1,10}+\\.(\\d{1,10}+\\.?){0,10}+)\\p{Blank}+([A-Z]{1,64}+(\\s[A-Z]{1,64}+){0," +
+ "256}+){5,10}+";
// Regular expression to match the "APPLICABLE DOCUMENTS" and equivalent
// sections
- private static final String REGEX_APPLICABLE_DOCUMENTS = "(?i:.*APPLICABLE\\sDOCUMENTS|REFERENCE|STANDARD|REQUIREMENT|GUIDELINE|COMPLIANCE.*)";
+ private static final String REGEX_APPLICABLE_DOCUMENTS =
+ "(?i:.*APPLICABLE\\sDOCUMENTS|REFERENCE|STANDARD|REQUIREMENT|GUIDELINE|COMPLIANCE.*)";
// Regular expression to match the alphanumeric identifier of the standard
- private static final String REGEX_IDENTIFIER = "(?<identifier>([0-9]{3,20}|([A-Z]+(-|_|\\.)" +
- "?[0-9]{2,20}))((-|_|\\.)?[A-Z0-9]+){0,10})";
+ private static final String REGEX_IDENTIFIER =
+ "(?<identifier>([0-9]{3,64}+|([A-Z]{1,64}+(-|_|\\.)?[0-9]{2,64}+))((-|_|\\.)" +
+ "?[A-Z0-9]{1,64}+){0,64}+)";
// Regular expression to match the standard organization
private static final String REGEX_ORGANIZATION = StandardOrganizations.getOrganzationsRegex();
@@ -72,45 +75,42 @@ public class StandardsText {
// Regular expression to match a string that is supposed to be a standard
// reference
- private static final String REGEX_FALLBACK = "\\(?" + "(?<mainOrganization>[A-Z]\\w{1,100})"
- + "\\)?((\\s?(?<separator>\\/)\\s?)(\\w{1,100}\\s)*\\(?" + "(?<secondOrganization>[A" +
- "-Z]\\w{1,100})" + "\\)?)?"
- + REGEX_STANDARD_TYPE + "?" + "(-|\\s)?" + REGEX_IDENTIFIER;
+ private static final String REGEX_FALLBACK = "\\(?" + "(?<mainOrganization>[A-Z]\\w{1,64}+)" +
+ "\\)?((\\s?(?<separator>\\/)\\s?)(\\w{1,64}+\\s)*\\(?" + "(?<secondOrganization>[A-Z" +
+ "]\\w{1,64}+)" +
+ "\\)?)?" + REGEX_STANDARD_TYPE + "?" + "(-|\\s)?" + REGEX_IDENTIFIER;
// Regular expression to match the standard organization within a string
// that is supposed to be a standard reference
- private static final String REGEX_STANDARD = ".*" + REGEX_ORGANIZATION + ".+" + REGEX_ORGANIZATION + "?.*";
+ private static final String REGEX_STANDARD =
+ ".*" + REGEX_ORGANIZATION + ".+" + REGEX_ORGANIZATION + "?.*";
/**
* Extracts the standard references found within the given text.
- *
- * @param text
- * the text from which the standard references are extracted.
- * @param threshold
- * the lower bound limit to be used in order to select only the
- * standard references with score greater than or equal to the
- * threshold. For instance, using a threshold of 0.75 means that
- * only the patterns with score greater than or equal to 0.75
- * will be returned.
+ *
+ * @param text the text from which the standard references are extracted.
+ * @param threshold the lower bound limit to be used in order to select only the
+ * standard references with score greater than or equal to the
+ * threshold. For instance, using a threshold of 0.75 means that
+ * only the patterns with score greater than or equal to 0.75
+ * will be returned.
* @return the list of standard references extracted from the given text.
*/
- public static ArrayList<StandardReference> extractStandardReferences(String text, double threshold) {
+ public static ArrayList<StandardReference> extractStandardReferences(String text,
+ double threshold) {
Map<Integer, String> headers = findHeaders(text);
- ArrayList<StandardReference> standardReferences = findStandards(text, headers, threshold);
-
- return standardReferences;
+ return findStandards(text, headers, threshold);
}
/**
* This method helps to find the headers within the given text.
- *
- * @param text
- * the text from which the headers are extracted.
+ *
+ * @param text the text from which the headers are extracted.
* @return the list of headers found within the given text.
*/
private static Map<Integer, String> findHeaders(String text) {
- Map<Integer, String> headers = new TreeMap<Integer, String>();
+ Map<Integer, String> headers = new TreeMap<>();
Pattern pattern = Pattern.compile(REGEX_HEADER);
Matcher matcher = pattern.matcher(text);
@@ -124,20 +124,18 @@ public class StandardsText {
/**
* This method helps to find the standard references within the given text.
- *
- * @param text
- * the text from which the standards references are extracted.
- * @param headers
- * the list of headers found within the given text.
- * @param threshold
- * the lower bound limit to be used in order to select only the
- * standard references with score greater than or equal to the
- * threshold.
+ *
+ * @param text the text from which the standards references are extracted.
+ * @param headers the list of headers found within the given text.
+ * @param threshold the lower bound limit to be used in order to select only the
+ * standard references with score greater than or equal to the
+ * threshold.
* @return the list of standard references extracted from the given text.
*/
- private static ArrayList<StandardReference> findStandards(String text, Map<Integer, String> headers,
- double threshold) {
- ArrayList<StandardReference> standards = new ArrayList<StandardReference>();
+ private static ArrayList<StandardReference> findStandards(String text,
+ Map<Integer, String> headers,
+ double threshold) {
+ ArrayList<StandardReference> standards = new ArrayList<>();
double score = 0;
Pattern pattern = Pattern.compile(REGEX_FALLBACK);
@@ -146,15 +144,18 @@ public class StandardsText {
while (matcher.find()) {
StandardReferenceBuilder builder = new StandardReference.StandardReferenceBuilder(
matcher.group("mainOrganization"), matcher.group("identifier"))
- .setSecondOrganization(matcher.group("separator"), matcher.group("secondOrganization"));
+ .setSecondOrganization(matcher.group("separator"),
+ matcher.group("secondOrganization"));
score = 0.25;
- // increases by 0.25 the score of references which include the name of a known standard organization
+ // increases by 0.25 the score of references which include the name of a known
+ // standard organization
if (matcher.group().matches(REGEX_STANDARD)) {
score += 0.25;
}
-
- // increases by 0.25 the score of references which include the word "Publication" or "Standard"
+
+ // increases by 0.25 the score of references which include the word "Publication" or
+ // "Standard"
if (matcher.group().matches(".*" + REGEX_STANDARD_TYPE + ".*")) {
score += 0.25;
}
@@ -172,14 +173,15 @@ public class StandardsText {
}
String header = headers.get(startHeader);
-
- // increases by 0.25 the score of references which have been found within "Applicable Documents" and equivalent sections
+
+ // increases by 0.25 the score of references which have been found within "Applicable
+ // Documents" and equivalent sections
if (header != null && headers.get(startHeader).matches(REGEX_APPLICABLE_DOCUMENTS)) {
score += 0.25;
}
builder.setScore(score);
-
+
if (score >= threshold) {
standards.add(builder.build());
}
@@ -187,4 +189,4 @@ public class StandardsText {
return standards;
}
-}
\ No newline at end of file
+}