You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/31 17:16:01 UTC

[tika] branch branch_1x updated: TIKA-3781 -- add limits to buffer in StandardsExtractingContentHandler

This is an automated email from the ASF dual-hosted git repository.

tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git


The following commit(s) were added to refs/heads/branch_1x by this push:
     new 22f763a3f TIKA-3781 -- add limits to buffer in StandardsExtractingContentHandler
22f763a3f is described below

commit 22f763a3f14f9a47e46212a74b2a5d4339de6ab5
Author: tallison <ta...@apache.org>
AuthorDate: Tue May 31 13:15:54 2022 -0400

    TIKA-3781 -- add limits to buffer in StandardsExtractingContentHandler
---
 .../sax/StandardsExtractingContentHandler.java     | 51 +++++++----
 .../java/org/apache/tika/sax/StandardsText.java    | 98 +++++++++++-----------
 2 files changed, 84 insertions(+), 65 deletions(-)

diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java b/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java
index 5d633004f..6be84b8c7 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/StandardsExtractingContentHandler.java
@@ -20,29 +20,32 @@ package org.apache.tika.sax;
 import java.util.Arrays;
 import java.util.List;
 
-import org.apache.tika.metadata.Metadata;
 import org.xml.sax.ContentHandler;
 import org.xml.sax.SAXException;
 import org.xml.sax.helpers.DefaultHandler;
 
+import org.apache.tika.metadata.Metadata;
+
 /**
  * StandardsExtractingContentHandler is a Content Handler used to extract
  * standard references while parsing.
- *
+ * <p>
+ * This handler relies on complex regular expressions which can be slow on some types of
+ * input data.
  */
 public class StandardsExtractingContentHandler extends ContentHandlerDecorator {
 	public static final String STANDARD_REFERENCES = "standard_references";
-	private Metadata metadata;
-	private StringBuilder stringBuilder;
+	private final Metadata metadata;
+	private final StringBuilder stringBuilder;
+
+	private int maxBufferLength = 100000;
 	private double threshold = 0;
 
 	/**
 	 * Creates a decorator for the given SAX event handler and Metadata object.
-	 * 
-	 * @param handler
-	 *            SAX event handler to be decorated.
-	 * @param metadata
-	 *            {@link Metadata} object.
+	 *
+	 * @param handler  SAX event handler to be decorated.
+	 * @param metadata {@link Metadata} object.
 	 */
 	public StandardsExtractingContentHandler(ContentHandler handler, Metadata metadata) {
 		super(handler);
@@ -64,9 +67,9 @@ public class StandardsExtractingContentHandler extends ContentHandlerDecorator {
 	/**
 	 * Gets the threshold to be used for selecting the standard references found
 	 * within the text based on their score.
-	 * 
+	 *
 	 * @return the threshold to be used for selecting the standard references
-	 *         found within the text based on their score.
+	 * found within the text based on their score.
 	 */
 	public double getThreshold() {
 		return threshold;
@@ -74,9 +77,8 @@ public class StandardsExtractingContentHandler extends ContentHandlerDecorator {
 
 	/**
 	 * Sets the score to be used as threshold.
-	 * 
-	 * @param score
-	 *            the score to be used as threshold.
+	 *
+	 * @param score the score to be used as threshold.
 	 */
 	public void setThreshold(double score) {
 		this.threshold = score;
@@ -92,6 +94,10 @@ public class StandardsExtractingContentHandler extends ContentHandlerDecorator {
 	@Override
 	public void characters(char[] ch, int start, int length) throws SAXException {
 		try {
+			if (maxBufferLength > -1) {
+				int remaining = maxBufferLength - stringBuilder.length();
+				length = remaining > length ? length : remaining;
+			}
 			String text = new String(Arrays.copyOfRange(ch, start, start + length));
 			stringBuilder.append(text);
 			super.characters(ch, start, length);
@@ -107,10 +113,21 @@ public class StandardsExtractingContentHandler extends ContentHandlerDecorator {
 	@Override
 	public void endDocument() throws SAXException {
 		super.endDocument();
-		List<StandardReference> standards = StandardsText.extractStandardReferences(stringBuilder.toString(),
-				threshold);
+		List<StandardReference> standards =
+				StandardsText.extractStandardReferences(stringBuilder.toString(), threshold);
 		for (StandardReference standardReference : standards) {
 			metadata.add(STANDARD_REFERENCES, standardReference.toString());
 		}
 	}
-}
\ No newline at end of file
+
+
+	/**
+	 * The number of characters to store in memory for checking for standards.
+	 *
+	 * If this is unbounded, the complex regular expressions can take a long time
+	 * to process some types of data.  Only increase this limit with great caution.
+	 */
+	public void setMaxBufferLength(int maxBufferLength) {
+		this.maxBufferLength = maxBufferLength;
+	}
+}
diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
index b4109d9dc..b9806fc5f 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
@@ -30,7 +30,7 @@ import org.apache.tika.sax.StandardReference.StandardReferenceBuilder;
 /**
  * StandardText relies on regular expressions to extract standard references
  * from text.
- * 
+ *
  * <p>
  * This class helps to find the standard references from text by performing the
  * following steps:
@@ -42,7 +42,7 @@ import org.apache.tika.sax.StandardReference.StandardReferenceBuilder;
  * <li>each potential standard reference starts with score equal to 0.25;</li>
  * <li>increases by 0.25 the score of references which include the name of a
  * known standard organization ({@link StandardOrganizations});</li>
- * <li>increases by 0.25 the score of references which include the word 
+ * <li>increases by 0.25 the score of references which include the word
  * Publication or Standard;</li>
  * <li>increases by 0.25 the score of references which have been found within
  * "Applicable Documents" and equivalent sections;</li>
@@ -53,15 +53,18 @@ import org.apache.tika.sax.StandardReference.StandardReferenceBuilder;
 public class StandardsText {
 	// Regular expression to match uppercase headers
 	private static final String REGEX_HEADER =
-			"(\\d{1,10}\\.(\\d{1,10}\\.?){0,10})\\p{Blank}+([A-Z]{1,256}(\\s[A-Z]+){0,256}){5,}";
+			"(\\d{1,10}+\\.(\\d{1,10}+\\.?){0,10}+)\\p{Blank}+([A-Z]{1,64}+(\\s[A-Z]{1,64}+){0," +
+					"256}+){5,10}+";
 
 	// Regular expression to match the "APPLICABLE DOCUMENTS" and equivalent
 	// sections
-	private static final String REGEX_APPLICABLE_DOCUMENTS = "(?i:.*APPLICABLE\\sDOCUMENTS|REFERENCE|STANDARD|REQUIREMENT|GUIDELINE|COMPLIANCE.*)";
+	private static final String REGEX_APPLICABLE_DOCUMENTS =
+			"(?i:.*APPLICABLE\\sDOCUMENTS|REFERENCE|STANDARD|REQUIREMENT|GUIDELINE|COMPLIANCE.*)";
 
 	// Regular expression to match the alphanumeric identifier of the standard
-	private static final String REGEX_IDENTIFIER = "(?<identifier>([0-9]{3,20}|([A-Z]+(-|_|\\.)" +
-			"?[0-9]{2,20}))((-|_|\\.)?[A-Z0-9]+){0,10})";
+	private static final String REGEX_IDENTIFIER =
+			"(?<identifier>([0-9]{3,64}+|([A-Z]{1,64}+(-|_|\\.)?[0-9]{2,64}+))((-|_|\\.)" +
+					"?[A-Z0-9]{1,64}+){0,64}+)";
 
 	// Regular expression to match the standard organization
 	private static final String REGEX_ORGANIZATION = StandardOrganizations.getOrganzationsRegex();
@@ -72,45 +75,42 @@ public class StandardsText {
 
 	// Regular expression to match a string that is supposed to be a standard
 	// reference
-	private static final String REGEX_FALLBACK = "\\(?" + "(?<mainOrganization>[A-Z]\\w{1,100})"
-			+ "\\)?((\\s?(?<separator>\\/)\\s?)(\\w{1,100}\\s)*\\(?" + "(?<secondOrganization>[A" +
-			"-Z]\\w{1,100})" + "\\)?)?"
-			+ REGEX_STANDARD_TYPE + "?" + "(-|\\s)?" + REGEX_IDENTIFIER;
+	private static final String REGEX_FALLBACK = "\\(?" + "(?<mainOrganization>[A-Z]\\w{1,64}+)" +
+			"\\)?((\\s?(?<separator>\\/)\\s?)(\\w{1,64}+\\s)*\\(?" + "(?<secondOrganization>[A-Z" +
+			"]\\w{1,64}+)" +
+			"\\)?)?" + REGEX_STANDARD_TYPE + "?" + "(-|\\s)?" + REGEX_IDENTIFIER;
 
 	// Regular expression to match the standard organization within a string
 	// that is supposed to be a standard reference
-	private static final String REGEX_STANDARD = ".*" + REGEX_ORGANIZATION + ".+" + REGEX_ORGANIZATION + "?.*";
+	private static final String REGEX_STANDARD =
+			".*" + REGEX_ORGANIZATION + ".+" + REGEX_ORGANIZATION + "?.*";
 
 	/**
 	 * Extracts the standard references found within the given text.
-	 * 
-	 * @param text
-	 *            the text from which the standard references are extracted.
-	 * @param threshold
-	 *            the lower bound limit to be used in order to select only the
-	 *            standard references with score greater than or equal to the
-	 *            threshold. For instance, using a threshold of 0.75 means that
-	 *            only the patterns with score greater than or equal to 0.75
-	 *            will be returned.
+	 *
+	 * @param text      the text from which the standard references are extracted.
+	 * @param threshold the lower bound limit to be used in order to select only the
+	 *                  standard references with score greater than or equal to the
+	 *                  threshold. For instance, using a threshold of 0.75 means that
+	 *                  only the patterns with score greater than or equal to 0.75
+	 *                  will be returned.
 	 * @return the list of standard references extracted from the given text.
 	 */
-	public static ArrayList<StandardReference> extractStandardReferences(String text, double threshold) {
+	public static ArrayList<StandardReference> extractStandardReferences(String text,
+																		 double threshold) {
 		Map<Integer, String> headers = findHeaders(text);
 
-		ArrayList<StandardReference> standardReferences = findStandards(text, headers, threshold);
-
-		return standardReferences;
+		return findStandards(text, headers, threshold);
 	}
 
 	/**
 	 * This method helps to find the headers within the given text.
-	 * 
-	 * @param text
-	 *            the text from which the headers are extracted.
+	 *
+	 * @param text the text from which the headers are extracted.
 	 * @return the list of headers found within the given text.
 	 */
 	private static Map<Integer, String> findHeaders(String text) {
-		Map<Integer, String> headers = new TreeMap<Integer, String>();
+		Map<Integer, String> headers = new TreeMap<>();
 
 		Pattern pattern = Pattern.compile(REGEX_HEADER);
 		Matcher matcher = pattern.matcher(text);
@@ -124,20 +124,18 @@ public class StandardsText {
 
 	/**
 	 * This method helps to find the standard references within the given text.
-	 * 
-	 * @param text
-	 *            the text from which the standards references are extracted.
-	 * @param headers
-	 *            the list of headers found within the given text.
-	 * @param threshold
-	 *            the lower bound limit to be used in order to select only the
-	 *            standard references with score greater than or equal to the
-	 *            threshold.
+	 *
+	 * @param text      the text from which the standards references are extracted.
+	 * @param headers   the list of headers found within the given text.
+	 * @param threshold the lower bound limit to be used in order to select only the
+	 *                  standard references with score greater than or equal to the
+	 *                  threshold.
 	 * @return the list of standard references extracted from the given text.
 	 */
-	private static ArrayList<StandardReference> findStandards(String text, Map<Integer, String> headers,
-			double threshold) {
-		ArrayList<StandardReference> standards = new ArrayList<StandardReference>();
+	private static ArrayList<StandardReference> findStandards(String text,
+															  Map<Integer, String> headers,
+															  double threshold) {
+		ArrayList<StandardReference> standards = new ArrayList<>();
 		double score = 0;
 
 		Pattern pattern = Pattern.compile(REGEX_FALLBACK);
@@ -146,15 +144,18 @@ public class StandardsText {
 		while (matcher.find()) {
 			StandardReferenceBuilder builder = new StandardReference.StandardReferenceBuilder(
 					matcher.group("mainOrganization"), matcher.group("identifier"))
-							.setSecondOrganization(matcher.group("separator"), matcher.group("secondOrganization"));
+					.setSecondOrganization(matcher.group("separator"),
+							matcher.group("secondOrganization"));
 			score = 0.25;
 
-			// increases by 0.25 the score of references which include the name of a known standard organization
+			// increases by 0.25 the score of references which include the name of a known
+			// standard organization
 			if (matcher.group().matches(REGEX_STANDARD)) {
 				score += 0.25;
 			}
-			
-			// increases by 0.25 the score of references which include the word "Publication" or "Standard"
+
+			// increases by 0.25 the score of references which include the word "Publication" or
+			// "Standard"
 			if (matcher.group().matches(".*" + REGEX_STANDARD_TYPE + ".*")) {
 				score += 0.25;
 			}
@@ -172,14 +173,15 @@ public class StandardsText {
 			}
 
 			String header = headers.get(startHeader);
-			
-			// increases by 0.25 the score of references which have been found within "Applicable Documents" and equivalent sections
+
+			// increases by 0.25 the score of references which have been found within "Applicable
+			// Documents" and equivalent sections
 			if (header != null && headers.get(startHeader).matches(REGEX_APPLICABLE_DOCUMENTS)) {
 				score += 0.25;
 			}
 
 			builder.setScore(score);
-			
+
 			if (score >= threshold) {
 				standards.add(builder.build());
 			}
@@ -187,4 +189,4 @@ public class StandardsText {
 
 		return standards;
 	}
-}
\ No newline at end of file
+}