You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@tika.apache.org by ta...@apache.org on 2022/05/17 17:19:07 UTC
[tika] branch branch_1x updated: improve regex in StandardsText
This is an automated email from the ASF dual-hosted git repository.
tallison pushed a commit to branch branch_1x
in repository https://gitbox.apache.org/repos/asf/tika.git
The following commit(s) were added to refs/heads/branch_1x by this push:
new a36711610 improve regex in StandardsText
a36711610 is described below
commit a36711610fa1f6f5ba0f594803415af795e0b265
Author: tballison <ta...@apache.org>
AuthorDate: Tue May 17 13:18:56 2022 -0400
improve regex in StandardsText
---
tika-core/src/main/java/org/apache/tika/sax/StandardsText.java | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
index 3921e064e..bdba930a7 100644
--- a/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
+++ b/tika-core/src/main/java/org/apache/tika/sax/StandardsText.java
@@ -49,11 +49,11 @@ import org.apache.tika.sax.StandardReference.StandardReferenceBuilder;
* <li>returns the standard references along with scores.</li>
* </ol>
* </p>
- *
*/
public class StandardsText {
// Regular expression to match uppercase headers
- private static final String REGEX_HEADER = "(\\d+\\.(\\d+\\.?)*)\\p{Blank}+([A-Z]+(\\s[A-Z]+)*){5,}";
+ private static final String REGEX_HEADER =
+ "(\\d{1,10}\\.(\\d{1,10}\\.?){0,10})\\p{Blank}+([A-Z]{1,256}(\\s[A-Z]+){0,256}){5,}";
// Regular expression to match the "APPLICABLE DOCUMENTS" and equivalent
// sections