You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by rz...@apache.org on 2023/01/14 19:02:10 UTC
[opennlp] branch main updated: OPENNLP-1441 Check and possibly replace usage of String.replaceAll(...) in code-base
This is an automated email from the ASF dual-hosted git repository.
rzo1 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new eaf06012 OPENNLP-1441 Check and possibly replace usage of String.replaceAll(...) in code-base
eaf06012 is described below
commit eaf060121a01bcb45bbbdb9f2ea32d026a666126
Author: Martin Wiesner <ma...@hs-heilbronn.de>
AuthorDate: Wed Jan 11 19:17:12 2023 +0100
OPENNLP-1441 Check and possibly replace usage of String.replaceAll(...) in code-base
- reworks existing classes calling `String.replaceAll(..)` which might impact performance; it internally compiles a regex Pattern instance which is inefficient when used in iterations
- extracts regular expressions used method locally into pre-compiled `Pattern` constants
Note: see also: https://medium.com/javarevisited/micro-optimizations-in-java-string-replaceall-c6d0edf2ef6
---
.../java/opennlp/dl/namefinder/NameFinderDL.java | 20 ++++++++++++--------
.../opennlp/tools/formats/ad/ADPOSSampleStream.java | 5 ++++-
.../opennlp/tools/formats/ad/ADSentenceStream.java | 6 ++++--
.../tools/formats/masc/MascPennTagParser.java | 3 +--
.../opennlp/tools/tokenize/WordpieceTokenizer.java | 4 +++-
.../java/opennlp/uima/normalizer/NumberUtil.java | 5 ++++-
6 files changed, 28 insertions(+), 15 deletions(-)
diff --git a/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java b/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java
index 319a2074..1049353e 100644
--- a/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java
+++ b/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java
@@ -57,6 +57,8 @@ public class NameFinderDL implements TokenNameFinder {
public static final String B_PER = "B-PER";
public static final String SEPARATOR = "[SEP]";
+ private static final String CHARS_TO_REPLACE = "##";
+
protected final OrtSession session;
private final SentenceDetector sentenceDetector;
@@ -137,6 +139,8 @@ public class NameFinderDL implements TokenNameFinder {
// spans we can get the next one instead of the first one each time.
int characterStart = 0;
+ final String[] toks = tokens.getTokens();
+
// We are looping over the vector for each word,
// finding the index of the array that has the maximum value,
// and then finding the token classification that corresponds to that index.
@@ -156,7 +160,7 @@ public class NameFinderDL implements TokenNameFinder {
String spanText;
// Find the end index of the span in the array (where the label is not I-PER).
- final SpanEnd spanEnd = findSpanEnd(v, x, ids2Labels, tokens.getTokens());
+ final SpanEnd spanEnd = findSpanEnd(v, x, ids2Labels, toks);
// If the end is -1 it means this is a single-span token.
// If the end is != -1 it means this is a multi-span token.
@@ -172,12 +176,12 @@ public class NameFinderDL implements TokenNameFinder {
for (int i = x; i <= end; i++) {
// If the next token starts with ##, combine it with this token.
- if (tokens.getTokens()[i + 1].startsWith("##")) {
+ if (toks[i + 1].startsWith(CHARS_TO_REPLACE)) {
- sb.append(tokens.getTokens()[i] + tokens.getTokens()[i + 1].replaceAll("##", ""));
+ sb.append(toks[i]).append(toks[i + 1].replace(CHARS_TO_REPLACE, ""));
// Append a space unless the next (next) token starts with ##.
- if (!tokens.getTokens()[i + 2].startsWith("##")) {
+ if (!toks[i + 2].startsWith(CHARS_TO_REPLACE)) {
sb.append(" ");
}
@@ -186,10 +190,10 @@ public class NameFinderDL implements TokenNameFinder {
} else {
- sb.append(tokens.getTokens()[i].replaceAll("##", ""));
+ sb.append(toks[i].replace(CHARS_TO_REPLACE, ""));
// Append a space unless the next token is a period.
- if (!".".equals(tokens.getTokens()[i + 1])) {
+ if (!".".equals(toks[i + 1])) {
sb.append(" ");
}
@@ -204,13 +208,13 @@ public class NameFinderDL implements TokenNameFinder {
} else {
// This is a single-token span so there is nothing else to do except grab the token.
- spanText = tokens.getTokens()[x];
+ spanText = toks[x];
}
if (!SEPARATOR.equals(spanText)) {
- spanText = spanText.replaceAll("##", "");
+ spanText = spanText.replace(CHARS_TO_REPLACE, "");
// This ignores other potential matches in the same sentence
// by only taking the first occurrence.
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java
index c2cb08c8..742e27e6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
+import java.util.regex.Pattern;
import opennlp.tools.commons.Internal;
import opennlp.tools.formats.ad.ADSentenceStream.Sentence;
@@ -39,6 +40,8 @@ import opennlp.tools.util.PlainTextByLineStream;
@Internal
public class ADPOSSampleStream implements ObjectStream<POSSample> {
+ private static final Pattern WHITESPACES_PATTERN = Pattern.compile("\\s+");
+
private final ObjectStream<ADSentenceStream.Sentence> adSentenceStream;
private final boolean expandME;
private final boolean isIncludeFeatures;
@@ -115,7 +118,7 @@ public class ADPOSSampleStream implements ObjectStream<POSSample> {
if (isIncludeFeatures && leaf.getMorphologicalTag() != null) {
tag += " " + leaf.getMorphologicalTag();
}
- tag = tag.replaceAll("\\s+", "=");
+ tag = WHITESPACES_PATTERN.matcher(tag).replaceAll("=");
if (tag == null)
tag = lexeme;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
index 42244381..577a3a6c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
@@ -96,6 +96,8 @@ public class ADSentenceStream extends FilterObjectStream<String, ADSentenceStrea
private static final Pattern BIZARRE_LEAF_PATTERN = Pattern
.compile("^([=-]*)([^:=]+=[^\\(\\s]+)\\(([\"'].+[\"'])?\\s*([^\\)]+)?\\)\\s+(.+)");
private static final Pattern PUNCTUATION_PATTERN = Pattern.compile("^(=*)(\\W+)$");
+ private static final Pattern PUNCTUATION_DOT_PATTERN = Pattern.compile("\\»\\s+\\.");
+ private static final Pattern PUNCTUATION_COMMA_PATTERN = Pattern.compile("\\»\\s+\\,");
private String text,meta;
@@ -238,8 +240,8 @@ public class ADSentenceStream extends FilterObjectStream<String, ADSentenceStrea
}
private String fixPunctuation(String text) {
- text = text.replaceAll("\\»\\s+\\.", "».");
- text = text.replaceAll("\\»\\s+\\,", "»,");
+ text = PUNCTUATION_DOT_PATTERN.matcher(text).replaceAll("».");
+ text = PUNCTUATION_COMMA_PATTERN.matcher(text).replaceAll("»,");
return text;
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPennTagParser.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPennTagParser.java
index 3356df32..5a423eda 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPennTagParser.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPennTagParser.java
@@ -65,8 +65,7 @@ public class MascPennTagParser extends DefaultHandler {
}
String[] targets = attributes.getValue("targets")
- .replaceAll("seg-r", "")
- .split(" ");
+ .replace("seg-r", "").split(" ");
int[] regions = new int[targets.length];
for (int i = 0; i < targets.length; i++) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java
index 79752a48..1cf9aa0c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java
@@ -20,6 +20,7 @@ package opennlp.tools.tokenize;
import java.util.LinkedList;
import java.util.List;
import java.util.Set;
+import java.util.regex.Pattern;
import opennlp.tools.util.Span;
@@ -44,6 +45,7 @@ import opennlp.tools.util.Span;
*/
public class WordpieceTokenizer implements Tokenizer {
+ private static final Pattern PUNCTUATION_PATTERN = Pattern.compile("\\p{Punct}+");
private static final String CLASSIFICATION_TOKEN = "[CLS]";
private static final String SEPARATOR_TOKEN = "[SEP]";
private static final String UNKNOWN_TOKEN = "[UNK]";
@@ -86,7 +88,7 @@ public class WordpieceTokenizer implements Tokenizer {
tokens.add(CLASSIFICATION_TOKEN);
// Put spaces around punctuation.
- final String spacedPunctuation = text.replaceAll("\\p{Punct}+", " $0 ");
+ final String spacedPunctuation = PUNCTUATION_PATTERN.matcher(text).replaceAll(" $0 ");
// Split based on whitespace.
final String[] split = WhitespaceTokenizer.INSTANCE.tokenize(spacedPunctuation);
diff --git a/opennlp-uima/src/main/java/opennlp/uima/normalizer/NumberUtil.java b/opennlp-uima/src/main/java/opennlp/uima/normalizer/NumberUtil.java
index 157d8cb6..6adf9030 100644
--- a/opennlp-uima/src/main/java/opennlp/uima/normalizer/NumberUtil.java
+++ b/opennlp-uima/src/main/java/opennlp/uima/normalizer/NumberUtil.java
@@ -20,12 +20,15 @@ package opennlp.uima.normalizer;
import java.text.NumberFormat;
import java.text.ParseException;
import java.util.Locale;
+import java.util.regex.Pattern;
/**
* Provides methods to parse numbers which occur in natural language texts.
*/
public final class NumberUtil {
+ private final static Pattern WHITESPACE_PATTERN = Pattern.compile("\\s");
+
/**
* Checks if the language is supported.
*
@@ -69,7 +72,7 @@ public final class NumberUtil {
Locale locale = new Locale(languageCode);
NumberFormat numberFormat = NumberFormat.getInstance(locale);
- number = number.replaceAll("\\s", "");
+ number = WHITESPACE_PATTERN.matcher(number).replaceAll("");
return numberFormat.parse(number);
}
}