You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by rz...@apache.org on 2023/01/14 19:02:10 UTC
[opennlp] branch main updated: OPENNLP-1441 Check and possibly replace usage of String.replaceAll(...) in code-base

This is an automated email from the ASF dual-hosted git repository.

rzo1 pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/main by this push:
     new eaf06012 OPENNLP-1441 Check and possibly replace usage of String.replaceAll(...) in code-base
eaf06012 is described below

commit eaf060121a01bcb45bbbdb9f2ea32d026a666126
Author: Martin Wiesner <ma...@hs-heilbronn.de>
AuthorDate: Wed Jan 11 19:17:12 2023 +0100

    OPENNLP-1441 Check and possibly replace usage of String.replaceAll(...) in code-base
    
    - reworks existing classes calling `String.replaceAll(..)` which might impact performance; it internally compiles a regex Pattern instance which is inefficient when used in iterations
    - extracts regular expressions used method locally into pre-compiled `Pattern` constants
    
    Note: see also: https://medium.com/javarevisited/micro-optimizations-in-java-string-replaceall-c6d0edf2ef6
---
 .../java/opennlp/dl/namefinder/NameFinderDL.java     | 20 ++++++++++++--------
 .../opennlp/tools/formats/ad/ADPOSSampleStream.java  |  5 ++++-
 .../opennlp/tools/formats/ad/ADSentenceStream.java   |  6 ++++--
 .../tools/formats/masc/MascPennTagParser.java        |  3 +--
 .../opennlp/tools/tokenize/WordpieceTokenizer.java   |  4 +++-
 .../java/opennlp/uima/normalizer/NumberUtil.java     |  5 ++++-
 6 files changed, 28 insertions(+), 15 deletions(-)

diff --git a/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java b/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java
index 319a2074..1049353e 100644
--- a/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java
+++ b/opennlp-dl/src/main/java/opennlp/dl/namefinder/NameFinderDL.java
@@ -57,6 +57,8 @@ public class NameFinderDL implements TokenNameFinder {
   public static final String B_PER = "B-PER";
   public static final String SEPARATOR = "[SEP]";
 
+  private static final String CHARS_TO_REPLACE = "##";
+
   protected final OrtSession session;
 
   private final SentenceDetector sentenceDetector;
@@ -137,6 +139,8 @@ public class NameFinderDL implements TokenNameFinder {
           // spans we can get the next one instead of the first one each time.
           int characterStart = 0;
 
+          final String[] toks = tokens.getTokens();
+
           // We are looping over the vector for each word,
           // finding the index of the array that has the maximum value,
           // and then finding the token classification that corresponds to that index.
@@ -156,7 +160,7 @@ public class NameFinderDL implements TokenNameFinder {
               String spanText;
 
               // Find the end index of the span in the array (where the label is not I-PER).
-              final SpanEnd spanEnd = findSpanEnd(v, x, ids2Labels, tokens.getTokens());
+              final SpanEnd spanEnd = findSpanEnd(v, x, ids2Labels, toks);
 
               // If the end is -1 it means this is a single-span token.
               // If the end is != -1 it means this is a multi-span token.
@@ -172,12 +176,12 @@ public class NameFinderDL implements TokenNameFinder {
                 for (int i = x; i <= end; i++) {
 
                   // If the next token starts with ##, combine it with this token.
-                  if (tokens.getTokens()[i + 1].startsWith("##")) {
+                  if (toks[i + 1].startsWith(CHARS_TO_REPLACE)) {
 
-                    sb.append(tokens.getTokens()[i] + tokens.getTokens()[i + 1].replaceAll("##", ""));
+                    sb.append(toks[i]).append(toks[i + 1].replace(CHARS_TO_REPLACE, ""));
 
                     // Append a space unless the next (next) token starts with ##.
-                    if (!tokens.getTokens()[i + 2].startsWith("##")) {
+                    if (!toks[i + 2].startsWith(CHARS_TO_REPLACE)) {
                       sb.append(" ");
                     }
 
@@ -186,10 +190,10 @@ public class NameFinderDL implements TokenNameFinder {
 
                   } else {
 
-                    sb.append(tokens.getTokens()[i].replaceAll("##", ""));
+                    sb.append(toks[i].replace(CHARS_TO_REPLACE, ""));
 
                     // Append a space unless the next token is a period.
-                    if (!".".equals(tokens.getTokens()[i + 1])) {
+                    if (!".".equals(toks[i + 1])) {
                       sb.append(" ");
                     }
 
@@ -204,13 +208,13 @@ public class NameFinderDL implements TokenNameFinder {
               } else {
 
                 // This is a single-token span so there is nothing else to do except grab the token.
-                spanText = tokens.getTokens()[x];
+                spanText = toks[x];
 
               }
 
               if (!SEPARATOR.equals(spanText)) {
 
-                spanText = spanText.replaceAll("##", "");
+                spanText = spanText.replace(CHARS_TO_REPLACE, "");
 
                 // This ignores other potential matches in the same sentence
                 // by only taking the first occurrence.
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java
index c2cb08c8..742e27e6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java
@@ -21,6 +21,7 @@ import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.StringTokenizer;
+import java.util.regex.Pattern;
 
 import opennlp.tools.commons.Internal;
 import opennlp.tools.formats.ad.ADSentenceStream.Sentence;
@@ -39,6 +40,8 @@ import opennlp.tools.util.PlainTextByLineStream;
 @Internal
 public class ADPOSSampleStream implements ObjectStream<POSSample> {
 
+  private static final Pattern WHITESPACES_PATTERN = Pattern.compile("\\s+");
+
   private final ObjectStream<ADSentenceStream.Sentence> adSentenceStream;
   private final boolean expandME;
   private final boolean isIncludeFeatures;
@@ -115,7 +118,7 @@ public class ADPOSSampleStream implements ObjectStream<POSSample> {
       if (isIncludeFeatures && leaf.getMorphologicalTag() != null) {
         tag += " " + leaf.getMorphologicalTag();
       }
-      tag = tag.replaceAll("\\s+", "=");
+      tag = WHITESPACES_PATTERN.matcher(tag).replaceAll("=");
 
       if (tag == null)
         tag = lexeme;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
index 42244381..577a3a6c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
@@ -96,6 +96,8 @@ public class ADSentenceStream extends FilterObjectStream<String, ADSentenceStrea
     private static final Pattern BIZARRE_LEAF_PATTERN = Pattern
         .compile("^([=-]*)([^:=]+=[^\\(\\s]+)\\(([\"'].+[\"'])?\\s*([^\\)]+)?\\)\\s+(.+)");
     private static final Pattern PUNCTUATION_PATTERN = Pattern.compile("^(=*)(\\W+)$");
+    private static final Pattern PUNCTUATION_DOT_PATTERN = Pattern.compile("\\»\\s+\\.");
+    private static final Pattern PUNCTUATION_COMMA_PATTERN = Pattern.compile("\\»\\s+\\,");
 
     private String text,meta;
 
@@ -238,8 +240,8 @@ public class ADSentenceStream extends FilterObjectStream<String, ADSentenceStrea
     }
 
     private String fixPunctuation(String text) {
-      text = text.replaceAll("\\»\\s+\\.", "».");
-      text = text.replaceAll("\\»\\s+\\,", "»,");
+      text = PUNCTUATION_DOT_PATTERN.matcher(text).replaceAll("».");
+      text = PUNCTUATION_COMMA_PATTERN.matcher(text).replaceAll("»,");
       return text;
     }
 
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPennTagParser.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPennTagParser.java
index 3356df32..5a423eda 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPennTagParser.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/MascPennTagParser.java
@@ -65,8 +65,7 @@ public class MascPennTagParser extends DefaultHandler {
         }
 
         String[] targets = attributes.getValue("targets")
-            .replaceAll("seg-r", "")
-            .split(" ");
+            .replace("seg-r", "").split(" ");
 
         int[] regions = new int[targets.length];
         for (int i = 0; i < targets.length; i++) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java
index 79752a48..1cf9aa0c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WordpieceTokenizer.java
@@ -20,6 +20,7 @@ package opennlp.tools.tokenize;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Set;
+import java.util.regex.Pattern;
 
 import opennlp.tools.util.Span;
 
@@ -44,6 +45,7 @@ import opennlp.tools.util.Span;
  */
 public class WordpieceTokenizer implements Tokenizer {
 
+  private static final Pattern PUNCTUATION_PATTERN = Pattern.compile("\\p{Punct}+");
   private static final String CLASSIFICATION_TOKEN = "[CLS]";
   private static final String SEPARATOR_TOKEN = "[SEP]";
   private static final String UNKNOWN_TOKEN = "[UNK]";
@@ -86,7 +88,7 @@ public class WordpieceTokenizer implements Tokenizer {
     tokens.add(CLASSIFICATION_TOKEN);
 
     // Put spaces around punctuation.
-    final String spacedPunctuation = text.replaceAll("\\p{Punct}+", " $0 ");
+    final String spacedPunctuation = PUNCTUATION_PATTERN.matcher(text).replaceAll(" $0 ");
 
     // Split based on whitespace.
     final String[] split = WhitespaceTokenizer.INSTANCE.tokenize(spacedPunctuation);
diff --git a/opennlp-uima/src/main/java/opennlp/uima/normalizer/NumberUtil.java b/opennlp-uima/src/main/java/opennlp/uima/normalizer/NumberUtil.java
index 157d8cb6..6adf9030 100644
--- a/opennlp-uima/src/main/java/opennlp/uima/normalizer/NumberUtil.java
+++ b/opennlp-uima/src/main/java/opennlp/uima/normalizer/NumberUtil.java
@@ -20,12 +20,15 @@ package opennlp.uima.normalizer;
 import java.text.NumberFormat;
 import java.text.ParseException;
 import java.util.Locale;
+import java.util.regex.Pattern;
 
 /**
  * Provides methods to parse numbers which occur in natural language texts.
  */
 public final class NumberUtil {
 
+  private final static Pattern WHITESPACE_PATTERN = Pattern.compile("\\s");
+
   /**
    * Checks if the language is supported.
    *
@@ -69,7 +72,7 @@ public final class NumberUtil {
 
     Locale locale = new Locale(languageCode);
     NumberFormat numberFormat = NumberFormat.getInstance(locale);
-    number = number.replaceAll("\\s", "");
+    number = WHITESPACE_PATTERN.matcher(number).replaceAll("");
     return numberFormat.parse(number);
   }
 }