You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2023/03/02 06:09:51 UTC

[opennlp] branch main updated: OPENNLP-141 Tokenizers alphanumeric optimization only recognizes a-z as alpha chars (#506)

This is an automated email from the ASF dual-hosted git repository.

mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/main by this push:
     new a69184c1 OPENNLP-141 Tokenizers alphanumeric optimization only recognizes a-z as alpha chars (#506)
a69184c1 is described below

commit a69184c1a8689ce804d24bce697ab5c0eba1c4cd
Author: Martin Wiesner <ma...@users.noreply.github.com>
AuthorDate: Thu Mar 2 07:09:45 2023 +0100

    OPENNLP-141 Tokenizers alphanumeric optimization only recognizes a-z as alpha chars (#506)
---
 .../java/opennlp/tools/tokenize/TokenizerME.java   |  7 ----
 .../java/opennlp/tools/tokenize/lang/Factory.java  | 39 +++++++++++++++++-----
 .../tools/tokenize/TokenizerFactoryTest.java       | 13 ++++----
 3 files changed, 37 insertions(+), 22 deletions(-)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
index a76f3a8c..88f73dde 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
@@ -85,13 +85,6 @@ public class TokenizerME extends AbstractTokenizer {
    */
   public static final String NO_SPLIT = "F";
 
-  /**
-   * Alpha-Numeric Pattern
-   * @deprecated As of release 1.5.2, replaced by {@link Factory#getAlphanumeric(String)}
-   */
-  @Deprecated
-  public static final Pattern alphaNumeric = Pattern.compile(Factory.DEFAULT_ALPHANUMERIC);
-
   private final Pattern alphanumeric;
 
   /*
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java
index 171613a8..9ec267a7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java
@@ -25,24 +25,45 @@ import opennlp.tools.tokenize.TokenContextGenerator;
 
 public class Factory {
 
-  public static final String DEFAULT_ALPHANUMERIC = "^[A-Za-z0-9]+$";
+  public static final Pattern DEFAULT_ALPHANUMERIC = Pattern.compile("^[A-Za-z0-9]+$");
+
+  private static final Pattern PORTUGUESE = Pattern.compile("^[0-9a-záãâàéêíóõôúüçA-ZÁÃÂÀÉÊÍÓÕÔÚÜÇ]+$");
+  private static final Pattern FRENCH = Pattern.compile("^[a-zA-Z0-9àâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ]+$");
+
+  // For reference: https://www.sttmedia.com/characterfrequency-dutch
+  private static final Pattern DUTCH = Pattern.compile("^[A-Za-z0-9äöüëèéïijÄÖÜËÉÈÏIJ]+$");
+  private static final Pattern GERMAN = Pattern.compile("^[A-Za-z0-9äöüÄÖÜß]+$");
 
   /**
-   * Gets the alphanumeric pattern for the language. Please save the value
-   * locally because this call is expensive.
+   * Gets the alphanumeric pattern for a language.
    *
-   * @param languageCode The language code. If {@code null}, or unknown,
-   *                     the default pattern will be returned.
-   * @return The alphanumeric pattern for the language or the default pattern.
+   * @param languageCode The ISO_639-1 code. If {@code null}, or unknown, the
+   *                     {@link #DEFAULT_ALPHANUMERIC} pattern will be returned.
+   * @return The alphanumeric {@link Pattern} for the language, or the default pattern.
    */
   public Pattern getAlphanumeric(String languageCode) {
+    // For reference: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
     if ("pt".equals(languageCode) || "por".equals(languageCode)) {
-      return Pattern.compile("^[0-9a-záãâàéêíóõôúüçA-ZÁÃÂÀÉÊÍÓÕÔÚÜÇ]+$");
+      return PORTUGUESE;
     }
-
-    return Pattern.compile(DEFAULT_ALPHANUMERIC);
+    if ("fr".equals(languageCode) || "fre".equals(languageCode) || "fra".equals(languageCode)) {
+      return FRENCH;
+    }
+    if ("nl".equals(languageCode) || "nld".equals(languageCode) || "dut".equals(languageCode)) {
+      return DUTCH;
+    }
+    if ("de".equals(languageCode) || "deu".equals(languageCode) || "ger".equals(languageCode)) {
+      return GERMAN;
+    }
+    return DEFAULT_ALPHANUMERIC;
   }
 
+  /**
+   * Initializes a customized {@link TokenContextGenerator} via a set of {@code abbreviations}.
+   * 
+   * @param languageCode The ISO_639-1 code to be used.
+   * @param abbreviations The abbreviations to be used for new instance.
+   */
   public TokenContextGenerator createTokenContextGenerator(String languageCode, Set<String> abbreviations) {
     return new DefaultTokenContextGenerator(abbreviations);
   }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
index e759c854..3a958229 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
@@ -42,8 +42,7 @@ import opennlp.tools.util.TrainingParameters;
  */
 public class TokenizerFactoryTest {
 
-  private static ObjectStream<TokenSample> createSampleStream()
-      throws IOException {
+  private static ObjectStream<TokenSample> createSampleStream() throws IOException {
     InputStreamFactory in = new ResourceAsStreamFactory(
         TokenizerFactoryTest.class, "/opennlp/tools/tokenize/token.train");
 
@@ -74,7 +73,8 @@ public class TokenizerFactoryTest {
     Assertions.assertNotNull(factory.getAbbreviationDictionary());
     Assertions.assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
 
-    Assertions.assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern().pattern());
+    String defaultPattern = Factory.DEFAULT_ALPHANUMERIC.pattern();
+    Assertions.assertEquals(defaultPattern, factory.getAlphaNumericPattern().pattern());
     Assertions.assertEquals(lang, factory.getLanguageCode());
     Assertions.assertEquals(lang, model.getLanguage());
     Assertions.assertFalse(factory.isUseAlphaNumericOptimization());
@@ -89,7 +89,7 @@ public class TokenizerFactoryTest {
     Assertions.assertNotNull(factory.getAbbreviationDictionary());
     Assertions.assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
 
-    Assertions.assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern().pattern());
+    Assertions.assertEquals(defaultPattern, factory.getAlphaNumericPattern().pattern());
     Assertions.assertEquals(lang, factory.getLanguageCode());
     Assertions.assertEquals(lang, model.getLanguage());
     Assertions.assertFalse(factory.isUseAlphaNumericOptimization());
@@ -107,7 +107,8 @@ public class TokenizerFactoryTest {
     Assertions.assertNull(factory.getAbbreviationDictionary());
     Assertions.assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
 
-    Assertions.assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern().pattern());
+    String defaultPattern = Factory.DEFAULT_ALPHANUMERIC.pattern();
+    Assertions.assertEquals(defaultPattern, factory.getAlphaNumericPattern().pattern());
     Assertions.assertEquals(lang, factory.getLanguageCode());
     Assertions.assertEquals(lang, model.getLanguage());
     Assertions.assertFalse(factory.isUseAlphaNumericOptimization());
@@ -122,7 +123,7 @@ public class TokenizerFactoryTest {
     Assertions.assertNull(factory.getAbbreviationDictionary());
     Assertions.assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
 
-    Assertions.assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern().pattern());
+    Assertions.assertEquals(defaultPattern, factory.getAlphaNumericPattern().pattern());
     Assertions.assertEquals(lang, factory.getLanguageCode());
     Assertions.assertEquals(lang, model.getLanguage());
     Assertions.assertFalse(factory.isUseAlphaNumericOptimization());