You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2023/03/02 06:09:51 UTC
[opennlp] branch main updated: OPENNLP-141 Tokenizers alphanumeric optimization only recognizes a-z as alpha chars (#506)
This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new a69184c1 OPENNLP-141 Tokenizers alphanumeric optimization only recognizes a-z as alpha chars (#506)
a69184c1 is described below
commit a69184c1a8689ce804d24bce697ab5c0eba1c4cd
Author: Martin Wiesner <ma...@users.noreply.github.com>
AuthorDate: Thu Mar 2 07:09:45 2023 +0100
OPENNLP-141 Tokenizers alphanumeric optimization only recognizes a-z as alpha chars (#506)
---
.../java/opennlp/tools/tokenize/TokenizerME.java | 7 ----
.../java/opennlp/tools/tokenize/lang/Factory.java | 39 +++++++++++++++++-----
.../tools/tokenize/TokenizerFactoryTest.java | 13 ++++----
3 files changed, 37 insertions(+), 22 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
index a76f3a8c..88f73dde 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
@@ -85,13 +85,6 @@ public class TokenizerME extends AbstractTokenizer {
*/
public static final String NO_SPLIT = "F";
- /**
- * Alpha-Numeric Pattern
- * @deprecated As of release 1.5.2, replaced by {@link Factory#getAlphanumeric(String)}
- */
- @Deprecated
- public static final Pattern alphaNumeric = Pattern.compile(Factory.DEFAULT_ALPHANUMERIC);
-
private final Pattern alphanumeric;
/*
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java
index 171613a8..9ec267a7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/lang/Factory.java
@@ -25,24 +25,45 @@ import opennlp.tools.tokenize.TokenContextGenerator;
public class Factory {
- public static final String DEFAULT_ALPHANUMERIC = "^[A-Za-z0-9]+$";
+ public static final Pattern DEFAULT_ALPHANUMERIC = Pattern.compile("^[A-Za-z0-9]+$");
+
+ private static final Pattern PORTUGUESE = Pattern.compile("^[0-9a-záãâàéêíóõôúüçA-ZÁÃÂÀÉÊÍÓÕÔÚÜÇ]+$");
+ private static final Pattern FRENCH = Pattern.compile("^[a-zA-Z0-9àâäèéêëîïôœùûüÿçÀÂÄÈÉÊËÎÏÔŒÙÛÜŸÇ]+$");
+
+ // For reference: https://www.sttmedia.com/characterfrequency-dutch
+ private static final Pattern DUTCH = Pattern.compile("^[A-Za-z0-9äöüëèéïijÄÖÜËÉÈÏIJ]+$");
+ private static final Pattern GERMAN = Pattern.compile("^[A-Za-z0-9äöüÄÖÜß]+$");
/**
- * Gets the alphanumeric pattern for the language. Please save the value
- * locally because this call is expensive.
+ * Gets the alphanumeric pattern for a language.
*
- * @param languageCode The language code. If {@code null}, or unknown,
- * the default pattern will be returned.
- * @return The alphanumeric pattern for the language or the default pattern.
+ * @param languageCode The ISO_639-1 code. If {@code null}, or unknown, the
+ * {@link #DEFAULT_ALPHANUMERIC} pattern will be returned.
+ * @return The alphanumeric {@link Pattern} for the language, or the default pattern.
*/
public Pattern getAlphanumeric(String languageCode) {
+ // For reference: https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes
if ("pt".equals(languageCode) || "por".equals(languageCode)) {
- return Pattern.compile("^[0-9a-záãâàéêíóõôúüçA-ZÁÃÂÀÉÊÍÓÕÔÚÜÇ]+$");
+ return PORTUGUESE;
}
-
- return Pattern.compile(DEFAULT_ALPHANUMERIC);
+ if ("fr".equals(languageCode) || "fre".equals(languageCode) || "fra".equals(languageCode)) {
+ return FRENCH;
+ }
+ if ("nl".equals(languageCode) || "nld".equals(languageCode) || "dut".equals(languageCode)) {
+ return DUTCH;
+ }
+ if ("de".equals(languageCode) || "deu".equals(languageCode) || "ger".equals(languageCode)) {
+ return GERMAN;
+ }
+ return DEFAULT_ALPHANUMERIC;
}
+ /**
+ * Initializes a customized {@link TokenContextGenerator} via a set of {@code abbreviations}.
+ *
+ * @param languageCode The ISO_639-1 code to be used.
+ * @param abbreviations The abbreviations to be used for new instance.
+ */
public TokenContextGenerator createTokenContextGenerator(String languageCode, Set<String> abbreviations) {
return new DefaultTokenContextGenerator(abbreviations);
}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
index e759c854..3a958229 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerFactoryTest.java
@@ -42,8 +42,7 @@ import opennlp.tools.util.TrainingParameters;
*/
public class TokenizerFactoryTest {
- private static ObjectStream<TokenSample> createSampleStream()
- throws IOException {
+ private static ObjectStream<TokenSample> createSampleStream() throws IOException {
InputStreamFactory in = new ResourceAsStreamFactory(
TokenizerFactoryTest.class, "/opennlp/tools/tokenize/token.train");
@@ -74,7 +73,8 @@ public class TokenizerFactoryTest {
Assertions.assertNotNull(factory.getAbbreviationDictionary());
Assertions.assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
- Assertions.assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern().pattern());
+ String defaultPattern = Factory.DEFAULT_ALPHANUMERIC.pattern();
+ Assertions.assertEquals(defaultPattern, factory.getAlphaNumericPattern().pattern());
Assertions.assertEquals(lang, factory.getLanguageCode());
Assertions.assertEquals(lang, model.getLanguage());
Assertions.assertFalse(factory.isUseAlphaNumericOptimization());
@@ -89,7 +89,7 @@ public class TokenizerFactoryTest {
Assertions.assertNotNull(factory.getAbbreviationDictionary());
Assertions.assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
- Assertions.assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern().pattern());
+ Assertions.assertEquals(defaultPattern, factory.getAlphaNumericPattern().pattern());
Assertions.assertEquals(lang, factory.getLanguageCode());
Assertions.assertEquals(lang, model.getLanguage());
Assertions.assertFalse(factory.isUseAlphaNumericOptimization());
@@ -107,7 +107,8 @@ public class TokenizerFactoryTest {
Assertions.assertNull(factory.getAbbreviationDictionary());
Assertions.assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
- Assertions.assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern().pattern());
+ String defaultPattern = Factory.DEFAULT_ALPHANUMERIC.pattern();
+ Assertions.assertEquals(defaultPattern, factory.getAlphaNumericPattern().pattern());
Assertions.assertEquals(lang, factory.getLanguageCode());
Assertions.assertEquals(lang, model.getLanguage());
Assertions.assertFalse(factory.isUseAlphaNumericOptimization());
@@ -122,7 +123,7 @@ public class TokenizerFactoryTest {
Assertions.assertNull(factory.getAbbreviationDictionary());
Assertions.assertTrue(factory.getContextGenerator() instanceof DefaultTokenContextGenerator);
- Assertions.assertEquals(Factory.DEFAULT_ALPHANUMERIC, factory.getAlphaNumericPattern().pattern());
+ Assertions.assertEquals(defaultPattern, factory.getAlphaNumericPattern().pattern());
Assertions.assertEquals(lang, factory.getLanguageCode());
Assertions.assertEquals(lang, model.getLanguage());
Assertions.assertFalse(factory.isUseAlphaNumericOptimization());