You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jz...@apache.org on 2022/04/02 19:25:08 UTC
[opennlp] branch master updated: OPENNLP-1185: Tokenizers should be able to output a new line token (#337)
This is an automated email from the ASF dual-hosted git repository.
jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new d34404a OPENNLP-1185: Tokenizers should be able to output a new line token (#337)
d34404a is described below
commit d34404aadabb06ee3a76efdb6f5e449eae2ab8ad
Author: Rudolf Schneider <po...@rudolf-schneider.de>
AuthorDate: Sat Apr 2 21:25:03 2022 +0200
OPENNLP-1185: Tokenizers should be able to output a new line token (#337)
---
.../opennlp/tools/tokenize/SimpleTokenizer.java | 14 +++++++
.../java/opennlp/tools/tokenize/TokenizerME.java | 9 ++++-
.../tools/tokenize/WhitespaceTokenizer.java | 17 ++++++++-
.../tools/tokenize/SimpleTokenizerTest.java | 39 ++++++++++++++++++++
.../opennlp/tools/tokenize/TokenizerMETest.java | 43 +++++++++++++++++++++-
.../tools/tokenize/WhitespaceTokenizerTest.java | 39 ++++++++++++++++++++
6 files changed, 157 insertions(+), 4 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
index b855c3b..08e2991 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
@@ -28,6 +28,8 @@ import opennlp.tools.util.StringUtil;
*/
public class SimpleTokenizer extends AbstractTokenizer {
+ private boolean keepNewLines = false;
+
static class CharacterEnum {
static final CharacterEnum WHITESPACE = new CharacterEnum("whitespace");
static final CharacterEnum ALPHABETIC = new CharacterEnum("alphabetic");
@@ -93,6 +95,10 @@ public class SimpleTokenizer extends AbstractTokenizer {
start = ci;
}
}
+ if (keepNewLines && isLineSeparator(c)) {
+ tokens.add(new Span(start, start + 1));
+ start = start + 1;
+ }
state = charType;
pc = c;
}
@@ -101,4 +107,12 @@ public class SimpleTokenizer extends AbstractTokenizer {
}
return tokens.toArray(new Span[tokens.size()]);
}
+
+ private boolean isLineSeparator(char character) {
+ return character == Character.LINE_SEPARATOR || character == Character.LETTER_NUMBER;
+ }
+
+ public void setKeepNewLines(boolean keepNewLines) {
+ this.keepNewLines = keepNewLines;
+ }
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
index 9ecdf13..541f502 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
@@ -118,6 +118,7 @@ public class TokenizerME extends AbstractTokenizer {
private List<Double> tokProbs;
private List<Span> newTokens;
+ private boolean keepNewLines = false;
/**
* Initializes the tokenizer by downloading a default model.
@@ -188,7 +189,9 @@ public class TokenizerME extends AbstractTokenizer {
* @return A span array containing individual tokens as elements.
*/
public Span[] tokenizePos(String d) {
- Span[] tokens = WhitespaceTokenizer.INSTANCE.tokenizePos(d);
+ WhitespaceTokenizer whitespaceTokenizer = WhitespaceTokenizer.INSTANCE;
+ whitespaceTokenizer.setKeepNewLines(keepNewLines);
+ Span[] tokens = whitespaceTokenizer.tokenizePos(d);
newTokens.clear();
tokProbs.clear();
for (Span s : tokens) {
@@ -268,4 +271,8 @@ public class TokenizerME extends AbstractTokenizer {
public boolean useAlphaNumericOptimization() {
return useAlphaNumericOptimization;
}
+
+ public void setKeepNewLines(boolean keepNewLines) {
+ this.keepNewLines = keepNewLines;
+ }
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenizer.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenizer.java
index 2346d62..e00c30d 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenizer.java
@@ -36,6 +36,7 @@ public class WhitespaceTokenizer extends AbstractTokenizer {
* {@link WhitespaceTokenizer}.
*/
public static final WhitespaceTokenizer INSTANCE = new WhitespaceTokenizer();
+ private boolean keepNewLines = false;
/**
* Use the {@link WhitespaceTokenizer#INSTANCE} field to retrieve an instance.
@@ -57,8 +58,12 @@ public class WhitespaceTokenizer extends AbstractTokenizer {
inTok = false;
tokStart = -1;
}
- }
- else {
+ if (keepNewLines && isLineSeparator(d.charAt(i))) {
+ tokStart = i;
+ tokens.add(new Span(tokStart, tokStart + 1));
+ tokStart = -1;
+ }
+ } else {
if (!inTok) {
tokStart = i;
inTok = true;
@@ -72,4 +77,12 @@ public class WhitespaceTokenizer extends AbstractTokenizer {
return tokens.toArray(new Span[tokens.size()]);
}
+
+ private boolean isLineSeparator(char character) {
+ return character == Character.LINE_SEPARATOR || character == Character.LETTER_NUMBER;
+ }
+
+ public void setKeepNewLines(boolean keepNewLines) {
+ this.keepNewLines = keepNewLines;
+ }
}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/SimpleTokenizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/SimpleTokenizerTest.java
index d51c905..e633c5d 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/SimpleTokenizerTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/SimpleTokenizerTest.java
@@ -89,4 +89,43 @@ public class SimpleTokenizerTest {
Assert.assertTrue(")".equals(tokenizedText[4]));
Assert.assertTrue(tokenizedText.length == 5);
}
+
+ @Test
+ public void testTokenizationOfStringWithUnixNewLineTokens() {
+ SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
+ tokenizer.setKeepNewLines(true);
+
+ Assert.assertEquals(2, tokenizer.tokenize("a\n").length);
+ Assert.assertArrayEquals(new String[] {"a", "\n"}, tokenizer.tokenize("a\n"));
+
+ Assert.assertEquals(3, tokenizer.tokenize("a\nb").length);
+ Assert.assertArrayEquals(new String[] {"a", "\n", "b"}, tokenizer.tokenize("a\nb"));
+
+ Assert.assertEquals(4, tokenizer.tokenize("a\n\n b").length);
+ Assert.assertArrayEquals(new String[] {"a", "\n", "\n", "b"}, tokenizer.tokenize("a\n\n b"));
+
+ Assert.assertEquals(7, tokenizer.tokenize("a\n\n b\n\n c").length);
+ Assert.assertArrayEquals(new String[] {"a", "\n", "\n", "b", "\n", "\n", "c"},
+ tokenizer.tokenize("a\n\n b\n\n c"));
+ }
+
+ @Test
+ public void testTokenizationOfStringWithWindowsNewLineTokens() {
+ SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
+ tokenizer.setKeepNewLines(true);
+
+ Assert.assertEquals(3, tokenizer.tokenize("a\r\n").length);
+ Assert.assertArrayEquals(new String[] {"a", "\r", "\n"}, tokenizer.tokenize("a\r\n"));
+
+ Assert.assertEquals(4, tokenizer.tokenize("a\r\nb").length);
+ Assert.assertArrayEquals(new String[] {"a", "\r", "\n", "b"}, tokenizer.tokenize("a\r\nb"));
+
+ Assert.assertEquals(6, tokenizer.tokenize("a\r\n\r\n b").length);
+ Assert.assertArrayEquals(new String[] {"a", "\r", "\n", "\r", "\n", "b"}, tokenizer
+ .tokenize("a\r\n\r\n b"));
+
+ Assert.assertEquals(11, tokenizer.tokenize("a\r\n\r\n b\r\n\r\n c").length);
+ Assert.assertArrayEquals(new String[] {"a", "\r", "\n", "\r", "\n", "b", "\r", "\n", "\r", "\n", "c"},
+ tokenizer.tokenize("a\r\n\r\n b\r\n\r\n c"));
+ }
}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
index e541f3d..9535009 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
@@ -102,5 +102,46 @@ public class TokenizerMETest {
TokenizerME.train(samples, TokenizerFactory.create(null, "eng", null, true, null), mlParams);
}
-
+
+ @Test
+ public void testNewLineAwareTokenization() throws IOException {
+ TokenizerModel model = TokenizerTestUtil.createMaxentTokenModel();
+ TokenizerME tokenizer = new TokenizerME(model);
+ tokenizer.setKeepNewLines(true);
+
+ Assert.assertEquals(2, tokenizer.tokenize("a\n").length);
+ Assert.assertArrayEquals(new String[] {"a", "\n"}, tokenizer.tokenize("a\n"));
+
+ Assert.assertEquals(3, tokenizer.tokenize("a\nb").length);
+ Assert.assertArrayEquals(new String[] {"a", "\n", "b"}, tokenizer.tokenize("a\nb"));
+
+ Assert.assertEquals(4, tokenizer.tokenize("a\n\n b").length);
+ Assert.assertArrayEquals(new String[] {"a", "\n", "\n", "b"}, tokenizer.tokenize("a\n\n b"));
+
+ Assert.assertEquals(7, tokenizer.tokenize("a\n\n b\n\n c").length);
+ Assert.assertArrayEquals(new String[] {"a", "\n", "\n", "b", "\n", "\n", "c"},
+ tokenizer.tokenize("a\n\n b\n\n c"));
+ }
+
+ @Test
+ public void testTokenizationOfStringWithWindowsNewLineTokens() throws IOException {
+ TokenizerModel model = TokenizerTestUtil.createMaxentTokenModel();
+ TokenizerME tokenizer = new TokenizerME(model);
+ tokenizer.setKeepNewLines(true);
+
+ Assert.assertEquals(3, tokenizer.tokenize("a\r\n").length);
+ Assert.assertArrayEquals(new String[] {"a", "\r", "\n"}, tokenizer.tokenize("a\r\n"));
+
+ Assert.assertEquals(4, tokenizer.tokenize("a\r\nb").length);
+ Assert.assertArrayEquals(new String[] {"a", "\r", "\n", "b"}, tokenizer.tokenize("a\r\nb"));
+
+ Assert.assertEquals(6, tokenizer.tokenize("a\r\n\r\n b").length);
+ Assert.assertArrayEquals(new String[] {"a", "\r", "\n", "\r", "\n", "b"}, tokenizer
+ .tokenize("a\r\n\r\n b"));
+
+ Assert.assertEquals(11, tokenizer.tokenize("a\r\n\r\n b\r\n\r\n c").length);
+ Assert.assertArrayEquals(new String[] {"a", "\r", "\n", "\r", "\n", "b", "\r", "\n", "\r", "\n", "c"},
+ tokenizer.tokenize("a\r\n\r\n b\r\n\r\n c"));
+ }
+
}
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/WhitespaceTokenizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/WhitespaceTokenizerTest.java
index c6eba8b..ed91c0f 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/WhitespaceTokenizerTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/WhitespaceTokenizerTest.java
@@ -59,4 +59,43 @@ public class WhitespaceTokenizerTest {
Assert.assertEquals(0, WhitespaceTokenizer.INSTANCE.tokenize(" ").length); // tab
Assert.assertEquals(0, WhitespaceTokenizer.INSTANCE.tokenize(" ").length);
}
+
+ @Test
+ public void testTokenizationOfStringWithUnixNewLineTokens() {
+ WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
+ tokenizer.setKeepNewLines(true);
+
+ Assert.assertEquals(2, tokenizer.tokenize("a\n").length);
+ Assert.assertArrayEquals(new String[] {"a", "\n"}, tokenizer.tokenize("a\n"));
+
+ Assert.assertEquals(3, tokenizer.tokenize("a\nb").length);
+ Assert.assertArrayEquals(new String[] {"a", "\n", "b"}, tokenizer.tokenize("a\nb"));
+
+ Assert.assertEquals(4, tokenizer.tokenize("a\n\n b").length);
+ Assert.assertArrayEquals(new String[] {"a", "\n", "\n", "b"}, tokenizer.tokenize("a\n\n b"));
+
+ Assert.assertEquals(7, tokenizer.tokenize("a\n\n b\n\n c").length);
+ Assert.assertArrayEquals(new String[] {"a", "\n", "\n", "b", "\n", "\n", "c"},
+ tokenizer.tokenize("a\n\n b\n\n c"));
+ }
+
+ @Test
+ public void testTokenizationOfStringWithWindowsNewLineTokens() {
+ WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
+ tokenizer.setKeepNewLines(true);
+
+ Assert.assertEquals(3, tokenizer.tokenize("a\r\n").length);
+ Assert.assertArrayEquals(new String[] {"a", "\r", "\n"}, tokenizer.tokenize("a\r\n"));
+
+ Assert.assertEquals(4, tokenizer.tokenize("a\r\nb").length);
+ Assert.assertArrayEquals(new String[] {"a", "\r", "\n", "b"}, tokenizer.tokenize("a\r\nb"));
+
+ Assert.assertEquals(6, tokenizer.tokenize("a\r\n\r\n b").length);
+ Assert.assertArrayEquals(new String[] {"a", "\r", "\n", "\r", "\n", "b"}, tokenizer
+ .tokenize("a\r\n\r\n b"));
+
+ Assert.assertEquals(11, tokenizer.tokenize("a\r\n\r\n b\r\n\r\n c").length);
+ Assert.assertArrayEquals(new String[] {"a", "\r", "\n", "\r", "\n", "b", "\r", "\n", "\r", "\n", "c"},
+ tokenizer.tokenize("a\r\n\r\n b\r\n\r\n c"));
+ }
}