You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jz...@apache.org on 2022/04/02 19:25:08 UTC

[opennlp] branch master updated: OPENNLP-1185: Tokenizers should be able to output a new line token (#337)

This is an automated email from the ASF dual-hosted git repository.

jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/master by this push:
     new d34404a  OPENNLP-1185: Tokenizers should be able to output a new line token (#337)
d34404a is described below

commit d34404aadabb06ee3a76efdb6f5e449eae2ab8ad
Author: Rudolf Schneider <po...@rudolf-schneider.de>
AuthorDate: Sat Apr 2 21:25:03 2022 +0200

    OPENNLP-1185: Tokenizers should be able to output a new line token (#337)
---
 .../opennlp/tools/tokenize/SimpleTokenizer.java    | 14 +++++++
 .../java/opennlp/tools/tokenize/TokenizerME.java   |  9 ++++-
 .../tools/tokenize/WhitespaceTokenizer.java        | 17 ++++++++-
 .../tools/tokenize/SimpleTokenizerTest.java        | 39 ++++++++++++++++++++
 .../opennlp/tools/tokenize/TokenizerMETest.java    | 43 +++++++++++++++++++++-
 .../tools/tokenize/WhitespaceTokenizerTest.java    | 39 ++++++++++++++++++++
 6 files changed, 157 insertions(+), 4 deletions(-)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
index b855c3b..08e2991 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/SimpleTokenizer.java
@@ -28,6 +28,8 @@ import opennlp.tools.util.StringUtil;
  */
 public class SimpleTokenizer extends AbstractTokenizer {
 
+  private boolean keepNewLines = false;
+
   static class CharacterEnum {
     static final CharacterEnum WHITESPACE = new CharacterEnum("whitespace");
     static final CharacterEnum ALPHABETIC = new CharacterEnum("alphabetic");
@@ -93,6 +95,10 @@ public class SimpleTokenizer extends AbstractTokenizer {
           start = ci;
         }
       }
+      if (keepNewLines && isLineSeparator(c)) {
+        tokens.add(new Span(start, start + 1));
+        start = start + 1;
+      }
       state = charType;
       pc = c;
     }
@@ -101,4 +107,12 @@ public class SimpleTokenizer extends AbstractTokenizer {
     }
     return tokens.toArray(new Span[tokens.size()]);
   }
+
+  private boolean isLineSeparator(char character) {
+    return character == Character.LINE_SEPARATOR || character == Character.LETTER_NUMBER;
+  }
+
+  public void setKeepNewLines(boolean keepNewLines) {
+    this.keepNewLines = keepNewLines;
+  }
 }
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
index 9ecdf13..541f502 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
@@ -118,6 +118,7 @@ public class TokenizerME extends AbstractTokenizer {
   private List<Double> tokProbs;
 
   private List<Span> newTokens;
+  private boolean keepNewLines = false;
 
   /**
    * Initializes the tokenizer by downloading a default model.
@@ -188,7 +189,9 @@ public class TokenizerME extends AbstractTokenizer {
    * @return   A span array containing individual tokens as elements.
    */
   public Span[] tokenizePos(String d) {
-    Span[] tokens = WhitespaceTokenizer.INSTANCE.tokenizePos(d);
+    WhitespaceTokenizer whitespaceTokenizer = WhitespaceTokenizer.INSTANCE;
+    whitespaceTokenizer.setKeepNewLines(keepNewLines);
+    Span[] tokens = whitespaceTokenizer.tokenizePos(d);
     newTokens.clear();
     tokProbs.clear();
     for (Span s : tokens) {
@@ -268,4 +271,8 @@ public class TokenizerME extends AbstractTokenizer {
   public boolean useAlphaNumericOptimization() {
     return useAlphaNumericOptimization;
   }
+
+  public void setKeepNewLines(boolean keepNewLines) {
+    this.keepNewLines = keepNewLines;
+  }
 }
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenizer.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenizer.java
index 2346d62..e00c30d 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/WhitespaceTokenizer.java
@@ -36,6 +36,7 @@ public class WhitespaceTokenizer extends AbstractTokenizer {
    * {@link WhitespaceTokenizer}.
    */
   public static final WhitespaceTokenizer INSTANCE = new WhitespaceTokenizer();
+  private boolean keepNewLines = false;
 
   /**
    * Use the {@link WhitespaceTokenizer#INSTANCE} field to retrieve an instance.
@@ -57,8 +58,12 @@ public class WhitespaceTokenizer extends AbstractTokenizer {
           inTok = false;
           tokStart = -1;
         }
-      }
-      else {
+        if (keepNewLines && isLineSeparator(d.charAt(i))) {
+          tokStart = i;
+          tokens.add(new Span(tokStart, tokStart + 1));
+          tokStart = -1;
+        }
+      } else {
         if (!inTok) {
           tokStart = i;
           inTok = true;
@@ -72,4 +77,12 @@ public class WhitespaceTokenizer extends AbstractTokenizer {
 
     return tokens.toArray(new Span[tokens.size()]);
   }
+
+  private boolean isLineSeparator(char character) {
+    return character == Character.LINE_SEPARATOR || character == Character.LETTER_NUMBER;
+  }
+
+  public void setKeepNewLines(boolean keepNewLines) {
+    this.keepNewLines = keepNewLines;
+  }
 }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/SimpleTokenizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/SimpleTokenizerTest.java
index d51c905..e633c5d 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/SimpleTokenizerTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/SimpleTokenizerTest.java
@@ -89,4 +89,43 @@ public class SimpleTokenizerTest {
     Assert.assertTrue(")".equals(tokenizedText[4]));
     Assert.assertTrue(tokenizedText.length == 5);
   }
+
+  @Test
+  public void testTokenizationOfStringWithUnixNewLineTokens() {
+    SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
+    tokenizer.setKeepNewLines(true);
+
+    Assert.assertEquals(2, tokenizer.tokenize("a\n").length);
+    Assert.assertArrayEquals(new String[] {"a", "\n"}, tokenizer.tokenize("a\n"));
+
+    Assert.assertEquals(3, tokenizer.tokenize("a\nb").length);
+    Assert.assertArrayEquals(new String[] {"a", "\n", "b"}, tokenizer.tokenize("a\nb"));
+
+    Assert.assertEquals(4, tokenizer.tokenize("a\n\n b").length);
+    Assert.assertArrayEquals(new String[] {"a", "\n", "\n", "b"}, tokenizer.tokenize("a\n\n b"));
+
+    Assert.assertEquals(7, tokenizer.tokenize("a\n\n b\n\n c").length);
+    Assert.assertArrayEquals(new String[] {"a", "\n", "\n", "b", "\n", "\n", "c"},
+                             tokenizer.tokenize("a\n\n b\n\n c"));
+  }
+
+  @Test
+  public void testTokenizationOfStringWithWindowsNewLineTokens() {
+    SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
+    tokenizer.setKeepNewLines(true);
+    
+    Assert.assertEquals(3, tokenizer.tokenize("a\r\n").length);
+    Assert.assertArrayEquals(new String[] {"a", "\r", "\n"}, tokenizer.tokenize("a\r\n"));
+
+    Assert.assertEquals(4, tokenizer.tokenize("a\r\nb").length);
+    Assert.assertArrayEquals(new String[] {"a", "\r", "\n", "b"}, tokenizer.tokenize("a\r\nb"));
+
+    Assert.assertEquals(6, tokenizer.tokenize("a\r\n\r\n b").length);
+    Assert.assertArrayEquals(new String[] {"a", "\r", "\n", "\r", "\n", "b"}, tokenizer
+        .tokenize("a\r\n\r\n b"));
+
+    Assert.assertEquals(11, tokenizer.tokenize("a\r\n\r\n b\r\n\r\n c").length);
+    Assert.assertArrayEquals(new String[] {"a", "\r", "\n", "\r", "\n", "b", "\r", "\n", "\r", "\n", "c"},
+                             tokenizer.tokenize("a\r\n\r\n b\r\n\r\n c"));
+  }
 }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
index e541f3d..9535009 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
@@ -102,5 +102,46 @@ public class TokenizerMETest {
     TokenizerME.train(samples, TokenizerFactory.create(null, "eng", null, true, null), mlParams);
 
   }
-  
+
+  @Test
+  public void testNewLineAwareTokenization() throws IOException {
+    TokenizerModel model = TokenizerTestUtil.createMaxentTokenModel();
+    TokenizerME tokenizer = new TokenizerME(model);
+    tokenizer.setKeepNewLines(true);
+
+    Assert.assertEquals(2, tokenizer.tokenize("a\n").length);
+    Assert.assertArrayEquals(new String[] {"a", "\n"}, tokenizer.tokenize("a\n"));
+    
+    Assert.assertEquals(3, tokenizer.tokenize("a\nb").length);
+    Assert.assertArrayEquals(new String[] {"a", "\n", "b"}, tokenizer.tokenize("a\nb"));
+    
+    Assert.assertEquals(4, tokenizer.tokenize("a\n\n b").length);
+    Assert.assertArrayEquals(new String[] {"a", "\n", "\n", "b"}, tokenizer.tokenize("a\n\n b"));
+    
+    Assert.assertEquals(7, tokenizer.tokenize("a\n\n b\n\n c").length);
+    Assert.assertArrayEquals(new String[] {"a", "\n", "\n", "b", "\n", "\n", "c"},
+                             tokenizer.tokenize("a\n\n b\n\n c"));
+  }
+
+  @Test
+  public void testTokenizationOfStringWithWindowsNewLineTokens() throws IOException {
+    TokenizerModel model = TokenizerTestUtil.createMaxentTokenModel();
+    TokenizerME tokenizer = new TokenizerME(model);
+    tokenizer.setKeepNewLines(true);
+
+    Assert.assertEquals(3, tokenizer.tokenize("a\r\n").length);
+    Assert.assertArrayEquals(new String[] {"a", "\r", "\n"}, tokenizer.tokenize("a\r\n"));
+
+    Assert.assertEquals(4, tokenizer.tokenize("a\r\nb").length);
+    Assert.assertArrayEquals(new String[] {"a", "\r", "\n", "b"}, tokenizer.tokenize("a\r\nb"));
+
+    Assert.assertEquals(6, tokenizer.tokenize("a\r\n\r\n b").length);
+    Assert.assertArrayEquals(new String[] {"a", "\r", "\n", "\r", "\n", "b"}, tokenizer
+        .tokenize("a\r\n\r\n b"));
+
+    Assert.assertEquals(11, tokenizer.tokenize("a\r\n\r\n b\r\n\r\n c").length);
+    Assert.assertArrayEquals(new String[] {"a", "\r", "\n", "\r", "\n", "b", "\r", "\n", "\r", "\n", "c"},
+                             tokenizer.tokenize("a\r\n\r\n b\r\n\r\n c"));
+  }
+
 }
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/WhitespaceTokenizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/WhitespaceTokenizerTest.java
index c6eba8b..ed91c0f 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/WhitespaceTokenizerTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/WhitespaceTokenizerTest.java
@@ -59,4 +59,43 @@ public class WhitespaceTokenizerTest {
     Assert.assertEquals(0, WhitespaceTokenizer.INSTANCE.tokenize(" ").length); // tab
     Assert.assertEquals(0, WhitespaceTokenizer.INSTANCE.tokenize("     ").length);
   }
+
+  @Test
+  public void testTokenizationOfStringWithUnixNewLineTokens() {
+    WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
+    tokenizer.setKeepNewLines(true);
+    
+    Assert.assertEquals(2, tokenizer.tokenize("a\n").length);
+    Assert.assertArrayEquals(new String[] {"a", "\n"}, tokenizer.tokenize("a\n"));
+    
+    Assert.assertEquals(3, tokenizer.tokenize("a\nb").length);
+    Assert.assertArrayEquals(new String[] {"a", "\n", "b"}, tokenizer.tokenize("a\nb"));
+    
+    Assert.assertEquals(4, tokenizer.tokenize("a\n\n b").length);
+    Assert.assertArrayEquals(new String[] {"a", "\n", "\n", "b"}, tokenizer.tokenize("a\n\n b"));
+    
+    Assert.assertEquals(7, tokenizer.tokenize("a\n\n b\n\n c").length);
+    Assert.assertArrayEquals(new String[] {"a", "\n", "\n", "b", "\n", "\n", "c"},
+                             tokenizer.tokenize("a\n\n b\n\n c"));
+  }
+
+  @Test
+  public void testTokenizationOfStringWithWindowsNewLineTokens() {
+    WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
+    tokenizer.setKeepNewLines(true);
+    
+    Assert.assertEquals(3, tokenizer.tokenize("a\r\n").length);
+    Assert.assertArrayEquals(new String[] {"a", "\r", "\n"}, tokenizer.tokenize("a\r\n"));
+    
+    Assert.assertEquals(4, tokenizer.tokenize("a\r\nb").length);
+    Assert.assertArrayEquals(new String[] {"a", "\r", "\n", "b"}, tokenizer.tokenize("a\r\nb"));
+    
+    Assert.assertEquals(6, tokenizer.tokenize("a\r\n\r\n b").length);
+    Assert.assertArrayEquals(new String[] {"a", "\r", "\n", "\r", "\n", "b"}, tokenizer
+        .tokenize("a\r\n\r\n b"));
+    
+    Assert.assertEquals(11, tokenizer.tokenize("a\r\n\r\n b\r\n\r\n c").length);
+    Assert.assertArrayEquals(new String[] {"a", "\r", "\n", "\r", "\n", "b", "\r", "\n", "\r", "\n", "c"},
+                             tokenizer.tokenize("a\r\n\r\n b\r\n\r\n c"));
+  }
 }