You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ko...@apache.org on 2018/10/03 01:02:11 UTC

[opennlp] branch master updated: OPENNLP-1214: use hash to avoid linear search in DefaultEndOfSentenceScanner and DefaultSDContextGenerator (#329)

This is an automated email from the ASF dual-hosted git repository.

koji pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/master by this push:
     new 51cbde6  OPENNLP-1214: use hash to avoid linear search in DefaultEndOfSentenceScanner and DefaultSDContextGenerator (#329)
51cbde6 is described below

commit 51cbde659383c8931525a51cf5066092826082ae
Author: Koji Sekiguchi <ko...@rondhuit.com>
AuthorDate: Wed Oct 3 10:02:01 2018 +0900

    OPENNLP-1214: use hash to avoid linear search in DefaultEndOfSentenceScanner and DefaultSDContextGenerator (#329)
---
 .../sentdetect/DefaultEndOfSentenceScanner.java    | 26 +++++++---
 .../sentdetect/DefaultSDContextGenerator.java      | 26 +++++-----
 .../tools/sentdetect/EndOfSentenceScanner.java     |  8 +++
 .../sentdetect/DefaultSDContextGeneratorTest.java  | 59 ++++++++++++++++++++++
 4 files changed, 97 insertions(+), 22 deletions(-)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultEndOfSentenceScanner.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultEndOfSentenceScanner.java
index 75d0ec0..2b8c0be 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultEndOfSentenceScanner.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultEndOfSentenceScanner.java
@@ -19,7 +19,9 @@
 package opennlp.tools.sentdetect;
 
 import java.util.ArrayList;
+import java.util.HashSet;
 import java.util.List;
+import java.util.Set;
 
 /**
  * Default implementation of the {@link EndOfSentenceScanner}.
@@ -28,7 +30,9 @@ import java.util.List;
  */
 public class DefaultEndOfSentenceScanner implements EndOfSentenceScanner {
 
-  private char[] eosCharacters;
+  private Set<Character> eosCharacters;
+  @Deprecated
+  private char[] eosChars;
 
   /**
    * Initializes the current instance.
@@ -36,7 +40,11 @@ public class DefaultEndOfSentenceScanner implements EndOfSentenceScanner {
    * @param eosCharacters
    */
   public DefaultEndOfSentenceScanner(char[] eosCharacters) {
-    this.eosCharacters = eosCharacters;
+    this.eosCharacters = new HashSet<>();
+    for (char eosChar: eosCharacters) {
+      this.eosCharacters.add(eosChar);
+    }
+    this.eosChars = eosCharacters;
   }
 
   public List<Integer> getPositions(String s) {
@@ -49,19 +57,21 @@ public class DefaultEndOfSentenceScanner implements EndOfSentenceScanner {
 
   public List<Integer> getPositions(char[] cbuf) {
     List<Integer> l = new ArrayList<>();
-    char[] eosCharacters = getEndOfSentenceCharacters();
     for (int i = 0; i < cbuf.length; i++) {
-      for (char eosCharacter : eosCharacters) {
-        if (cbuf[i] == eosCharacter) {
-          l.add(i);
-          break;
-        }
+      if (eosCharacters.contains(cbuf[i])) {
+        l.add(i);
       }
     }
     return l;
   }
 
+  @Deprecated
   public char[] getEndOfSentenceCharacters() {
+    return eosChars;
+  }
+
+  @Override
+  public Set<Character> getEOSCharacters() {
     return eosCharacters;
   }
 }
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java
index a29119b..8c2822b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java
@@ -19,6 +19,7 @@ package opennlp.tools.sentdetect;
 
 import java.util.ArrayList;
 import java.util.Collections;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 
@@ -42,7 +43,7 @@ public class DefaultSDContextGenerator implements SDContextGenerator {
 
   private Set<String> inducedAbbreviations;
 
-  private char[] eosCharacters;
+  private Set<Character> eosCharacters;
 
   /**
    * Creates a new <code>SDContextGenerator</code> instance with
@@ -66,7 +67,10 @@ public class DefaultSDContextGenerator implements SDContextGenerator {
    */
   public DefaultSDContextGenerator(Set<String> inducedAbbreviations, char[] eosCharacters) {
     this.inducedAbbreviations = inducedAbbreviations;
-    this.eosCharacters = eosCharacters;
+    this.eosCharacters = new HashSet<>();
+    for (char eosChar: eosCharacters) {
+      this.eosCharacters.add(eosChar);
+    }
     buf = new StringBuffer();
     collectFeats = new ArrayList<>();
   }
@@ -121,12 +125,9 @@ public class DefaultSDContextGenerator implements SDContextGenerator {
     int c = position;
     { ///assign prefix, stop if you run into a period though otherwise stop at space
       while (--c > prefixStart) {
-        for (int eci = 0, ecl = eosCharacters.length; eci < ecl; eci++) {
-          if (sb.charAt(c) == eosCharacters[eci]) {
-            prefixStart = c;
-            c++; // this gets us out of while loop.
-            break;
-          }
+        if (eosCharacters.contains(sb.charAt(c))) {
+          prefixStart = c;
+          c++; // this gets us out of while loop.
         }
       }
       prefix = String.valueOf(sb.subSequence(prefixStart, position)).trim();
@@ -138,12 +139,9 @@ public class DefaultSDContextGenerator implements SDContextGenerator {
     {
       c = position;
       while (++c < suffixEnd) {
-        for (int eci = 0, ecl = eosCharacters.length; eci < ecl; eci++) {
-          if (sb.charAt(c) == eosCharacters[eci]) {
-            suffixEnd = c;
-            c--; // this gets us out of while loop.
-            break;
-          }
+        if (eosCharacters.contains(sb.charAt(c))) {
+          suffixEnd = c;
+          c--; // this gets us out of while loop.
         }
       }
     }
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EndOfSentenceScanner.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EndOfSentenceScanner.java
index b48ad3f..7963e37 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EndOfSentenceScanner.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EndOfSentenceScanner.java
@@ -18,6 +18,7 @@
 package opennlp.tools.sentdetect;
 
 import java.util.List;
+import java.util.Set;
 
 /**
  * Scans Strings, StringBuffers, and char[] arrays for the offsets of
@@ -34,9 +35,16 @@ public interface EndOfSentenceScanner {
    * Returns an array of character which can indicate the end of a sentence.
    * @return an array of character which can indicate the end of a sentence.
    */
+  @Deprecated
   char[] getEndOfSentenceCharacters();
 
   /**
+   * Returns a set of character which can indicate the end of a sentence.
+   * @return a set of character which can indicate the end of a sentence.
+   */
+  Set<Character> getEOSCharacters();
+
+  /**
    * The receiver scans the specified string for sentence ending characters and
    * returns their offsets.
    *
diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/DefaultSDContextGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/DefaultSDContextGeneratorTest.java
new file mode 100644
index 0000000..f010498
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/DefaultSDContextGeneratorTest.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.sentdetect.lang.Factory;
+
+public class DefaultSDContextGeneratorTest {
+
+  @Test
+  public void testGetContext() throws Exception {
+    SDContextGenerator sdContextGenerator =
+        new DefaultSDContextGenerator(Collections.<String>emptySet(), Factory.defaultEosCharacters);
+
+    String[] context = sdContextGenerator.getContext(
+        "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 2);
+    Assert.assertArrayEquals("sn/eos=./x=Mr/2/xcap/v=/s=/n=Smith/ncap".split("/"), context);
+
+    context = sdContextGenerator.getContext(
+        "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 29);
+    Assert.assertArrayEquals("sn/eos=./x=Inc/3/xcap/v=RONDHUIT/vcap/s=/n=as".split("/"), context);
+  }
+
+  @Test
+  public void testGetContextWithAbbreviations() throws Exception {
+    SDContextGenerator sdContextGenerator =
+        new DefaultSDContextGenerator(new HashSet<>(Arrays.asList("Mr./Inc.".split("/"))),
+            Factory.defaultEosCharacters);
+
+    String[] context = sdContextGenerator.getContext(
+        "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 2);
+    Assert.assertArrayEquals("sn/eos=./x=Mr/2/xcap/xabbrev/v=/s=/n=Smith/ncap".split("/"), context);
+
+    context = sdContextGenerator.getContext(
+        "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 29);
+    Assert.assertArrayEquals("sn/eos=./x=Inc/3/xcap/xabbrev/v=RONDHUIT/vcap/s=/n=as".split("/"), context);
+  }
+}