You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ko...@apache.org on 2018/10/15 07:02:57 UTC

[opennlp] 01/01: Revert "OPENNLP-1214: use hash to avoid linear search in DefaultEndOfSentenceScanner and DefaultSDContextGenerator (#329)"

This is an automated email from the ASF dual-hosted git repository.

koji pushed a commit to branch revert-329-OPENNLP-1214
in repository https://gitbox.apache.org/repos/asf/opennlp.git

commit bf0d13b4b0c5304e1134ac295a9c1cb3767c3dcc
Author: Koji Sekiguchi <ko...@rondhuit.com>
AuthorDate: Mon Oct 15 16:02:50 2018 +0900

    Revert "OPENNLP-1214: use hash to avoid linear search in DefaultEndOfSentenceScanner and DefaultSDContextGenerator (#329)"
    
    This reverts commit 51cbde659383c8931525a51cf5066092826082ae.
---
 .../sentdetect/DefaultEndOfSentenceScanner.java    | 26 +++-------
 .../sentdetect/DefaultSDContextGenerator.java      | 26 +++++-----
 .../tools/sentdetect/EndOfSentenceScanner.java     |  8 ---
 .../sentdetect/DefaultSDContextGeneratorTest.java  | 59 ----------------------
 4 files changed, 22 insertions(+), 97 deletions(-)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultEndOfSentenceScanner.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultEndOfSentenceScanner.java
index 2b8c0be..75d0ec0 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultEndOfSentenceScanner.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultEndOfSentenceScanner.java
@@ -19,9 +19,7 @@
 package opennlp.tools.sentdetect;
 
 import java.util.ArrayList;
-import java.util.HashSet;
 import java.util.List;
-import java.util.Set;
 
 /**
  * Default implementation of the {@link EndOfSentenceScanner}.
@@ -30,9 +28,7 @@ import java.util.Set;
  */
 public class DefaultEndOfSentenceScanner implements EndOfSentenceScanner {
 
-  private Set<Character> eosCharacters;
-  @Deprecated
-  private char[] eosChars;
+  private char[] eosCharacters;
 
   /**
    * Initializes the current instance.
@@ -40,11 +36,7 @@ public class DefaultEndOfSentenceScanner implements EndOfSentenceScanner {
    * @param eosCharacters
    */
   public DefaultEndOfSentenceScanner(char[] eosCharacters) {
-    this.eosCharacters = new HashSet<>();
-    for (char eosChar: eosCharacters) {
-      this.eosCharacters.add(eosChar);
-    }
-    this.eosChars = eosCharacters;
+    this.eosCharacters = eosCharacters;
   }
 
   public List<Integer> getPositions(String s) {
@@ -57,21 +49,19 @@ public class DefaultEndOfSentenceScanner implements EndOfSentenceScanner {
 
   public List<Integer> getPositions(char[] cbuf) {
     List<Integer> l = new ArrayList<>();
+    char[] eosCharacters = getEndOfSentenceCharacters();
     for (int i = 0; i < cbuf.length; i++) {
-      if (eosCharacters.contains(cbuf[i])) {
-        l.add(i);
+      for (char eosCharacter : eosCharacters) {
+        if (cbuf[i] == eosCharacter) {
+          l.add(i);
+          break;
+        }
       }
     }
     return l;
   }
 
-  @Deprecated
   public char[] getEndOfSentenceCharacters() {
-    return eosChars;
-  }
-
-  @Override
-  public Set<Character> getEOSCharacters() {
     return eosCharacters;
   }
 }
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java
index 8c2822b..a29119b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java
@@ -19,7 +19,6 @@ package opennlp.tools.sentdetect;
 
 import java.util.ArrayList;
 import java.util.Collections;
-import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 
@@ -43,7 +42,7 @@ public class DefaultSDContextGenerator implements SDContextGenerator {
 
   private Set<String> inducedAbbreviations;
 
-  private Set<Character> eosCharacters;
+  private char[] eosCharacters;
 
   /**
    * Creates a new <code>SDContextGenerator</code> instance with
@@ -67,10 +66,7 @@ public class DefaultSDContextGenerator implements SDContextGenerator {
    */
   public DefaultSDContextGenerator(Set<String> inducedAbbreviations, char[] eosCharacters) {
     this.inducedAbbreviations = inducedAbbreviations;
-    this.eosCharacters = new HashSet<>();
-    for (char eosChar: eosCharacters) {
-      this.eosCharacters.add(eosChar);
-    }
+    this.eosCharacters = eosCharacters;
     buf = new StringBuffer();
     collectFeats = new ArrayList<>();
   }
@@ -125,9 +121,12 @@ public class DefaultSDContextGenerator implements SDContextGenerator {
     int c = position;
     { ///assign prefix, stop if you run into a period though otherwise stop at space
       while (--c > prefixStart) {
-        if (eosCharacters.contains(sb.charAt(c))) {
-          prefixStart = c;
-          c++; // this gets us out of while loop.
+        for (int eci = 0, ecl = eosCharacters.length; eci < ecl; eci++) {
+          if (sb.charAt(c) == eosCharacters[eci]) {
+            prefixStart = c;
+            c++; // this gets us out of while loop.
+            break;
+          }
         }
       }
       prefix = String.valueOf(sb.subSequence(prefixStart, position)).trim();
@@ -139,9 +138,12 @@ public class DefaultSDContextGenerator implements SDContextGenerator {
     {
       c = position;
       while (++c < suffixEnd) {
-        if (eosCharacters.contains(sb.charAt(c))) {
-          suffixEnd = c;
-          c--; // this gets us out of while loop.
+        for (int eci = 0, ecl = eosCharacters.length; eci < ecl; eci++) {
+          if (sb.charAt(c) == eosCharacters[eci]) {
+            suffixEnd = c;
+            c--; // this gets us out of while loop.
+            break;
+          }
         }
       }
     }
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EndOfSentenceScanner.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EndOfSentenceScanner.java
index 7963e37..b48ad3f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EndOfSentenceScanner.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EndOfSentenceScanner.java
@@ -18,7 +18,6 @@
 package opennlp.tools.sentdetect;
 
 import java.util.List;
-import java.util.Set;
 
 /**
  * Scans Strings, StringBuffers, and char[] arrays for the offsets of
@@ -35,16 +34,9 @@ public interface EndOfSentenceScanner {
    * Returns an array of character which can indicate the end of a sentence.
    * @return an array of character which can indicate the end of a sentence.
    */
-  @Deprecated
   char[] getEndOfSentenceCharacters();
 
   /**
-   * Returns a set of character which can indicate the end of a sentence.
-   * @return a set of character which can indicate the end of a sentence.
-   */
-  Set<Character> getEOSCharacters();
-
-  /**
    * The receiver scans the specified string for sentence ending characters and
    * returns their offsets.
    *
diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/DefaultSDContextGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/DefaultSDContextGeneratorTest.java
deleted file mode 100644
index f010498..0000000
--- a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/DefaultSDContextGeneratorTest.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.sentdetect;
-
-import java.util.Arrays;
-import java.util.Collections;
-import java.util.HashSet;
-
-import org.junit.Assert;
-import org.junit.Test;
-
-import opennlp.tools.sentdetect.lang.Factory;
-
-public class DefaultSDContextGeneratorTest {
-
-  @Test
-  public void testGetContext() throws Exception {
-    SDContextGenerator sdContextGenerator =
-        new DefaultSDContextGenerator(Collections.<String>emptySet(), Factory.defaultEosCharacters);
-
-    String[] context = sdContextGenerator.getContext(
-        "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 2);
-    Assert.assertArrayEquals("sn/eos=./x=Mr/2/xcap/v=/s=/n=Smith/ncap".split("/"), context);
-
-    context = sdContextGenerator.getContext(
-        "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 29);
-    Assert.assertArrayEquals("sn/eos=./x=Inc/3/xcap/v=RONDHUIT/vcap/s=/n=as".split("/"), context);
-  }
-
-  @Test
-  public void testGetContextWithAbbreviations() throws Exception {
-    SDContextGenerator sdContextGenerator =
-        new DefaultSDContextGenerator(new HashSet<>(Arrays.asList("Mr./Inc.".split("/"))),
-            Factory.defaultEosCharacters);
-
-    String[] context = sdContextGenerator.getContext(
-        "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 2);
-    Assert.assertArrayEquals("sn/eos=./x=Mr/2/xcap/xabbrev/v=/s=/n=Smith/ncap".split("/"), context);
-
-    context = sdContextGenerator.getContext(
-        "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 29);
-    Assert.assertArrayEquals("sn/eos=./x=Inc/3/xcap/xabbrev/v=RONDHUIT/vcap/s=/n=as".split("/"), context);
-  }
-}