You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ko...@apache.org on 2018/10/03 01:02:11 UTC
[opennlp] branch master updated: OPENNLP-1214: use hash to avoid
linear search in DefaultEndOfSentenceScanner and DefaultSDContextGenerator
(#329)
This is an automated email from the ASF dual-hosted git repository.
koji pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new 51cbde6 OPENNLP-1214: use hash to avoid linear search in DefaultEndOfSentenceScanner and DefaultSDContextGenerator (#329)
51cbde6 is described below
commit 51cbde659383c8931525a51cf5066092826082ae
Author: Koji Sekiguchi <ko...@rondhuit.com>
AuthorDate: Wed Oct 3 10:02:01 2018 +0900
OPENNLP-1214: use hash to avoid linear search in DefaultEndOfSentenceScanner and DefaultSDContextGenerator (#329)
---
.../sentdetect/DefaultEndOfSentenceScanner.java | 26 +++++++---
.../sentdetect/DefaultSDContextGenerator.java | 26 +++++-----
.../tools/sentdetect/EndOfSentenceScanner.java | 8 +++
.../sentdetect/DefaultSDContextGeneratorTest.java | 59 ++++++++++++++++++++++
4 files changed, 97 insertions(+), 22 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultEndOfSentenceScanner.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultEndOfSentenceScanner.java
index 75d0ec0..2b8c0be 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultEndOfSentenceScanner.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultEndOfSentenceScanner.java
@@ -19,7 +19,9 @@
package opennlp.tools.sentdetect;
import java.util.ArrayList;
+import java.util.HashSet;
import java.util.List;
+import java.util.Set;
/**
* Default implementation of the {@link EndOfSentenceScanner}.
@@ -28,7 +30,9 @@ import java.util.List;
*/
public class DefaultEndOfSentenceScanner implements EndOfSentenceScanner {
- private char[] eosCharacters;
+ private Set<Character> eosCharacters;
+ @Deprecated
+ private char[] eosChars;
/**
* Initializes the current instance.
@@ -36,7 +40,11 @@ public class DefaultEndOfSentenceScanner implements EndOfSentenceScanner {
* @param eosCharacters
*/
public DefaultEndOfSentenceScanner(char[] eosCharacters) {
- this.eosCharacters = eosCharacters;
+ this.eosCharacters = new HashSet<>();
+ for (char eosChar: eosCharacters) {
+ this.eosCharacters.add(eosChar);
+ }
+ this.eosChars = eosCharacters;
}
public List<Integer> getPositions(String s) {
@@ -49,19 +57,21 @@ public class DefaultEndOfSentenceScanner implements EndOfSentenceScanner {
public List<Integer> getPositions(char[] cbuf) {
List<Integer> l = new ArrayList<>();
- char[] eosCharacters = getEndOfSentenceCharacters();
for (int i = 0; i < cbuf.length; i++) {
- for (char eosCharacter : eosCharacters) {
- if (cbuf[i] == eosCharacter) {
- l.add(i);
- break;
- }
+ if (eosCharacters.contains(cbuf[i])) {
+ l.add(i);
}
}
return l;
}
+ @Deprecated
public char[] getEndOfSentenceCharacters() {
+ return eosChars;
+ }
+
+ @Override
+ public Set<Character> getEOSCharacters() {
return eosCharacters;
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java
index a29119b..8c2822b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/DefaultSDContextGenerator.java
@@ -19,6 +19,7 @@ package opennlp.tools.sentdetect;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.HashSet;
import java.util.List;
import java.util.Set;
@@ -42,7 +43,7 @@ public class DefaultSDContextGenerator implements SDContextGenerator {
private Set<String> inducedAbbreviations;
- private char[] eosCharacters;
+ private Set<Character> eosCharacters;
/**
* Creates a new <code>SDContextGenerator</code> instance with
@@ -66,7 +67,10 @@ public class DefaultSDContextGenerator implements SDContextGenerator {
*/
public DefaultSDContextGenerator(Set<String> inducedAbbreviations, char[] eosCharacters) {
this.inducedAbbreviations = inducedAbbreviations;
- this.eosCharacters = eosCharacters;
+ this.eosCharacters = new HashSet<>();
+ for (char eosChar: eosCharacters) {
+ this.eosCharacters.add(eosChar);
+ }
buf = new StringBuffer();
collectFeats = new ArrayList<>();
}
@@ -121,12 +125,9 @@ public class DefaultSDContextGenerator implements SDContextGenerator {
int c = position;
{ ///assign prefix, stop if you run into a period though otherwise stop at space
while (--c > prefixStart) {
- for (int eci = 0, ecl = eosCharacters.length; eci < ecl; eci++) {
- if (sb.charAt(c) == eosCharacters[eci]) {
- prefixStart = c;
- c++; // this gets us out of while loop.
- break;
- }
+ if (eosCharacters.contains(sb.charAt(c))) {
+ prefixStart = c;
+ c++; // this gets us out of while loop.
}
}
prefix = String.valueOf(sb.subSequence(prefixStart, position)).trim();
@@ -138,12 +139,9 @@ public class DefaultSDContextGenerator implements SDContextGenerator {
{
c = position;
while (++c < suffixEnd) {
- for (int eci = 0, ecl = eosCharacters.length; eci < ecl; eci++) {
- if (sb.charAt(c) == eosCharacters[eci]) {
- suffixEnd = c;
- c--; // this gets us out of while loop.
- break;
- }
+ if (eosCharacters.contains(sb.charAt(c))) {
+ suffixEnd = c;
+ c--; // this gets us out of while loop.
}
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EndOfSentenceScanner.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EndOfSentenceScanner.java
index b48ad3f..7963e37 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EndOfSentenceScanner.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/EndOfSentenceScanner.java
@@ -18,6 +18,7 @@
package opennlp.tools.sentdetect;
import java.util.List;
+import java.util.Set;
/**
* Scans Strings, StringBuffers, and char[] arrays for the offsets of
@@ -34,9 +35,16 @@ public interface EndOfSentenceScanner {
* Returns an array of character which can indicate the end of a sentence.
* @return an array of character which can indicate the end of a sentence.
*/
+ @Deprecated
char[] getEndOfSentenceCharacters();
/**
+ * Returns a set of character which can indicate the end of a sentence.
+ * @return a set of character which can indicate the end of a sentence.
+ */
+ Set<Character> getEOSCharacters();
+
+ /**
* The receiver scans the specified string for sentence ending characters and
* returns their offsets.
*
diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/DefaultSDContextGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/DefaultSDContextGeneratorTest.java
new file mode 100644
index 0000000..f010498
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/DefaultSDContextGeneratorTest.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.sentdetect;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.sentdetect.lang.Factory;
+
+public class DefaultSDContextGeneratorTest {
+
+ @Test
+ public void testGetContext() throws Exception {
+ SDContextGenerator sdContextGenerator =
+ new DefaultSDContextGenerator(Collections.<String>emptySet(), Factory.defaultEosCharacters);
+
+ String[] context = sdContextGenerator.getContext(
+ "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 2);
+ Assert.assertArrayEquals("sn/eos=./x=Mr/2/xcap/v=/s=/n=Smith/ncap".split("/"), context);
+
+ context = sdContextGenerator.getContext(
+ "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 29);
+ Assert.assertArrayEquals("sn/eos=./x=Inc/3/xcap/v=RONDHUIT/vcap/s=/n=as".split("/"), context);
+ }
+
+ @Test
+ public void testGetContextWithAbbreviations() throws Exception {
+ SDContextGenerator sdContextGenerator =
+ new DefaultSDContextGenerator(new HashSet<>(Arrays.asList("Mr./Inc.".split("/"))),
+ Factory.defaultEosCharacters);
+
+ String[] context = sdContextGenerator.getContext(
+ "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 2);
+ Assert.assertArrayEquals("sn/eos=./x=Mr/2/xcap/xabbrev/v=/s=/n=Smith/ncap".split("/"), context);
+
+ context = sdContextGenerator.getContext(
+ "Mr. Smith joined RONDHUIT Inc. as a manager of sales department.", 29);
+ Assert.assertArrayEquals("sn/eos=./x=Inc/3/xcap/xabbrev/v=RONDHUIT/vcap/s=/n=as".split("/"), context);
+ }
+}