You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2018/04/30 14:03:40 UTC
[opennlp] branch OPENNLP-1194 updated (1b1f4ea -> 108e975)
This is an automated email from the ASF dual-hosted git repository.
colen pushed a change to branch OPENNLP-1194
in repository https://gitbox.apache.org/repos/asf/opennlp.git.
omit 1b1f4ea OPENNLP-1194: Adds type name filter to BratDocumentParser
new 108e975 OPENNLP-1194: Adds type name filter to BratDocumentParser
This update added new revisions after undoing existing revisions.
That is to say, some revisions that were in the old version of the
branch are not in the new version. This situation occurs
when a user --force pushes a change and generates a repository
containing something like this:
* -- * -- B -- O -- O -- O (1b1f4ea)
\
N -- N -- N refs/heads/OPENNLP-1194 (108e975)
You should already have received notification emails for all of the O
revisions, and so the following emails describe only the N revisions
from the common base, B.
Any revisions marked "omit" are not gone; other references still
refer to them. Any revisions marked "discard" are gone forever.
The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
.../tools/formats/brat/BratNameSampleStreamFactory.java | 1 -
.../opennlp/tools/formats/brat/BratNameSampleStreamTest.java | 10 ++++++++--
2 files changed, 8 insertions(+), 3 deletions(-)
--
To stop receiving notification emails like this one, please contact
colen@apache.org.
[opennlp] 01/01: OPENNLP-1194: Adds type name filter to
BratDocumentParser
Posted by co...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
colen pushed a commit to branch OPENNLP-1194
in repository https://gitbox.apache.org/repos/asf/opennlp.git
commit 108e975dc1b9909ba5355583d0d4f3ca4a9ce2b9
Author: William D C M SILVA <co...@apache.org>
AuthorDate: Fri Apr 20 14:07:29 2018 -0300
OPENNLP-1194: Adds type name filter to BratDocumentParser
---
.../tools/formats/brat/BratDocumentParser.java | 21 ++++-
.../tools/formats/brat/BratNameSampleStream.java | 52 +++++++++++-
.../formats/brat/BratNameSampleStreamFactory.java | 16 +++-
.../formats/brat/BratNameSampleStreamTest.java | 98 ++++++++++++++++++++++
.../formats/brat/voa-with-entities-overlapping.ann | 21 +++++
.../formats/brat/voa-with-entities-overlapping.txt | 8 ++
6 files changed, 209 insertions(+), 7 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
index 24ba887..a3899ff 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
@@ -34,10 +34,20 @@ public class BratDocumentParser {
private SentenceDetector sentDetector;
private Tokenizer tokenizer;
+ private final Set<String> nameTypes;
public BratDocumentParser(SentenceDetector sentenceDetector, Tokenizer tokenizer) {
+ this(sentenceDetector, tokenizer, null);
+ }
+
+ public BratDocumentParser(SentenceDetector sentenceDetector, Tokenizer tokenizer,
+ Set<String> nameTypes) {
+ if (nameTypes != null && nameTypes.size() == 0) {
+ throw new IllegalArgumentException("nameTypes should be null or have one or more elements");
+ }
this.sentDetector = sentenceDetector;
this.tokenizer = tokenizer;
+ this.nameTypes = nameTypes;
}
public List<NameSample> parse(BratDocument sample) {
@@ -49,7 +59,7 @@ public class BratDocumentParser {
Map<Integer, Span> coveredIndexes = new HashMap<>();
for (BratAnnotation ann : sample.getAnnotations()) {
- if (ann instanceof SpanAnnotation) {
+ if (isSpanAnnotation(ann)) {
entityIdSet.add(ann.getId());
Span span = ((SpanAnnotation) ann).getSpan();
@@ -109,7 +119,7 @@ public class BratDocumentParser {
for (BratAnnotation ann : sample.getAnnotations()) {
- if (ann instanceof SpanAnnotation) {
+ if (isSpanAnnotation(ann)) {
SpanAnnotation entity = (SpanAnnotation) ann;
Span entitySpan = entity.getSpan();
@@ -145,5 +155,12 @@ public class BratDocumentParser {
return samples;
}
+
+ private boolean isSpanAnnotation(BratAnnotation ann) {
+ if (ann instanceof SpanAnnotation && (nameTypes == null || nameTypes.contains(ann.getType()))) {
+ return true;
+ }
+ return false;
+ }
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
index cc066ad..d799b4c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
@@ -19,6 +19,7 @@ package opennlp.tools.formats.brat;
import java.io.IOException;
import java.util.List;
+import java.util.Set;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.sentdetect.SentenceDetector;
@@ -36,19 +37,62 @@ public class BratNameSampleStream extends SegmenterObjectStream<BratDocument, Na
private final BratDocumentParser parser;
+ /**
+ * Creates a new {@link BratNameSampleStream}.
+ * @param sentDetector a {@link SentenceDetector} instance
+ * @param tokenizer a {@link Tokenizer} instance
+ * @param samples a {@link BratDocument} {@link ObjectStream}
+ */
public BratNameSampleStream(SentenceDetector sentDetector,
- Tokenizer tokenizer, ObjectStream<BratDocument> samples) {
+ Tokenizer tokenizer, ObjectStream<BratDocument> samples) {
super(samples);
- this.parser = new BratDocumentParser(sentDetector, tokenizer);
+ this.parser = new BratDocumentParser(sentDetector, tokenizer, null);
}
+ /**
+ * Creates a new {@link BratNameSampleStream}.
+ * @param sentModel a {@link SentenceModel} model
+ * @param tokenModel a {@link TokenizerModel} model
+ * @param samples a {@link BratDocument} {@link ObjectStream}
+ */
public BratNameSampleStream(SentenceModel sentModel, TokenizerModel tokenModel,
- ObjectStream<BratDocument> samples) {
+ ObjectStream<BratDocument> samples) {
super(samples);
// TODO: We can pass in custom validators here ...
- this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel), new TokenizerME(tokenModel));
+ this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel),
+ new TokenizerME(tokenModel), null);
+ }
+
+ /**
+ * Creates a new {@link BratNameSampleStream}.
+ * @param sentDetector a {@link SentenceDetector} instance
+ * @param tokenizer a {@link Tokenizer} instance
+ * @param samples a {@link BratDocument} {@link ObjectStream}
+ * @param nameTypes the name types to use or null if all name types
+ */
+ public BratNameSampleStream(SentenceDetector sentDetector,
+ Tokenizer tokenizer, ObjectStream<BratDocument> samples, Set<String> nameTypes) {
+ super(samples);
+
+ this.parser = new BratDocumentParser(sentDetector, tokenizer, nameTypes);
+ }
+
+ /**
+ * Creates a new {@link BratNameSampleStream}.
+ * @param sentModel a {@link SentenceModel} model
+ * @param tokenModel a {@link TokenizerModel} model
+ * @param samples a {@link BratDocument} {@link ObjectStream}
+ * @param nameTypes the name types to use or null if all name types
+ */
+ public BratNameSampleStream(SentenceModel sentModel, TokenizerModel tokenModel,
+ ObjectStream<BratDocument> samples, Set<String> nameTypes) {
+ super(samples);
+
+ // TODO: We can pass in custom validators here ...
+ this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel),
+ new TokenizerME(tokenModel), nameTypes);
}
@Override
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
index a94f5c9..338e3cb 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
@@ -19,6 +19,9 @@ package opennlp.tools.formats.brat;
import java.io.File;
import java.io.IOException;
+import java.util.Arrays;
+import java.util.Set;
+import java.util.stream.Collectors;
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
@@ -63,6 +66,9 @@ public class BratNameSampleStreamFactory extends AbstractSampleStreamFactory<Nam
@OptionalParameter(defaultValue = "false")
Boolean getRecursive();
+ @ParameterDescription(valueName = "names")
+ @OptionalParameter
+ String getNameTypes();
}
protected BratNameSampleStreamFactory() {
@@ -148,7 +154,15 @@ public class BratNameSampleStreamFactory extends AbstractSampleStreamFactory<Nam
}
}
- return new BratNameSampleStream(sentDetector, tokenizer, samples);
+ Set<String> nameTypes = null;
+ if (params.getNameTypes() != null) {
+ String[] nameTypesArr = params.getNameTypes().split(",");
+ if (nameTypesArr != null && nameTypesArr.length > 0) {
+ nameTypes = Arrays.stream(nameTypesArr).map(String::trim).collect(Collectors.toSet());
+ }
+ }
+
+ return new BratNameSampleStream(sentDetector, tokenizer, samples, nameTypes);
}
public static void registerFactory() {
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamTest.java
new file mode 100644
index 0000000..0f13682
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamTest.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.sentdetect.NewlineSentenceDetector;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.ObjectStream;
+
+public class BratNameSampleStreamTest {
+
+ private BratNameSampleStream createNameSampleWith(String nameContainsFilter,
+ Set<String> nameTypes) throws IOException {
+ Map<String, String> typeToClassMap = new HashMap<>();
+ BratAnnotationStreamTest.addEntityTypes(typeToClassMap);
+ AnnotationConfiguration config = new AnnotationConfiguration(typeToClassMap);
+
+ File dir = new File(this.getClass().getResource("/opennlp/tools/formats/brat/").getFile());
+ FileFilter fileFilter = pathname -> pathname.getName().contains(nameContainsFilter);
+
+ ObjectStream<BratDocument> bratDocumentStream = new BratDocumentStream(config, dir,
+ false, fileFilter);
+
+ return new BratNameSampleStream(new NewlineSentenceDetector(),
+ WhitespaceTokenizer.INSTANCE, bratDocumentStream, nameTypes);
+ }
+
+ @Test
+ public void readNoOverlap() throws IOException {
+ BratNameSampleStream stream = createNameSampleWith("-entities.",
+ null);
+ int count = 0;
+ NameSample sample = stream.read();
+ while (sample != null) {
+ count++;
+ sample = stream.read();
+ }
+
+ Assert.assertEquals(8, count);
+ }
+
+ @Test(expected = RuntimeException.class)
+ public void readOverlapFail() throws IOException {
+ BratNameSampleStream stream = createNameSampleWith("overlapping",
+ null);
+
+ NameSample sample = stream.read();
+ while (sample != null) {
+ sample = stream.read();
+ }
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void emptySample() throws IOException {
+ createNameSampleWith("overlapping",
+ Collections.emptySet());
+ }
+
+ @Test
+ public void readOverlapFilter() throws IOException {
+ BratNameSampleStream stream = createNameSampleWith("overlapping",
+ Collections.singleton("Person"));
+ int count = 0;
+ NameSample sample = stream.read();
+ while (sample != null) {
+ count++;
+ sample = stream.read();
+ }
+
+ Assert.assertEquals(8, count);
+ }
+}
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.ann b/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.ann
new file mode 100644
index 0000000..b16f176
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.ann
@@ -0,0 +1,21 @@
+T1 Person 281 286 Obama
+T2 Person 21 33 Barack Obama
+T3 Location 51 62 South Korea
+T4 Location 151 162 North Korea
+T5 Location 231 236 China
+T6 Location 243 254 South Korea
+T7 Location 322 333 North Korea
+T8 Date 257 266 Wednesday
+T9 Location 386 397 North Korea
+T10 Person 586 591 Obama
+T11 Date 843 860 Wednesday evening
+T12 Location 889 901 South Korean
+T13 Person 913 928 Lee Myung - bak
+T14 Date 931 939 Thursday
+T15 Location 978 989 South Korea
+T16 Location 1000 1013 United States
+T17 Person 1121 1126 Obama
+T18 Location 1168 1177 Pyongyang
+T19 Person 1168 1177 Pyongyang
+#1 AnnotatorNotes T2 President Obama was the 44th U.S. president
+#2 AnnotatorNotes T3 The capital of South Korea is Seoul
\ No newline at end of file
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.txt b/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.txt
new file mode 100644
index 0000000..9b2d544
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.txt
@@ -0,0 +1,8 @@
+ U . S . President Barack Obama has arrived in South Korea , where he is expected to show solidarity with the country ' s president in demanding North Korea move toward ending its nuclear weapons programs .
+As he departed China for South Korea Wednesday , President Obama took another opportunity to urge North Korea to reach an agreement on its nuclear weapons .
+" North Korea has a choice .
+It can continue down the path of confrontation and provocation that has led to less security , less prosperity and more isolation from the global community , " President Obama said .
+" Or it can choose to become a full member of the international community , which will give a better life to its people by living up to international obligations and foregoing nuclear weapons . "
+The president landed at a U . S . air base Wednesday evening , and is to hold talks with South Korean President Lee Myung - bak Thursday here in the South Korean capital .
+ South Korea and the United States are trying to coax the North back to six - nation talks aimed at ending its nuclear weapons .
+President Obama has indicated he will send an envoy to Pyongyang before the end of the year for one - on - one discussions , but only in the context of restarting the multinational process .
--
To stop receiving notification emails like this one, please contact
colen@apache.org.