You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2018/04/30 15:12:49 UTC
[opennlp] branch master updated: OPENNLP-1194: Adds type name
filter to BratDocumentParser
This is an automated email from the ASF dual-hosted git repository.
colen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new ce8d0f7 OPENNLP-1194: Adds type name filter to BratDocumentParser
ce8d0f7 is described below
commit ce8d0f7a47f7caba8ab937389c59b68b20f6994a
Author: William D C M SILVA <co...@apache.org>
AuthorDate: Fri Apr 20 14:07:29 2018 -0300
OPENNLP-1194: Adds type name filter to BratDocumentParser
---
.../tools/formats/brat/BratDocumentParser.java | 21 ++++-
.../tools/formats/brat/BratNameSampleStream.java | 52 +++++++++++-
.../formats/brat/BratNameSampleStreamFactory.java | 16 +++-
.../formats/brat/BratNameSampleStreamTest.java | 98 ++++++++++++++++++++++
.../formats/brat/voa-with-entities-overlapping.ann | 21 +++++
.../formats/brat/voa-with-entities-overlapping.txt | 8 ++
6 files changed, 209 insertions(+), 7 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
index 24ba887..a3899ff 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
@@ -34,10 +34,20 @@ public class BratDocumentParser {
private SentenceDetector sentDetector;
private Tokenizer tokenizer;
+ private final Set<String> nameTypes;
public BratDocumentParser(SentenceDetector sentenceDetector, Tokenizer tokenizer) {
+ this(sentenceDetector, tokenizer, null);
+ }
+
+ public BratDocumentParser(SentenceDetector sentenceDetector, Tokenizer tokenizer,
+ Set<String> nameTypes) {
+ if (nameTypes != null && nameTypes.size() == 0) {
+ throw new IllegalArgumentException("nameTypes should be null or have one or more elements");
+ }
this.sentDetector = sentenceDetector;
this.tokenizer = tokenizer;
+ this.nameTypes = nameTypes;
}
public List<NameSample> parse(BratDocument sample) {
@@ -49,7 +59,7 @@ public class BratDocumentParser {
Map<Integer, Span> coveredIndexes = new HashMap<>();
for (BratAnnotation ann : sample.getAnnotations()) {
- if (ann instanceof SpanAnnotation) {
+ if (isSpanAnnotation(ann)) {
entityIdSet.add(ann.getId());
Span span = ((SpanAnnotation) ann).getSpan();
@@ -109,7 +119,7 @@ public class BratDocumentParser {
for (BratAnnotation ann : sample.getAnnotations()) {
- if (ann instanceof SpanAnnotation) {
+ if (isSpanAnnotation(ann)) {
SpanAnnotation entity = (SpanAnnotation) ann;
Span entitySpan = entity.getSpan();
@@ -145,5 +155,12 @@ public class BratDocumentParser {
return samples;
}
+
+ private boolean isSpanAnnotation(BratAnnotation ann) {
+ if (ann instanceof SpanAnnotation && (nameTypes == null || nameTypes.contains(ann.getType()))) {
+ return true;
+ }
+ return false;
+ }
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
index cc066ad..d799b4c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
@@ -19,6 +19,7 @@ package opennlp.tools.formats.brat;
import java.io.IOException;
import java.util.List;
+import java.util.Set;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.sentdetect.SentenceDetector;
@@ -36,19 +37,62 @@ public class BratNameSampleStream extends SegmenterObjectStream<BratDocument, Na
private final BratDocumentParser parser;
+ /**
+ * Creates a new {@link BratNameSampleStream}.
+ * @param sentDetector a {@link SentenceDetector} instance
+ * @param tokenizer a {@link Tokenizer} instance
+ * @param samples a {@link BratDocument} {@link ObjectStream}
+ */
public BratNameSampleStream(SentenceDetector sentDetector,
- Tokenizer tokenizer, ObjectStream<BratDocument> samples) {
+ Tokenizer tokenizer, ObjectStream<BratDocument> samples) {
super(samples);
- this.parser = new BratDocumentParser(sentDetector, tokenizer);
+ this.parser = new BratDocumentParser(sentDetector, tokenizer, null);
}
+ /**
+ * Creates a new {@link BratNameSampleStream}.
+ * @param sentModel a {@link SentenceModel} model
+ * @param tokenModel a {@link TokenizerModel} model
+ * @param samples a {@link BratDocument} {@link ObjectStream}
+ */
public BratNameSampleStream(SentenceModel sentModel, TokenizerModel tokenModel,
- ObjectStream<BratDocument> samples) {
+ ObjectStream<BratDocument> samples) {
super(samples);
// TODO: We can pass in custom validators here ...
- this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel), new TokenizerME(tokenModel));
+ this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel),
+ new TokenizerME(tokenModel), null);
+ }
+
+ /**
+ * Creates a new {@link BratNameSampleStream}.
+ * @param sentDetector a {@link SentenceDetector} instance
+ * @param tokenizer a {@link Tokenizer} instance
+ * @param samples a {@link BratDocument} {@link ObjectStream}
+ * @param nameTypes the name types to use or null if all name types
+ */
+ public BratNameSampleStream(SentenceDetector sentDetector,
+ Tokenizer tokenizer, ObjectStream<BratDocument> samples, Set<String> nameTypes) {
+ super(samples);
+
+ this.parser = new BratDocumentParser(sentDetector, tokenizer, nameTypes);
+ }
+
+ /**
+ * Creates a new {@link BratNameSampleStream}.
+ * @param sentModel a {@link SentenceModel} model
+ * @param tokenModel a {@link TokenizerModel} model
+ * @param samples a {@link BratDocument} {@link ObjectStream}
+ * @param nameTypes the name types to use or null if all name types
+ */
+ public BratNameSampleStream(SentenceModel sentModel, TokenizerModel tokenModel,
+ ObjectStream<BratDocument> samples, Set<String> nameTypes) {
+ super(samples);
+
+ // TODO: We can pass in custom validators here ...
+ this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel),
+ new TokenizerME(tokenModel), nameTypes);
}
@Override
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
index a94f5c9..87a0c61 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
@@ -19,6 +19,9 @@ package opennlp.tools.formats.brat;
import java.io.File;
import java.io.IOException;
+import java.util.Arrays;
+import java.util.Set;
+import java.util.stream.Collectors;
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
@@ -63,6 +66,9 @@ public class BratNameSampleStreamFactory extends AbstractSampleStreamFactory<Nam
@OptionalParameter(defaultValue = "false")
Boolean getRecursive();
+ @ParameterDescription(valueName = "names")
+ @OptionalParameter
+ String getNameTypes();
}
protected BratNameSampleStreamFactory() {
@@ -148,7 +154,15 @@ public class BratNameSampleStreamFactory extends AbstractSampleStreamFactory<Nam
}
}
- return new BratNameSampleStream(sentDetector, tokenizer, samples);
+ Set<String> nameTypes = null;
+ if (params.getNameTypes() != null) {
+ String[] nameTypesArr = params.getNameTypes().split(",");
+ if (nameTypesArr.length > 0) {
+ nameTypes = Arrays.stream(nameTypesArr).map(String::trim).collect(Collectors.toSet());
+ }
+ }
+
+ return new BratNameSampleStream(sentDetector, tokenizer, samples, nameTypes);
}
public static void registerFactory() {
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamTest.java
new file mode 100644
index 0000000..0f13682
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamTest.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.sentdetect.NewlineSentenceDetector;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.ObjectStream;
+
+public class BratNameSampleStreamTest {
+
+ private BratNameSampleStream createNameSampleWith(String nameContainsFilter,
+ Set<String> nameTypes) throws IOException {
+ Map<String, String> typeToClassMap = new HashMap<>();
+ BratAnnotationStreamTest.addEntityTypes(typeToClassMap);
+ AnnotationConfiguration config = new AnnotationConfiguration(typeToClassMap);
+
+ File dir = new File(this.getClass().getResource("/opennlp/tools/formats/brat/").getFile());
+ FileFilter fileFilter = pathname -> pathname.getName().contains(nameContainsFilter);
+
+ ObjectStream<BratDocument> bratDocumentStream = new BratDocumentStream(config, dir,
+ false, fileFilter);
+
+ return new BratNameSampleStream(new NewlineSentenceDetector(),
+ WhitespaceTokenizer.INSTANCE, bratDocumentStream, nameTypes);
+ }
+
+ @Test
+ public void readNoOverlap() throws IOException {
+ BratNameSampleStream stream = createNameSampleWith("-entities.",
+ null);
+ int count = 0;
+ NameSample sample = stream.read();
+ while (sample != null) {
+ count++;
+ sample = stream.read();
+ }
+
+ Assert.assertEquals(8, count);
+ }
+
+ @Test(expected = RuntimeException.class)
+ public void readOverlapFail() throws IOException {
+ BratNameSampleStream stream = createNameSampleWith("overlapping",
+ null);
+
+ NameSample sample = stream.read();
+ while (sample != null) {
+ sample = stream.read();
+ }
+ }
+
+ @Test(expected = IllegalArgumentException.class)
+ public void emptySample() throws IOException {
+ createNameSampleWith("overlapping",
+ Collections.emptySet());
+ }
+
+ @Test
+ public void readOverlapFilter() throws IOException {
+ BratNameSampleStream stream = createNameSampleWith("overlapping",
+ Collections.singleton("Person"));
+ int count = 0;
+ NameSample sample = stream.read();
+ while (sample != null) {
+ count++;
+ sample = stream.read();
+ }
+
+ Assert.assertEquals(8, count);
+ }
+}
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.ann b/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.ann
new file mode 100644
index 0000000..b16f176
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.ann
@@ -0,0 +1,21 @@
+T1 Person 281 286 Obama
+T2 Person 21 33 Barack Obama
+T3 Location 51 62 South Korea
+T4 Location 151 162 North Korea
+T5 Location 231 236 China
+T6 Location 243 254 South Korea
+T7 Location 322 333 North Korea
+T8 Date 257 266 Wednesday
+T9 Location 386 397 North Korea
+T10 Person 586 591 Obama
+T11 Date 843 860 Wednesday evening
+T12 Location 889 901 South Korean
+T13 Person 913 928 Lee Myung - bak
+T14 Date 931 939 Thursday
+T15 Location 978 989 South Korea
+T16 Location 1000 1013 United States
+T17 Person 1121 1126 Obama
+T18 Location 1168 1177 Pyongyang
+T19 Person 1168 1177 Pyongyang
+#1 AnnotatorNotes T2 President Obama was the 44th U.S. president
+#2 AnnotatorNotes T3 The capital of South Korea is Seoul
\ No newline at end of file
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.txt b/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.txt
new file mode 100644
index 0000000..9b2d544
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.txt
@@ -0,0 +1,8 @@
+ U . S . President Barack Obama has arrived in South Korea , where he is expected to show solidarity with the country ' s president in demanding North Korea move toward ending its nuclear weapons programs .
+As he departed China for South Korea Wednesday , President Obama took another opportunity to urge North Korea to reach an agreement on its nuclear weapons .
+" North Korea has a choice .
+It can continue down the path of confrontation and provocation that has led to less security , less prosperity and more isolation from the global community , " President Obama said .
+" Or it can choose to become a full member of the international community , which will give a better life to its people by living up to international obligations and foregoing nuclear weapons . "
+The president landed at a U . S . air base Wednesday evening , and is to hold talks with South Korean President Lee Myung - bak Thursday here in the South Korean capital .
+ South Korea and the United States are trying to coax the North back to six - nation talks aimed at ending its nuclear weapons .
+President Obama has indicated he will send an envoy to Pyongyang before the end of the year for one - on - one discussions , but only in the context of restarting the multinational process .
--
To stop receiving notification emails like this one, please contact
colen@apache.org.