You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2018/04/30 15:12:49 UTC

[opennlp] branch master updated: OPENNLP-1194: Adds type name filter to BratDocumentParser

This is an automated email from the ASF dual-hosted git repository.

colen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/master by this push:
     new ce8d0f7  OPENNLP-1194: Adds type name filter to BratDocumentParser
ce8d0f7 is described below

commit ce8d0f7a47f7caba8ab937389c59b68b20f6994a
Author: William D C M SILVA <co...@apache.org>
AuthorDate: Fri Apr 20 14:07:29 2018 -0300

    OPENNLP-1194: Adds type name filter to BratDocumentParser
---
 .../tools/formats/brat/BratDocumentParser.java     | 21 ++++-
 .../tools/formats/brat/BratNameSampleStream.java   | 52 +++++++++++-
 .../formats/brat/BratNameSampleStreamFactory.java  | 16 +++-
 .../formats/brat/BratNameSampleStreamTest.java     | 98 ++++++++++++++++++++++
 .../formats/brat/voa-with-entities-overlapping.ann | 21 +++++
 .../formats/brat/voa-with-entities-overlapping.txt |  8 ++
 6 files changed, 209 insertions(+), 7 deletions(-)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
index 24ba887..a3899ff 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
@@ -34,10 +34,20 @@ public class BratDocumentParser {
 
   private SentenceDetector sentDetector;
   private Tokenizer tokenizer;
+  private final Set<String> nameTypes;
 
   public BratDocumentParser(SentenceDetector sentenceDetector, Tokenizer tokenizer) {
+    this(sentenceDetector, tokenizer, null);
+  }
+
+  public BratDocumentParser(SentenceDetector sentenceDetector, Tokenizer tokenizer,
+                            Set<String> nameTypes) {
+    if (nameTypes != null && nameTypes.size() == 0) {
+      throw new IllegalArgumentException("nameTypes should be null or have one or more elements");
+    }
     this.sentDetector = sentenceDetector;
     this.tokenizer = tokenizer;
+    this.nameTypes = nameTypes;
   }
 
   public List<NameSample> parse(BratDocument sample) {
@@ -49,7 +59,7 @@ public class BratDocumentParser {
     Map<Integer, Span> coveredIndexes = new HashMap<>();
 
     for (BratAnnotation ann : sample.getAnnotations()) {
-      if (ann instanceof SpanAnnotation) {
+      if (isSpanAnnotation(ann)) {
         entityIdSet.add(ann.getId());
 
         Span span = ((SpanAnnotation) ann).getSpan();
@@ -109,7 +119,7 @@ public class BratDocumentParser {
 
       for (BratAnnotation ann : sample.getAnnotations()) {
 
-        if (ann instanceof SpanAnnotation) {
+        if (isSpanAnnotation(ann)) {
           SpanAnnotation entity = (SpanAnnotation) ann;
 
           Span entitySpan = entity.getSpan();
@@ -145,5 +155,12 @@ public class BratDocumentParser {
 
     return samples;
   }
+
+  private boolean isSpanAnnotation(BratAnnotation ann) {
+    if (ann instanceof SpanAnnotation && (nameTypes == null || nameTypes.contains(ann.getType()))) {
+      return true;
+    }
+    return false;
+  }
 }
 
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
index cc066ad..d799b4c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
@@ -19,6 +19,7 @@ package opennlp.tools.formats.brat;
 
 import java.io.IOException;
 import java.util.List;
+import java.util.Set;
 
 import opennlp.tools.namefind.NameSample;
 import opennlp.tools.sentdetect.SentenceDetector;
@@ -36,19 +37,62 @@ public class BratNameSampleStream extends SegmenterObjectStream<BratDocument, Na
 
   private final BratDocumentParser parser;
 
+  /**
+   * Creates a new {@link BratNameSampleStream}.
+   * @param sentDetector a {@link SentenceDetector} instance
+   * @param tokenizer a {@link Tokenizer} instance
+   * @param samples a {@link BratDocument} {@link ObjectStream}
+   */
   public BratNameSampleStream(SentenceDetector sentDetector,
-      Tokenizer tokenizer, ObjectStream<BratDocument> samples) {
+                              Tokenizer tokenizer, ObjectStream<BratDocument> samples) {
     super(samples);
 
-    this.parser = new BratDocumentParser(sentDetector, tokenizer);
+    this.parser = new BratDocumentParser(sentDetector, tokenizer, null);
   }
 
+  /**
+   * Creates a new {@link BratNameSampleStream}.
+   * @param sentModel a {@link SentenceModel} model
+   * @param tokenModel a {@link TokenizerModel} model
+   * @param samples a {@link BratDocument} {@link ObjectStream}
+   */
   public BratNameSampleStream(SentenceModel sentModel, TokenizerModel tokenModel,
-      ObjectStream<BratDocument> samples) {
+                              ObjectStream<BratDocument> samples) {
     super(samples);
 
     // TODO: We can pass in custom validators here ...
-    this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel), new TokenizerME(tokenModel)); 
+    this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel),
+        new TokenizerME(tokenModel), null);
+  }
+
+  /**
+   * Creates a new {@link BratNameSampleStream}.
+   * @param sentDetector a {@link SentenceDetector} instance
+   * @param tokenizer a {@link Tokenizer} instance
+   * @param samples a {@link BratDocument} {@link ObjectStream}
+   * @param nameTypes the name types to use or null if all name types
+   */
+  public BratNameSampleStream(SentenceDetector sentDetector,
+      Tokenizer tokenizer, ObjectStream<BratDocument> samples, Set<String> nameTypes) {
+    super(samples);
+
+    this.parser = new BratDocumentParser(sentDetector, tokenizer, nameTypes);
+  }
+
+  /**
+   * Creates a new {@link BratNameSampleStream}.
+   * @param sentModel a {@link SentenceModel} model
+   * @param tokenModel a {@link TokenizerModel} model
+   * @param samples a {@link BratDocument} {@link ObjectStream}
+   * @param nameTypes the name types to use or null if all name types
+   */
+  public BratNameSampleStream(SentenceModel sentModel, TokenizerModel tokenModel,
+      ObjectStream<BratDocument> samples, Set<String> nameTypes) {
+    super(samples);
+
+    // TODO: We can pass in custom validators here ...
+    this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel),
+        new TokenizerME(tokenModel), nameTypes);
   }
 
   @Override
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
index a94f5c9..87a0c61 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
@@ -19,6 +19,9 @@ package opennlp.tools.formats.brat;
 
 import java.io.File;
 import java.io.IOException;
+import java.util.Arrays;
+import java.util.Set;
+import java.util.stream.Collectors;
 
 import opennlp.tools.cmdline.ArgumentParser;
 import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
@@ -63,6 +66,9 @@ public class BratNameSampleStreamFactory extends AbstractSampleStreamFactory<Nam
     @OptionalParameter(defaultValue = "false")
     Boolean getRecursive();
 
+    @ParameterDescription(valueName = "names")
+    @OptionalParameter
+    String getNameTypes();
   }
 
   protected BratNameSampleStreamFactory() {
@@ -148,7 +154,15 @@ public class BratNameSampleStreamFactory extends AbstractSampleStreamFactory<Nam
       }
     }
 
-    return new BratNameSampleStream(sentDetector, tokenizer, samples);
+    Set<String> nameTypes = null;
+    if (params.getNameTypes() != null) {
+      String[] nameTypesArr = params.getNameTypes().split(",");
+      if (nameTypesArr.length > 0) {
+        nameTypes = Arrays.stream(nameTypesArr).map(String::trim).collect(Collectors.toSet());
+      }
+    }
+
+    return new BratNameSampleStream(sentDetector, tokenizer, samples, nameTypes);
   }
 
   public static void registerFactory() {
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamTest.java
new file mode 100644
index 0000000..0f13682
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamTest.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.sentdetect.NewlineSentenceDetector;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.ObjectStream;
+
+public class BratNameSampleStreamTest {
+
+  private BratNameSampleStream createNameSampleWith(String nameContainsFilter,
+                                                    Set<String> nameTypes) throws IOException {
+    Map<String, String> typeToClassMap = new HashMap<>();
+    BratAnnotationStreamTest.addEntityTypes(typeToClassMap);
+    AnnotationConfiguration config = new AnnotationConfiguration(typeToClassMap);
+
+    File dir = new File(this.getClass().getResource("/opennlp/tools/formats/brat/").getFile());
+    FileFilter fileFilter = pathname -> pathname.getName().contains(nameContainsFilter);
+
+    ObjectStream<BratDocument> bratDocumentStream = new BratDocumentStream(config, dir,
+        false, fileFilter);
+
+    return new BratNameSampleStream(new NewlineSentenceDetector(),
+        WhitespaceTokenizer.INSTANCE, bratDocumentStream, nameTypes);
+  }
+
+  @Test
+  public void readNoOverlap() throws IOException {
+    BratNameSampleStream stream = createNameSampleWith("-entities.",
+        null);
+    int count = 0;
+    NameSample sample = stream.read();
+    while (sample != null) {
+      count++;
+      sample = stream.read();
+    }
+
+    Assert.assertEquals(8, count);
+  }
+
+  @Test(expected = RuntimeException.class)
+  public void readOverlapFail() throws IOException {
+    BratNameSampleStream stream = createNameSampleWith("overlapping",
+        null);
+
+    NameSample sample = stream.read();
+    while (sample != null) {
+      sample = stream.read();
+    }
+  }
+
+  @Test(expected = IllegalArgumentException.class)
+  public void emptySample() throws IOException {
+    createNameSampleWith("overlapping",
+        Collections.emptySet());
+  }
+
+  @Test
+  public void readOverlapFilter() throws IOException {
+    BratNameSampleStream stream = createNameSampleWith("overlapping",
+        Collections.singleton("Person"));
+    int count = 0;
+    NameSample sample = stream.read();
+    while (sample != null) {
+      count++;
+      sample = stream.read();
+    }
+
+    Assert.assertEquals(8, count);
+  }
+}
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.ann b/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.ann
new file mode 100644
index 0000000..b16f176
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.ann
@@ -0,0 +1,21 @@
+T1	Person 281 286	Obama
+T2	Person 21 33	Barack Obama
+T3	Location 51 62	South Korea
+T4	Location 151 162	North Korea
+T5	Location 231 236	China
+T6	Location 243 254	South Korea
+T7	Location 322 333	North Korea
+T8	Date 257 266	Wednesday
+T9	Location 386 397	North Korea
+T10	Person 586 591	Obama
+T11	Date 843 860	Wednesday evening
+T12	Location 889 901	South Korean
+T13	Person 913 928	Lee Myung - bak
+T14	Date 931 939	Thursday
+T15	Location 978 989	South Korea
+T16	Location 1000 1013	United States
+T17	Person 1121 1126	Obama
+T18	Location 1168 1177	Pyongyang
+T19	Person 1168 1177	Pyongyang
+#1	AnnotatorNotes	T2	President Obama was the 44th U.S. president
+#2	AnnotatorNotes	T3	The capital of South Korea is Seoul
\ No newline at end of file
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.txt b/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.txt
new file mode 100644
index 0000000..9b2d544
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.txt
@@ -0,0 +1,8 @@
+ U . S .  President  Barack Obama  has arrived in  South Korea  , where he is expected to show solidarity with the country ' s president in demanding  North Korea  move toward ending its nuclear weapons programs . 
+As he departed  China  for  South Korea   Wednesday  , President  Obama  took another opportunity to urge  North Korea  to reach an agreement on its nuclear weapons . 
+"  North Korea  has a choice . 
+It can continue down the path of confrontation and provocation that has led to less security , less prosperity and more isolation from the global community , " President  Obama  said . 
+" Or it can choose to become a full member of the international community , which will give a better life to its people by living up to international obligations and foregoing nuclear weapons . " 
+The president landed at a  U . S .  air base Wednesday evening , and is to hold talks with South Korean President  Lee Myung - bak   Thursday  here in the South Korean capital . 
+ South Korea  and the  United States  are trying to coax the North back to six - nation talks aimed at ending its nuclear weapons . 
+President  Obama  has indicated he will send an envoy to  Pyongyang  before the end of the year for one - on - one discussions , but only in the context of restarting the multinational process . 

-- 
To stop receiving notification emails like this one, please contact
colen@apache.org.