You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2018/04/30 13:38:32 UTC

[opennlp] branch OPENNLP-1194 updated (223c1a9 -> 1b1f4ea)

This is an automated email from the ASF dual-hosted git repository.

colen pushed a change to branch OPENNLP-1194
in repository https://gitbox.apache.org/repos/asf/opennlp.git.


    omit 223c1a9  OPENNLP-1194: Adds type name filter to BratDocumentParser
     new 1b1f4ea  OPENNLP-1194: Adds type name filter to BratDocumentParser

This update added new revisions after undoing existing revisions.
That is to say, some revisions that were in the old version of the
branch are not in the new version.  This situation occurs
when a user --force pushes a change and generates a repository
containing something like this:

 * -- * -- B -- O -- O -- O   (223c1a9)
            \
             N -- N -- N   refs/heads/OPENNLP-1194 (1b1f4ea)

You should already have received notification emails for all of the O
revisions, and so the following emails describe only the N revisions
from the common base, B.

Any revisions marked "omit" are not gone; other references still
refer to them.  Any revisions marked "discard" are gone forever.

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 .../tools/formats/brat/BratDocumentParser.java     | 10 ++++--
 .../tools/formats/brat/BratNameSampleStream.java   | 42 ++++++++++++++++++++++
 .../formats/brat/BratNameSampleStreamFactory.java  |  3 +-
 3 files changed, 52 insertions(+), 3 deletions(-)

-- 
To stop receiving notification emails like this one, please contact
colen@apache.org.

[opennlp] 01/01: OPENNLP-1194: Adds type name filter to BratDocumentParser

Posted by co...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

colen pushed a commit to branch OPENNLP-1194
in repository https://gitbox.apache.org/repos/asf/opennlp.git

commit 1b1f4ea121e16693e5a3c4b4026fe5e3b424de0b
Author: William D C M SILVA <co...@apache.org>
AuthorDate: Fri Apr 20 14:07:29 2018 -0300

    OPENNLP-1194: Adds type name filter to BratDocumentParser
---
 .../tools/formats/brat/BratDocumentParser.java     | 21 ++++-
 .../tools/formats/brat/BratNameSampleStream.java   | 52 +++++++++++-
 .../formats/brat/BratNameSampleStreamFactory.java  | 17 +++-
 .../formats/brat/BratNameSampleStreamTest.java     | 92 ++++++++++++++++++++++
 .../formats/brat/voa-with-entities-overlapping.ann | 21 +++++
 .../formats/brat/voa-with-entities-overlapping.txt |  8 ++
 6 files changed, 204 insertions(+), 7 deletions(-)

diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
index 24ba887..a3899ff 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
@@ -34,10 +34,20 @@ public class BratDocumentParser {
 
   private SentenceDetector sentDetector;
   private Tokenizer tokenizer;
+  private final Set<String> nameTypes;
 
   public BratDocumentParser(SentenceDetector sentenceDetector, Tokenizer tokenizer) {
+    this(sentenceDetector, tokenizer, null);
+  }
+
+  public BratDocumentParser(SentenceDetector sentenceDetector, Tokenizer tokenizer,
+                            Set<String> nameTypes) {
+    if (nameTypes != null && nameTypes.size() == 0) {
+      throw new IllegalArgumentException("nameTypes should be null or have one or more elements");
+    }
     this.sentDetector = sentenceDetector;
     this.tokenizer = tokenizer;
+    this.nameTypes = nameTypes;
   }
 
   public List<NameSample> parse(BratDocument sample) {
@@ -49,7 +59,7 @@ public class BratDocumentParser {
     Map<Integer, Span> coveredIndexes = new HashMap<>();
 
     for (BratAnnotation ann : sample.getAnnotations()) {
-      if (ann instanceof SpanAnnotation) {
+      if (isSpanAnnotation(ann)) {
         entityIdSet.add(ann.getId());
 
         Span span = ((SpanAnnotation) ann).getSpan();
@@ -109,7 +119,7 @@ public class BratDocumentParser {
 
       for (BratAnnotation ann : sample.getAnnotations()) {
 
-        if (ann instanceof SpanAnnotation) {
+        if (isSpanAnnotation(ann)) {
           SpanAnnotation entity = (SpanAnnotation) ann;
 
           Span entitySpan = entity.getSpan();
@@ -145,5 +155,12 @@ public class BratDocumentParser {
 
     return samples;
   }
+
+  private boolean isSpanAnnotation(BratAnnotation ann) {
+    if (ann instanceof SpanAnnotation && (nameTypes == null || nameTypes.contains(ann.getType()))) {
+      return true;
+    }
+    return false;
+  }
 }
 
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
index cc066ad..d799b4c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
@@ -19,6 +19,7 @@ package opennlp.tools.formats.brat;
 
 import java.io.IOException;
 import java.util.List;
+import java.util.Set;
 
 import opennlp.tools.namefind.NameSample;
 import opennlp.tools.sentdetect.SentenceDetector;
@@ -36,19 +37,62 @@ public class BratNameSampleStream extends SegmenterObjectStream<BratDocument, Na
 
   private final BratDocumentParser parser;
 
+  /**
+   * Creates a new {@link BratNameSampleStream}.
+   * @param sentDetector a {@link SentenceDetector} instance
+   * @param tokenizer a {@link Tokenizer} instance
+   * @param samples a {@link BratDocument} {@link ObjectStream}
+   */
   public BratNameSampleStream(SentenceDetector sentDetector,
-      Tokenizer tokenizer, ObjectStream<BratDocument> samples) {
+                              Tokenizer tokenizer, ObjectStream<BratDocument> samples) {
     super(samples);
 
-    this.parser = new BratDocumentParser(sentDetector, tokenizer);
+    this.parser = new BratDocumentParser(sentDetector, tokenizer, null);
   }
 
+  /**
+   * Creates a new {@link BratNameSampleStream}.
+   * @param sentModel a {@link SentenceModel} model
+   * @param tokenModel a {@link TokenizerModel} model
+   * @param samples a {@link BratDocument} {@link ObjectStream}
+   */
   public BratNameSampleStream(SentenceModel sentModel, TokenizerModel tokenModel,
-      ObjectStream<BratDocument> samples) {
+                              ObjectStream<BratDocument> samples) {
     super(samples);
 
     // TODO: We can pass in custom validators here ...
-    this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel), new TokenizerME(tokenModel)); 
+    this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel),
+        new TokenizerME(tokenModel), null);
+  }
+
+  /**
+   * Creates a new {@link BratNameSampleStream}.
+   * @param sentDetector a {@link SentenceDetector} instance
+   * @param tokenizer a {@link Tokenizer} instance
+   * @param samples a {@link BratDocument} {@link ObjectStream}
+   * @param nameTypes the name types to use or null if all name types
+   */
+  public BratNameSampleStream(SentenceDetector sentDetector,
+      Tokenizer tokenizer, ObjectStream<BratDocument> samples, Set<String> nameTypes) {
+    super(samples);
+
+    this.parser = new BratDocumentParser(sentDetector, tokenizer, nameTypes);
+  }
+
+  /**
+   * Creates a new {@link BratNameSampleStream}.
+   * @param sentModel a {@link SentenceModel} model
+   * @param tokenModel a {@link TokenizerModel} model
+   * @param samples a {@link BratDocument} {@link ObjectStream}
+   * @param nameTypes the name types to use or null if all name types
+   */
+  public BratNameSampleStream(SentenceModel sentModel, TokenizerModel tokenModel,
+      ObjectStream<BratDocument> samples, Set<String> nameTypes) {
+    super(samples);
+
+    // TODO: We can pass in custom validators here ...
+    this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel),
+        new TokenizerME(tokenModel), nameTypes);
   }
 
   @Override
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
index a94f5c9..aa3a173 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
@@ -19,6 +19,10 @@ package opennlp.tools.formats.brat;
 
 import java.io.File;
 import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+import java.util.stream.Collectors;
 
 import opennlp.tools.cmdline.ArgumentParser;
 import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
@@ -63,6 +67,9 @@ public class BratNameSampleStreamFactory extends AbstractSampleStreamFactory<Nam
     @OptionalParameter(defaultValue = "false")
     Boolean getRecursive();
 
+    @ParameterDescription(valueName = "names")
+    @OptionalParameter
+    String getNameTypes();
   }
 
   protected BratNameSampleStreamFactory() {
@@ -148,7 +155,15 @@ public class BratNameSampleStreamFactory extends AbstractSampleStreamFactory<Nam
       }
     }
 
-    return new BratNameSampleStream(sentDetector, tokenizer, samples);
+    Set<String> nameTypes = null;
+    if (params.getNameTypes() != null) {
+      String[] nameTypesArr = params.getNameTypes().split(",");
+      if (nameTypesArr != null && nameTypesArr.length > 0) {
+        nameTypes = Arrays.stream(nameTypesArr).map(String::trim).collect(Collectors.toSet());
+      }
+    }
+
+    return new BratNameSampleStream(sentDetector, tokenizer, samples, nameTypes);
   }
 
   public static void registerFactory() {
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamTest.java
new file mode 100644
index 0000000..1aa56dc
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/brat/BratNameSampleStreamTest.java
@@ -0,0 +1,92 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.Set;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.sentdetect.NewlineSentenceDetector;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.ObjectStream;
+
+public class BratNameSampleStreamTest {
+
+  private BratNameSampleStream createNameSampleWith(String nameContainsFilter,
+                                                    Set<String> nameTypes) throws IOException {
+    Map<String, String> typeToClassMap = new HashMap<>();
+    BratAnnotationStreamTest.addEntityTypes(typeToClassMap);
+    AnnotationConfiguration config = new AnnotationConfiguration(typeToClassMap);
+
+    File dir = new File(this.getClass().getResource("/opennlp/tools/formats/brat/").getFile());
+    FileFilter fileFilter = pathname -> pathname.getName().contains(nameContainsFilter);
+
+    ObjectStream<BratDocument> bratDocumentStream = new BratDocumentStream(config, dir,
+        false, fileFilter);
+
+    return new BratNameSampleStream(new NewlineSentenceDetector(),
+        WhitespaceTokenizer.INSTANCE, bratDocumentStream, nameTypes);
+  }
+
+  @Test
+  public void readNoOverlap() throws IOException {
+    BratNameSampleStream stream = createNameSampleWith("-entities.",
+        Collections.emptySet());
+    int count = 0;
+    NameSample sample = stream.read();
+    while (sample != null) {
+      count++;
+      sample = stream.read();
+    }
+
+    Assert.assertEquals(8, count);
+  }
+
+  @Test(expected = RuntimeException.class)
+  public void readOverlapFail() throws IOException {
+    BratNameSampleStream stream = createNameSampleWith("overlapping",
+        Collections.emptySet());
+
+    NameSample sample = stream.read();
+    while (sample != null) {
+      sample = stream.read();
+    }
+  }
+
+  @Test
+  public void readOverlapFilter() throws IOException {
+    BratNameSampleStream stream = createNameSampleWith("overlapping",
+        Collections.singleton("Person"));
+    int count = 0;
+    NameSample sample = stream.read();
+    while (sample != null) {
+      count++;
+      sample = stream.read();
+    }
+
+    Assert.assertEquals(8, count);
+  }
+}
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.ann b/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.ann
new file mode 100644
index 0000000..b16f176
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.ann
@@ -0,0 +1,21 @@
+T1	Person 281 286	Obama
+T2	Person 21 33	Barack Obama
+T3	Location 51 62	South Korea
+T4	Location 151 162	North Korea
+T5	Location 231 236	China
+T6	Location 243 254	South Korea
+T7	Location 322 333	North Korea
+T8	Date 257 266	Wednesday
+T9	Location 386 397	North Korea
+T10	Person 586 591	Obama
+T11	Date 843 860	Wednesday evening
+T12	Location 889 901	South Korean
+T13	Person 913 928	Lee Myung - bak
+T14	Date 931 939	Thursday
+T15	Location 978 989	South Korea
+T16	Location 1000 1013	United States
+T17	Person 1121 1126	Obama
+T18	Location 1168 1177	Pyongyang
+T19	Person 1168 1177	Pyongyang
+#1	AnnotatorNotes	T2	President Obama was the 44th U.S. president
+#2	AnnotatorNotes	T3	The capital of South Korea is Seoul
\ No newline at end of file
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.txt b/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.txt
new file mode 100644
index 0000000..9b2d544
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/brat/voa-with-entities-overlapping.txt
@@ -0,0 +1,8 @@
+ U . S .  President  Barack Obama  has arrived in  South Korea  , where he is expected to show solidarity with the country ' s president in demanding  North Korea  move toward ending its nuclear weapons programs . 
+As he departed  China  for  South Korea   Wednesday  , President  Obama  took another opportunity to urge  North Korea  to reach an agreement on its nuclear weapons . 
+"  North Korea  has a choice . 
+It can continue down the path of confrontation and provocation that has led to less security , less prosperity and more isolation from the global community , " President  Obama  said . 
+" Or it can choose to become a full member of the international community , which will give a better life to its people by living up to international obligations and foregoing nuclear weapons . " 
+The president landed at a  U . S .  air base Wednesday evening , and is to hold talks with South Korean President  Lee Myung - bak   Thursday  here in the South Korean capital . 
+ South Korea  and the  United States  are trying to coax the North back to six - nation talks aimed at ending its nuclear weapons . 
+President  Obama  has indicated he will send an envoy to  Pyongyang  before the end of the year for one - on - one discussions , but only in the context of restarting the multinational process . 

-- 
To stop receiving notification emails like this one, please contact
colen@apache.org.