You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/06/06 10:09:58 UTC
[14/21] opennlp git commit: OPENNLP-1079 Added BratDocumentParser.
Closed Annotation stream in BratDocument
OPENNLP-1079 Added BratDocumentParser. Closed Annotation stream in
BratDocument
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/e9728694
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/e9728694
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/e9728694
Branch: refs/heads/LangDetect
Commit: e972869486f85c3424875a443eb04bda2eeb6bd3
Parents: 1aa5432
Author: Daniel Russ <dr...@mail.nih.gov>
Authored: Thu May 25 14:57:27 2017 -0400
Committer: Daniel Russ <dr...@mail.nih.gov>
Committed: Thu May 25 14:59:45 2017 -0400
----------------------------------------------------------------------
.../tools/formats/brat/BratDocument.java | 1 +
.../tools/formats/brat/BratDocumentParser.java | 149 +++++++++++++++++++
.../formats/brat/BratNameSampleStream.java | 120 +--------------
3 files changed, 154 insertions(+), 116 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/e9728694/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java
index 1b9aee2..51723be 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java
@@ -91,6 +91,7 @@ public class BratDocument {
while ((ann = annStream.read()) != null) {
annotations.add(ann);
}
+ annStream.close();
return new BratDocument(config, id, text.toString(), annotations);
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/e9728694/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
new file mode 100644
index 0000000..24ba887
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.Span;
+
+public class BratDocumentParser {
+
+ private SentenceDetector sentDetector;
+ private Tokenizer tokenizer;
+
+ public BratDocumentParser(SentenceDetector sentenceDetector, Tokenizer tokenizer) {
+ this.sentDetector = sentenceDetector;
+ this.tokenizer = tokenizer;
+ }
+
+ public List<NameSample> parse(BratDocument sample) {
+ // Note: Some entities might not match sentence boundaries,
+ // to be able to print warning a set of entities id must be maintained
+ // to check if all entities have been used up after the matching is done
+
+ Set<String> entityIdSet = new HashSet<>();
+ Map<Integer, Span> coveredIndexes = new HashMap<>();
+
+ for (BratAnnotation ann : sample.getAnnotations()) {
+ if (ann instanceof SpanAnnotation) {
+ entityIdSet.add(ann.getId());
+
+ Span span = ((SpanAnnotation) ann).getSpan();
+ for (int i = span.getStart(); i < span.getEnd(); i++) {
+ coveredIndexes.put(i, span);
+ }
+ }
+ }
+
+ List<Span> sentences = new ArrayList<>();
+ for (Span sentence : sentDetector.sentPosDetect(sample.getText())) {
+ Span conflictingName = coveredIndexes.get(sentence.getStart());
+
+ if (sentences.size() > 0 && conflictingName != null &&
+ conflictingName.getStart() < sentence.getStart()) {
+ Span lastSentence = sentences.remove(sentences.size() - 1);
+ sentences.add(new Span(lastSentence.getStart(), sentence.getEnd()));
+
+ System.out.println("Correcting sentence segmentation in document " +
+ sample.getId());
+ }
+ else {
+ sentences.add(sentence);
+ }
+ }
+
+ // TODO: Token breaks should be enforced on name span boundaries
+ // a) Just split tokens
+ // b) Implement a custom token split validator which can be injected into the Tokenizer
+
+ // Currently we are missing all
+
+ List<NameSample> samples = new ArrayList<>(sentences.size());
+
+ for (Span sentence : sentences) {
+
+ String sentenceText = sentence.getCoveredText(
+ sample.getText()).toString();
+
+ Span[] tokens = tokenizer.tokenizePos(sentenceText);
+
+ // Note:
+ // A begin and end token index can be identical, but map to different
+ // tokens, to distinguish between between the two begin indexes are
+ // stored with a negative sign, and end indexes are stored with a positive sign
+ // in the tokenIndexMap.
+ // The tokenIndexMap maps to the sentence local token index.
+
+ Map<Integer, Integer> tokenIndexMap = new HashMap<>();
+
+ for (int i = 0; i < tokens.length; i++) {
+ tokenIndexMap.put(-(sentence.getStart() + tokens[i].getStart()), i);
+ tokenIndexMap.put(sentence.getStart() + tokens[i].getEnd(), i + 1);
+ }
+
+ List<Span> names = new ArrayList<>();
+
+ for (BratAnnotation ann : sample.getAnnotations()) {
+
+ if (ann instanceof SpanAnnotation) {
+ SpanAnnotation entity = (SpanAnnotation) ann;
+
+ Span entitySpan = entity.getSpan();
+
+ if (sentence.contains(entitySpan)) {
+ entityIdSet.remove(ann.getId());
+
+ entitySpan = entitySpan.trim(sample.getText());
+
+ Integer nameBeginIndex = tokenIndexMap.get(-entitySpan.getStart());
+ Integer nameEndIndex = tokenIndexMap.get(entitySpan.getEnd());
+
+ if (nameBeginIndex != null && nameEndIndex != null) {
+ names.add(new Span(nameBeginIndex, nameEndIndex, entity.getType()));
+ }
+ else {
+ System.err.println("Dropped entity " + entity.getId() + " ("
+ + entitySpan.getCoveredText(sample.getText()) + ") " + " in document "
+ + sample.getId() + ", it is not matching tokenization!");
+ }
+ }
+ }
+ }
+
+ samples.add(new NameSample(sample.getId(), Span.spansToStrings(tokens, sentenceText),
+ names.toArray(new Span[names.size()]), null, samples.size() == 0));
+ }
+
+ for (String id : entityIdSet) {
+ System.err.println("Dropped entity " + id + " in document " +
+ sample.getId() + ", is not matching sentence segmentation!");
+ }
+
+ return samples;
+ }
+}
+
http://git-wip-us.apache.org/repos/asf/opennlp/blob/e9728694/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
index 569f450..cc066ad 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
@@ -18,12 +18,7 @@
package opennlp.tools.formats.brat;
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
import java.util.List;
-import java.util.Map;
-import java.util.Set;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.sentdetect.SentenceDetector;
@@ -33,22 +28,19 @@ import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.tokenize.TokenizerME;
import opennlp.tools.tokenize.TokenizerModel;
import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.Span;
/**
* Generates Name Sample objects for a Brat Document object.
*/
public class BratNameSampleStream extends SegmenterObjectStream<BratDocument, NameSample> {
- private SentenceDetector sentDetector;
- private Tokenizer tokenizer;
+ private final BratDocumentParser parser;
public BratNameSampleStream(SentenceDetector sentDetector,
Tokenizer tokenizer, ObjectStream<BratDocument> samples) {
super(samples);
- this.sentDetector = sentDetector;
- this.tokenizer = tokenizer;
+ this.parser = new BratDocumentParser(sentDetector, tokenizer);
}
public BratNameSampleStream(SentenceModel sentModel, TokenizerModel tokenModel,
@@ -56,115 +48,11 @@ public class BratNameSampleStream extends SegmenterObjectStream<BratDocument, Na
super(samples);
// TODO: We can pass in custom validators here ...
- this.sentDetector = new SentenceDetectorME(sentModel);
- this.tokenizer = new TokenizerME(tokenModel);
+ this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel), new TokenizerME(tokenModel));
}
@Override
protected List<NameSample> read(BratDocument sample) throws IOException {
-
- // Note: Some entities might not match sentence boundaries,
- // to be able to print warning a set of entities id must be maintained
- // to check if all entities have been used up after the matching is done
-
- Set<String> entityIdSet = new HashSet<>();
- Map<Integer, Span> coveredIndexes = new HashMap<>();
-
- for (BratAnnotation ann : sample.getAnnotations()) {
- if (ann instanceof SpanAnnotation) {
- entityIdSet.add(ann.getId());
-
- Span span = ((SpanAnnotation) ann).getSpan();
- for (int i = span.getStart(); i < span.getEnd(); i++) {
- coveredIndexes.put(i, span);
- }
- }
- }
-
- List<Span> sentences = new ArrayList<>();
- for (Span sentence : sentDetector.sentPosDetect(sample.getText())) {
- Span conflictingName = coveredIndexes.get(sentence.getStart());
-
- if (sentences.size() > 0 && conflictingName != null &&
- conflictingName.getStart() < sentence.getStart()) {
- Span lastSentence = sentences.remove(sentences.size() - 1);
- sentences.add(new Span(lastSentence.getStart(), sentence.getEnd()));
-
- System.out.println("Correcting sentence segmentation in document " +
- sample.getId());
- }
- else {
- sentences.add(sentence);
- }
- }
-
- // TODO: Token breaks should be enforced on name span boundaries
- // a) Just split tokens
- // b) Implement a custom token split validator which can be injected into the Tokenizer
-
- // Currently we are missing all
-
- List<NameSample> samples = new ArrayList<>(sentences.size());
-
- for (Span sentence : sentences) {
-
- String sentenceText = sentence.getCoveredText(
- sample.getText()).toString();
-
- Span[] tokens = tokenizer.tokenizePos(sentenceText);
-
- // Note:
- // A begin and end token index can be identical, but map to different
- // tokens, to distinguish between between the two begin indexes are
- // stored with a negative sign, and end indexes are stored with a positive sign
- // in the tokenIndexMap.
- // The tokenIndexMap maps to the sentence local token index.
-
- Map<Integer, Integer> tokenIndexMap = new HashMap<>();
-
- for (int i = 0; i < tokens.length; i++) {
- tokenIndexMap.put(-(sentence.getStart() + tokens[i].getStart()), i);
- tokenIndexMap.put(sentence.getStart() + tokens[i].getEnd(), i + 1);
- }
-
- List<Span> names = new ArrayList<>();
-
- for (BratAnnotation ann : sample.getAnnotations()) {
-
- if (ann instanceof SpanAnnotation) {
- SpanAnnotation entity = (SpanAnnotation) ann;
-
- Span entitySpan = entity.getSpan();
-
- if (sentence.contains(entitySpan)) {
- entityIdSet.remove(ann.getId());
-
- entitySpan = entitySpan.trim(sample.getText());
-
- Integer nameBeginIndex = tokenIndexMap.get(-entitySpan.getStart());
- Integer nameEndIndex = tokenIndexMap.get(entitySpan.getEnd());
-
- if (nameBeginIndex != null && nameEndIndex != null) {
- names.add(new Span(nameBeginIndex, nameEndIndex, entity.getType()));
- }
- else {
- System.err.println("Dropped entity " + entity.getId() + " ("
- + entitySpan.getCoveredText(sample.getText()) + ") " + " in document "
- + sample.getId() + ", it is not matching tokenization!");
- }
- }
- }
- }
-
- samples.add(new NameSample(sample.getId(), Span.spansToStrings(tokens, sentenceText),
- names.toArray(new Span[names.size()]), null, samples.size() == 0));
- }
-
- for (String id : entityIdSet) {
- System.err.println("Dropped entity " + id + " in document " +
- sample.getId() + ", is not matching sentence segmentation!");
- }
-
- return samples;
+ return parser.parse(sample);
}
}