You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2013/05/17 14:11:56 UTC
svn commit: r1483759 -
/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/
Author: joern
Date: Fri May 17 12:11:55 2013
New Revision: 1483759
URL: http://svn.apache.org/r1483759
Log:
OPENNLP-560 Initial check in of brat format parsing code for the name finder.
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java (with props)
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotation.java (with props)
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java (with props)
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java (with props)
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java (with props)
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java (with props)
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java (with props)
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/RelationAnnotation.java (with props)
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java (with props)
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SpanAnnotation.java (with props)
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java Fri May 17 12:11:55 2013
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+public class AnnotationConfiguration {
+
+ public static final String SPAN_TYPE = "Span";
+ public static final String ENTITY_TYPE = "Entity";
+ public static final String RELATION_TYPE = "Relation";
+
+ private final Map<String, String> typeToClassMap;
+
+ public AnnotationConfiguration(Map<String, String> typeToClassMap) {
+
+ this.typeToClassMap = Collections.unmodifiableMap(
+ new HashMap<String, String>(typeToClassMap));
+ }
+
+ public String getTypeClass(String type) {
+ return typeToClassMap.get(type);
+ }
+
+ // TODO: Add a parser for the brat configuration file!
+}
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotation.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotation.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotation.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotation.java Fri May 17 12:11:55 2013
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+public abstract class BratAnnotation {
+
+ private final String id;
+ private final String type;
+
+ protected BratAnnotation(String id, String type) {
+ this.id = id;
+ this.type =type;
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public String getType() {
+ return type;
+ }
+
+ @Override
+ public String toString() {
+ return id + " " + type;
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotation.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java Fri May 17 12:11:55 2013
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+/**
+ * Reads the annotations from the brat .ann annotation file.
+ */
+public class BratAnnotationStream implements ObjectStream<BratAnnotation> {
+
+ static abstract class BratAnnotationParser {
+
+ static final int ID_OFFSET = 0;
+ static final int TYPE_OFFSET = 1;
+
+ BratAnnotation parse(String values[]) throws IOException {
+ return null;
+ }
+
+ protected int parseInt(String intString) throws InvalidFormatException {
+ try {
+ return Integer.parseInt(intString);
+ }
+ catch (NumberFormatException e) {
+ throw new InvalidFormatException(e);
+ }
+ }
+ }
+
+ static class SpanAnnotationParser extends BratAnnotationParser {
+
+ private static final int BEGIN_OFFSET = 2;
+ private static final int END_OFFSET = 3;
+
+ @Override
+ BratAnnotation parse(String[] values) throws IOException {
+
+ if (values.length > 4) {
+ String type = values[BratAnnotationParser.TYPE_OFFSET];
+
+ int endOffset = -1;
+
+ for (int i = END_OFFSET; i < values.length; i++) {
+ if (!values[i].contains(";")) {
+ endOffset = parseInt(values[i]);
+ break;
+ }
+ }
+
+ return new SpanAnnotation(values[BratAnnotationParser.ID_OFFSET], type,
+ new Span(parseInt(values[BEGIN_OFFSET]), endOffset, type), "");
+ }
+ else {
+ throw new InvalidFormatException("Line must have at least 5 fields");
+ }
+ }
+ }
+
+ static class RelationAnnotationParser extends BratAnnotationParser {
+
+ private static final int ARG1_OFFSET = 2;
+ private static final int ARG2_OFFSET = 3;
+
+ private String parseArg(String arg) throws InvalidFormatException {
+ if (arg.length() > 4) {
+ return arg.substring(5).trim();
+ }
+ else {
+ throw new InvalidFormatException("Failed to parse argument: " + arg);
+ }
+ }
+
+ @Override
+ BratAnnotation parse(String[] values) throws IOException {
+ return new RelationAnnotation(values[BratAnnotationParser.ID_OFFSET],
+ values[BratAnnotationParser.TYPE_OFFSET], parseArg(values[ARG1_OFFSET]),
+ parseArg(values[ARG2_OFFSET]));
+ }
+ }
+
+ private final Map<String, BratAnnotationParser> parsers =
+ new HashMap<String, BratAnnotationParser>();
+ private final AnnotationConfiguration config;
+ private final BufferedReader reader;
+ private final String id;
+
+ BratAnnotationStream(AnnotationConfiguration config, String id, InputStream in) {
+ this.config = config;
+ this.id = id;
+
+ reader = new BufferedReader(new InputStreamReader(in, Charset.forName("UTF-8")));
+
+ parsers.put(AnnotationConfiguration.SPAN_TYPE, new SpanAnnotationParser());
+ parsers.put(AnnotationConfiguration.ENTITY_TYPE, new SpanAnnotationParser());
+ parsers.put(AnnotationConfiguration.RELATION_TYPE, new RelationAnnotationParser());
+ }
+
+ public BratAnnotation read() throws IOException {
+
+ String line = reader.readLine();
+
+ if (line != null) {
+ String values[] = WhitespaceTokenizer.INSTANCE.tokenize(line);
+
+ if (values.length > 2) {
+ String typeClass = config.getTypeClass(values[BratAnnotationParser.TYPE_OFFSET]);
+
+ BratAnnotationParser parser = parsers.get(typeClass);
+
+ if (parser == null) {
+ throw new IOException("Failed to parse ann document with id " + id +
+ " type class, no parser registered: " + values[BratAnnotationParser.TYPE_OFFSET]);
+ }
+
+ return parser.parse(values);
+ }
+ }
+ else {
+ return null;
+ }
+
+ return null;
+ }
+
+ public void reset() throws IOException, UnsupportedOperationException {
+ reader.reset();
+ }
+
+ public void close() throws IOException {
+ reader.close();
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java Fri May 17 12:11:55 2013
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.util.ObjectStream;
+
+public class BratDocument {
+
+ private final AnnotationConfiguration config;
+ private final String id;
+ private final String text;
+ private final Map<String, BratAnnotation> annotationMap;
+
+ public BratDocument(AnnotationConfiguration config, String id, String text,
+ Collection<BratAnnotation> annotations) {
+ this.config = config;
+ this.id = id;
+ this.text = text;
+
+ Map<String, BratAnnotation> annMap = new HashMap<String, BratAnnotation>();
+ for (BratAnnotation annotation : annotations) {
+ annMap.put(annotation.getId(), annotation);
+ }
+
+ annotationMap = Collections.unmodifiableMap(annMap);
+ }
+
+ public AnnotationConfiguration getConfig() {
+ return config;
+ }
+
+ public String getId() {
+ return id;
+ }
+
+ public String getText() {
+ return text;
+ }
+
+ public BratAnnotation getAnnotation(String id) {
+ return annotationMap.get(id);
+ }
+
+ public Collection<BratAnnotation> getAnnotations() {
+ return annotationMap.values();
+ }
+
+ public static BratDocument parseDocument(AnnotationConfiguration config, String id,
+ InputStream txtIn, InputStream annIn)
+ throws IOException {
+
+ Reader txtReader = new InputStreamReader(txtIn, Charset.forName("UTF-8"));
+
+ StringBuilder text = new StringBuilder();
+
+ char cbuf[] = new char[1024];
+
+ int len;
+ while ((len = txtReader.read(cbuf)) > 0) {
+ text.append(cbuf, 0, len);
+ }
+
+ Collection<BratAnnotation> annotations = new ArrayList<BratAnnotation>();
+
+ ObjectStream<BratAnnotation> annStream = new BratAnnotationStream(config, id, annIn);
+
+ BratAnnotation ann;
+ while ((ann = annStream.read()) != null) {
+ annotations.add(ann);
+ }
+
+ return new BratDocument(config, id, text.toString(), annotations);
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java Fri May 17 12:11:55 2013
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Stack;
+
+import opennlp.tools.util.ObjectStream;
+
+public class BratDocumentStream implements ObjectStream<BratDocument> {
+
+ private AnnotationConfiguration config;
+ private List<String> documentIds = new LinkedList<String>();
+ private Iterator<String> documentIdIterator;
+
+ /**
+ * Creates a BratDocumentStream which reads the documents from the given input directory.
+ *
+ * @param bratCorpusDirectory the directory containing all the brat training data files
+ * @param searchRecursive specifies if the corpus directory should be traversed recursively
+ * to find training data files.
+ * @param fileFilter a custom file filter to filter out certain files or null to accept all files
+ */
+ public BratDocumentStream(AnnotationConfiguration config, File bratCorpusDirectory,
+ boolean searchRecursive, FileFilter fileFilter) throws IOException {
+
+ if (!bratCorpusDirectory.isDirectory()) {
+ throw new IOException("Input corpus directory must be a directory " +
+ "according to File.isDirectory()!");
+ }
+
+ this.config = config;
+
+ Stack<File> directoryStack = new Stack<File>();
+ directoryStack.add(bratCorpusDirectory);
+
+ while (!directoryStack.isEmpty()) {
+ for (File file : directoryStack.pop().listFiles(fileFilter)) {
+
+ if (file.isFile()) {
+ String annFilePath = file.getAbsolutePath();
+ if (annFilePath.endsWith(".ann")) {
+
+ // cutoff last 4 chars ...
+ String documentId = annFilePath.substring(0, annFilePath.length() - 4);
+
+ File txtFile = new File(documentId + ".txt");
+
+ if (txtFile.exists() && txtFile.isFile()) {
+ documentIds.add(documentId);
+ }
+ }
+ }
+ else if (searchRecursive && file.isDirectory()) {
+ directoryStack.push(file);
+ }
+ }
+ }
+
+ reset();
+ }
+
+ public BratDocument read() throws IOException {
+
+ BratDocument doc = null;
+
+ if (documentIdIterator.hasNext()) {
+ String id = documentIdIterator.next();
+
+ InputStream txtIn = null;
+ InputStream annIn = null;
+
+ try {
+ txtIn = new BufferedInputStream(new FileInputStream(id + ".txt"));
+ annIn = new BufferedInputStream(new FileInputStream(id + ".ann"));
+
+ doc = BratDocument.parseDocument(config, id, txtIn, annIn);
+ }
+ finally{
+ if (txtIn != null) {
+ try {
+ txtIn.close();
+ }
+ catch (IOException e) {
+ }
+ }
+
+ if (annIn!= null) {
+ try {
+ annIn.close();
+ }
+ catch (IOException e) {
+ }
+ }
+ }
+ }
+
+ return doc;
+ }
+
+ public void reset() {
+ documentIdIterator = documentIds.iterator();
+ }
+
+ public void close() {
+ // No longer needed, make the object unusable
+ documentIds = null;
+ documentIdIterator = null;
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java Fri May 17 12:11:55 2013
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+/**
+ * Generates Name Sample objects for a Brat Document object.
+ */
+public class BratNameSampleStream extends SegmenterObjectStream<BratDocument, NameSample> {
+
+ private SentenceDetector sentDetector;
+ private Tokenizer tokenizer;
+
+ protected BratNameSampleStream(SentenceDetector sentDetector,
+ Tokenizer tokenizer, ObjectStream<BratDocument> samples) {
+ super(samples);
+
+ this.sentDetector = sentDetector;
+ this.tokenizer = tokenizer;
+ }
+
+ @Override
+ protected List<NameSample> read(BratDocument sample) throws IOException {
+
+ // Note: Some entities might not match sentence boundaries,
+ // to be able to print warning a set of entities id must be maintained
+ // to check if all entities have been used up after the matching is done
+
+ Set<String> entityIdSet = new HashSet<String>();
+
+ for (BratAnnotation ann : sample.getAnnotations()) {
+ if (ann instanceof SpanAnnotation) {
+ entityIdSet.add(ann.getId());
+ }
+ }
+
+ Span sentences[] = sentDetector.sentPosDetect(sample.getText());
+
+ List<NameSample> samples = new ArrayList<NameSample>(sentences.length);
+
+ for (Span sentence : sentences) {
+
+ String sentenceText = sentence.getCoveredText(
+ sample.getText()).toString();
+
+ Span tokens[] = tokenizer.tokenizePos(sentenceText);
+
+ // Note:
+ // A begin and end token index can be identical, but map to different
+ // tokens, to distinguish between between the two begin indexes are
+ // stored with a negative sign, and end indexes are stored with a positive sign
+ // in the tokenIndexMap.
+ // The tokenIndexMap maps to the sentence local token index.
+
+ Map<Integer, Integer> tokenIndexMap = new HashMap<Integer, Integer>();
+
+ for (int i = 0; i < tokens.length; i++) {
+ tokenIndexMap.put(-(sentence.getStart() + tokens[i].getStart()), i);
+ tokenIndexMap.put(sentence.getStart() + tokens[i].getEnd(), i);
+ }
+
+ List<Span> names = new ArrayList<Span>();
+
+ for (BratAnnotation ann : sample.getAnnotations()) {
+
+ if (ann instanceof SpanAnnotation) {
+ SpanAnnotation entity = (SpanAnnotation) ann;
+
+ Span entitySpan = entity.getSpan();
+
+ if (sentence.contains(entitySpan)) {
+ entityIdSet.remove(ann.getId());
+
+ Integer nameBeginIndex = tokenIndexMap.get(-entitySpan.getStart());
+ Integer nameEndIndex = tokenIndexMap.get(entitySpan.getEnd());
+
+ if (nameBeginIndex != null && nameEndIndex != null) {
+ names.add(new Span(nameBeginIndex, nameEndIndex, entity.getType()));
+ }
+ else {
+ System.err.println("Dropped entity " + entity.getId() + " in document " +
+ sample.getId() + ", it is not matching tokenization!");
+ }
+ }
+ }
+ }
+
+ samples.add(new NameSample(Span.spansToStrings(tokens, sentenceText),
+ names.toArray(new Span[names.size()]), samples.size() == 0));
+ }
+
+ for (String id : entityIdSet) {
+ System.err.println("Dropped entity " + id + " in document " +
+ sample.getId() + ", is not matching sentence segmentation!");
+ }
+
+ return samples;
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java Fri May 17 12:11:55 2013
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.sentdetect.NewlineSentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.ObjectStream;
+
+public class BratNameSampleStreamFactory extends AbstractSampleStreamFactory<NameSample> {
+
+ interface Parameters {
+ @ParameterDescription(valueName = "bratDataDir", description = "location of brat data dir")
+ File getBratDataDir();
+
+ @ParameterDescription(valueName = "modelFile")
+ @OptionalParameter
+ File getSentenceDetectorModel();
+
+ @ParameterDescription(valueName = "modelFile")
+ @OptionalParameter
+ File getTokenizerModel();
+
+ @ParameterDescription(valueName = "name")
+ @OptionalParameter
+ String getRuleBasedTokenizer();
+
+ @ParameterDescription(valueName = "value")
+ @OptionalParameter(defaultValue = "false")
+ Boolean getRecursive();
+ }
+
+ protected BratNameSampleStreamFactory() {
+ super(Parameters.class);
+ }
+
+ /**
+ * Checks that non of the passed values are null.
+ *
+ * @param objects
+ * @return
+ */
+ private boolean notNull(Object... objects) {
+
+ for (Object obj : objects) {
+ if (obj == null)
+ return false;
+ }
+
+ return true;
+ }
+
+ public ObjectStream<NameSample> create(String[] args) {
+
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+ if (notNull(params.getRuleBasedTokenizer(), params.getTokenizerModel())) {
+ throw new TerminateToolException(-1, "Either use rule based or statistical tokenizer!");
+ }
+
+ // TODO: This need to be loaded from the real file ...
+ Map<String, String> typeToClassMap = new HashMap<String, String>();
+
+ typeToClassMap.put("bumblebee_annotations_Person", "Entity");
+ typeToClassMap.put("bumblebee_annotations_Organization", "Entity");
+ typeToClassMap.put("bumblebee_annotations_DateMention", "Entity");
+ typeToClassMap.put("bumblebee_annotations_Location", "Entity");
+ typeToClassMap.put("bumblebee_annotations_CRN", "Entity");
+ typeToClassMap.put("bumblebee_annotations_Money", "Entity");
+ typeToClassMap.put("bumblebee_annotations_LocatedAt", AnnotationConfiguration.RELATION_TYPE);
+ typeToClassMap.put("bumblebee_annotations_BornIn", AnnotationConfiguration.RELATION_TYPE);
+ typeToClassMap.put("bumblebee_annotations_BornOn", AnnotationConfiguration.RELATION_TYPE);
+ typeToClassMap.put("bumblebee_annotations_MemberOf", AnnotationConfiguration.RELATION_TYPE);
+
+ AnnotationConfiguration annConfig = new AnnotationConfiguration(typeToClassMap);
+
+ // TODO: Add an optional parameter to search recursive
+ // TODO: How to handle the error here ? terminate the tool? not nice if used by API!
+ ObjectStream<BratDocument> samples;
+ try {
+ samples = new BratDocumentStream(annConfig,
+ params.getBratDataDir(), params.getRecursive(), null);
+ } catch (IOException e) {
+ throw new TerminateToolException(-1, e.getMessage());
+ }
+
+ SentenceDetector sentDetector;
+
+ if (params.getSentenceDetectorModel() != null) {
+ try {
+ sentDetector = new SentenceDetectorME(new SentenceModel(params.getSentenceDetectorModel()));
+ } catch (IOException e) {
+ throw new TerminateToolException(-1, "Failed to load sentence detector model!", e);
+ }
+ }
+ else {
+ sentDetector = new NewlineSentenceDetector();
+ }
+
+ Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
+
+ if (params.getTokenizerModel() != null) {
+ try {
+ tokenizer = new TokenizerME(new TokenizerModel(params.getTokenizerModel()));
+ } catch (IOException e) {
+ throw new TerminateToolException(-1, "Failed to load tokenizer model!", e);
+ }
+ }
+ else if (params.getRuleBasedTokenizer() != null) {
+ String tokenizerName = params.getRuleBasedTokenizer();
+
+ if ("simple".equals(tokenizerName)) {
+ tokenizer = SimpleTokenizer.INSTANCE;
+ }
+ else if("whitespace".equals(tokenizerName)) {
+ tokenizer = WhitespaceTokenizer.INSTANCE;
+ }
+ else {
+ throw new TerminateToolException(-1, "Unkown tokenizer: " + tokenizerName);
+ }
+ }
+
+ return new BratNameSampleStream(sentDetector, tokenizer, samples);
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(NameSample.class, "brat",
+ new BratNameSampleStreamFactory());
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/RelationAnnotation.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/RelationAnnotation.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/RelationAnnotation.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/RelationAnnotation.java Fri May 17 12:11:55 2013
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+public class RelationAnnotation extends BratAnnotation {
+
+ private final String arg1;
+ private final String arg2;
+
+ protected RelationAnnotation(String id, String type, String arg1, String arg2) {
+ super(id, type);
+ this.arg1 = arg1;
+ this.arg2 = arg2;
+ }
+
+ public String getArg1() {
+ return arg1;
+ }
+
+ public String getArg2() {
+ return arg2;
+ }
+
+ @Override
+ public String toString() {
+ return super.toString() + " arg1:" + getArg1() + " arg2:" + getArg2();
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/RelationAnnotation.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java Fri May 17 12:11:55 2013
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+public abstract class SegmenterObjectStream<S, T> extends FilterObjectStream<S, T> {
+
+ private Iterator<T> sampleIt = Collections.<T>emptySet().iterator();
+
+ public SegmenterObjectStream(ObjectStream<S> in) {
+ super(in);
+ }
+
+ protected abstract List<T> read(S sample) throws IOException;
+
+ public final T read() throws IOException {
+
+ if (sampleIt.hasNext()) {
+ return sampleIt.next();
+ }
+ else {
+ S inSample = samples.read();
+
+ if (inSample != null) {
+ List<T> outSamples = read(inSample);
+
+ if (outSamples != null) {
+ sampleIt = outSamples.iterator();
+ }
+
+ return read();
+ }
+ }
+
+ return null;
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SpanAnnotation.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SpanAnnotation.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SpanAnnotation.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SpanAnnotation.java Fri May 17 12:11:55 2013
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import opennlp.tools.util.Span;
+
+public class SpanAnnotation extends BratAnnotation {
+
+ private final Span span;
+ private final String coveredText;
+
+ SpanAnnotation(String id, String type, Span span, String coveredText) {
+ super(id, type);
+ this.span = span;
+ this.coveredText = coveredText;
+ }
+
+ public Span getSpan() {
+ return span;
+ }
+
+ @Override
+ public String toString() {
+ return super.toString() + " " + span.getStart() + " " + span.getEnd() + " " + coveredText;
+ }
+}
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SpanAnnotation.java
------------------------------------------------------------------------------
svn:mime-type = text/plain