You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2013/05/17 14:11:56 UTC

svn commit: r1483759 - /opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/

Author: joern
Date: Fri May 17 12:11:55 2013
New Revision: 1483759

URL: http://svn.apache.org/r1483759
Log:
OPENNLP-560 Initial check in of brat format parsing code for the name finder.

Added:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java   (with props)
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotation.java   (with props)
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java   (with props)
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java   (with props)
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java   (with props)
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java   (with props)
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java   (with props)
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/RelationAnnotation.java   (with props)
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java   (with props)
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SpanAnnotation.java   (with props)

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java Fri May 17 12:11:55 2013
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+public class AnnotationConfiguration {
+  
+  public static final String SPAN_TYPE = "Span";
+  public static final String ENTITY_TYPE = "Entity";
+  public static final String RELATION_TYPE = "Relation";
+  
+  private final Map<String, String> typeToClassMap;
+  
+  public AnnotationConfiguration(Map<String, String> typeToClassMap) {
+    
+    this.typeToClassMap = Collections.unmodifiableMap(
+        new HashMap<String, String>(typeToClassMap));
+  }
+
+  public String getTypeClass(String type) {
+    return typeToClassMap.get(type);
+  }
+  
+  // TODO: Add a parser for the brat configuration file!
+}

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/AnnotationConfiguration.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotation.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotation.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotation.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotation.java Fri May 17 12:11:55 2013
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+public abstract class BratAnnotation {
+
+  private final String id;
+  private final String type;
+  
+  protected BratAnnotation(String id, String type) {
+    this.id = id;
+    this.type =type;
+  }
+  
+  public String getId() {
+    return id;
+  }
+  
+  public String getType() {
+    return type;
+  }
+  
+  @Override
+  public String toString() {
+    return id + " " + type;
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotation.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java Fri May 17 12:11:55 2013
@@ -0,0 +1,159 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.Charset;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+/**
+ * Reads the annotations from the brat .ann annotation file.
+ */
+public class BratAnnotationStream implements ObjectStream<BratAnnotation> {
+  
+  static abstract class BratAnnotationParser {
+    
+    static final int ID_OFFSET = 0;
+    static final int TYPE_OFFSET = 1;
+    
+    BratAnnotation parse(String values[]) throws IOException {
+      return null;
+    }
+    
+    protected int parseInt(String intString) throws InvalidFormatException {
+      try {
+        return Integer.parseInt(intString);
+      }
+      catch (NumberFormatException e) {
+        throw new InvalidFormatException(e);
+      }
+    }
+  }
+  
+  static class SpanAnnotationParser extends BratAnnotationParser {
+    
+    private static final int BEGIN_OFFSET = 2;
+    private static final int END_OFFSET = 3;
+    
+    @Override
+    BratAnnotation parse(String[] values) throws IOException {
+      
+      if (values.length > 4) {
+        String type = values[BratAnnotationParser.TYPE_OFFSET];
+        
+        int endOffset = -1;
+        
+        for (int i = END_OFFSET; i < values.length; i++) {
+          if (!values[i].contains(";")) {
+            endOffset = parseInt(values[i]);
+            break;
+          }
+        }
+        
+        return new SpanAnnotation(values[BratAnnotationParser.ID_OFFSET], type, 
+            new Span(parseInt(values[BEGIN_OFFSET]), endOffset, type), "");
+      }
+      else {
+        throw new InvalidFormatException("Line must have at least 5 fields");
+      }
+    }
+  }
+  
+  static class RelationAnnotationParser extends BratAnnotationParser {
+    
+    private static final int ARG1_OFFSET = 2;
+    private static final int ARG2_OFFSET = 3;
+    
+    private String parseArg(String arg) throws InvalidFormatException {
+      if (arg.length() > 4) {
+        return arg.substring(5).trim();
+      }
+      else {
+        throw new InvalidFormatException("Failed to parse argument: " + arg);
+      }
+    }
+    
+    @Override
+    BratAnnotation parse(String[] values) throws IOException {
+      return new RelationAnnotation(values[BratAnnotationParser.ID_OFFSET], 
+          values[BratAnnotationParser.TYPE_OFFSET], parseArg(values[ARG1_OFFSET]),
+          parseArg(values[ARG2_OFFSET]));
+    }
+  }
+  
+  private final Map<String, BratAnnotationParser> parsers =
+      new HashMap<String, BratAnnotationParser>();
+  private final AnnotationConfiguration config;
+  private final BufferedReader reader;
+  private final String id;
+  
+  BratAnnotationStream(AnnotationConfiguration config, String id, InputStream in) {
+    this.config = config;
+    this.id = id;
+    
+    reader = new BufferedReader(new InputStreamReader(in, Charset.forName("UTF-8")));
+    
+    parsers.put(AnnotationConfiguration.SPAN_TYPE, new SpanAnnotationParser());
+    parsers.put(AnnotationConfiguration.ENTITY_TYPE, new SpanAnnotationParser());
+    parsers.put(AnnotationConfiguration.RELATION_TYPE, new RelationAnnotationParser());
+  }
+
+  public BratAnnotation read() throws IOException {
+    
+    String line = reader.readLine();
+    
+    if (line != null) {
+      String values[] = WhitespaceTokenizer.INSTANCE.tokenize(line);
+
+      if (values.length > 2) {
+        String typeClass = config.getTypeClass(values[BratAnnotationParser.TYPE_OFFSET]);
+        
+        BratAnnotationParser parser = parsers.get(typeClass);
+        
+        if (parser == null) {
+          throw new IOException("Failed to parse ann document with id " + id + 
+              " type class, no parser registered: " + values[BratAnnotationParser.TYPE_OFFSET]);
+        }
+        
+        return parser.parse(values);
+      }
+    }
+    else {
+      return null;
+    }
+    
+    return null;
+  }
+
+  public void reset() throws IOException, UnsupportedOperationException {
+    reader.reset();
+  }
+
+  public void close() throws IOException {
+    reader.close();
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java Fri May 17 12:11:55 2013
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.Reader;
+import java.nio.charset.Charset;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.util.ObjectStream;
+
+public class BratDocument {
+
+  private final AnnotationConfiguration config;
+  private final String id;
+  private final String text;
+  private final Map<String, BratAnnotation> annotationMap;
+  
+  public BratDocument(AnnotationConfiguration config, String id, String text,
+      Collection<BratAnnotation> annotations) {
+    this.config = config;
+    this.id = id;
+    this.text = text;
+    
+    Map<String, BratAnnotation> annMap = new HashMap<String, BratAnnotation>();
+    for (BratAnnotation annotation : annotations) {
+      annMap.put(annotation.getId(), annotation);
+    }
+
+    annotationMap = Collections.unmodifiableMap(annMap);
+  }
+  
+  public AnnotationConfiguration getConfig() {
+    return config;
+  }
+  
+  public String getId() {
+    return id;
+  }
+  
+  public String getText() {
+    return text;
+  }
+  
+  public BratAnnotation getAnnotation(String id) {
+    return annotationMap.get(id);
+  }
+  
+  public Collection<BratAnnotation> getAnnotations() {
+    return annotationMap.values();
+  }
+  
+  public static BratDocument parseDocument(AnnotationConfiguration config, String id,
+      InputStream txtIn, InputStream annIn)
+      throws IOException {
+    
+    Reader txtReader = new InputStreamReader(txtIn, Charset.forName("UTF-8"));
+    
+    StringBuilder text = new StringBuilder();
+    
+    char cbuf[] = new char[1024];
+    
+    int len;
+    while ((len = txtReader.read(cbuf)) > 0) {
+      text.append(cbuf, 0, len);
+    }
+    
+    Collection<BratAnnotation> annotations = new ArrayList<BratAnnotation>();
+    
+    ObjectStream<BratAnnotation> annStream = new BratAnnotationStream(config, id, annIn);
+    
+    BratAnnotation ann;
+    while ((ann = annStream.read()) != null) {
+      annotations.add(ann);
+    }
+    
+    return new BratDocument(config, id, text.toString(), annotations);
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java Fri May 17 12:11:55 2013
@@ -0,0 +1,133 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import java.io.BufferedInputStream;
+import java.io.File;
+import java.io.FileFilter;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Stack;
+
+import opennlp.tools.util.ObjectStream;
+
+public class BratDocumentStream implements ObjectStream<BratDocument> {
+  
+  private AnnotationConfiguration config;
+  private List<String> documentIds = new LinkedList<String>();
+  private Iterator<String> documentIdIterator;
+
+  /**
+   * Creates a BratDocumentStream which reads the documents from the given input directory.
+   * 
+   * @param bratCorpusDirectory the directory containing all the brat training data files
+   * @param searchRecursive specifies if the corpus directory should be traversed recursively
+   * to find training data files.
+   * @param fileFilter  a custom file filter to filter out certain files or null to accept all files
+   */
+  public BratDocumentStream(AnnotationConfiguration config, File bratCorpusDirectory,
+      boolean searchRecursive, FileFilter fileFilter) throws IOException {
+  
+    if (!bratCorpusDirectory.isDirectory()) {
+      throw new IOException("Input corpus directory must be a directory " +
+      		"according to File.isDirectory()!");
+    }
+    
+    this.config = config;
+    
+    Stack<File> directoryStack = new Stack<File>();
+    directoryStack.add(bratCorpusDirectory);
+    
+    while (!directoryStack.isEmpty()) {
+      for (File file : directoryStack.pop().listFiles(fileFilter)) {
+        
+        if (file.isFile()) {
+          String annFilePath = file.getAbsolutePath(); 
+          if (annFilePath.endsWith(".ann")) {
+            
+            // cutoff last 4 chars ...
+            String documentId = annFilePath.substring(0, annFilePath.length() - 4);
+            
+            File txtFile = new File(documentId + ".txt");
+            
+            if (txtFile.exists() && txtFile.isFile()) {
+              documentIds.add(documentId);
+            }
+          }
+        }
+        else if (searchRecursive && file.isDirectory()) {
+          directoryStack.push(file);
+        }
+      }
+    }
+    
+    reset();
+  }
+  
+  public BratDocument read() throws IOException {
+    
+    BratDocument doc = null;
+    
+    if (documentIdIterator.hasNext()) {
+      String id = documentIdIterator.next();
+      
+      InputStream txtIn = null;
+      InputStream annIn = null;
+      
+      try {
+        txtIn = new BufferedInputStream(new FileInputStream(id + ".txt"));
+        annIn = new BufferedInputStream(new FileInputStream(id + ".ann"));
+        
+        doc = BratDocument.parseDocument(config, id, txtIn, annIn);
+      }
+      finally{
+        if (txtIn != null) {
+          try {
+            txtIn.close();
+          }
+          catch (IOException e) {
+          }
+        }
+        
+        if (annIn!= null) {
+          try {
+            annIn.close();
+          }
+          catch (IOException e) {
+          }
+        }
+      }
+    }
+    
+    return doc;
+  }
+
+  public void reset() {
+    documentIdIterator = documentIds.iterator();
+  }
+
+  public void close() {
+    // No longer needed, make the object unusable
+    documentIds = null;
+    documentIdIterator = null;
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java Fri May 17 12:11:55 2013
@@ -0,0 +1,127 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+/**
+ * Generates Name Sample objects for a Brat Document object. 
+ */
+public class BratNameSampleStream extends SegmenterObjectStream<BratDocument, NameSample> {
+  
+  private SentenceDetector sentDetector;
+  private Tokenizer tokenizer;
+  
+  protected BratNameSampleStream(SentenceDetector sentDetector, 
+      Tokenizer tokenizer, ObjectStream<BratDocument> samples) {
+    super(samples);
+    
+    this.sentDetector = sentDetector;
+    this.tokenizer = tokenizer;
+  }
+  
+  @Override
+  protected List<NameSample> read(BratDocument sample) throws IOException {
+    
+    // Note: Some entities might not match sentence boundaries,
+    // to be able to print warning a set of entities id must be maintained
+    // to check if all entities have been used up after the matching is done
+    
+    Set<String> entityIdSet = new HashSet<String>();
+    
+    for (BratAnnotation ann : sample.getAnnotations()) {
+      if (ann instanceof SpanAnnotation) {
+        entityIdSet.add(ann.getId());
+      }
+    }
+    
+    Span sentences[] = sentDetector.sentPosDetect(sample.getText());
+    
+    List<NameSample> samples = new ArrayList<NameSample>(sentences.length);
+    
+    for (Span sentence : sentences) {
+      
+      String sentenceText = sentence.getCoveredText(
+          sample.getText()).toString();
+      
+      Span tokens[] = tokenizer.tokenizePos(sentenceText);
+      
+      // Note:
+      // A begin and end token index can be identical, but map to different
+      // tokens, to distinguish between between the two begin indexes are
+      // stored with a negative sign, and end indexes are stored with a positive sign
+      // in the tokenIndexMap.
+      // The tokenIndexMap maps to the sentence local token index.
+      
+      Map<Integer, Integer> tokenIndexMap = new HashMap<Integer, Integer>();
+      
+      for (int i = 0; i < tokens.length; i++) {
+        tokenIndexMap.put(-(sentence.getStart() + tokens[i].getStart()), i);
+        tokenIndexMap.put(sentence.getStart() + tokens[i].getEnd(), i);
+      }
+      
+      List<Span> names = new ArrayList<Span>();
+      
+      for (BratAnnotation ann : sample.getAnnotations()) {
+        
+        if (ann instanceof SpanAnnotation) {
+          SpanAnnotation entity = (SpanAnnotation) ann;
+          
+          Span entitySpan = entity.getSpan();
+          
+          if (sentence.contains(entitySpan)) {
+            entityIdSet.remove(ann.getId());
+            
+            Integer nameBeginIndex = tokenIndexMap.get(-entitySpan.getStart());
+            Integer nameEndIndex = tokenIndexMap.get(entitySpan.getEnd());
+           
+            if (nameBeginIndex != null && nameEndIndex != null) {
+              names.add(new Span(nameBeginIndex, nameEndIndex, entity.getType()));
+            }
+            else {
+              System.err.println("Dropped entity " + entity.getId() + " in document " + 
+                  sample.getId() + ", it is not matching tokenization!");
+            }
+          }
+        }
+      }
+      
+      samples.add(new NameSample(Span.spansToStrings(tokens, sentenceText),
+          names.toArray(new Span[names.size()]), samples.size() == 0));
+    }
+    
+    for (String id : entityIdSet) {
+      System.err.println("Dropped entity " + id + " in document " + 
+          sample.getId() + ", is not matching sentence segmentation!");
+    }
+    
+    return samples;
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java Fri May 17 12:11:55 2013
@@ -0,0 +1,164 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.sentdetect.NewlineSentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.sentdetect.SentenceDetectorME;
+import opennlp.tools.sentdetect.SentenceModel;
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.tokenize.TokenizerME;
+import opennlp.tools.tokenize.TokenizerModel;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.ObjectStream;
+
+public class BratNameSampleStreamFactory extends AbstractSampleStreamFactory<NameSample> {
+  
+  interface Parameters {
+    @ParameterDescription(valueName = "bratDataDir", description = "location of brat data dir")
+    File getBratDataDir();
+
+    @ParameterDescription(valueName = "modelFile")
+    @OptionalParameter
+    File getSentenceDetectorModel();
+    
+    @ParameterDescription(valueName = "modelFile")
+    @OptionalParameter
+    File getTokenizerModel();
+
+    @ParameterDescription(valueName = "name")
+    @OptionalParameter
+    String getRuleBasedTokenizer();
+    
+    @ParameterDescription(valueName = "value")
+    @OptionalParameter(defaultValue = "false")
+    Boolean getRecursive();
+  }
+  
+  protected BratNameSampleStreamFactory() {
+    super(Parameters.class);
+  }
+  
+  /**
+   * Checks that non of the passed values are null.
+   * 
+   * @param objects
+   * @return
+   */
+  private boolean notNull(Object... objects) {
+    
+    for (Object obj : objects) {
+      if (obj == null)
+        return false;
+    }
+    
+    return true;
+  }
+  
+  public ObjectStream<NameSample> create(String[] args) {
+    
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+    
+    if (notNull(params.getRuleBasedTokenizer(), params.getTokenizerModel())) {
+      throw new TerminateToolException(-1, "Either use rule based or statistical tokenizer!");
+    }
+    
+    // TODO: This need to be loaded from the real file ...
+    Map<String, String> typeToClassMap = new HashMap<String, String>();
+    
+    typeToClassMap.put("bumblebee_annotations_Person", "Entity");
+    typeToClassMap.put("bumblebee_annotations_Organization", "Entity");
+    typeToClassMap.put("bumblebee_annotations_DateMention", "Entity");
+    typeToClassMap.put("bumblebee_annotations_Location", "Entity");
+    typeToClassMap.put("bumblebee_annotations_CRN", "Entity");
+    typeToClassMap.put("bumblebee_annotations_Money", "Entity");
+    typeToClassMap.put("bumblebee_annotations_LocatedAt", AnnotationConfiguration.RELATION_TYPE);
+    typeToClassMap.put("bumblebee_annotations_BornIn", AnnotationConfiguration.RELATION_TYPE);
+    typeToClassMap.put("bumblebee_annotations_BornOn", AnnotationConfiguration.RELATION_TYPE);
+    typeToClassMap.put("bumblebee_annotations_MemberOf", AnnotationConfiguration.RELATION_TYPE);
+    
+    AnnotationConfiguration annConfig = new AnnotationConfiguration(typeToClassMap);
+    
+    // TODO: Add an optional parameter to search recursive
+    // TODO: How to handle the error here ? terminate the tool? not nice if used by API!
+    ObjectStream<BratDocument> samples;
+    try {
+      samples = new BratDocumentStream(annConfig, 
+          params.getBratDataDir(), params.getRecursive(), null);
+    } catch (IOException e) {
+      throw new TerminateToolException(-1, e.getMessage());
+    }
+    
+    SentenceDetector sentDetector;
+    
+    if (params.getSentenceDetectorModel() != null) {
+      try {
+        sentDetector = new SentenceDetectorME(new SentenceModel(params.getSentenceDetectorModel()));
+      } catch (IOException e) {
+        throw new TerminateToolException(-1, "Failed to load sentence detector model!", e);
+      }
+    }
+    else {
+      sentDetector = new NewlineSentenceDetector();
+    }
+        
+    Tokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
+    
+    if (params.getTokenizerModel() != null) {
+      try {
+        tokenizer = new TokenizerME(new TokenizerModel(params.getTokenizerModel()));
+      } catch (IOException e) {
+        throw new TerminateToolException(-1, "Failed to load tokenizer model!", e);
+      }
+    }
+    else if (params.getRuleBasedTokenizer() != null) {
+      String tokenizerName = params.getRuleBasedTokenizer();
+      
+      if ("simple".equals(tokenizerName)) {
+        tokenizer = SimpleTokenizer.INSTANCE;
+      }
+      else if("whitespace".equals(tokenizerName)) {
+        tokenizer = WhitespaceTokenizer.INSTANCE;
+      }
+      else {
+        throw new TerminateToolException(-1, "Unkown tokenizer: " + tokenizerName);
+      }
+    }
+    
+    return new BratNameSampleStream(sentDetector, tokenizer, samples);
+  }
+  
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(NameSample.class, "brat",
+        new BratNameSampleStreamFactory());
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/RelationAnnotation.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/RelationAnnotation.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/RelationAnnotation.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/RelationAnnotation.java Fri May 17 12:11:55 2013
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+public class RelationAnnotation extends BratAnnotation {
+
+  private final String arg1;
+  private final String arg2;
+  
+  protected RelationAnnotation(String id, String type, String arg1, String arg2) {
+    super(id, type);
+    this.arg1 = arg1;
+    this.arg2 = arg2;
+  }
+  
+  public String getArg1() {
+    return arg1;
+  }
+  
+  public String getArg2() {
+    return arg2;
+  }
+  
+  @Override
+  public String toString() {
+    return super.toString() + " arg1:" + getArg1() + " arg2:" + getArg2();
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/RelationAnnotation.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java Fri May 17 12:11:55 2013
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+public abstract class SegmenterObjectStream<S, T> extends FilterObjectStream<S, T> {
+
+  private Iterator<T> sampleIt = Collections.<T>emptySet().iterator();
+  
+  public SegmenterObjectStream(ObjectStream<S> in) {
+    super(in);
+  }
+  
+  protected abstract List<T> read(S sample) throws IOException;
+  
+  public final T read() throws IOException {
+    
+    if (sampleIt.hasNext()) {
+      return sampleIt.next();
+    }
+    else {
+      S inSample = samples.read();
+      
+      if (inSample != null) {
+        List<T> outSamples = read(inSample);
+        
+        if (outSamples != null) {
+          sampleIt = outSamples.iterator();
+        }
+        
+        return read();
+      }
+    }
+    
+    return null;
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SegmenterObjectStream.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SpanAnnotation.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SpanAnnotation.java?rev=1483759&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SpanAnnotation.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SpanAnnotation.java Fri May 17 12:11:55 2013
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+import opennlp.tools.util.Span;
+
+public class SpanAnnotation extends BratAnnotation {
+
+  private final Span span;
+  private final String coveredText;
+  
+  SpanAnnotation(String id, String type, Span span, String coveredText) {
+    super(id, type);
+    this.span = span;
+    this.coveredText = coveredText;
+  }
+  
+  public Span getSpan() {
+    return span;
+  }
+  
+  @Override
+  public String toString() {
+    return super.toString() + " " + span.getStart() + " " + span.getEnd() + " " + coveredText;
+  }
+}

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/brat/SpanAnnotation.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain