You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/07/12 17:58:12 UTC

svn commit: r1145641 - in /incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: cmdline/namefind/TokenNameFinderConverterTool.java formats/BioNLP2004NameSampleStream.java formats/BioNLP2004NameSampleStreamFactory.java

Author: joern
Date: Tue Jul 12 15:58:11 2011
New Revision: 1145641

URL: http://svn.apache.org/viewvc?rev=1145641&view=rev
Log:
OPENNLP-222 Added converter for bionlp 2004 shared task

Added:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java   (with props)
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java   (with props)
Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderConverterTool.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderConverterTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderConverterTool.java?rev=1145641&r1=1145640&r2=1145641&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderConverterTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderConverterTool.java Tue Jul 12 15:58:11 2011
@@ -23,6 +23,7 @@ import java.util.Map;
 
 import opennlp.tools.cmdline.AbstractConverterTool;
 import opennlp.tools.cmdline.ObjectStreamFactory;
+import opennlp.tools.formats.BioNLP2004NameSampleStreamFactory;
 import opennlp.tools.formats.Conll02NameSampleStreamFactory;
 import opennlp.tools.formats.Conll03NameSampleStreamFactory;
 import opennlp.tools.formats.ad.ADNameSampleStreamFactory;
@@ -43,6 +44,7 @@ public class TokenNameFinderConverterToo
     mutableStreamFactories.put("conll02", new Conll02NameSampleStreamFactory());
     mutableStreamFactories.put("conll03", new Conll03NameSampleStreamFactory());
     mutableStreamFactories.put("ad", new ADNameSampleStreamFactory());
+    mutableStreamFactories.put("bionlp2004", new BioNLP2004NameSampleStreamFactory());
     
     streamFactories = Collections.unmodifiableMap(mutableStreamFactories);
   }

Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java?rev=1145641&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java Tue Jul 12 15:58:11 2011
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.StringUtil;
+
+/**
+ * Parser for the training files of the BioNLP/NLPBA 2004 shared task.
+ * <p>
+ * The data contains five named entity types: DNA, RNA, protein, cell_type and cell_line.<br>
+ * <p>
+ * Data can be found on this web site:<br>
+ * http://www-tsujii.is.s.u-tokyo.ac.jp/GENIA/ERtask/report.html
+ * <p>
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class BioNLP2004NameSampleStream implements ObjectStream<NameSample> {
+
+  public static final int GENERATE_DNA_ENTITIES = 0x01;
+  public static final int GENERATE_PROTEIN_ENTITIES = 0x01 << 1;
+  public static final int GENERATE_CELLTYPE_ENTITIES = 0x01 << 2;
+  public static final int GENERATE_CELLLINE_ENTITIES = 0x01 << 3;
+  public static final int GENERATE_RNA_ENTITIES = 0x01 << 4;
+  
+  private final int types;
+  
+  private final ObjectStream<String> lineStream;
+  
+  public BioNLP2004NameSampleStream(InputStream in, int types) {
+    try {
+      this.lineStream = new PlainTextByLineStream(in, "UTF-8");
+    } catch (UnsupportedEncodingException e) {
+      // UTF-8 is available on all JVMs, will never happen
+      throw new IllegalStateException(e);
+    }
+    
+    this.types = types;
+  }
+  
+  public NameSample read() throws IOException {
+
+    List<String> sentence = new ArrayList<String>();
+    List<String> tags = new ArrayList<String>();
+    
+    boolean isClearAdaptiveData = false;
+    
+    // Empty line indicates end of sentence
+    
+    String line;
+    while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line.trim())) {
+      
+      if (line.startsWith("###MEDLINE:")) {
+        isClearAdaptiveData = true;
+        lineStream.read();
+        continue;
+      }
+      
+      if (line.contains("ABSTRACT TRUNCATED"))
+        continue;
+      
+      String fields[] = line.split("\t");
+      
+      if (fields.length == 2) {
+        sentence.add(fields[0]);
+        tags.add(fields[1]);
+      }
+      else {
+        throw new IOException("Expected two fields per line in training data!");
+      }
+    }
+    
+    if (sentence.size() > 0) {
+      
+      // convert name tags into spans
+      List<Span> names = new ArrayList<Span>();
+      
+      int beginIndex = -1;
+      int endIndex = -1;
+      for (int i = 0; i < tags.size(); i++) {
+        
+        String tag = tags.get(i);
+        
+        if (tag.endsWith("DNA") && (types & GENERATE_DNA_ENTITIES) == 0) 
+          tag = "O";
+        
+        if (tag.endsWith("protein") && (types & GENERATE_PROTEIN_ENTITIES) == 0) 
+          tag = "O";
+        
+        if (tag.endsWith("cell_type") && (types & GENERATE_CELLTYPE_ENTITIES) == 0) 
+          tag = "O";
+
+        if (tag.endsWith("cell_line") && (types & GENERATE_CELLTYPE_ENTITIES) == 0) 
+          tag = "O";
+        if (tag.endsWith("RNA") && (types & GENERATE_RNA_ENTITIES) == 0) 
+          tag = "O";
+        
+        if (tag.startsWith("B-")) {
+          
+          if (beginIndex != -1) {
+            names.add(new Span(beginIndex, endIndex, tags.get(beginIndex).substring(2)));
+            beginIndex = -1;
+            endIndex = -1;
+          }
+          
+          beginIndex = i;
+          endIndex = i +1;
+        }
+        else if (tag.startsWith("I-")) {
+          endIndex++;
+        }
+        else if (tag.equals("O")) {
+          if (beginIndex != -1) {
+            names.add(new Span(beginIndex, endIndex, tags.get(beginIndex).substring(2)));
+            beginIndex = -1;
+            endIndex = -1;
+          }
+        }
+        else {
+          throw new IOException("Invalid tag: " + tag);
+        }
+      }
+      
+      // if one span remains, create it here
+      if (beginIndex != -1)
+        names.add(new Span(beginIndex, endIndex, tags.get(beginIndex).substring(2)));
+      
+      return new NameSample(sentence.toArray(new String[sentence.size()]), names.toArray(new Span[names.size()]), isClearAdaptiveData);
+    }
+    else if (line != null) {
+      // Just filter out empty events, if two lines in a row are empty
+      return read();
+    }
+    else {
+      // source stream is not returning anymore lines
+      return null;
+    }
+  }
+
+  public void reset() throws IOException, UnsupportedOperationException {
+    lineStream.reset();
+  }
+
+  public void close() throws IOException {
+    lineStream.close();
+  }
+}

Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java?rev=1145641&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java Tue Jul 12 15:58:11 2011
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.File;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.ObjectStreamFactory;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.ObjectStream;
+
+public class BioNLP2004NameSampleStreamFactory
+    implements ObjectStreamFactory<NameSample>{
+
+  interface Parameters {
+    @ParameterDescription(valueName = "sampleData")
+    String getData();
+    
+    @ParameterDescription(valueName = "DNA,protein,cell_type,cell_line,RNA")
+    String getTypes();
+  }
+  
+  public String getUsage() {
+    return ArgumentParser.createUsage(Parameters.class);
+  }
+  
+  public boolean validateArguments(String[] args) {
+    return ArgumentParser.validateArguments(args, Parameters.class);
+  }
+
+  public ObjectStream<NameSample> create(String[] args) {
+    
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+    int typesToGenerate = 0;
+    
+    if (params.getTypes().contains("DNA")) {
+      typesToGenerate = typesToGenerate | 
+          BioNLP2004NameSampleStream.GENERATE_DNA_ENTITIES;
+    }
+    else if (params.getTypes().contains("protein")) {
+      typesToGenerate = typesToGenerate | 
+          BioNLP2004NameSampleStream.GENERATE_PROTEIN_ENTITIES;
+    }
+    else if (params.getTypes().contains("cell_type")) {
+      typesToGenerate = typesToGenerate | 
+          BioNLP2004NameSampleStream.GENERATE_CELLTYPE_ENTITIES;
+    }
+    else if (params.getTypes().contains("cell_line")) {
+      typesToGenerate = typesToGenerate | 
+          BioNLP2004NameSampleStream.GENERATE_CELLLINE_ENTITIES;
+    }
+    else if (params.getTypes().contains("RNA")) {
+      typesToGenerate = typesToGenerate | 
+          BioNLP2004NameSampleStream.GENERATE_RNA_ENTITIES;
+    }
+
+    return new BioNLP2004NameSampleStream(
+        CmdLineUtil.openInFile(new File(params.getData())), typesToGenerate);
+  }
+}

Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain