You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/07/12 17:58:12 UTC
svn commit: r1145641 - in
/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools:
cmdline/namefind/TokenNameFinderConverterTool.java
formats/BioNLP2004NameSampleStream.java
formats/BioNLP2004NameSampleStreamFactory.java
Author: joern
Date: Tue Jul 12 15:58:11 2011
New Revision: 1145641
URL: http://svn.apache.org/viewvc?rev=1145641&view=rev
Log:
OPENNLP-222 Added converter for bionlp 2004 shared task
Added:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java (with props)
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java (with props)
Modified:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderConverterTool.java
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderConverterTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderConverterTool.java?rev=1145641&r1=1145640&r2=1145641&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderConverterTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderConverterTool.java Tue Jul 12 15:58:11 2011
@@ -23,6 +23,7 @@ import java.util.Map;
import opennlp.tools.cmdline.AbstractConverterTool;
import opennlp.tools.cmdline.ObjectStreamFactory;
+import opennlp.tools.formats.BioNLP2004NameSampleStreamFactory;
import opennlp.tools.formats.Conll02NameSampleStreamFactory;
import opennlp.tools.formats.Conll03NameSampleStreamFactory;
import opennlp.tools.formats.ad.ADNameSampleStreamFactory;
@@ -43,6 +44,7 @@ public class TokenNameFinderConverterToo
mutableStreamFactories.put("conll02", new Conll02NameSampleStreamFactory());
mutableStreamFactories.put("conll03", new Conll03NameSampleStreamFactory());
mutableStreamFactories.put("ad", new ADNameSampleStreamFactory());
+ mutableStreamFactories.put("bionlp2004", new BioNLP2004NameSampleStreamFactory());
streamFactories = Collections.unmodifiableMap(mutableStreamFactories);
}
Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java?rev=1145641&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java Tue Jul 12 15:58:11 2011
@@ -0,0 +1,171 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.StringUtil;
+
+/**
+ * Parser for the training files of the BioNLP/NLPBA 2004 shared task.
+ * <p>
+ * The data contains five named entity types: DNA, RNA, protein, cell_type and cell_line.<br>
+ * <p>
+ * Data can be found on this web site:<br>
+ * http://www-tsujii.is.s.u-tokyo.ac.jp/GENIA/ERtask/report.html
+ * <p>
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class BioNLP2004NameSampleStream implements ObjectStream<NameSample> {
+
+ public static final int GENERATE_DNA_ENTITIES = 0x01;
+ public static final int GENERATE_PROTEIN_ENTITIES = 0x01 << 1;
+ public static final int GENERATE_CELLTYPE_ENTITIES = 0x01 << 2;
+ public static final int GENERATE_CELLLINE_ENTITIES = 0x01 << 3;
+ public static final int GENERATE_RNA_ENTITIES = 0x01 << 4;
+
+ private final int types;
+
+ private final ObjectStream<String> lineStream;
+
+ public BioNLP2004NameSampleStream(InputStream in, int types) {
+ try {
+ this.lineStream = new PlainTextByLineStream(in, "UTF-8");
+ } catch (UnsupportedEncodingException e) {
+ // UTF-8 is available on all JVMs, will never happen
+ throw new IllegalStateException(e);
+ }
+
+ this.types = types;
+ }
+
+ public NameSample read() throws IOException {
+
+ List<String> sentence = new ArrayList<String>();
+ List<String> tags = new ArrayList<String>();
+
+ boolean isClearAdaptiveData = false;
+
+ // Empty line indicates end of sentence
+
+ String line;
+ while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line.trim())) {
+
+ if (line.startsWith("###MEDLINE:")) {
+ isClearAdaptiveData = true;
+ lineStream.read();
+ continue;
+ }
+
+ if (line.contains("ABSTRACT TRUNCATED"))
+ continue;
+
+ String fields[] = line.split("\t");
+
+ if (fields.length == 2) {
+ sentence.add(fields[0]);
+ tags.add(fields[1]);
+ }
+ else {
+ throw new IOException("Expected two fields per line in training data!");
+ }
+ }
+
+ if (sentence.size() > 0) {
+
+ // convert name tags into spans
+ List<Span> names = new ArrayList<Span>();
+
+ int beginIndex = -1;
+ int endIndex = -1;
+ for (int i = 0; i < tags.size(); i++) {
+
+ String tag = tags.get(i);
+
+ if (tag.endsWith("DNA") && (types & GENERATE_DNA_ENTITIES) == 0)
+ tag = "O";
+
+ if (tag.endsWith("protein") && (types & GENERATE_PROTEIN_ENTITIES) == 0)
+ tag = "O";
+
+ if (tag.endsWith("cell_type") && (types & GENERATE_CELLTYPE_ENTITIES) == 0)
+ tag = "O";
+
+ if (tag.endsWith("cell_line") && (types & GENERATE_CELLTYPE_ENTITIES) == 0)
+ tag = "O";
+ if (tag.endsWith("RNA") && (types & GENERATE_RNA_ENTITIES) == 0)
+ tag = "O";
+
+ if (tag.startsWith("B-")) {
+
+ if (beginIndex != -1) {
+ names.add(new Span(beginIndex, endIndex, tags.get(beginIndex).substring(2)));
+ beginIndex = -1;
+ endIndex = -1;
+ }
+
+ beginIndex = i;
+ endIndex = i +1;
+ }
+ else if (tag.startsWith("I-")) {
+ endIndex++;
+ }
+ else if (tag.equals("O")) {
+ if (beginIndex != -1) {
+ names.add(new Span(beginIndex, endIndex, tags.get(beginIndex).substring(2)));
+ beginIndex = -1;
+ endIndex = -1;
+ }
+ }
+ else {
+ throw new IOException("Invalid tag: " + tag);
+ }
+ }
+
+ // if one span remains, create it here
+ if (beginIndex != -1)
+ names.add(new Span(beginIndex, endIndex, tags.get(beginIndex).substring(2)));
+
+ return new NameSample(sentence.toArray(new String[sentence.size()]), names.toArray(new Span[names.size()]), isClearAdaptiveData);
+ }
+ else if (line != null) {
+ // Just filter out empty events, if two lines in a row are empty
+ return read();
+ }
+ else {
+ // source stream is not returning anymore lines
+ return null;
+ }
+ }
+
+ public void reset() throws IOException, UnsupportedOperationException {
+ lineStream.reset();
+ }
+
+ public void close() throws IOException {
+ lineStream.close();
+ }
+}
Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java?rev=1145641&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java Tue Jul 12 15:58:11 2011
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.File;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.ObjectStreamFactory;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.ObjectStream;
+
+public class BioNLP2004NameSampleStreamFactory
+ implements ObjectStreamFactory<NameSample>{
+
+ interface Parameters {
+ @ParameterDescription(valueName = "sampleData")
+ String getData();
+
+ @ParameterDescription(valueName = "DNA,protein,cell_type,cell_line,RNA")
+ String getTypes();
+ }
+
+ public String getUsage() {
+ return ArgumentParser.createUsage(Parameters.class);
+ }
+
+ public boolean validateArguments(String[] args) {
+ return ArgumentParser.validateArguments(args, Parameters.class);
+ }
+
+ public ObjectStream<NameSample> create(String[] args) {
+
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
+ int typesToGenerate = 0;
+
+ if (params.getTypes().contains("DNA")) {
+ typesToGenerate = typesToGenerate |
+ BioNLP2004NameSampleStream.GENERATE_DNA_ENTITIES;
+ }
+ else if (params.getTypes().contains("protein")) {
+ typesToGenerate = typesToGenerate |
+ BioNLP2004NameSampleStream.GENERATE_PROTEIN_ENTITIES;
+ }
+ else if (params.getTypes().contains("cell_type")) {
+ typesToGenerate = typesToGenerate |
+ BioNLP2004NameSampleStream.GENERATE_CELLTYPE_ENTITIES;
+ }
+ else if (params.getTypes().contains("cell_line")) {
+ typesToGenerate = typesToGenerate |
+ BioNLP2004NameSampleStream.GENERATE_CELLLINE_ENTITIES;
+ }
+ else if (params.getTypes().contains("RNA")) {
+ typesToGenerate = typesToGenerate |
+ BioNLP2004NameSampleStream.GENERATE_RNA_ENTITIES;
+ }
+
+ return new BioNLP2004NameSampleStream(
+ CmdLineUtil.openInFile(new File(params.getData())), typesToGenerate);
+ }
+}
Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java
------------------------------------------------------------------------------
svn:mime-type = text/plain