You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2013/04/15 16:54:35 UTC
svn commit: r1468104 - in /opennlp/trunk/opennlp-tools/src:
main/java/opennlp/tools/cmdline/ main/java/opennlp/tools/formats/
test/java/opennlp/tools/formats/ test/resources/opennlp/tools/formats/
Author: joern
Date: Mon Apr 15 14:54:34 2013
New Revision: 1468104
URL: http://svn.apache.org/r1468104
Log:
OPENNLP-551 Added support for EVALITA 07/09 NER datasets. Thanks to Rodrigo Agerri for providing a patch.
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java (with props)
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java (with props)
opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamTest.java (with props)
opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it.sample
Modified:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java?rev=1468104&r1=1468103&r2=1468104&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java Mon Apr 15 14:54:34 2013
@@ -24,6 +24,7 @@ import opennlp.tools.formats.BioNLP2004N
import opennlp.tools.formats.ChunkerSampleStreamFactory;
import opennlp.tools.formats.Conll02NameSampleStreamFactory;
import opennlp.tools.formats.Conll03NameSampleStreamFactory;
+import opennlp.tools.formats.EvalitaNameSampleStreamFactory;
import opennlp.tools.formats.ConllXPOSSampleStreamFactory;
import opennlp.tools.formats.ConllXSentenceSampleStreamFactory;
import opennlp.tools.formats.ConllXTokenSampleStreamFactory;
@@ -82,6 +83,7 @@ public final class StreamFactoryRegistry
BioNLP2004NameSampleStreamFactory.registerFactory();
Conll02NameSampleStreamFactory.registerFactory();
Conll03NameSampleStreamFactory.registerFactory();
+ EvalitaNameSampleStreamFactory.registerFactory();
ConllXPOSSampleStreamFactory.registerFactory();
ConllXSentenceSampleStreamFactory.registerFactory();
ConllXTokenSampleStreamFactory.registerFactory();
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java?rev=1468104&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java Mon Apr 15 14:54:34 2013
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.StringUtil;
+
+/**
+ * Parser for the Italian NER training files of the Evalita 2007 and 2009 NER shared tasks.
+ * <p>
+ * The data does not contain article boundaries,
+ * adaptive data will be cleared for every sentence.
+ * <p>
+ * Named Entities are annotated in the IOB2 format (as used in CoNLL 2002 shared task)
+ * <p>
+ * The Named Entity tag consists of two parts:
+ * 1. The IOB2 tag: 'B' (for 'begin') denotes the first token of a
+ * Named Entity, I (for 'inside') is used for all other tokens in a
+ * Named Entity, and 'O' (for 'outside') is used for all other words;
+ * 2. The Entity type tag: PER (for Person), ORG (for Organization),
+ * GPE (for Geo-Political Entity), or LOC (for Location).
+ * <p>
+ * Each file consists of four columns separated by a blank, containing
+ * respectively the token, the Elsnet PoS-tag, the Adige news story to
+ * which the token belongs, and the Named Entity tag.
+ * <p>
+ * Data can be found on this web site:<br>
+ * http://www.evalita.it
+ * <p>
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class EvalitaNameSampleStream implements ObjectStream<NameSample>{
+
+ public enum LANGUAGE {
+ IT
+ }
+
+ public static final int GENERATE_PERSON_ENTITIES = 0x01;
+ public static final int GENERATE_ORGANIZATION_ENTITIES = 0x01 << 1;
+ public static final int GENERATE_LOCATION_ENTITIES = 0x01 << 2;
+ public static final int GENERATE_GPE_ENTITIES = 0x01 << 3;
+
+ public static final String DOCSTART = "-DOCSTART-";
+
+ private final LANGUAGE lang;
+ private final ObjectStream<String> lineStream;
+
+ private final int types;
+
+ public EvalitaNameSampleStream(LANGUAGE lang, ObjectStream<String> lineStream, int types) {
+ this.lang = lang;
+ this.lineStream = lineStream;
+ this.types = types;
+ }
+ /**
+ * @param lang
+ * @param in an Input Stream to read data.
+ * @throws IOException
+ */
+ public EvalitaNameSampleStream(LANGUAGE lang, InputStream in, int types) {
+
+ this.lang = lang;
+ try {
+ this.lineStream = new PlainTextByLineStream(in, "UTF-8");
+ System.setOut(new PrintStream(System.out, true, "UTF-8"));
+ } catch (UnsupportedEncodingException e) {
+ // UTF-8 is available on all JVMs, will never happen
+ throw new IllegalStateException(e);
+ }
+ this.types = types;
+ }
+
+ static final Span extract(int begin, int end, String beginTag) throws InvalidFormatException {
+
+ String type = beginTag.substring(2);
+
+ if ("PER".equals(type)) {
+ type = "person";
+ }
+ else if ("LOC".equals(type)) {
+ type = "location";
+ }
+ else if ("GPE".equals(type)) {
+ type = "gpe";
+ }
+ else if ("ORG".equals(type)) {
+ type = "organization";
+ }
+ else {
+ throw new InvalidFormatException("Unknown type: " + type);
+ }
+
+ return new Span(begin, end, type);
+ }
+
+
+ public NameSample read() throws IOException {
+
+ List<String> sentence = new ArrayList<String>();
+ List<String> tags = new ArrayList<String>();
+
+ boolean isClearAdaptiveData = false;
+
+ // Empty line indicates end of sentence
+
+ String line;
+ while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line)) {
+
+ if (line.startsWith(DOCSTART)) {
+ isClearAdaptiveData = true;
+ String emptyLine = lineStream.read();
+
+ if (!StringUtil.isEmpty(emptyLine))
+ throw new IOException("Empty line after -DOCSTART- not empty: '" + emptyLine +"'!");
+
+ continue;
+ }
+
+ String fields[] = line.split(" ");
+
+ // For Italian: WORD POS-TAG SC-TAG NE-TAG
+ if (LANGUAGE.IT.equals(lang) && (fields.length == 4)) {
+ sentence.add(fields[0]);
+ tags.add(fields[3]); // 3 is NE-TAG
+ }
+ else {
+ throw new IOException("Incorrect number of fields per line for language: '" + line + "'!");
+ }
+ }
+
+ // Always clear adaptive data for Italian
+ if (LANGUAGE.IT.equals(lang))
+ isClearAdaptiveData = true;
+
+ if (sentence.size() > 0) {
+
+ // convert name tags into spans
+ List<Span> names = new ArrayList<Span>();
+
+ int beginIndex = -1;
+ int endIndex = -1;
+ for (int i = 0; i < tags.size(); i++) {
+
+ String tag = tags.get(i);
+
+ if (tag.endsWith("PER") && (types & GENERATE_PERSON_ENTITIES) == 0)
+ tag = "O";
+
+ if (tag.endsWith("ORG") && (types & GENERATE_ORGANIZATION_ENTITIES) == 0)
+ tag = "O";
+
+ if (tag.endsWith("LOC") && (types & GENERATE_LOCATION_ENTITIES) == 0)
+ tag = "O";
+
+ if (tag.endsWith("GPE") && (types & GENERATE_GPE_ENTITIES) == 0)
+ tag = "O";
+
+ if (tag.startsWith("B-")) {
+
+ if (beginIndex != -1) {
+ names.add(extract(beginIndex, endIndex, tags.get(beginIndex)));
+ beginIndex = -1;
+ endIndex = -1;
+ }
+
+ beginIndex = i;
+ endIndex = i +1;
+ }
+ else if (tag.startsWith("I-")) {
+ endIndex++;
+ }
+ else if (tag.equals("O")) {
+ if (beginIndex != -1) {
+ names.add(extract(beginIndex, endIndex, tags.get(beginIndex)));
+ beginIndex = -1;
+ endIndex = -1;
+ }
+ }
+ else {
+ throw new IOException("Invalid tag: " + tag);
+ }
+ }
+
+ // if one span remains, create it here
+ if (beginIndex != -1)
+ names.add(extract(beginIndex, endIndex, tags.get(beginIndex)));
+
+ return new NameSample(sentence.toArray(new String[sentence.size()]), names.toArray(new Span[names.size()]), isClearAdaptiveData);
+ }
+ else if (line != null) {
+ // Just filter out empty events, if two lines in a row are empty
+ return read();
+ }
+ else {
+ // source stream is not returning anymore lines
+ return null;
+ }
+ }
+
+ public void reset() throws IOException, UnsupportedOperationException {
+ lineStream.reset();
+ }
+
+ public void close() throws IOException {
+ lineStream.close();
+ }
+}
+
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java?rev=1468104&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java Mon Apr 15 14:54:34 2013
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.EvalitaNameSampleStream.LANGUAGE;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class EvalitaNameSampleStreamFactory extends LanguageSampleStreamFactory<NameSample> {
+
+ interface Parameters extends BasicFormatParams {
+ @ParameterDescription(valueName = "it")
+ String getLang();
+
+ @ParameterDescription(valueName = "per,loc,org,gpe")
+ String getTypes();
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(NameSample.class,
+ "evalita", new EvalitaNameSampleStreamFactory(Parameters.class));
+ }
+
+ protected <P> EvalitaNameSampleStreamFactory(Class<P> params) {
+ super(params);
+ }
+
+ public ObjectStream<NameSample> create(String[] args) {
+
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+ LANGUAGE lang;
+ if ("it".equals(params.getLang())) {
+ lang = LANGUAGE.IT;
+ language = params.getLang();
+ }
+ else {
+ throw new TerminateToolException(1, "Unsupported language: " + params.getLang());
+ }
+
+ int typesToGenerate = 0;
+
+ if (params.getTypes().contains("per")) {
+ typesToGenerate = typesToGenerate |
+ EvalitaNameSampleStream.GENERATE_PERSON_ENTITIES;
+ }
+ if (params.getTypes().contains("org")) {
+ typesToGenerate = typesToGenerate |
+ EvalitaNameSampleStream.GENERATE_ORGANIZATION_ENTITIES;
+ }
+ if (params.getTypes().contains("loc")) {
+ typesToGenerate = typesToGenerate |
+ EvalitaNameSampleStream.GENERATE_LOCATION_ENTITIES;
+ }
+ if (params.getTypes().contains("gpe")) {
+ typesToGenerate = typesToGenerate |
+ EvalitaNameSampleStream.GENERATE_GPE_ENTITIES;
+ }
+
+
+ return new EvalitaNameSampleStream(lang,
+ CmdLineUtil.openInFile(params.getData()), typesToGenerate);
+ }
+}
+
Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamTest.java?rev=1468104&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamTest.java (added)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamTest.java Mon Apr 15 14:54:34 2013
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.formats.EvalitaNameSampleStream.LANGUAGE;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+import org.junit.Test;
+
+/**
+ *
+ * Note:
+ * Sample training data must be UTF-8 encoded and uncompressed!
+ */
+public class EvalitaNameSampleStreamTest {
+
+ private static ObjectStream<NameSample> openData(LANGUAGE lang, String name) throws IOException {
+ InputStream in = EvalitaNameSampleStreamTest.class.getResourceAsStream("/opennlp/tools/formats/" + name);
+
+ return new EvalitaNameSampleStream(lang, in, EvalitaNameSampleStream.GENERATE_PERSON_ENTITIES);
+ }
+
+ @Test
+ public void testParsingItalianSample() throws IOException {
+
+ ObjectStream<NameSample> sampleStream = openData(LANGUAGE.IT, "evalita-ner-it.sample");
+
+ NameSample personName = sampleStream.read();
+
+ assertNotNull(personName);
+
+ assertEquals(11, personName.getSentence().length);
+ assertEquals(1, personName.getNames().length);
+ assertEquals(true, personName.isClearAdaptiveDataSet());
+
+ Span nameSpan = personName.getNames()[0];
+ assertEquals(8, nameSpan.getStart());
+ assertEquals(10, nameSpan.getEnd());
+ assertEquals(true, personName.isClearAdaptiveDataSet());
+
+ assertEquals(0, sampleStream.read().getNames().length);
+
+ assertNull(sampleStream.read());
+ }
+
+}
Propchange: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamTest.java
------------------------------------------------------------------------------
svn:mime-type = text/plain
Added: opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it.sample
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it.sample?rev=1468104&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it.sample (added)
+++ opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it.sample Mon Apr 15 14:54:34 2013
@@ -0,0 +1,27 @@
+A E adige20041007_id413942 O
+parlare VF adige20041007_id413942 O
+di E adige20041007_id413942 O
+questi DP adige20041007_id413942 O
+problemi SP adige20041007_id413942 O
+sar<E0> VI adige20041007_id413942 O
+il RS adige20041007_id413942 O
+neonatologo SS adige20041007_id413942 O
+Dino SPN adige20041007_id413942 B-PER
+Pedrotti SPN adige20041007_id413942 I-PER
+. XPS adige20041007_id413942 O
+
+Sono VIY adige20041008_id414214 O
+assicurate VPP adige20041008_id414214 O
+a E adige20041008_id414214 O
+tutta DS adige20041008_id414214 O
+la RS adige20041008_id414214 O
+popolazione SS adige20041008_id414214 O
+a E adige20041008_id414214 O
+titolo SS adige20041008_id414214 O
+gratuito AS adige20041008_id414214 O
+e C adige20041008_id414214 O
+con E adige20041008_id414214 O
+accesso SS adige20041008_id414214 O
+diretto AS adige20041008_id414214 O
+. XPS adige20041008_id414214 O
+