You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2013/04/15 16:54:35 UTC

svn commit: r1468104 - in /opennlp/trunk/opennlp-tools/src: main/java/opennlp/tools/cmdline/ main/java/opennlp/tools/formats/ test/java/opennlp/tools/formats/ test/resources/opennlp/tools/formats/

Author: joern
Date: Mon Apr 15 14:54:34 2013
New Revision: 1468104

URL: http://svn.apache.org/r1468104
Log:
OPENNLP-551 Added support for EVALITA 07/09 NER datasets. Thanks to Rodrigo Agerri for providing a patch.

Added:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java   (with props)
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java   (with props)
    opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamTest.java   (with props)
    opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it.sample
Modified:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java?rev=1468104&r1=1468103&r2=1468104&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java Mon Apr 15 14:54:34 2013
@@ -24,6 +24,7 @@ import opennlp.tools.formats.BioNLP2004N
 import opennlp.tools.formats.ChunkerSampleStreamFactory;
 import opennlp.tools.formats.Conll02NameSampleStreamFactory;
 import opennlp.tools.formats.Conll03NameSampleStreamFactory;
+import opennlp.tools.formats.EvalitaNameSampleStreamFactory;
 import opennlp.tools.formats.ConllXPOSSampleStreamFactory;
 import opennlp.tools.formats.ConllXSentenceSampleStreamFactory;
 import opennlp.tools.formats.ConllXTokenSampleStreamFactory;
@@ -82,6 +83,7 @@ public final class StreamFactoryRegistry
     BioNLP2004NameSampleStreamFactory.registerFactory();
     Conll02NameSampleStreamFactory.registerFactory();
     Conll03NameSampleStreamFactory.registerFactory();
+    EvalitaNameSampleStreamFactory.registerFactory();
     ConllXPOSSampleStreamFactory.registerFactory();
     ConllXSentenceSampleStreamFactory.registerFactory();
     ConllXTokenSampleStreamFactory.registerFactory();

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java?rev=1468104&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java Mon Apr 15 14:54:34 2013
@@ -0,0 +1,234 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.io.UnsupportedEncodingException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.Span;
+import opennlp.tools.util.StringUtil;
+
+/**
+ * Parser for the Italian NER training files of the Evalita 2007 and 2009 NER  shared tasks.
+ * <p>
+ * The data does not contain article boundaries,
+ * adaptive data will be cleared for every sentence.
+ * <p>
+ * Named Entities are annotated in the IOB2 format (as used in CoNLL 2002 shared task)
+ * <p>
+ * The Named Entity tag consists of two parts:
+ * 1. The  IOB2 tag: 'B'  (for 'begin')  denotes the  first token  of a
+ *    Named Entity,  I (for 'inside')  is used for  all other tokens  in a
+ *    Named Entity, and 'O' (for 'outside') is used for all other words;
+ * 2. The Entity  type tag: PER  (for Person), ORG  (for Organization),
+ *    GPE (for Geo-Political Entity), or LOC (for Location).
+ * <p>
+ * Each file  consists of four  columns separated by a  blank, containing
+ * respectively the  token, the Elsnet  PoS-tag, the Adige news  story to
+ * which the token belongs, and the Named Entity tag.
+ * <p>
+ * Data can be found on this web site:<br>
+ * http://www.evalita.it
+ * <p>
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class EvalitaNameSampleStream implements ObjectStream<NameSample>{
+
+	public enum LANGUAGE {
+	    IT
+	  }
+
+	  public static final int GENERATE_PERSON_ENTITIES = 0x01;
+	  public static final int GENERATE_ORGANIZATION_ENTITIES = 0x01 << 1;
+	  public static final int GENERATE_LOCATION_ENTITIES = 0x01 << 2;
+	  public static final int GENERATE_GPE_ENTITIES = 0x01 << 3;
+
+	  public static final String DOCSTART = "-DOCSTART-";
+
+	  private final LANGUAGE lang;
+	  private final ObjectStream<String> lineStream;
+
+	  private final int types;
+
+	  public EvalitaNameSampleStream(LANGUAGE lang, ObjectStream<String> lineStream, int types) {
+	    this.lang = lang;
+	    this.lineStream = lineStream;
+	    this.types = types;
+	  }
+  /**
+   * @param lang
+   * @param in an Input Stream to read data.
+   * @throws IOException
+   */
+  public EvalitaNameSampleStream(LANGUAGE lang, InputStream in, int types) {
+
+    this.lang = lang;
+    try {
+      this.lineStream = new PlainTextByLineStream(in, "UTF-8");
+      System.setOut(new PrintStream(System.out, true, "UTF-8"));
+    } catch (UnsupportedEncodingException e) {
+      // UTF-8 is available on all JVMs, will never happen
+      throw new IllegalStateException(e);
+    }
+    this.types = types;
+  }
+
+  static final Span extract(int begin, int end, String beginTag) throws InvalidFormatException {
+
+    String type = beginTag.substring(2);
+
+    if ("PER".equals(type)) {
+      type = "person";
+    }
+    else if ("LOC".equals(type)) {
+      type = "location";
+    }
+    else if ("GPE".equals(type)) {
+      type = "gpe";
+    }
+    else if ("ORG".equals(type)) {
+      type = "organization";
+    }
+    else {
+      throw new InvalidFormatException("Unknown type: " + type);
+    }
+
+    return new Span(begin, end, type);
+  }
+
+
+  public NameSample read() throws IOException {
+
+    List<String> sentence = new ArrayList<String>();
+    List<String> tags = new ArrayList<String>();
+
+    boolean isClearAdaptiveData = false;
+
+    // Empty line indicates end of sentence
+
+    String line;
+    while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line)) {
+
+      if (line.startsWith(DOCSTART)) {
+        isClearAdaptiveData = true;
+        String emptyLine = lineStream.read();
+
+        if (!StringUtil.isEmpty(emptyLine))
+            throw new IOException("Empty line after -DOCSTART- not empty: '" + emptyLine +"'!");
+
+        continue;
+      }
+
+      String fields[] = line.split(" ");
+
+     // For Italian: WORD  POS-TAG SC-TAG NE-TAG
+      if (LANGUAGE.IT.equals(lang) && (fields.length == 4)) {
+        sentence.add(fields[0]);
+        tags.add(fields[3]); // 3 is NE-TAG
+      }
+      else {
+          throw new IOException("Incorrect number of fields per line for language: '" + line + "'!");
+        }
+    }
+    
+    // Always clear adaptive data for Italian
+    if (LANGUAGE.IT.equals(lang))
+      isClearAdaptiveData = true;
+
+    if (sentence.size() > 0) {
+
+      // convert name tags into spans
+      List<Span> names = new ArrayList<Span>();
+
+      int beginIndex = -1;
+      int endIndex = -1;
+      for (int i = 0; i < tags.size(); i++) {
+
+        String tag = tags.get(i);
+
+        if (tag.endsWith("PER") && (types & GENERATE_PERSON_ENTITIES) == 0)
+          tag = "O";
+
+        if (tag.endsWith("ORG") && (types & GENERATE_ORGANIZATION_ENTITIES) == 0)
+          tag = "O";
+
+        if (tag.endsWith("LOC") && (types & GENERATE_LOCATION_ENTITIES) == 0)
+          tag = "O";
+
+        if (tag.endsWith("GPE") && (types & GENERATE_GPE_ENTITIES) == 0)
+          tag = "O";
+
+        if (tag.startsWith("B-")) {
+
+            if (beginIndex != -1) {
+              names.add(extract(beginIndex, endIndex, tags.get(beginIndex)));
+              beginIndex = -1;
+              endIndex = -1;
+            }
+
+            beginIndex = i;
+            endIndex = i +1;
+          }
+          else if (tag.startsWith("I-")) {
+            endIndex++;
+          }
+          else if (tag.equals("O")) {
+            if (beginIndex != -1) {
+              names.add(extract(beginIndex, endIndex, tags.get(beginIndex)));
+              beginIndex = -1;
+              endIndex = -1;
+            }
+          }
+          else {
+            throw new IOException("Invalid tag: " + tag);
+          }
+        }
+
+        // if one span remains, create it here
+        if (beginIndex != -1)
+          names.add(extract(beginIndex, endIndex, tags.get(beginIndex)));
+
+        return new NameSample(sentence.toArray(new String[sentence.size()]), names.toArray(new Span[names.size()]), isClearAdaptiveData);
+      }
+      else if (line != null) {
+        // Just filter out empty events, if two lines in a row are empty
+        return read();
+      }
+      else {
+        // source stream is not returning anymore lines
+        return null;
+      }
+    }
+
+  public void reset() throws IOException, UnsupportedOperationException {
+    lineStream.reset();
+  }
+
+  public void close() throws IOException {
+    lineStream.close();
+  }
+}
+

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java?rev=1468104&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java Mon Apr 15 14:54:34 2013
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.EvalitaNameSampleStream.LANGUAGE;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class EvalitaNameSampleStreamFactory extends LanguageSampleStreamFactory<NameSample> {
+
+  interface Parameters extends BasicFormatParams {
+    @ParameterDescription(valueName = "it")
+    String getLang();
+
+    @ParameterDescription(valueName = "per,loc,org,gpe")
+    String getTypes();
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(NameSample.class,
+        "evalita", new EvalitaNameSampleStreamFactory(Parameters.class));
+  }
+
+  protected <P> EvalitaNameSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  public ObjectStream<NameSample> create(String[] args) {
+
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    LANGUAGE lang;
+    if ("it".equals(params.getLang())) {
+      lang = LANGUAGE.IT;
+      language = params.getLang();
+    }
+    else {
+      throw new TerminateToolException(1, "Unsupported language: " + params.getLang());
+    }
+
+    int typesToGenerate = 0;
+
+    if (params.getTypes().contains("per")) {
+      typesToGenerate = typesToGenerate |
+          EvalitaNameSampleStream.GENERATE_PERSON_ENTITIES;
+    }
+    if (params.getTypes().contains("org")) {
+      typesToGenerate = typesToGenerate |
+          EvalitaNameSampleStream.GENERATE_ORGANIZATION_ENTITIES;
+    }
+    if (params.getTypes().contains("loc")) {
+      typesToGenerate = typesToGenerate |
+          EvalitaNameSampleStream.GENERATE_LOCATION_ENTITIES;
+    }
+    if (params.getTypes().contains("gpe")) {
+      typesToGenerate = typesToGenerate |
+          EvalitaNameSampleStream.GENERATE_GPE_ENTITIES;
+    }
+
+
+    return new EvalitaNameSampleStream(lang,
+        CmdLineUtil.openInFile(params.getData()), typesToGenerate);
+  }
+}
+

Propchange: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamTest.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamTest.java?rev=1468104&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamTest.java (added)
+++ opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamTest.java Mon Apr 15 14:54:34 2013
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+import static org.junit.Assert.assertNull;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.formats.EvalitaNameSampleStream.LANGUAGE;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+import org.junit.Test;
+
+/**
+ * 
+ * Note:
+ * Sample training data must be UTF-8 encoded and uncompressed!
+ */
+public class EvalitaNameSampleStreamTest {
+  
+  private static ObjectStream<NameSample> openData(LANGUAGE lang, String name) throws IOException {
+    InputStream in = EvalitaNameSampleStreamTest.class.getResourceAsStream("/opennlp/tools/formats/" + name);
+    
+    return new EvalitaNameSampleStream(lang, in, EvalitaNameSampleStream.GENERATE_PERSON_ENTITIES);
+  }
+  
+  @Test
+  public void testParsingItalianSample() throws IOException {
+    
+    ObjectStream<NameSample> sampleStream = openData(LANGUAGE.IT, "evalita-ner-it.sample");
+    
+    NameSample personName = sampleStream.read();
+    
+    assertNotNull(personName);
+    
+    assertEquals(11, personName.getSentence().length);
+    assertEquals(1, personName.getNames().length);
+    assertEquals(true, personName.isClearAdaptiveDataSet());
+    
+    Span nameSpan = personName.getNames()[0];
+    assertEquals(8, nameSpan.getStart());
+    assertEquals(10, nameSpan.getEnd());
+    assertEquals(true, personName.isClearAdaptiveDataSet());
+    
+    assertEquals(0, sampleStream.read().getNames().length);
+    
+    assertNull(sampleStream.read());
+  }
+  
+}

Propchange: opennlp/trunk/opennlp-tools/src/test/java/opennlp/tools/formats/EvalitaNameSampleStreamTest.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain

Added: opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it.sample
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it.sample?rev=1468104&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it.sample (added)
+++ opennlp/trunk/opennlp-tools/src/test/resources/opennlp/tools/formats/evalita-ner-it.sample Mon Apr 15 14:54:34 2013
@@ -0,0 +1,27 @@
+A E adige20041007_id413942 O
+parlare VF adige20041007_id413942 O
+di E adige20041007_id413942 O
+questi DP adige20041007_id413942 O
+problemi SP adige20041007_id413942 O
+sar<E0> VI adige20041007_id413942 O
+il RS adige20041007_id413942 O
+neonatologo SS adige20041007_id413942 O
+Dino SPN adige20041007_id413942 B-PER
+Pedrotti SPN adige20041007_id413942 I-PER
+. XPS adige20041007_id413942 O
+
+Sono VIY adige20041008_id414214 O
+assicurate VPP adige20041008_id414214 O
+a E adige20041008_id414214 O
+tutta DS adige20041008_id414214 O
+la RS adige20041008_id414214 O
+popolazione SS adige20041008_id414214 O
+a E adige20041008_id414214 O
+titolo SS adige20041008_id414214 O
+gratuito AS adige20041008_id414214 O
+e C adige20041008_id414214 O
+con E adige20041008_id414214 O
+accesso SS adige20041008_id414214 O
+diretto AS adige20041008_id414214 O
+. XPS adige20041008_id414214 O
+