You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2013/12/02 17:41:35 UTC

svn commit: r1547097 - in /opennlp/trunk/opennlp-tools: lang/ src/main/java/opennlp/tools/cmdline/ src/main/java/opennlp/tools/formats/ontonotes/ src/main/java/opennlp/tools/ml/model/

Author: joern
Date: Mon Dec  2 16:41:34 2013
New Revision: 1547097

URL: http://svn.apache.org/r1547097
Log:
OPENNLP-623 Added support to train the name finder on OntoNotes data

Added:
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java
Modified:
    opennlp/trunk/opennlp-tools/lang/TrainerParams.txt
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
    opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ml/model/MaxentModel.java

Modified: opennlp/trunk/opennlp-tools/lang/TrainerParams.txt
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/lang/TrainerParams.txt?rev=1547097&r1=1547096&r2=1547097&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/lang/TrainerParams.txt (original)
+++ opennlp/trunk/opennlp-tools/lang/TrainerParams.txt Mon Dec  2 16:41:34 2013
@@ -15,7 +15,6 @@
 
 # Sample machine learning properties file
 
-Algorithm=MAXENT
-Iterations=200
-Cutoff=5
-Threads=2
\ No newline at end of file
+Algorithm=PERCEPTRON
+Iterations=300
+Cutoff=0

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java?rev=1547097&r1=1547096&r2=1547097&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java Mon Dec  2 16:41:34 2013
@@ -50,6 +50,7 @@ import opennlp.tools.formats.convert.Par
 import opennlp.tools.formats.convert.ParseToTokenSampleStreamFactory;
 import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory;
 import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
+import opennlp.tools.formats.ontonotes.OntoNotesNameSampleStreamFactory;
 
 /**
  * Registry for object stream factories.
@@ -78,6 +79,7 @@ public final class StreamFactoryRegistry
     ParseToSentenceSampleStreamFactory.registerFactory();
     ParseToTokenSampleStreamFactory.registerFactory();
     
+    OntoNotesNameSampleStreamFactory.registerFactory();
     BioNLP2004NameSampleStreamFactory.registerFactory();
     Conll02NameSampleStreamFactory.registerFactory();
     Conll03NameSampleStreamFactory.registerFactory();

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java?rev=1547097&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java Mon Dec  2 16:41:34 2013
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.ontonotes;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+/**
+ * Name Sample Stream parser for the OntoNotes 4.0 corpus.
+ */
+public class OntoNotesNameSampleStream extends
+    FilterObjectStream<String, NameSample> {
+
+  private final Map<String, String> tokenConversionMap;
+
+  private List<NameSample> nameSamples = new LinkedList<NameSample>();
+
+  protected OntoNotesNameSampleStream(ObjectStream<String> samples) {
+    super(samples);
+
+    Map<String, String> tokenConversionMap = new HashMap<String, String>();
+    tokenConversionMap.put("-LRB-", "(");
+    tokenConversionMap.put("-RRB-", ")");
+    tokenConversionMap.put("-LSB-", "[");
+    tokenConversionMap.put("-RSB-", "]");
+    tokenConversionMap.put("-LCB-", "{");
+    tokenConversionMap.put("-RCB-", "}");
+    tokenConversionMap.put("-AMP-", "&");
+    this.tokenConversionMap = Collections.unmodifiableMap(tokenConversionMap);
+  }
+
+  private String convertToken(String token) {
+
+    StringBuilder convertedToken = new StringBuilder(token);
+
+    int startTagEndIndex = convertedToken.indexOf(">");
+
+    if (token.contains("=\"") && startTagEndIndex != -1) {
+      convertedToken.delete(0, startTagEndIndex + 1);
+    }
+
+    int endTagBeginIndex = convertedToken.indexOf("<");
+    int endTagEndIndex = convertedToken.indexOf(">");
+
+    if (endTagBeginIndex != -1 && endTagEndIndex != -1) {
+      convertedToken.delete(endTagBeginIndex, endTagEndIndex + 1);
+    }
+
+    String cleanedToken = convertedToken.toString();
+
+    if (tokenConversionMap.get(cleanedToken) != null) {
+      cleanedToken = tokenConversionMap.get(cleanedToken);
+    }
+
+    return cleanedToken;
+  }
+
+  public NameSample read() throws IOException {
+
+    if (nameSamples.isEmpty()) {
+      String doc = samples.read();
+
+      if (doc != null) {
+        BufferedReader docIn = new BufferedReader(new StringReader(doc));
+
+        boolean clearAdaptiveData = true;
+
+        String line;
+        while ((line = docIn.readLine()) != null) {
+
+          if (line.startsWith("<DOC")) {
+            continue;
+          }
+
+          if (line.equals("</DOC>")) {
+            break;
+          }
+
+          String tokens[] = WhitespaceTokenizer.INSTANCE.tokenize(line);
+
+          List<Span> entities = new LinkedList<Span>();
+          List<String> cleanedTokens = new ArrayList<String>(tokens.length);
+
+          int tokenIndex = 0;
+          int entityBeginIndex = -1;
+          String entityType = null;
+          boolean insideStartEnmaxTag = false;
+          for (String token : tokens) {
+
+            // Split here, next part of tag is in new token
+            if (token.startsWith("<ENAMEX")) {
+              insideStartEnmaxTag = true;
+              continue;
+            }
+
+            if (insideStartEnmaxTag) {
+
+              String typeBegin = "TYPE=\"";
+
+              if (token.startsWith(typeBegin)) {
+
+                int typeEnd = token.indexOf("\"", typeBegin.length());
+
+                entityType = token.substring(typeBegin.length(), typeEnd)
+                    .toLowerCase();
+              }
+
+              if (token.contains(">")) {
+                entityBeginIndex = tokenIndex;
+                insideStartEnmaxTag = false;
+              } else {
+                continue;
+              }
+            }
+
+            if (token.endsWith("</ENAMEX>")) {
+              entities.add(new Span(entityBeginIndex, tokenIndex + 1,
+                  entityType));
+              entityBeginIndex = -1;
+            }
+
+            cleanedTokens.add(convertToken(token));
+            tokenIndex++;
+          }
+
+          nameSamples.add(new NameSample(cleanedTokens
+              .toArray(new String[cleanedTokens.size()]), entities
+              .toArray(new Span[entities.size()]), clearAdaptiveData));
+
+          clearAdaptiveData = false;
+        }
+      }
+    }
+
+    if (!nameSamples.isEmpty()) {
+      return nameSamples.remove(0);
+    } else {
+      return null;
+    }
+  }
+}
\ No newline at end of file

Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java?rev=1547097&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java Mon Dec  2 16:41:34 2013
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.ontonotes;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.nio.charset.Charset;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.formats.DirectorySampleStream;
+import opennlp.tools.formats.convert.FileToStringSampleStream;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.ObjectStream;
+
+public class OntoNotesNameSampleStreamFactory extends
+    AbstractSampleStreamFactory<NameSample> {
+
+  interface Parameters {
+    @ParameterDescription(valueName = "OntoNotes 4.0 corpus directory")
+    String getOntoNotesDir();
+  }
+
+  public OntoNotesNameSampleStreamFactory() {
+    super(Parameters.class);
+  }
+
+  public ObjectStream<NameSample> create(String[] args) {
+
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    ObjectStream<File> documentStream = new DirectorySampleStream(new File(
+        params.getOntoNotesDir()), new FileFilter() {
+
+      public boolean accept(File file) {
+        if (file.isFile()) {
+          return file.getName().endsWith(".name");
+        }
+
+        return file.isDirectory();
+      }
+    }, true);
+
+    return new OntoNotesNameSampleStream(new FileToStringSampleStream(
+        documentStream, Charset.forName("UTF-8")));
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(NameSample.class,
+        "ontonotes", new OntoNotesNameSampleStreamFactory());
+  }
+}
\ No newline at end of file

Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ml/model/MaxentModel.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ml/model/MaxentModel.java?rev=1547097&r1=1547096&r2=1547097&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ml/model/MaxentModel.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ml/model/MaxentModel.java Mon Dec  2 16:41:34 2013
@@ -78,6 +78,7 @@ public interface MaxentModel {
    *            probability (contained in the <code>double[] ocs</code>)
    *            for each one.
    **/
+  // TODO: This should be removed, can't be used anyway without format spec
   public String getAllOutcomes(double[] outcomes);
 
   /**
@@ -104,7 +105,7 @@ public interface MaxentModel {
   /**
    * Returns the data structures relevant to storing the model.
    **/
-  public Object[] getDataStructures();
+  // public Object[] getDataStructures();
 
   /** Returns the number of outcomes for this model.
    *  @return The number of outcomes.