You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2013/12/02 17:41:35 UTC
svn commit: r1547097 - in /opennlp/trunk/opennlp-tools: lang/
src/main/java/opennlp/tools/cmdline/
src/main/java/opennlp/tools/formats/ontonotes/
src/main/java/opennlp/tools/ml/model/
Author: joern
Date: Mon Dec 2 16:41:34 2013
New Revision: 1547097
URL: http://svn.apache.org/r1547097
Log:
OPENNLP-623 Added support to train the name finder on OntoNotes data
Added:
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java
Modified:
opennlp/trunk/opennlp-tools/lang/TrainerParams.txt
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ml/model/MaxentModel.java
Modified: opennlp/trunk/opennlp-tools/lang/TrainerParams.txt
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/lang/TrainerParams.txt?rev=1547097&r1=1547096&r2=1547097&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/lang/TrainerParams.txt (original)
+++ opennlp/trunk/opennlp-tools/lang/TrainerParams.txt Mon Dec 2 16:41:34 2013
@@ -15,7 +15,6 @@
# Sample machine learning properties file
-Algorithm=MAXENT
-Iterations=200
-Cutoff=5
-Threads=2
\ No newline at end of file
+Algorithm=PERCEPTRON
+Iterations=300
+Cutoff=0
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java?rev=1547097&r1=1547096&r2=1547097&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java Mon Dec 2 16:41:34 2013
@@ -50,6 +50,7 @@ import opennlp.tools.formats.convert.Par
import opennlp.tools.formats.convert.ParseToTokenSampleStreamFactory;
import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory;
import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
+import opennlp.tools.formats.ontonotes.OntoNotesNameSampleStreamFactory;
/**
* Registry for object stream factories.
@@ -78,6 +79,7 @@ public final class StreamFactoryRegistry
ParseToSentenceSampleStreamFactory.registerFactory();
ParseToTokenSampleStreamFactory.registerFactory();
+ OntoNotesNameSampleStreamFactory.registerFactory();
BioNLP2004NameSampleStreamFactory.registerFactory();
Conll02NameSampleStreamFactory.registerFactory();
Conll03NameSampleStreamFactory.registerFactory();
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java?rev=1547097&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java Mon Dec 2 16:41:34 2013
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.ontonotes;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.Map;
+
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+/**
+ * Name Sample Stream parser for the OntoNotes 4.0 corpus.
+ */
+public class OntoNotesNameSampleStream extends
+ FilterObjectStream<String, NameSample> {
+
+ private final Map<String, String> tokenConversionMap;
+
+ private List<NameSample> nameSamples = new LinkedList<NameSample>();
+
+ protected OntoNotesNameSampleStream(ObjectStream<String> samples) {
+ super(samples);
+
+ Map<String, String> tokenConversionMap = new HashMap<String, String>();
+ tokenConversionMap.put("-LRB-", "(");
+ tokenConversionMap.put("-RRB-", ")");
+ tokenConversionMap.put("-LSB-", "[");
+ tokenConversionMap.put("-RSB-", "]");
+ tokenConversionMap.put("-LCB-", "{");
+ tokenConversionMap.put("-RCB-", "}");
+ tokenConversionMap.put("-AMP-", "&");
+ this.tokenConversionMap = Collections.unmodifiableMap(tokenConversionMap);
+ }
+
+ private String convertToken(String token) {
+
+ StringBuilder convertedToken = new StringBuilder(token);
+
+ int startTagEndIndex = convertedToken.indexOf(">");
+
+ if (token.contains("=\"") && startTagEndIndex != -1) {
+ convertedToken.delete(0, startTagEndIndex + 1);
+ }
+
+ int endTagBeginIndex = convertedToken.indexOf("<");
+ int endTagEndIndex = convertedToken.indexOf(">");
+
+ if (endTagBeginIndex != -1 && endTagEndIndex != -1) {
+ convertedToken.delete(endTagBeginIndex, endTagEndIndex + 1);
+ }
+
+ String cleanedToken = convertedToken.toString();
+
+ if (tokenConversionMap.get(cleanedToken) != null) {
+ cleanedToken = tokenConversionMap.get(cleanedToken);
+ }
+
+ return cleanedToken;
+ }
+
+ public NameSample read() throws IOException {
+
+ if (nameSamples.isEmpty()) {
+ String doc = samples.read();
+
+ if (doc != null) {
+ BufferedReader docIn = new BufferedReader(new StringReader(doc));
+
+ boolean clearAdaptiveData = true;
+
+ String line;
+ while ((line = docIn.readLine()) != null) {
+
+ if (line.startsWith("<DOC")) {
+ continue;
+ }
+
+ if (line.equals("</DOC>")) {
+ break;
+ }
+
+ String tokens[] = WhitespaceTokenizer.INSTANCE.tokenize(line);
+
+ List<Span> entities = new LinkedList<Span>();
+ List<String> cleanedTokens = new ArrayList<String>(tokens.length);
+
+ int tokenIndex = 0;
+ int entityBeginIndex = -1;
+ String entityType = null;
+ boolean insideStartEnmaxTag = false;
+ for (String token : tokens) {
+
+ // Split here, next part of tag is in new token
+ if (token.startsWith("<ENAMEX")) {
+ insideStartEnmaxTag = true;
+ continue;
+ }
+
+ if (insideStartEnmaxTag) {
+
+ String typeBegin = "TYPE=\"";
+
+ if (token.startsWith(typeBegin)) {
+
+ int typeEnd = token.indexOf("\"", typeBegin.length());
+
+ entityType = token.substring(typeBegin.length(), typeEnd)
+ .toLowerCase();
+ }
+
+ if (token.contains(">")) {
+ entityBeginIndex = tokenIndex;
+ insideStartEnmaxTag = false;
+ } else {
+ continue;
+ }
+ }
+
+ if (token.endsWith("</ENAMEX>")) {
+ entities.add(new Span(entityBeginIndex, tokenIndex + 1,
+ entityType));
+ entityBeginIndex = -1;
+ }
+
+ cleanedTokens.add(convertToken(token));
+ tokenIndex++;
+ }
+
+ nameSamples.add(new NameSample(cleanedTokens
+ .toArray(new String[cleanedTokens.size()]), entities
+ .toArray(new Span[entities.size()]), clearAdaptiveData));
+
+ clearAdaptiveData = false;
+ }
+ }
+ }
+
+ if (!nameSamples.isEmpty()) {
+ return nameSamples.remove(0);
+ } else {
+ return null;
+ }
+ }
+}
\ No newline at end of file
Added: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java?rev=1547097&view=auto
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java (added)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java Mon Dec 2 16:41:34 2013
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.ontonotes;
+
+import java.io.File;
+import java.io.FileFilter;
+import java.nio.charset.Charset;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.formats.DirectorySampleStream;
+import opennlp.tools.formats.convert.FileToStringSampleStream;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.util.ObjectStream;
+
+public class OntoNotesNameSampleStreamFactory extends
+ AbstractSampleStreamFactory<NameSample> {
+
+ interface Parameters {
+ @ParameterDescription(valueName = "OntoNotes 4.0 corpus directory")
+ String getOntoNotesDir();
+ }
+
+ public OntoNotesNameSampleStreamFactory() {
+ super(Parameters.class);
+ }
+
+ public ObjectStream<NameSample> create(String[] args) {
+
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+ ObjectStream<File> documentStream = new DirectorySampleStream(new File(
+ params.getOntoNotesDir()), new FileFilter() {
+
+ public boolean accept(File file) {
+ if (file.isFile()) {
+ return file.getName().endsWith(".name");
+ }
+
+ return file.isDirectory();
+ }
+ }, true);
+
+ return new OntoNotesNameSampleStream(new FileToStringSampleStream(
+ documentStream, Charset.forName("UTF-8")));
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(NameSample.class,
+ "ontonotes", new OntoNotesNameSampleStreamFactory());
+ }
+}
\ No newline at end of file
Modified: opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ml/model/MaxentModel.java
URL: http://svn.apache.org/viewvc/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ml/model/MaxentModel.java?rev=1547097&r1=1547096&r2=1547097&view=diff
==============================================================================
--- opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ml/model/MaxentModel.java (original)
+++ opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/ml/model/MaxentModel.java Mon Dec 2 16:41:34 2013
@@ -78,6 +78,7 @@ public interface MaxentModel {
* probability (contained in the <code>double[] ocs</code>)
* for each one.
**/
+ // TODO: This should be removed, can't be used anyway without format spec
public String getAllOutcomes(double[] outcomes);
/**
@@ -104,7 +105,7 @@ public interface MaxentModel {
/**
* Returns the data structures relevant to storing the model.
**/
- public Object[] getDataStructures();
+ // public Object[] getDataStructures();
/** Returns the number of outcomes for this model.
* @return The number of outcomes.