You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/05/24 20:30:57 UTC
svn commit: r1127204 - in
/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools:
cmdline/namefind/TokenNameFinderTrainerTool.java
namefind/NameFinderEventStream.java namefind/NameFinderME.java
namefind/NameSampleSequenceStream.java
Author: joern
Date: Tue May 24 18:30:57 2011
New Revision: 1127204
URL: http://svn.apache.org/viewvc?rev=1127204&view=rev
Log:
OPENNLP-183 Initial version of name finder sequence training
Added:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleSequenceStream.java (with props)
Modified:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderEventStream.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java?rev=1127204&r1=1127203&r2=1127204&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java Tue May 24 18:30:57 2011
@@ -77,7 +77,7 @@ public final class TokenNameFinderTraine
}
opennlp.tools.util.TrainingParameters mlParams =
- CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args), false);
+ CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args), true);
File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderEventStream.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderEventStream.java?rev=1127204&r1=1127203&r2=1127204&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderEventStream.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderEventStream.java Tue May 24 18:30:57 2011
@@ -24,6 +24,7 @@ import java.util.Map;
import opennlp.model.Event;
import opennlp.model.EventStream;
+import opennlp.tools.postag.POSContextGenerator;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
import opennlp.tools.util.Span;
@@ -98,6 +99,17 @@ public class NameFinderEventStream exten
return outcomes;
}
+ public static List<Event> generateEvents(String[] sentence, String[] outcomes, NameContextGenerator cg) {
+ List<Event> events = new ArrayList<Event>(outcomes.length);
+ for (int i = 0; i < outcomes.length; i++) {
+ events.add(new Event((String) outcomes[i], cg.getContext(i, sentence, outcomes,null)));
+ }
+
+ cg.updateAdaptiveData(sentence, outcomes);
+
+ return events;
+ }
+
@Override
protected Iterator<Event> createEvents(NameSample sample) {
@@ -108,17 +120,12 @@ public class NameFinderEventStream exten
String outcomes[] = generateOutcomes(sample.getNames(), type, sample.getSentence().length);
additionalContextFeatureGenerator.setCurrentContext(sample.getAdditionalContext());
String[] tokens = new String[sample.getSentence().length];
- List<Event> events = new ArrayList<Event>(outcomes.length);
+
for (int i = 0; i < sample.getSentence().length; i++) {
tokens[i] = sample.getSentence()[i];
}
- for (int i = 0; i < outcomes.length; i++) {
- events.add(new Event((String) outcomes[i], contextGenerator.getContext(i, sample.getSentence(), outcomes,null)));
- }
-
- contextGenerator.updateAdaptiveData(tokens, outcomes);
- return events.iterator();
+ return generateEvents(tokens, outcomes, contextGenerator).iterator();
}
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java?rev=1127204&r1=1127203&r2=1127204&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java Tue May 24 18:30:57 2011
@@ -39,6 +39,7 @@ import opennlp.model.EventStream;
import opennlp.model.MaxentModel;
import opennlp.model.TrainUtil;
import opennlp.model.TwoPassDataIndexer;
+import opennlp.tools.postag.POSSampleSequenceStream;
import opennlp.tools.util.BeamSearch;
import opennlp.tools.util.HashSumEventStream;
import opennlp.tools.util.ObjectStream;
@@ -214,6 +215,10 @@ public class NameFinderME implements Tok
public Span[] find(String[] tokens, String[][] additionalContext) {
additionalContextFeatureGenerator.setCurrentContext(additionalContext);
bestSequence = beam.bestSequence(tokens, additionalContext);
+
+ if (bestSequence == null) // TODO: Fix this in extra jira issue!!!
+ return new Span[0];
+
List<String> c = bestSequence.getOutcomes();
contextGenerator.updateAdaptiveData(tokens, (String[]) c.toArray(new String[c.size()]));
@@ -316,10 +321,6 @@ public class NameFinderME implements Tok
public static TokenNameFinderModel train(String languageCode, String type, ObjectStream<NameSample> samples,
TrainingParameters trainParams, AdaptiveFeatureGenerator generator, final Map<String, Object> resources) throws IOException {
- if (TrainUtil.isSequenceTraining(trainParams.getSettings())) {
- throw new IllegalArgumentException("Sequence training is not supported!");
- }
-
Map<String, String> manifestInfoEntries = new HashMap<String, String>();
AdaptiveFeatureGenerator featureGenerator;
@@ -329,10 +330,19 @@ public class NameFinderME implements Tok
else
featureGenerator = createFeatureGenerator();
- EventStream eventStream = new NameFinderEventStream(samples, type,
- new DefaultNameContextGenerator(featureGenerator));
+ AbstractModel nameFinderModel;
- AbstractModel nameFinderModel = TrainUtil.train(eventStream, trainParams.getSettings(), manifestInfoEntries);
+ if (!TrainUtil.isSequenceTraining(trainParams.getSettings())) {
+ EventStream eventStream = new NameFinderEventStream(samples, type,
+ new DefaultNameContextGenerator(featureGenerator));
+
+ nameFinderModel = TrainUtil.train(eventStream, trainParams.getSettings(), manifestInfoEntries);
+ }
+ else {
+ NameSampleSequenceStream ss = new NameSampleSequenceStream(samples, featureGenerator);
+
+ nameFinderModel = TrainUtil.train(ss, trainParams.getSettings(), manifestInfoEntries);
+ }
return new TokenNameFinderModel(languageCode, nameFinderModel,
resources, manifestInfoEntries);
Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleSequenceStream.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleSequenceStream.java?rev=1127204&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleSequenceStream.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleSequenceStream.java Tue May 24 18:30:57 2011
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+ package opennlp.tools.namefind;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.model.AbstractModel;
+import opennlp.model.Event;
+import opennlp.model.Sequence;
+import opennlp.model.SequenceStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
+
+public class NameSampleSequenceStream implements SequenceStream {
+
+ private NameContextGenerator pcg;
+ private List<NameSample> samples;
+
+ public NameSampleSequenceStream(ObjectStream<NameSample> psi) throws IOException {
+ this(psi, new DefaultNameContextGenerator((AdaptiveFeatureGenerator) null));
+ }
+
+ public NameSampleSequenceStream(ObjectStream<NameSample> psi, AdaptiveFeatureGenerator featureGen)
+ throws IOException {
+ this(psi, new DefaultNameContextGenerator(featureGen));
+ }
+
+ public NameSampleSequenceStream(ObjectStream<NameSample> psi, NameContextGenerator pcg)
+ throws IOException {
+ samples = new ArrayList<NameSample>();
+
+ NameSample sample;
+ while((sample = psi.read()) != null) {
+ samples.add(sample);
+ }
+
+ System.err.println("Got "+samples.size()+" sequences");
+
+ this.pcg = pcg;
+ }
+
+
+ @SuppressWarnings("unchecked")
+ public Event[] updateContext(Sequence sequence, AbstractModel model) {
+ Sequence<NameSample> pss = (Sequence<NameSample>) sequence;
+ TokenNameFinder tagger = new NameFinderME(new TokenNameFinderModel("x-unspecified", model, Collections.<String, Object>emptyMap(), null));
+ String[] sentence = pss.getSource().getSentence();
+ String[] tags = NameFinderEventStream.generateOutcomes(tagger.find(sentence), null, sentence.length);
+ Event[] events = new Event[sentence.length];
+
+ for (int si=0;si<events.length;si++) {
+ NameFinderEventStream.generateEvents(sentence,tags,pcg).toArray(events);
+ }
+
+ return events;
+ }
+
+ @SuppressWarnings("unchecked")
+ public Iterator<Sequence> iterator() {
+ return new NameSampleSequenceIterator(samples.iterator());
+ }
+
+}
+
+class NameSampleSequenceIterator implements Iterator<Sequence> {
+
+ private Iterator<NameSample> psi;
+ private NameContextGenerator cg;
+
+ public NameSampleSequenceIterator(Iterator<NameSample> psi) {
+ this.psi = psi;
+ cg = new DefaultNameContextGenerator(null);
+ }
+
+ public boolean hasNext() {
+ return psi.hasNext();
+ }
+
+ public Sequence<NameSample> next() {
+ NameSample sample = (NameSample) psi.next();
+
+ String sentence[] = sample.getSentence();
+ String tags[] = NameFinderEventStream.generateOutcomes(sample.getNames(), null, sentence.length);
+ Event[] events = new Event[sentence.length];
+
+ for (int i=0; i < sentence.length; i++) {
+
+ // it is safe to pass the tags as previous tags because
+ // the context generator does not look for non predicted tags
+ String[] context = cg.getContext(i, sentence, tags, null);
+
+ events[i] = new Event(tags[i], context);
+ }
+ Sequence<NameSample> sequence = new Sequence<NameSample>(events,sample);
+ return sequence;
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+
+}
+
Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleSequenceStream.java
------------------------------------------------------------------------------
svn:mime-type = text/plain