You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/05/24 20:30:57 UTC

svn commit: r1127204 - in /incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools: cmdline/namefind/TokenNameFinderTrainerTool.java namefind/NameFinderEventStream.java namefind/NameFinderME.java namefind/NameSampleSequenceStream.java

Author: joern
Date: Tue May 24 18:30:57 2011
New Revision: 1127204

URL: http://svn.apache.org/viewvc?rev=1127204&view=rev
Log:
OPENNLP-183 Initial version of name finder sequence training

Added:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleSequenceStream.java   (with props)
Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderEventStream.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java?rev=1127204&r1=1127203&r2=1127204&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java Tue May 24 18:30:57 2011
@@ -77,7 +77,7 @@ public final class TokenNameFinderTraine
     }
     
     opennlp.tools.util.TrainingParameters mlParams = 
-      CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args), false);
+      CmdLineUtil.loadTrainingParameters(CmdLineUtil.getParameter("-params", args), true);
     
     File trainingDataInFile = new File(CmdLineUtil.getParameter("-data", args));
     File modelOutFile = new File(CmdLineUtil.getParameter("-model", args));

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderEventStream.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderEventStream.java?rev=1127204&r1=1127203&r2=1127204&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderEventStream.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderEventStream.java Tue May 24 18:30:57 2011
@@ -24,6 +24,7 @@ import java.util.Map;
 
 import opennlp.model.Event;
 import opennlp.model.EventStream;
+import opennlp.tools.postag.POSContextGenerator;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.Span;
@@ -98,6 +99,17 @@ public class NameFinderEventStream exten
     return outcomes;
   }
 
+  public static List<Event> generateEvents(String[] sentence, String[] outcomes, NameContextGenerator cg) {
+    List<Event> events = new ArrayList<Event>(outcomes.length);
+    for (int i = 0; i < outcomes.length; i++) {
+      events.add(new Event((String) outcomes[i], cg.getContext(i, sentence, outcomes,null)));
+    }
+    
+    cg.updateAdaptiveData(sentence, outcomes);
+
+    return events;
+  }
+  
   @Override
   protected Iterator<Event> createEvents(NameSample sample) {
     
@@ -108,17 +120,12 @@ public class NameFinderEventStream exten
     String outcomes[] = generateOutcomes(sample.getNames(), type, sample.getSentence().length);
     additionalContextFeatureGenerator.setCurrentContext(sample.getAdditionalContext());
     String[] tokens = new String[sample.getSentence().length];
-    List<Event> events = new ArrayList<Event>(outcomes.length);
+    
     for (int i = 0; i < sample.getSentence().length; i++) {
       tokens[i] = sample.getSentence()[i];
     }
-    for (int i = 0; i < outcomes.length; i++) {
-      events.add(new Event((String) outcomes[i], contextGenerator.getContext(i, sample.getSentence(), outcomes,null)));
-    }
-    
-    contextGenerator.updateAdaptiveData(tokens, outcomes);
     
-    return events.iterator();
+    return generateEvents(tokens, outcomes, contextGenerator).iterator();
   }
 
 

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java?rev=1127204&r1=1127203&r2=1127204&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java Tue May 24 18:30:57 2011
@@ -39,6 +39,7 @@ import opennlp.model.EventStream;
 import opennlp.model.MaxentModel;
 import opennlp.model.TrainUtil;
 import opennlp.model.TwoPassDataIndexer;
+import opennlp.tools.postag.POSSampleSequenceStream;
 import opennlp.tools.util.BeamSearch;
 import opennlp.tools.util.HashSumEventStream;
 import opennlp.tools.util.ObjectStream;
@@ -214,6 +215,10 @@ public class NameFinderME implements Tok
   public Span[] find(String[] tokens, String[][] additionalContext) {
     additionalContextFeatureGenerator.setCurrentContext(additionalContext);
     bestSequence = beam.bestSequence(tokens, additionalContext);
+    
+    if (bestSequence == null) // TODO: Fix this in extra jira issue!!!
+      return new Span[0];
+    
     List<String> c = bestSequence.getOutcomes();
 
     contextGenerator.updateAdaptiveData(tokens, (String[]) c.toArray(new String[c.size()]));
@@ -316,10 +321,6 @@ public class NameFinderME implements Tok
    public static TokenNameFinderModel train(String languageCode, String type, ObjectStream<NameSample> samples, 
        TrainingParameters trainParams, AdaptiveFeatureGenerator generator, final Map<String, Object> resources) throws IOException {
      
-     if (TrainUtil.isSequenceTraining(trainParams.getSettings())) {
-       throw new IllegalArgumentException("Sequence training is not supported!");
-     }
-     
      Map<String, String> manifestInfoEntries = new HashMap<String, String>();
      
      AdaptiveFeatureGenerator featureGenerator;
@@ -329,10 +330,19 @@ public class NameFinderME implements Tok
      else 
        featureGenerator = createFeatureGenerator();
      
-     EventStream eventStream = new NameFinderEventStream(samples, type,
-         new DefaultNameContextGenerator(featureGenerator));
+     AbstractModel nameFinderModel;
      
-     AbstractModel nameFinderModel = TrainUtil.train(eventStream, trainParams.getSettings(), manifestInfoEntries);
+     if (!TrainUtil.isSequenceTraining(trainParams.getSettings())) {
+       EventStream eventStream = new NameFinderEventStream(samples, type,
+           new DefaultNameContextGenerator(featureGenerator));
+       
+       nameFinderModel = TrainUtil.train(eventStream, trainParams.getSettings(), manifestInfoEntries);
+     }
+     else {
+       NameSampleSequenceStream ss = new NameSampleSequenceStream(samples, featureGenerator);
+
+       nameFinderModel = TrainUtil.train(ss, trainParams.getSettings(), manifestInfoEntries);
+     }
      
      return new TokenNameFinderModel(languageCode, nameFinderModel,
          resources, manifestInfoEntries);

Added: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleSequenceStream.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleSequenceStream.java?rev=1127204&view=auto
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleSequenceStream.java (added)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleSequenceStream.java Tue May 24 18:30:57 2011
@@ -0,0 +1,122 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreemnets.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+ 
+ package opennlp.tools.namefind;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.model.AbstractModel;
+import opennlp.model.Event;
+import opennlp.model.Sequence;
+import opennlp.model.SequenceStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
+
+public class NameSampleSequenceStream implements SequenceStream {
+
+  private NameContextGenerator pcg;
+  private List<NameSample> samples;
+  
+  public NameSampleSequenceStream(ObjectStream<NameSample> psi) throws IOException {
+    this(psi, new DefaultNameContextGenerator((AdaptiveFeatureGenerator) null));
+  }
+  
+  public NameSampleSequenceStream(ObjectStream<NameSample> psi, AdaptiveFeatureGenerator featureGen) 
+  throws IOException {
+    this(psi, new DefaultNameContextGenerator(featureGen));
+  }
+  
+  public NameSampleSequenceStream(ObjectStream<NameSample> psi, NameContextGenerator pcg)
+      throws IOException {
+    samples = new ArrayList<NameSample>();
+    
+    NameSample sample;
+    while((sample = psi.read()) != null) {
+      samples.add(sample);
+    }
+    
+    System.err.println("Got "+samples.size()+" sequences");
+    
+    this.pcg = pcg;
+  }
+  
+  
+  @SuppressWarnings("unchecked")
+  public Event[] updateContext(Sequence sequence, AbstractModel model) {
+    Sequence<NameSample> pss = (Sequence<NameSample>) sequence;
+    TokenNameFinder tagger = new NameFinderME(new TokenNameFinderModel("x-unspecified", model, Collections.<String, Object>emptyMap(), null));
+    String[] sentence = pss.getSource().getSentence();
+    String[] tags = NameFinderEventStream.generateOutcomes(tagger.find(sentence), null, sentence.length);
+    Event[] events = new Event[sentence.length];
+    
+    for (int si=0;si<events.length;si++) {
+      NameFinderEventStream.generateEvents(sentence,tags,pcg).toArray(events);
+    }
+    
+    return events;
+  }
+  
+  @SuppressWarnings("unchecked")
+  public Iterator<Sequence> iterator() {
+    return new NameSampleSequenceIterator(samples.iterator());
+  }
+
+}
+
+class NameSampleSequenceIterator implements Iterator<Sequence> {
+
+  private Iterator<NameSample> psi;
+  private NameContextGenerator cg;
+  
+  public NameSampleSequenceIterator(Iterator<NameSample> psi) {
+    this.psi = psi;
+    cg = new DefaultNameContextGenerator(null);
+  }
+  
+  public boolean hasNext() {
+    return psi.hasNext();
+  }
+
+  public Sequence<NameSample> next() {
+    NameSample sample = (NameSample) psi.next();
+    
+    String sentence[] = sample.getSentence();
+    String tags[] = NameFinderEventStream.generateOutcomes(sample.getNames(), null, sentence.length);
+    Event[] events = new Event[sentence.length];
+    
+    for (int i=0; i < sentence.length; i++) {
+
+      // it is safe to pass the tags as previous tags because
+      // the context generator does not look for non predicted tags
+      String[] context = cg.getContext(i, sentence, tags, null);
+
+      events[i] = new Event(tags[i], context);
+    }
+    Sequence<NameSample> sequence = new Sequence<NameSample>(events,sample);
+    return sequence;
+  }
+
+  public void remove() {
+    throw new UnsupportedOperationException();
+  }
+  
+}
+

Propchange: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleSequenceStream.java
------------------------------------------------------------------------------
    svn:mime-type = text/plain