You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/05/24 11:08:13 UTC

svn commit: r1126943 - in /incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser: AbstractBottomUpParser.java chunking/Parser.java treeinsert/Parser.java

Author: joern
Date: Tue May 24 09:08:13 2011
New Revision: 1126943

URL: http://svn.apache.org/viewvc?rev=1126943&view=rev
Log:
OPENNLP-175 Removed duplicate code in parser train methods

Modified:
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java
    incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java?rev=1126943&r1=1126942&r2=1126943&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java Tue May 24 09:08:13 2011
@@ -34,6 +34,7 @@ import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.Sequence;
 import opennlp.tools.util.Span;
 import opennlp.tools.util.StringList;
+import opennlp.tools.util.TrainingParameters;
 
 /**
  * Abstract class which contains code to tag and chunk parses for bottom up parsing and
@@ -504,8 +505,19 @@ public abstract class AbstractBottomUpPa
    * @param cutoff The minimum number of entries required for the n-gram to be saved as part of the dictionary.
    * @return A dictionary object.
    */
-  public static Dictionary buildDictionary(ObjectStream<Parse> data, HeadRules rules, int cutoff)
+  public static Dictionary buildDictionary(ObjectStream<Parse> data, HeadRules rules, TrainingParameters params)
       throws IOException {
+    
+    int cutoff = 5;
+    
+    String cutoffString = params.getSettings("dict").
+        get(TrainingParameters.CUTOFF_PARAM);
+    
+    if (cutoffString != null) {
+      // TODO: Maybe throw illegal argument exception if not parse able
+      cutoff = Integer.parseInt(cutoffString);
+    }
+    
     NGramModel mdict = new NGramModel();
     Parse p;
     while((p = data.read()) != null) {
@@ -570,4 +582,13 @@ public abstract class AbstractBottomUpPa
     mdict.cutoff(cutoff, Integer.MAX_VALUE);
     return mdict.toDictionary(true);
   }
+  
+  public static Dictionary buildDictionary(ObjectStream<Parse> data, HeadRules rules, int cutoff)
+      throws IOException {
+    
+    TrainingParameters params = new TrainingParameters();
+    params.put("dict", TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
+    
+    return buildDictionary(data, rules, params);
+  }
 }

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java?rev=1126943&r1=1126942&r2=1126943&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java Tue May 24 09:08:13 2011
@@ -277,11 +277,11 @@ public class Parser extends AbstractBott
   }
   
   public static ParserModel train(String languageCode, ObjectStream<Parse> parseSamples, HeadRules rules, TrainingParameters mlParams)
-  throws IOException {
+          throws IOException {
     
     System.err.println("Building dictionary");
- // TODO: Discuss and make dict cutoff configurable
-    Dictionary mdict = buildDictionary(parseSamples, rules, 5); 
+    
+    Dictionary mdict = buildDictionary(parseSamples, rules, mlParams);
     
     parseSamples.reset();
     
@@ -298,13 +298,13 @@ public class Parser extends AbstractBott
     
     // tag
     POSModel posModel = POSTaggerME.train(languageCode, new PosSampleStream(parseSamples), 
-        mlParams.getParameters("tagger"), null, null); // <- pass on name space corrected TrainingParameters ...
+        mlParams.getParameters("tagger"), null, null);
     
     parseSamples.reset();
     
     // chunk
     ChunkerModel chunkModel = ChunkerME.train(languageCode, 
-        new ChunkSampleStream(parseSamples), // <- pass on name space corrected TrainingParameters ...
+        new ChunkSampleStream(parseSamples),
         new ChunkContextGenerator(), mlParams.getParameters("chunker"));
     
     parseSamples.reset();
@@ -315,58 +315,28 @@ public class Parser extends AbstractBott
     Map<String, String> checkReportMap = new HashMap<String, String>();
     AbstractModel checkModel = TrainUtil.train(kes, mlParams.getSettings("check"), checkReportMap);
     mergeReportIntoManifest(manifestInfoEntries, checkReportMap, "check");
-    
+
     // TODO: Remove cast for HeadRules
     return new ParserModel(languageCode, buildModel, checkModel,
         posModel, chunkModel, (opennlp.tools.parser.lang.en.HeadRules) rules,
         ParserType.CHUNKING, manifestInfoEntries);
   }
-  
+
   public static ParserModel train(String languageCode, ObjectStream<Parse> parseSamples, HeadRules rules, int iterations, int cut)
       throws IOException {
     
-    System.err.println("Building dictionary");
-    Dictionary mdict = buildDictionary(parseSamples, rules, cut);
-    
-    parseSamples.reset();
-    
-    Map<String, String> manifestInfoEntries = new HashMap<String, String>();
-    ModelUtil.addCutoffAndIterations(manifestInfoEntries, cut, iterations);
-    
-    // build
-    System.err.println("Training builder");
-    opennlp.model.EventStream bes = new ParserEventStream(parseSamples, rules, ParserEventTypeEnum.BUILD, mdict);
-    HashSumEventStream hsbes = new HashSumEventStream(bes);
-    AbstractModel buildModel = train(hsbes, iterations, cut);
-    manifestInfoEntries.put("Training-Builder-Eventhash", 
-        hsbes.calculateHashSum().toString(16));
-    
-    parseSamples.reset();
-    
-    // tag
-    POSModel posModel = POSTaggerME.train(languageCode, new PosSampleStream(parseSamples), 
-        ModelType.MAXENT, null, null, cut, iterations);
-    
-    parseSamples.reset();
-    
-    // chunk
-    ChunkerModel chunkModel = ChunkerME.train(languageCode, 
-        new ChunkSampleStream(parseSamples), cut, iterations,
-        new ChunkContextGenerator());
-    
-    parseSamples.reset();
-    
-    // check
-    System.err.println("Training checker");
-    opennlp.model.EventStream kes = new ParserEventStream(parseSamples, rules, ParserEventTypeEnum.CHECK);
-    HashSumEventStream hskes = new HashSumEventStream(kes);
-    AbstractModel checkModel = train(hskes, iterations, cut);
-    manifestInfoEntries.put("Training-Checker-Eventhash", 
-        hskes.calculateHashSum().toString(16));
-    
-    // TODO: Remove cast for HeadRules
-    return new ParserModel(languageCode, buildModel, checkModel,
-        posModel, chunkModel, (opennlp.tools.parser.lang.en.HeadRules) rules,
-        ParserType.CHUNKING, manifestInfoEntries);
+    TrainingParameters params = new TrainingParameters();
+    params.put("dict", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
+
+    params.put("tagger", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
+    params.put("tagger", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+    params.put("chunker", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
+    params.put("chunker", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+    params.put("check", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
+    params.put("check", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+    params.put("build", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
+    params.put("build", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+
+    return train(languageCode, parseSamples, rules, params);
   }
 }

Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java?rev=1126943&r1=1126942&r2=1126943&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java Tue May 24 09:08:13 2011
@@ -443,10 +443,8 @@ public class Parser extends AbstractBott
     
     Map<String, String> manifestInfoEntries = new HashMap<String, String>();
     
-    // TODO: training code should be shared between two parsers
     System.err.println("Building dictionary");
-    // TODO: Make cutoff configurable ... which cutoff should be used here?
-    Dictionary mdict = buildDictionary(parseSamples, rules, 5);
+    Dictionary mdict = buildDictionary(parseSamples, rules, mlParams);
     
     parseSamples.reset();
     
@@ -500,50 +498,19 @@ public class Parser extends AbstractBott
       ObjectStream<Parse> parseSamples, HeadRules rules, int iterations, int cut)
       throws IOException {
     
-    // TODO: training code should be shared between two parsers
-    System.err.println("Building dictionary");
-    Dictionary mdict = buildDictionary(parseSamples, rules, cut);
-
-    parseSamples.reset();
-
-    // tag
-    POSModel posModel = POSTaggerME.train(languageCode, new PosSampleStream(
-        parseSamples), ModelType.MAXENT, null, null, cut, iterations);
-
-    parseSamples.reset();
-
-    // chunk
-    ChunkerModel chunkModel = ChunkerME.train(languageCode, new ChunkSampleStream(
-        parseSamples), cut, iterations, new ChunkContextGenerator());
-
-    parseSamples.reset();
-
-    // build
-    System.err.println("Training builder");
-    opennlp.model.EventStream bes = new ParserEventStream(parseSamples, rules,
-        ParserEventTypeEnum.BUILD, mdict);
-    AbstractModel buildModel = train(bes, iterations, cut);
-
-    parseSamples.reset();
+    TrainingParameters params = new TrainingParameters();
+    params.put("dict", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
 
-    // check
-    System.err.println("Training checker");
-    opennlp.model.EventStream kes = new ParserEventStream(parseSamples, rules,
-        ParserEventTypeEnum.CHECK);
-    AbstractModel checkModel = train(kes, iterations, cut);
-
-    parseSamples.reset();
+    params.put("tagger", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
+    params.put("tagger", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+    params.put("chunker", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
+    params.put("chunker", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+    params.put("check", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
+    params.put("check", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+    params.put("build", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
+    params.put("build", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
     
-    // attach 
-    System.err.println("Training attacher");
-    opennlp.model.EventStream attachEvents = new ParserEventStream(parseSamples, rules,
-        ParserEventTypeEnum.ATTACH);
-    AbstractModel attachModel = train(attachEvents, iterations, cut);
-    
-    // TODO: Remove cast for HeadRules
-    return new ParserModel(languageCode, buildModel, checkModel,
-        attachModel, posModel, chunkModel, 
-        (opennlp.tools.parser.lang.en.HeadRules) rules, ParserType.TREEINSERT);
+    return train(languageCode, parseSamples, rules, params);
   }
   
   @Deprecated