You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2011/05/24 11:08:13 UTC
svn commit: r1126943 - in
/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser:
AbstractBottomUpParser.java chunking/Parser.java treeinsert/Parser.java
Author: joern
Date: Tue May 24 09:08:13 2011
New Revision: 1126943
URL: http://svn.apache.org/viewvc?rev=1126943&view=rev
Log:
OPENNLP-175 Removed duplicate code in parser train methods
Modified:
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java
incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java?rev=1126943&r1=1126942&r2=1126943&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java Tue May 24 09:08:13 2011
@@ -34,6 +34,7 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Sequence;
import opennlp.tools.util.Span;
import opennlp.tools.util.StringList;
+import opennlp.tools.util.TrainingParameters;
/**
* Abstract class which contains code to tag and chunk parses for bottom up parsing and
@@ -504,8 +505,19 @@ public abstract class AbstractBottomUpPa
* @param cutoff The minimum number of entries required for the n-gram to be saved as part of the dictionary.
* @return A dictionary object.
*/
- public static Dictionary buildDictionary(ObjectStream<Parse> data, HeadRules rules, int cutoff)
+ public static Dictionary buildDictionary(ObjectStream<Parse> data, HeadRules rules, TrainingParameters params)
throws IOException {
+
+ int cutoff = 5;
+
+ String cutoffString = params.getSettings("dict").
+ get(TrainingParameters.CUTOFF_PARAM);
+
+ if (cutoffString != null) {
+ // TODO: Maybe throw illegal argument exception if not parse able
+ cutoff = Integer.parseInt(cutoffString);
+ }
+
NGramModel mdict = new NGramModel();
Parse p;
while((p = data.read()) != null) {
@@ -570,4 +582,13 @@ public abstract class AbstractBottomUpPa
mdict.cutoff(cutoff, Integer.MAX_VALUE);
return mdict.toDictionary(true);
}
+
+ public static Dictionary buildDictionary(ObjectStream<Parse> data, HeadRules rules, int cutoff)
+ throws IOException {
+
+ TrainingParameters params = new TrainingParameters();
+ params.put("dict", TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
+
+ return buildDictionary(data, rules, params);
+ }
}
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java?rev=1126943&r1=1126942&r2=1126943&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java Tue May 24 09:08:13 2011
@@ -277,11 +277,11 @@ public class Parser extends AbstractBott
}
public static ParserModel train(String languageCode, ObjectStream<Parse> parseSamples, HeadRules rules, TrainingParameters mlParams)
- throws IOException {
+ throws IOException {
System.err.println("Building dictionary");
- // TODO: Discuss and make dict cutoff configurable
- Dictionary mdict = buildDictionary(parseSamples, rules, 5);
+
+ Dictionary mdict = buildDictionary(parseSamples, rules, mlParams);
parseSamples.reset();
@@ -298,13 +298,13 @@ public class Parser extends AbstractBott
// tag
POSModel posModel = POSTaggerME.train(languageCode, new PosSampleStream(parseSamples),
- mlParams.getParameters("tagger"), null, null); // <- pass on name space corrected TrainingParameters ...
+ mlParams.getParameters("tagger"), null, null);
parseSamples.reset();
// chunk
ChunkerModel chunkModel = ChunkerME.train(languageCode,
- new ChunkSampleStream(parseSamples), // <- pass on name space corrected TrainingParameters ...
+ new ChunkSampleStream(parseSamples),
new ChunkContextGenerator(), mlParams.getParameters("chunker"));
parseSamples.reset();
@@ -315,58 +315,28 @@ public class Parser extends AbstractBott
Map<String, String> checkReportMap = new HashMap<String, String>();
AbstractModel checkModel = TrainUtil.train(kes, mlParams.getSettings("check"), checkReportMap);
mergeReportIntoManifest(manifestInfoEntries, checkReportMap, "check");
-
+
// TODO: Remove cast for HeadRules
return new ParserModel(languageCode, buildModel, checkModel,
posModel, chunkModel, (opennlp.tools.parser.lang.en.HeadRules) rules,
ParserType.CHUNKING, manifestInfoEntries);
}
-
+
public static ParserModel train(String languageCode, ObjectStream<Parse> parseSamples, HeadRules rules, int iterations, int cut)
throws IOException {
- System.err.println("Building dictionary");
- Dictionary mdict = buildDictionary(parseSamples, rules, cut);
-
- parseSamples.reset();
-
- Map<String, String> manifestInfoEntries = new HashMap<String, String>();
- ModelUtil.addCutoffAndIterations(manifestInfoEntries, cut, iterations);
-
- // build
- System.err.println("Training builder");
- opennlp.model.EventStream bes = new ParserEventStream(parseSamples, rules, ParserEventTypeEnum.BUILD, mdict);
- HashSumEventStream hsbes = new HashSumEventStream(bes);
- AbstractModel buildModel = train(hsbes, iterations, cut);
- manifestInfoEntries.put("Training-Builder-Eventhash",
- hsbes.calculateHashSum().toString(16));
-
- parseSamples.reset();
-
- // tag
- POSModel posModel = POSTaggerME.train(languageCode, new PosSampleStream(parseSamples),
- ModelType.MAXENT, null, null, cut, iterations);
-
- parseSamples.reset();
-
- // chunk
- ChunkerModel chunkModel = ChunkerME.train(languageCode,
- new ChunkSampleStream(parseSamples), cut, iterations,
- new ChunkContextGenerator());
-
- parseSamples.reset();
-
- // check
- System.err.println("Training checker");
- opennlp.model.EventStream kes = new ParserEventStream(parseSamples, rules, ParserEventTypeEnum.CHECK);
- HashSumEventStream hskes = new HashSumEventStream(kes);
- AbstractModel checkModel = train(hskes, iterations, cut);
- manifestInfoEntries.put("Training-Checker-Eventhash",
- hskes.calculateHashSum().toString(16));
-
- // TODO: Remove cast for HeadRules
- return new ParserModel(languageCode, buildModel, checkModel,
- posModel, chunkModel, (opennlp.tools.parser.lang.en.HeadRules) rules,
- ParserType.CHUNKING, manifestInfoEntries);
+ TrainingParameters params = new TrainingParameters();
+ params.put("dict", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
+
+ params.put("tagger", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
+ params.put("tagger", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+ params.put("chunker", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
+ params.put("chunker", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+ params.put("check", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
+ params.put("check", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+ params.put("build", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
+ params.put("build", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+
+ return train(languageCode, parseSamples, rules, params);
}
}
Modified: incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java
URL: http://svn.apache.org/viewvc/incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java?rev=1126943&r1=1126942&r2=1126943&view=diff
==============================================================================
--- incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java (original)
+++ incubator/opennlp/trunk/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java Tue May 24 09:08:13 2011
@@ -443,10 +443,8 @@ public class Parser extends AbstractBott
Map<String, String> manifestInfoEntries = new HashMap<String, String>();
- // TODO: training code should be shared between two parsers
System.err.println("Building dictionary");
- // TODO: Make cutoff configurable ... which cutoff should be used here?
- Dictionary mdict = buildDictionary(parseSamples, rules, 5);
+ Dictionary mdict = buildDictionary(parseSamples, rules, mlParams);
parseSamples.reset();
@@ -500,50 +498,19 @@ public class Parser extends AbstractBott
ObjectStream<Parse> parseSamples, HeadRules rules, int iterations, int cut)
throws IOException {
- // TODO: training code should be shared between two parsers
- System.err.println("Building dictionary");
- Dictionary mdict = buildDictionary(parseSamples, rules, cut);
-
- parseSamples.reset();
-
- // tag
- POSModel posModel = POSTaggerME.train(languageCode, new PosSampleStream(
- parseSamples), ModelType.MAXENT, null, null, cut, iterations);
-
- parseSamples.reset();
-
- // chunk
- ChunkerModel chunkModel = ChunkerME.train(languageCode, new ChunkSampleStream(
- parseSamples), cut, iterations, new ChunkContextGenerator());
-
- parseSamples.reset();
-
- // build
- System.err.println("Training builder");
- opennlp.model.EventStream bes = new ParserEventStream(parseSamples, rules,
- ParserEventTypeEnum.BUILD, mdict);
- AbstractModel buildModel = train(bes, iterations, cut);
-
- parseSamples.reset();
+ TrainingParameters params = new TrainingParameters();
+ params.put("dict", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
- // check
- System.err.println("Training checker");
- opennlp.model.EventStream kes = new ParserEventStream(parseSamples, rules,
- ParserEventTypeEnum.CHECK);
- AbstractModel checkModel = train(kes, iterations, cut);
-
- parseSamples.reset();
+ params.put("tagger", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
+ params.put("tagger", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+ params.put("chunker", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
+ params.put("chunker", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+ params.put("check", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
+ params.put("check", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+ params.put("build", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
+ params.put("build", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
- // attach
- System.err.println("Training attacher");
- opennlp.model.EventStream attachEvents = new ParserEventStream(parseSamples, rules,
- ParserEventTypeEnum.ATTACH);
- AbstractModel attachModel = train(attachEvents, iterations, cut);
-
- // TODO: Remove cast for HeadRules
- return new ParserModel(languageCode, buildModel, checkModel,
- attachModel, posModel, chunkModel,
- (opennlp.tools.parser.lang.en.HeadRules) rules, ParserType.TREEINSERT);
+ return train(languageCode, parseSamples, rules, params);
}
@Deprecated