You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2016/12/23 17:47:23 UTC
[6/7] opennlp git commit: OPENNLP-871: Cleanup for Java 8 and remove
deprecated tokenizer code
OPENNLP-871: Cleanup for Java 8 and remove deprecated tokenizer code
Closes #17
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/c49a87ab
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/c49a87ab
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/c49a87ab
Branch: refs/heads/897
Commit: c49a87abdb6c07a123a2234f4ec5ca3d21306d69
Parents: 99323ad
Author: smarthi <sm...@apache.org>
Authored: Thu Dec 22 22:39:33 2016 -0500
Committer: Kottmann <jo...@apache.org>
Committed: Fri Dec 23 16:46:42 2016 +0100
----------------------------------------------------------------------
.../tools/cmdline/chunker/ChunkerMETool.java | 6 +-
.../opennlp/tools/dictionary/Dictionary.java | 7 +-
.../tools/tokenize/TokenizerCrossValidator.java | 36 --------
.../tools/tokenize/TokenizerFactory.java | 22 ++---
.../opennlp/tools/tokenize/TokenizerME.java | 89 --------------------
.../opennlp/tools/tokenize/TokenizerModel.java | 53 ------------
.../opennlp/tools/eval/Conll00ChunkerEval.java | 3 +-
7 files changed, 13 insertions(+), 203 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c49a87ab/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerMETool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerMETool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerMETool.java
index 5b87c9e..b511a0b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerMETool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/chunker/ChunkerMETool.java
@@ -69,11 +69,9 @@ public class ChunkerMETool extends BasicCmdLineTool {
continue;
}
- String[] chunks = chunker.chunk(posSample.getSentence(),
- posSample.getTags());
+ String[] chunks = chunker.chunk(posSample.getSentence(), posSample.getTags());
- System.out.println(new ChunkSample(posSample.getSentence(),
- posSample.getTags(), chunks).nicePrint());
+ System.out.println(new ChunkSample(posSample.getSentence(), posSample.getTags(), chunks).nicePrint());
perfMon.incrementCounter();
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c49a87ab/opennlp-tools/src/main/java/opennlp/tools/dictionary/Dictionary.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/dictionary/Dictionary.java b/opennlp-tools/src/main/java/opennlp/tools/dictionary/Dictionary.java
index 58b7a6e..4961741 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/dictionary/Dictionary.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/dictionary/Dictionary.java
@@ -31,7 +31,6 @@ import java.util.StringTokenizer;
import opennlp.tools.dictionary.serializer.Attributes;
import opennlp.tools.dictionary.serializer.DictionarySerializer;
import opennlp.tools.dictionary.serializer.Entry;
-import opennlp.tools.dictionary.serializer.EntryInserter;
import opennlp.tools.util.StringList;
import opennlp.tools.util.StringUtil;
@@ -113,11 +112,7 @@ public class Dictionary implements Iterable<StringList> {
* @throws IOException
*/
public Dictionary(InputStream in) throws IOException {
- isCaseSensitive = DictionarySerializer.create(in, new EntryInserter() {
- public void insert(Entry entry) {
- put(entry.getTokens());
- }
- });
+ isCaseSensitive = DictionarySerializer.create(in, entry -> put(entry.getTokens()));
}
/**
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c49a87ab/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
index 3ca3c1d..fe9e4c6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerCrossValidator.java
@@ -18,13 +18,10 @@
package opennlp.tools.tokenize;
import java.io.IOException;
-
-import opennlp.tools.dictionary.Dictionary;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.eval.CrossValidationPartitioner;
import opennlp.tools.util.eval.FMeasure;
-import opennlp.tools.util.model.ModelUtil;
public class TokenizerCrossValidator {
@@ -42,39 +39,6 @@ public class TokenizerCrossValidator {
}
/**
- * @deprecated use
- * {@link #TokenizerCrossValidator(TrainingParameters, TokenizerFactory, TokenizerEvaluationMonitor...)}
- * instead and pass in a {@link TokenizerFactory}
- */
- public TokenizerCrossValidator(String language, Dictionary abbreviations,
- boolean alphaNumericOptimization, TrainingParameters params,
- TokenizerEvaluationMonitor ... listeners) {
- this(params, new TokenizerFactory(language, abbreviations,
- alphaNumericOptimization, null), listeners);
- }
-
- /**
- * @deprecated use
- * {@link #TokenizerCrossValidator(TrainingParameters, TokenizerFactory, TokenizerEvaluationMonitor...)}
- * instead and pass in a {@link TokenizerFactory}
- */
- public TokenizerCrossValidator(String language, boolean alphaNumericOptimization) {
- this(language, alphaNumericOptimization, ModelUtil.createDefaultTrainingParameters());
- }
-
- /**
- * @deprecated use
- * {@link #TokenizerCrossValidator(TrainingParameters, TokenizerFactory, TokenizerEvaluationMonitor...)}
- * instead and pass in a {@link TokenizerFactory}
- */
- public TokenizerCrossValidator(String language,
- boolean alphaNumericOptimization, TrainingParameters params,
- TokenizerEvaluationMonitor ... listeners) {
- this(language, null, alphaNumericOptimization, params, listeners);
- }
-
-
- /**
* Starts the evaluation.
*
* @param samples
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c49a87ab/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
index f9e789a..ffa793b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerFactory.java
@@ -37,7 +37,7 @@ public class TokenizerFactory extends BaseToolFactory {
private String languageCode;
private Dictionary abbreviationDictionary;
- private Boolean useAlphaNumericOptimization;
+ private Boolean useAlphaNumericOptimization = false;
private Pattern alphaNumericPattern;
private static final String ABBREVIATIONS_ENTRY_NAME = "abbreviations.dictionary";
@@ -112,13 +112,11 @@ public class TokenizerFactory extends BaseToolFactory {
public Map<String, String> createManifestEntries() {
Map<String, String> manifestEntries = super.createManifestEntries();
- manifestEntries.put(USE_ALPHA_NUMERIC_OPTIMIZATION,
- Boolean.toString(isUseAlphaNumericOptmization()));
+ manifestEntries.put(USE_ALPHA_NUMERIC_OPTIMIZATION, Boolean.toString(isUseAlphaNumericOptmization()));
// alphanumeric pattern is optional
if (getAlphaNumericPattern() != null)
- manifestEntries.put(ALPHA_NUMERIC_PATTERN, getAlphaNumericPattern()
- .pattern());
+ manifestEntries.put(ALPHA_NUMERIC_PATTERN, getAlphaNumericPattern().pattern());
return manifestEntries;
}
@@ -167,9 +165,8 @@ public class TokenizerFactory extends BaseToolFactory {
*/
public Pattern getAlphaNumericPattern() {
if (this.alphaNumericPattern == null) {
- if (artifactProvider != null) {
- String prop = this.artifactProvider
- .getManifestProperty(ALPHA_NUMERIC_PATTERN);
+ if (this.artifactProvider != null) {
+ String prop = this.artifactProvider.getManifestProperty(ALPHA_NUMERIC_PATTERN);
if (prop != null) {
this.alphaNumericPattern = Pattern.compile(prop);
}
@@ -189,8 +186,8 @@ public class TokenizerFactory extends BaseToolFactory {
* @return true if the alpha numeric optimization is enabled, otherwise false
*/
public boolean isUseAlphaNumericOptmization() {
- if (this.useAlphaNumericOptimization == null && artifactProvider != null) {
- this.useAlphaNumericOptimization = Boolean.valueOf(artifactProvider
+ if (artifactProvider != null) {
+ this.useAlphaNumericOptimization = Boolean.valueOf(this.artifactProvider
.getManifestProperty(USE_ALPHA_NUMERIC_OPTIMIZATION));
}
return this.useAlphaNumericOptimization;
@@ -203,8 +200,7 @@ public class TokenizerFactory extends BaseToolFactory {
*/
public Dictionary getAbbreviationDictionary() {
if (this.abbreviationDictionary == null && artifactProvider != null) {
- this.abbreviationDictionary = artifactProvider
- .getArtifact(ABBREVIATIONS_ENTRY_NAME);
+ this.abbreviationDictionary = this.artifactProvider.getArtifact(ABBREVIATIONS_ENTRY_NAME);
}
return this.abbreviationDictionary;
}
@@ -215,7 +211,7 @@ public class TokenizerFactory extends BaseToolFactory {
* @return the language code
*/
public String getLanguageCode() {
- if (this.languageCode == null && artifactProvider != null) {
+ if (this.languageCode == null && this.artifactProvider != null) {
this.languageCode = this.artifactProvider.getLanguage();
}
return this.languageCode;
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c49a87ab/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
index 4c4c638..491b6fa 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerME.java
@@ -34,7 +34,6 @@ import opennlp.tools.tokenize.lang.Factory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
import opennlp.tools.util.TrainingParameters;
-import opennlp.tools.util.model.ModelUtil;
/**
* A Tokenizer for converting raw text into separated tokens. It uses
@@ -254,94 +253,6 @@ public class TokenizerME extends AbstractTokenizer {
}
/**
- * Trains a model for the {@link TokenizerME}.
- *
- * @param languageCode the language of the natural text
- * @param samples the samples used for the training.
- * @param useAlphaNumericOptimization - if true alpha numerics are skipped
- * @param mlParams the machine learning train parameters
- *
- * @return the trained {@link TokenizerModel}
- *
- * @throws IOException it throws an {@link IOException} if an {@link IOException}
- * is thrown during IO operations on a temp file which is created during training.
- * Or if reading from the {@link ObjectStream} fails.
- *
- * @deprecated Use
- * {@link #train(ObjectStream, TokenizerFactory, TrainingParameters)}
- * and pass in a {@link TokenizerFactory}
- */
- public static TokenizerModel train(String languageCode, ObjectStream<TokenSample> samples,
- boolean useAlphaNumericOptimization, TrainingParameters mlParams) throws IOException {
- return train(languageCode, samples, null, useAlphaNumericOptimization,
- mlParams);
- }
-
- /**
- * Trains a model for the {@link TokenizerME}.
- *
- * @param languageCode the language of the natural text
- * @param samples the samples used for the training.
- * @param abbreviations an abbreviations dictionary
- * @param useAlphaNumericOptimization - if true alpha numerics are skipped
- * @param mlParams the machine learning train parameters
- *
- * @return the trained {@link TokenizerModel}
- *
- * @throws IOException it throws an {@link IOException} if an {@link IOException}
- * is thrown during IO operations on a temp file which is created during training.
- * Or if reading from the {@link ObjectStream} fails.
- *
- * @deprecated Use
- * {@link #train(ObjectStream, TokenizerFactory, TrainingParameters)}
- * and pass in a {@link TokenizerFactory}
- */
- public static TokenizerModel train(String languageCode,
- ObjectStream<TokenSample> samples, Dictionary abbreviations,
- boolean useAlphaNumericOptimization, TrainingParameters mlParams)
- throws IOException {
- Factory factory = new Factory();
-
- Map<String, String> manifestInfoEntries = new HashMap<>();
-
- ObjectStream<Event> eventStream = new TokSpanEventStream(samples,
- useAlphaNumericOptimization, factory.getAlphanumeric(languageCode),
- factory.createTokenContextGenerator(languageCode,
- getAbbreviations(abbreviations)));
-
- EventTrainer trainer = TrainerFactory.getEventTrainer(
- mlParams.getSettings(), manifestInfoEntries);
-
- MaxentModel maxentModel = trainer.train(eventStream);
-
- return new TokenizerModel(languageCode, maxentModel, abbreviations,
- useAlphaNumericOptimization, manifestInfoEntries);
- }
-
-
- /**
- * Trains a model for the {@link TokenizerME} with a default cutoff of 5 and 100 iterations.
- *
- * @param languageCode the language of the natural text
- * @param samples the samples used for the training.
- * @param useAlphaNumericOptimization - if true alpha numerics are skipped
- *
- * @return the trained {@link TokenizerModel}
- *
- * @throws IOException it throws an {@link IOException} if an {@link IOException}
- * is thrown during IO operations on a temp file which is
- *
- * @deprecated Use
- * {@link #train(ObjectStream, TokenizerFactory, TrainingParameters)}
- * and pass in a {@link TokenizerFactory}
- */
- public static TokenizerModel train(String languageCode, ObjectStream<TokenSample> samples,
- boolean useAlphaNumericOptimization) throws IOException {
- return train(samples, TokenizerFactory.create(null, languageCode, null, useAlphaNumericOptimization, null),
- ModelUtil.createDefaultTrainingParameters());
- }
-
- /**
* Returns the value of the alpha-numeric optimization flag.
*
* @return true if the tokenizer should use alpha-numeric optimization, false otherwise.
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c49a87ab/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
index e63b946..ed84b4e 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
@@ -66,59 +66,6 @@ public final class TokenizerModel extends BaseModel {
/**
* Initializes the current instance.
*
- * @param language the language the tokenizer should use
- * @param tokenizerMaxentModel the statistical model of the tokenizer
- * @param abbreviations the dictionary containing the abbreviations
- * @param useAlphaNumericOptimization if true alpha numeric optimization is enabled, otherwise not
- * @param manifestInfoEntries the additional meta data which should be written into manifest
- *
- * @deprecated Use
- * {@link TokenizerModel#TokenizerModel(MaxentModel, Map, TokenizerFactory)}
- * instead and pass in a {@link TokenizerFactory}.
- */
- public TokenizerModel(String language, MaxentModel tokenizerMaxentModel,
- Dictionary abbreviations, boolean useAlphaNumericOptimization,
- Map<String, String> manifestInfoEntries) {
- this(tokenizerMaxentModel, manifestInfoEntries,
- new TokenizerFactory(language, abbreviations, useAlphaNumericOptimization, null));
- }
-
- /**
- * Initializes the current instance.
- *
- * @param language the language the tokenizer should use
- * @param tokenizerMaxentModel the statistical model of the tokenizer
- * @param useAlphaNumericOptimization if true alpha numeric optimization is enabled, otherwise not
- * @param manifestInfoEntries the additional meta data which should be written into manifest
- *
- * @deprecated Use
- * {@link TokenizerModel#TokenizerModel(MaxentModel, Map, TokenizerFactory)}
- * instead and pass in a {@link TokenizerFactory}.
- */
- public TokenizerModel(String language, AbstractModel tokenizerMaxentModel,
- boolean useAlphaNumericOptimization, Map<String, String> manifestInfoEntries) {
- this(language, tokenizerMaxentModel, null, useAlphaNumericOptimization, manifestInfoEntries);
- }
-
- /**
- * Initializes the current instance.
- *
- * @param language the language the tokenizer should use
- * @param tokenizerMaxentModel the statistical model of the tokenizer
- * @param useAlphaNumericOptimization if true alpha numeric optimization is enabled, otherwise not
- *
- * @deprecated Use
- * {@link TokenizerModel#TokenizerModel(MaxentModel, Map, TokenizerFactory)}
- * instead and pass in a {@link TokenizerFactory}.
- */
- public TokenizerModel(String language, AbstractModel tokenizerMaxentModel,
- boolean useAlphaNumericOptimization) {
- this(language, tokenizerMaxentModel, useAlphaNumericOptimization, null);
- }
-
- /**
- * Initializes the current instance.
- *
* @param in the Input Stream to load the model from
*
* @throws IOException if reading from the stream fails in anyway
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c49a87ab/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
index 3b8e060..0ff3002 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
@@ -59,8 +59,7 @@ public class Conll00ChunkerEval {
double expectedFMeasure) throws IOException {
ObjectStream<ChunkSample> samples = new ChunkSampleStream(
- new PlainTextByLineStream(new MarkableFileInputStreamFactory(testData),
- "UTF-8"));
+ new PlainTextByLineStream(new MarkableFileInputStreamFactory(testData),"UTF-8"));
ChunkerEvaluator evaluator = new ChunkerEvaluator(new ChunkerME(model));
evaluator.evaluate(samples);