You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/04/20 10:41:16 UTC
[29/50] [abbrv] opennlp git commit: OPENNLP-994: Remove deprecated
methods from the Document Categorizer, this closes apache/opennlp#133
OPENNLP-994: Remove deprecated methods from the Document Categorizer, this closes apache/opennlp#133
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/c6ecbf24
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/c6ecbf24
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/c6ecbf24
Branch: refs/heads/parser_regression
Commit: c6ecbf243c7ab63f83d7c2267e052552bc6672f9
Parents: ede6901
Author: smarthi <sm...@apache.org>
Authored: Mon Feb 27 17:23:40 2017 -0500
Committer: J�rn Kottmann <jo...@apache.org>
Committed: Thu Apr 20 12:40:22 2017 +0200
----------------------------------------------------------------------
.../doccat/DoccatCrossValidatorTool.java | 7 +-
.../tools/cmdline/doccat/DoccatTool.java | 11 +-
.../tools/cmdline/doccat/DoccatTrainerTool.java | 5 +-
.../opennlp/tools/doccat/DoccatFactory.java | 93 +----------------
.../tools/doccat/DocumentCategorizer.java | 54 ++--------
.../doccat/DocumentCategorizerEvaluator.java | 2 +-
.../tools/doccat/DocumentCategorizerME.java | 101 ++-----------------
.../opennlp/tools/doccat/DocumentSample.java | 6 --
.../formats/LeipzigDoccatSampleStream.java | 19 ++--
.../tools/doccat/DocumentCategorizerMETest.java | 18 ++--
.../tools/doccat/DocumentCategorizerNBTest.java | 17 ++--
.../tools/doccat/DocumentSampleTest.java | 4 +-
.../doccat/AbstractDocumentCategorizer.java | 29 +++---
.../java/opennlp/uima/util/AnnotatorUtil.java | 6 +-
14 files changed, 66 insertions(+), 306 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
index f0f1712..a73aba7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatCrossValidatorTool.java
@@ -36,7 +36,6 @@ import opennlp.tools.doccat.DoccatEvaluationMonitor;
import opennlp.tools.doccat.DoccatFactory;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.doccat.FeatureGenerator;
-import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.eval.EvaluationMonitor;
import opennlp.tools.util.model.ModelUtil;
@@ -84,16 +83,12 @@ public final class DoccatCrossValidatorTool extends
FeatureGenerator[] featureGenerators = DoccatTrainerTool
.createFeatureGenerators(params.getFeatureGenerators());
- Tokenizer tokenizer = DoccatTrainerTool.createTokenizer(params
- .getTokenizer());
-
DoccatEvaluationMonitor[] listenersArr = listeners
.toArray(new DoccatEvaluationMonitor[listeners.size()]);
DoccatCrossValidator validator;
try {
- DoccatFactory factory = DoccatFactory.create(params.getFactory(),
- tokenizer, featureGenerators);
+ DoccatFactory factory = DoccatFactory.create(params.getFactory(), featureGenerators);
validator = new DoccatCrossValidator(params.getLang(), mlParams,
factory, listenersArr);
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java
index a01d354..49a640c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTool.java
@@ -28,6 +28,7 @@ import opennlp.tools.cmdline.SystemInputStreamFactory;
import opennlp.tools.doccat.DoccatModel;
import opennlp.tools.doccat.DocumentCategorizerME;
import opennlp.tools.doccat.DocumentSample;
+import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.ParagraphStream;
import opennlp.tools.util.PlainTextByLineStream;
@@ -36,7 +37,7 @@ public class DoccatTool extends BasicCmdLineTool {
@Override
public String getShortDescription() {
- return "learnable document categorizer";
+ return "learned document categorizer";
}
@Override
@@ -53,7 +54,7 @@ public class DoccatTool extends BasicCmdLineTool {
DoccatModel model = new DoccatModelLoader().load(new File(args[0]));
- DocumentCategorizerME doccat = new DocumentCategorizerME(model);
+ DocumentCategorizerME documentCategorizerME = new DocumentCategorizerME(model);
/*
* moved initialization to the try block to catch new IOException
@@ -68,10 +69,10 @@ public class DoccatTool extends BasicCmdLineTool {
new SystemInputStreamFactory(), SystemInputStreamFactory.encoding()));
String document;
while ((document = documentStream.read()) != null) {
- String[] tokens = model.getFactory().getTokenizer().tokenize(document);
+ String[] tokens = WhitespaceTokenizer.INSTANCE.tokenize(document);
- double[] prob = doccat.categorize(tokens);
- String category = doccat.getBestCategory(prob);
+ double[] prob = documentCategorizerME.categorize(tokens);
+ String category = documentCategorizerME.getBestCategory(prob);
DocumentSample sample = new DocumentSample(category, tokens);
System.out.println(sample.toString());
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
index 6ef5d88..8ebb5a8 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/doccat/DoccatTrainerTool.java
@@ -66,12 +66,9 @@ public class DoccatTrainerTool
FeatureGenerator[] featureGenerators = createFeatureGenerators(params
.getFeatureGenerators());
- Tokenizer tokenizer = createTokenizer(params.getTokenizer());
-
DoccatModel model;
try {
- DoccatFactory factory = DoccatFactory.create(params.getFactory(),
- tokenizer, featureGenerators);
+ DoccatFactory factory = DoccatFactory.create(params.getFactory(), featureGenerators);
model = DocumentCategorizerME.train(params.getLang(), sampleStream,
mlParams, factory);
} catch (IOException e) {
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
index a6c815b..babab7c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
@@ -22,8 +22,6 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
-import opennlp.tools.tokenize.Tokenizer;
-import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.BaseToolFactory;
import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ext.ExtensionLoader;
@@ -34,47 +32,17 @@ import opennlp.tools.util.ext.ExtensionLoader;
public class DoccatFactory extends BaseToolFactory {
private static final String FEATURE_GENERATORS = "doccat.featureGenerators";
- private static final String TOKENIZER_NAME = "doccat.tokenizer";
private FeatureGenerator[] featureGenerators;
- private Tokenizer tokenizer;
/**
* Creates a {@link DoccatFactory} that provides the default implementation of
* the resources.
*/
- public DoccatFactory() {
- this.tokenizer = WhitespaceTokenizer.INSTANCE;
- }
+ public DoccatFactory() {}
public DoccatFactory(final FeatureGenerator[] featureGenerators) {
- this.tokenizer = WhitespaceTokenizer.INSTANCE;
- this.featureGenerators = featureGenerators;
- }
-
- /**
- * Creates a {@link DoccatFactory}. Use this constructor to programmatically
- * create a factory.
- *
- * @deprecated will be removed after 1.7.1 release. Don't use it.
- * @param tokenizer the tokenizer
- * @param featureGenerators the feature generators
- */
- @Deprecated
- public DoccatFactory(Tokenizer tokenizer, FeatureGenerator[] featureGenerators) {
- this.init(tokenizer, featureGenerators);
- }
-
- /**
- * @deprecated will be removed after 1.7.1 release. Don't use it.
- * @param tokenizer the tokenizer
- * @param featureGenerators feature generators
- */
- @Deprecated
- protected void init(Tokenizer tokenizer, FeatureGenerator[] featureGenerators) {
-
this.featureGenerators = featureGenerators;
- this.tokenizer = tokenizer;
}
protected void init(FeatureGenerator[] featureGenerators) {
@@ -85,11 +53,6 @@ public class DoccatFactory extends BaseToolFactory {
public Map<String, String> createManifestEntries() {
Map<String, String> manifestEntries = super.createManifestEntries();
- if (getTokenizer() != null) {
- manifestEntries.put(TOKENIZER_NAME, getTokenizer().getClass()
- .getCanonicalName());
- }
-
if (getFeatureGenerators() != null) {
manifestEntries.put(FEATURE_GENERATORS, featureGeneratorsAsString());
}
@@ -115,31 +78,6 @@ public class DoccatFactory extends BaseToolFactory {
// nothing to validate
}
- /**
- * @deprecated will be removed after 1.7.1 release. Don't use it.
- */
- @Deprecated
- public static DoccatFactory create(String subclassName, Tokenizer tokenizer,
- FeatureGenerator[] featureGenerators) throws InvalidFormatException {
- if (subclassName == null) {
- // will create the default factory
- return new DoccatFactory(tokenizer, featureGenerators);
- }
- try {
- DoccatFactory theFactory = ExtensionLoader.instantiateExtension(
- DoccatFactory.class, subclassName);
- theFactory.init(tokenizer, featureGenerators);
- return theFactory;
- } catch (Exception e) {
- String msg = "Could not instantiate the " + subclassName
- + ". The initialization throw an exception.";
- System.err.println(msg);
- e.printStackTrace();
- throw new InvalidFormatException(msg, e);
- }
-
- }
-
public static DoccatFactory create(String subclassName, FeatureGenerator[] featureGenerators)
throws InvalidFormatException {
if (subclassName == null) {
@@ -192,33 +130,4 @@ public class DoccatFactory extends BaseToolFactory {
this.featureGenerators = featureGenerators;
}
- /**
- * @deprecated will be removed after 1.7.1 release. Don't use it.
- */
- @Deprecated
- public Tokenizer getTokenizer() {
- if (this.tokenizer == null) {
- if (artifactProvider != null) {
- String className = artifactProvider.getManifestProperty(TOKENIZER_NAME);
- if (className != null) {
- this.tokenizer = ExtensionLoader.instantiateExtension(
- Tokenizer.class, className);
- }
- }
- if (this.tokenizer == null) { // could not load using artifact provider
- this.tokenizer = WhitespaceTokenizer.INSTANCE;
- }
- }
- return tokenizer;
- }
-
- /**
- * @deprecated will be removed after 1.7.1 release. Don't use it.
- * @param tokenizer tokenizer
- */
- @Deprecated
- public void setTokenizer(Tokenizer tokenizer) {
- this.tokenizer = tokenizer;
- }
-
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
index 88bf8f9..b180549 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
@@ -27,23 +27,21 @@ import java.util.SortedMap;
public interface DocumentCategorizer {
/**
- * Categorizes the given text, provided in separate tokens.
+ * Categorize the given text provided as tokens along with
+ * the provided extra information
*
* @param text the tokens of text to categorize
+ * @param extraInformation extra information
* @return per category probabilities
*/
- double[] categorize(String[] text);
+ double[] categorize(String[] text, Map<String, Object> extraInformation);
/**
* Categorizes the given text, provided in separate tokens.
- *
- * @param text the tokens of text to categorize
- * @param extraInformation optional extra information to pass for evaluation
+ * @param text the tokens of text to categorize
* @return per category probabilities
- * @deprecated will be removed after 1.7.1 release. Don't use it.
*/
- @Deprecated
- double[] categorize(String[] text, Map<String, Object> extraInformation);
+ double[] categorize(String[] text);
/**
* get the best category from previously generated outcome probabilities
@@ -77,25 +75,6 @@ public interface DocumentCategorizer {
int getNumberOfCategories();
/**
- * categorize a piece of text
- *
- * @param documentText the text to categorize
- * @return the probabilities of each category (sum up to 1)
- * @deprecated will be removed after 1.7.1 release. Don't use it.
- */
- @Deprecated
- double[] categorize(String documentText);
-
- /**
- * categorize a piece of text, providing extra metadata.
- *
- * @param documentText the text to categorize
- * @param extraInformation extra metadata
- * @return the probabilities of each category (sum up to 1)
- */
- double[] categorize(String documentText, Map<String, Object> extraInformation);
-
- /**
* get the name of the category associated with the given probabilties
*
* @param results the probabilities of each category
@@ -108,16 +87,6 @@ public interface DocumentCategorizer {
*
* @param text the input text to classify
* @return a map with the score as a key. The value is a Set of categories with the score.
- * @deprecated will be removed after 1.7.1 release. Don't use it.
- */
- @Deprecated
- Map<String, Double> scoreMap(String text);
-
- /**
- * Returns a map in which the key is the category name and the value is the score
- *
- * @param text the input text to classify
- * @return a map with the score as a key. The value is a Set of categories with the score.
*/
Map<String, Double> scoreMap(String[] text);
@@ -127,17 +96,6 @@ public interface DocumentCategorizer {
*
* @param text the input text to classify
* @return a map with the score as a key. The value is a Set of categories with the score.
- * @deprecated will be removed after 1.7.1 release. Don't use it.
- */
- @Deprecated
- SortedMap<Double, Set<String>> sortedScoreMap(String text);
-
- /**
- * Get a map of the scores sorted in ascending aorder together with their associated categories.
- * Many categories can have the same score, hence the Set as value
- *
- * @param text the input text to classify
- * @return a map with the score as a key. The value is a Set of categories with the score.
*/
SortedMap<Double, Set<String>> sortedScoreMap(String[] text);
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
index 63e0768..c501280 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
@@ -59,7 +59,7 @@ public class DocumentCategorizerEvaluator extends Evaluator<DocumentSample> {
String[] document = sample.getText();
- double[] probs = categorizer.categorize(document, sample.getExtraInformation());
+ double[] probs = categorizer.categorize(document);
String cat = categorizer.getBestCategory(probs);
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
index e743b9d..9dc41d7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
@@ -29,8 +29,6 @@ import java.util.TreeMap;
import opennlp.tools.ml.EventTrainer;
import opennlp.tools.ml.TrainerFactory;
import opennlp.tools.ml.model.MaxentModel;
-import opennlp.tools.tokenize.SimpleTokenizer;
-import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.TrainingParameters;
@@ -48,22 +46,6 @@ public class DocumentCategorizerME implements DocumentCategorizer {
private DocumentCategorizerContextGenerator mContextGenerator;
/**
- * Initializes the current instance with a doccat model and custom feature
- * generation. The feature generation must be identical to the configuration
- * at training time.
- *
- * @param model the doccat model
- * @param featureGenerators the feature generators
- * @deprecated train a {@link DoccatModel} with a specific
- * {@link DoccatFactory} to customize the {@link FeatureGenerator}s
- */
- @Deprecated
- public DocumentCategorizerME(DoccatModel model, FeatureGenerator... featureGenerators) {
- this.model = model;
- this.mContextGenerator = new DocumentCategorizerContextGenerator(featureGenerators);
- }
-
- /**
* Initializes the current instance with a doccat model. Default feature
* generation is used.
*
@@ -75,6 +57,13 @@ public class DocumentCategorizerME implements DocumentCategorizer {
.getFactory().getFeatureGenerators());
}
+ /**
+ * Categorize the given text provided as tokens along with
+ * the provided extra information
+ *
+ * @param text text tokens to categorize
+ * @param extraInformation additional information
+ */
@Override
public double[] categorize(String[] text, Map<String, Object> extraInformation) {
return model.getMaxentModel().eval(
@@ -83,58 +72,15 @@ public class DocumentCategorizerME implements DocumentCategorizer {
/**
* Categorizes the given text.
+ *
* @param text the text to categorize
*/
+ @Override
public double[] categorize(String[] text) {
return this.categorize(text, Collections.emptyMap());
}
/**
- * Categorizes the given text. The Tokenizer is obtained from
- * {@link DoccatFactory#getTokenizer()} and defaults to
- * {@link SimpleTokenizer}.
- * @deprecated will be removed after 1.7.1 release. Don't use it.
- */
- @Deprecated
- @Override
- public double[] categorize(String documentText,
- Map<String, Object> extraInformation) {
- Tokenizer tokenizer = model.getFactory().getTokenizer();
- return categorize(tokenizer.tokenize(documentText), extraInformation);
- }
-
- /**
- * Categorizes the given text. The text is tokenized with the SimpleTokenizer
- * before it is passed to the feature generation.
- * @deprecated will be removed after 1.7.1 release. Don't use it.
- */
- @Deprecated
- public double[] categorize(String documentText) {
- Tokenizer tokenizer = model.getFactory().getTokenizer();
- return categorize(tokenizer.tokenize(documentText), Collections.emptyMap());
- }
-
- /**
- * Returns a map in which the key is the category name and the value is the score
- *
- * @param text the input text to classify
- * @return the score map
- * @deprecated will be removed after 1.7.1 release. Don't use it.
- */
- @Deprecated
- public Map<String, Double> scoreMap(String text) {
- Map<String, Double> probDist = new HashMap<>();
-
- double[] categorize = categorize(text);
- int catSize = getNumberOfCategories();
- for (int i = 0; i < catSize; i++) {
- String category = getCategory(i);
- probDist.put(category, categorize[getIndex(category)]);
- }
- return probDist;
- }
-
- /**
* Returns a map in which the key is the category name and the value is the score
*
* @param text the input text to classify
@@ -160,35 +106,6 @@ public class DocumentCategorizerME implements DocumentCategorizer {
*
* @param text the input text to classify
* @return the sorted score map
- * @deprecated will be removed after 1.7.1 release. Don't use it.
- */
- @Deprecated
- @Override
- public SortedMap<Double, Set<String>> sortedScoreMap(String text) {
- SortedMap<Double, Set<String>> descendingMap = new TreeMap<>();
- double[] categorize = categorize(text);
- int catSize = getNumberOfCategories();
- for (int i = 0; i < catSize; i++) {
- String category = getCategory(i);
- double score = categorize[getIndex(category)];
- if (descendingMap.containsKey(score)) {
- descendingMap.get(score).add(category);
- } else {
- Set<String> newset = new HashSet<>();
- newset.add(category);
- descendingMap.put(score, newset);
- }
- }
- return descendingMap;
- }
-
- /**
- * Returns a map with the score as a key in ascending order.
- * The value is a Set of categories with the score.
- * Many categories can have the same score, hence the Set as value
- *
- * @param text the input text to classify
- * @return the sorted score map
*/
@Override
public SortedMap<Double, Set<String>> sortedScoreMap(String[] text) {
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
index 3d107fa..adddc27 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
@@ -24,8 +24,6 @@ import java.util.List;
import java.util.Map;
import java.util.Objects;
-import opennlp.tools.tokenize.WhitespaceTokenizer;
-
/**
* Class which holds a classified document and its category.
*/
@@ -35,10 +33,6 @@ public class DocumentSample {
private final List<String> text;
private final Map<String, Object> extraInformation;
- public DocumentSample(String category, String text) {
- this(category, WhitespaceTokenizer.INSTANCE.tokenize(text));
- }
-
public DocumentSample(String category, String[] text) {
this(category, text, null);
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
index 1ca0484..8ed0036 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
@@ -20,6 +20,9 @@ package opennlp.tools.formats;
import java.io.IOException;
import java.io.PrintStream;
import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
import opennlp.tools.doccat.DocumentSample;
import opennlp.tools.tokenize.SimpleTokenizer;
@@ -36,7 +39,7 @@ import opennlp.tools.util.PlainTextByLineStream;
* <p>
* The input text is tokenized with the {@link SimpleTokenizer}. The input text classified
* by the language model must also be tokenized by the {@link SimpleTokenizer} to produce
- * exactly the same tokenization during testing and training.
+ * exactly the same tokenization during testing and training.�
*/
public class LeipzigDoccatSampleStream extends
FilterObjectStream<String, DocumentSample> {
@@ -79,10 +82,8 @@ public class LeipzigDoccatSampleStream extends
}
public DocumentSample read() throws IOException {
-
int count = 0;
-
- StringBuilder sampleText = new StringBuilder();
+ List<String> tokensList = new ArrayList<>();
String line;
while (count < sentencesPerDocument && (line = samples.read()) != null) {
@@ -94,17 +95,13 @@ public class LeipzigDoccatSampleStream extends
}
// Always skip first token, that is the sentence number!
- for (int i = 1; i < tokens.length; i++) {
- sampleText.append(tokens[i]);
- sampleText.append(' ');
- }
+ tokensList.addAll(Arrays.asList(tokens).subList(1, tokens.length));
count++;
}
-
- if (sampleText.length() > 0) {
- return new DocumentSample(language, sampleText.toString());
+ if (tokensList.size() > 0) {
+ return new DocumentSample(language, tokensList.toArray(new String[tokensList.size()]));
}
return null;
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
index 6389530..220df87 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
@@ -42,27 +42,23 @@ public class DocumentCategorizerMETest {
new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"}));
TrainingParameters params = new TrainingParameters();
- params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100));
- params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(0));
+ params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+ params.put(TrainingParameters.CUTOFF_PARAM, "0");
DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples,
params, new DoccatFactory());
DocumentCategorizer doccat = new DocumentCategorizerME(model);
- double[] aProbs = doccat.categorize("a");
+ double[] aProbs = doccat.categorize(new String[]{"a"});
Assert.assertEquals("1", doccat.getBestCategory(aProbs));
- double[] bProbs = doccat.categorize("x");
+ double[] bProbs = doccat.categorize(new String[]{"x"});
Assert.assertEquals("0", doccat.getBestCategory(bProbs));
//test to make sure sorted map's last key is cat 1 because it has the highest score.
- SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap("a");
- for (String cat : sortedScoreMap.get(sortedScoreMap.lastKey())) {
- Assert.assertEquals("1", cat);
- break;
- }
- System.out.println("");
-
+ SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap(new String[]{"a"});
+ Set<String> cat = sortedScoreMap.get(sortedScoreMap.lastKey());
+ Assert.assertEquals(1, cat.size());
}
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java
index de3f098..0847690 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java
@@ -44,8 +44,8 @@ public class DocumentCategorizerNBTest {
new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"}));
TrainingParameters params = new TrainingParameters();
- params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100));
- params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(0));
+ params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+ params.put(TrainingParameters.CUTOFF_PARAM, "0");
params.put(AbstractTrainer.ALGORITHM_PARAM, NaiveBayesTrainer.NAIVE_BAYES_VALUE);
DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples,
@@ -53,19 +53,16 @@ public class DocumentCategorizerNBTest {
DocumentCategorizer doccat = new DocumentCategorizerME(model);
- double[] aProbs = doccat.categorize("a");
+ double[] aProbs = doccat.categorize(new String[]{"a"});
Assert.assertEquals("1", doccat.getBestCategory(aProbs));
- double[] bProbs = doccat.categorize("x");
+ double[] bProbs = doccat.categorize(new String[]{"x"});
Assert.assertEquals("0", doccat.getBestCategory(bProbs));
//test to make sure sorted map's last key is cat 1 because it has the highest score.
- SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap("a");
- for (String cat : sortedScoreMap.get(sortedScoreMap.lastKey())) {
- Assert.assertEquals("1", cat);
- break;
- }
- System.out.println("");
+ SortedMap<Double, Set<String>> sortedScoreMap = doccat.sortedScoreMap(new String[]{"a"});
+ Set<String> cat = sortedScoreMap.get(sortedScoreMap.lastKey());
+ Assert.assertEquals(1, cat.size());
}
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentSampleTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentSampleTest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentSampleTest.java
index 232158b..8cf8fef 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentSampleTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentSampleTest.java
@@ -31,11 +31,11 @@ public class DocumentSampleTest {
}
public static DocumentSample createGoldSample() {
- return new DocumentSample("aCategory", "a small text");
+ return new DocumentSample("aCategory", new String[]{"a", "small", "text"});
}
public static DocumentSample createPredSample() {
- return new DocumentSample("anotherCategory", "a small text");
+ return new DocumentSample("anotherCategory", new String[]{"a", "small", "text"});
}
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java
----------------------------------------------------------------------
diff --git a/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java b/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java
index db9c075..4b49dca 100644
--- a/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java
+++ b/opennlp-uima/src/main/java/opennlp/uima/doccat/AbstractDocumentCategorizer.java
@@ -17,12 +17,17 @@
package opennlp.uima.doccat;
+import java.util.ArrayList;
+import java.util.List;
+
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CAS;
+import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
+import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.ResourceAccessException;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.Level;
@@ -72,29 +77,25 @@ abstract class AbstractDocumentCategorizer extends CasAnnotator_ImplBase {
mCategorizer = new DocumentCategorizerME(model);
}
- public void typeSystemInit(TypeSystem typeSystem)
- throws AnalysisEngineProcessException {
+ public void typeSystemInit(TypeSystem typeSystem) throws AnalysisEngineProcessException {
mTokenType = AnnotatorUtil.getRequiredTypeParameter(context, typeSystem,
- UimaUtil.SENTENCE_TYPE_PARAMETER);
+ UimaUtil.TOKEN_TYPE_PARAMETER);
}
protected abstract void setBestCategory(CAS cas, String bestCategory);
public void process(CAS cas) {
- double[] result;
-
- if (mTokenType != null) {
- // TODO:
- // count tokens
- // create token array
- // pass array to doccat
- // create result annotation
- result = mCategorizer.categorize(cas.getDocumentText());
- } else {
- result = mCategorizer.categorize(cas.getDocumentText());
+ FSIterator<AnnotationFS> tokenAnnotations = cas.getAnnotationIndex(mTokenType).iterator();
+ List<String> tokensList = new ArrayList<>();
+
+ while (tokenAnnotations.hasNext()) {
+ tokensList.add(tokenAnnotations.next().getCoveredText());
}
+ double[] result =
+ mCategorizer.categorize(tokensList.toArray(new String[tokensList.size()]));
+
String bestCategory = mCategorizer.getBestCategory(result);
setBestCategory(cas, bestCategory);
http://git-wip-us.apache.org/repos/asf/opennlp/blob/c6ecbf24/opennlp-uima/src/main/java/opennlp/uima/util/AnnotatorUtil.java
----------------------------------------------------------------------
diff --git a/opennlp-uima/src/main/java/opennlp/uima/util/AnnotatorUtil.java b/opennlp-uima/src/main/java/opennlp/uima/util/AnnotatorUtil.java
index 8847107..730d6be 100644
--- a/opennlp-uima/src/main/java/opennlp/uima/util/AnnotatorUtil.java
+++ b/opennlp-uima/src/main/java/opennlp/uima/util/AnnotatorUtil.java
@@ -329,8 +329,7 @@ public final class AnnotatorUtil {
} else {
throw new ResourceInitializationException(
ExceptionMessages.MESSAGE_CATALOG,
- ExceptionMessages.WRONG_PARAMETER_TYPE, new Object[] {parameter,
- "String array"});
+ ExceptionMessages.WRONG_PARAMETER_TYPE, new Object[] {parameter, "String array"});
}
}
@@ -443,8 +442,7 @@ public final class AnnotatorUtil {
if (inResource == null) {
throw new ResourceInitializationException(
ExceptionMessages.MESSAGE_CATALOG,
- ExceptionMessages.IO_ERROR_MODEL_READING, new Object[] {name
- + " could not be found!"});
+ ExceptionMessages.IO_ERROR_MODEL_READING, new Object[] {name + " could not be found!"});
}
return inResource;