You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jz...@apache.org on 2022/12/12 13:03:05 UTC
[opennlp] branch master updated: OPENNLP-1408 Enhance JavaDoc in opennlp.tools.doccat package (#451)
This is an automated email from the ASF dual-hosted git repository.
jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new 6e9b3b4b OPENNLP-1408 Enhance JavaDoc in opennlp.tools.doccat package (#451)
6e9b3b4b is described below
commit 6e9b3b4b8f94f5e20754edbee88612ea59fb6814
Author: Martin Wiesner <ma...@users.noreply.github.com>
AuthorDate: Mon Dec 12 14:02:59 2022 +0100
OPENNLP-1408 Enhance JavaDoc in opennlp.tools.doccat package (#451)
- adds missing JavaDoc
- improves existing documentation for clarity
- removes superfluous text
- adds 'final' modifier where useful and applicable
- adds 'Override' annotation where useful and applicable
- fixes several typos
---
.../tools/doccat/BagOfWordsFeatureGenerator.java | 8 +++
.../opennlp/tools/doccat/DoccatCrossValidator.java | 35 +++++-----
.../tools/doccat/DoccatEvaluationMonitor.java | 2 +-
.../java/opennlp/tools/doccat/DoccatFactory.java | 33 ++++++++--
.../java/opennlp/tools/doccat/DoccatModel.java | 42 ++++++++++++
.../opennlp/tools/doccat/DocumentCategorizer.java | 59 +++++++++--------
.../DocumentCategorizerContextGenerator.java | 11 +++-
.../tools/doccat/DocumentCategorizerEvaluator.java | 27 ++++----
.../doccat/DocumentCategorizerEventStream.java | 31 +++++----
.../tools/doccat/DocumentCategorizerME.java | 74 ++++++++++------------
.../java/opennlp/tools/doccat/DocumentSample.java | 23 ++++++-
.../opennlp/tools/doccat/DocumentSampleStream.java | 16 ++++-
.../opennlp/tools/doccat/FeatureGenerator.java | 8 +--
.../tools/doccat/NGramFeatureGenerator.java | 23 +++----
14 files changed, 246 insertions(+), 146 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
index 51a3277e..fc598c9f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
@@ -32,10 +32,18 @@ public class BagOfWordsFeatureGenerator implements FeatureGenerator {
private final boolean useOnlyAllLetterTokens;
+ /**
+ * Instantiates a default {@link BagOfWordsFeatureGenerator} instance.
+ */
public BagOfWordsFeatureGenerator() {
this(false);
}
+ /**
+ * Instantiates a {@link BagOfWordsFeatureGenerator} instance.
+ *
+ * @param useOnlyAllLetterTokens Whether to use only all-letter tokens, or not.
+ */
BagOfWordsFeatureGenerator(boolean useOnlyAllLetterTokens) {
this.useOnlyAllLetterTokens = useOnlyAllLetterTokens;
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
index 106b82d2..ebfe6a09 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatCrossValidator.java
@@ -25,7 +25,7 @@ import opennlp.tools.util.eval.CrossValidationPartitioner;
import opennlp.tools.util.eval.Mean;
/**
- * Cross validator for document categorization
+ * Cross validator for {@link DocumentCategorizer}.
*/
public class DoccatCrossValidator {
@@ -33,16 +33,21 @@ public class DoccatCrossValidator {
private final TrainingParameters params;
- private Mean documentAccuracy = new Mean();
+ private final Mean documentAccuracy = new Mean();
- private DoccatEvaluationMonitor[] listeners;
+ private final DoccatEvaluationMonitor[] listeners;
- private DoccatFactory factory;
+ private final DoccatFactory factory;
/**
- * Creates a {@link DoccatCrossValidator} with the given
- * {@link FeatureGenerator}s.
+ * Instantiates a {@link DoccatCrossValidator} with the
+ * given {@link FeatureGenerator generators}.
+ *
+ * @param languageCode An ISO conform language code.
+ * @param mlParams The {@link TrainingParameters} for the context of cross validation.
+ * @param factory The {@link DoccatFactory} for creating related objects.
+ * @param listeners the {@link DoccatEvaluationMonitor evaluation listeners}.
*/
public DoccatCrossValidator(String languageCode, TrainingParameters mlParams,
DoccatFactory factory, DoccatEvaluationMonitor ... listeners) {
@@ -55,12 +60,10 @@ public class DoccatCrossValidator {
/**
* Starts the evaluation.
*
- * @param samples
- * the data to train and test
- * @param nFolds
- * number of folds
+ * @param samples The {@link ObjectStream} of {@link DocumentSample samples} to train and test with.
+ * @param nFolds Number of folds. It must be greater than zero.
*
- * @throws IOException
+ * @throws IOException Thrown if IO errors occurred.
*/
public void evaluate(ObjectStream<DocumentSample> samples, int nFolds)
throws IOException {
@@ -88,19 +91,15 @@ public class DoccatCrossValidator {
}
/**
- * Retrieves the accuracy for all iterations.
- *
- * @return the word accuracy
+ * @return Retrieves the accuracy for all iterations.
*/
public double getDocumentAccuracy() {
return documentAccuracy.mean();
}
/**
- * Retrieves the number of words which where validated over all iterations.
- * The result is the amount of folds multiplied by the total number of words.
- *
- * @return the word count
+ * @return Retrieves the number of words which where validated over all iterations.
+ * The result is the amount of folds multiplied by the total number of words.
*/
public long getDocumentCount() {
return documentAccuracy.count();
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatEvaluationMonitor.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatEvaluationMonitor.java
index f7b5a6f5..951f8d0f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatEvaluationMonitor.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatEvaluationMonitor.java
@@ -20,7 +20,7 @@ package opennlp.tools.doccat;
import opennlp.tools.util.eval.EvaluationMonitor;
/**
- * {@link EvaluationMonitor} for doccat.
+ * A marker interface for evaluating {@link DocumentCategorizer doccat}.
*/
public interface DoccatEvaluationMonitor extends
EvaluationMonitor<DocumentSample> {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
index babab7cd..9d2fb946 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatFactory.java
@@ -27,7 +27,7 @@ import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.ext.ExtensionLoader;
/**
- * The factory that provides Doccat default implementations and resources
+ * The factory that provides Doccat default implementations and resources.
*/
public class DoccatFactory extends BaseToolFactory {
@@ -36,13 +36,19 @@ public class DoccatFactory extends BaseToolFactory {
private FeatureGenerator[] featureGenerators;
/**
- * Creates a {@link DoccatFactory} that provides the default implementation of
+ * Instantiates a {@link DoccatFactory} that provides the default implementation of
* the resources.
*/
public DoccatFactory() {}
+ /**
+ * Instantiates a {@link DoccatFactory} that provides the default implementation of
+ * the resources.
+ *
+ * @param featureGenerators The {@link FeatureGenerator featureGenerators} to use.
+ */
public DoccatFactory(final FeatureGenerator[] featureGenerators) {
- this.featureGenerators = featureGenerators;
+ init(featureGenerators);
}
protected void init(FeatureGenerator[] featureGenerators) {
@@ -78,6 +84,17 @@ public class DoccatFactory extends BaseToolFactory {
// nothing to validate
}
+ /**
+ * Factory method the framework uses create a new {@link DoccatFactory}.
+ *
+ * @param subclassName The name of the class implementing the {@link DoccatFactory}.
+ * @param featureGenerators The {@link FeatureGenerator featureGenerators} to use.
+ *
+ * @return A valid {@link DoccatFactory} instance.
+ *
+ * @throws InvalidFormatException Thrown if the {@link ExtensionLoader} mechanism failed to
+ * create the factory associated with {@code subclassName}.
+ */
public static DoccatFactory create(String subclassName, FeatureGenerator[] featureGenerators)
throws InvalidFormatException {
if (subclassName == null) {
@@ -91,9 +108,7 @@ public class DoccatFactory extends BaseToolFactory {
return theFactory;
} catch (Exception e) {
String msg = "Could not instantiate the " + subclassName
- + ". The initialization throw an exception.";
- System.err.println(msg);
- e.printStackTrace();
+ + ". The initialization threw an exception.";
throw new InvalidFormatException(msg, e);
}
}
@@ -109,6 +124,9 @@ public class DoccatFactory extends BaseToolFactory {
return fgs;
}
+ /**
+ * @return Retrieves the {@link FeatureGenerator generators} used.
+ */
public FeatureGenerator[] getFeatureGenerators() {
if (featureGenerators == null) {
if (artifactProvider != null) {
@@ -126,6 +144,9 @@ public class DoccatFactory extends BaseToolFactory {
return featureGenerators;
}
+ /**
+ * @param featureGenerators The {@link FeatureGenerator featureGenerators} to use.
+ */
public void setFeatureGenerators(FeatureGenerator[] featureGenerators) {
this.featureGenerators = featureGenerators;
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
index 1b5c1640..26776eae 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
@@ -38,6 +38,14 @@ public class DoccatModel extends BaseModel {
private static final String COMPONENT_NAME = "DocumentCategorizerME";
private static final String DOCCAT_MODEL_ENTRY_NAME = "doccat.model";
+ /**
+ * Initializes a {@link DoccatModel} instance via given parameters.
+ *
+ * @param languageCode An ISO conform language code.
+ * @param doccatModel A valid {@link MaxentModel} to be used.
+ * @param manifestInfoEntries Additional information kept in the manifest.
+ * @param factory The {@link DoccatFactory} to be used.
+ */
public DoccatModel(String languageCode, MaxentModel doccatModel,
Map<String, String> manifestInfoEntries, DoccatFactory factory) {
super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);
@@ -46,18 +54,46 @@ public class DoccatModel extends BaseModel {
checkArtifactMap();
}
+ /**
+ * Initializes a {@link DoccatModel} instance via a valid {@link InputStream}.
+ *
+ * @param in The {@link InputStream} used for loading the model.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public DoccatModel(InputStream in) throws IOException {
super(COMPONENT_NAME, in);
}
+ /**
+ * Initializes a {@link DoccatModel} instance via a valid {@link File}.
+ *
+ * @param modelFile The {@link File} used for loading the model.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public DoccatModel(File modelFile) throws IOException {
super(COMPONENT_NAME, modelFile);
}
+ /**
+ * Initializes a {@link DoccatModel} instance via a valid {@link Path}.
+ *
+ * @param modelPath The {@link Path} used for loading the model.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public DoccatModel(Path modelPath) throws IOException {
this(modelPath.toFile());
}
+ /**
+ * Initializes a {@link DoccatModel} instance via a valid {@link URL}.
+ *
+ * @param modelURL The {@link URL} used for loading the model.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public DoccatModel(URL modelURL) throws IOException {
super(COMPONENT_NAME, modelURL);
}
@@ -71,6 +107,9 @@ public class DoccatModel extends BaseModel {
}
}
+ /**
+ * @return Retrieves the active {@link DoccatFactory}.
+ */
public DoccatFactory getFactory() {
return (DoccatFactory) this.toolFactory;
}
@@ -80,6 +119,9 @@ public class DoccatModel extends BaseModel {
return DoccatFactory.class;
}
+ /**
+ * @return Retrieves the active {@link MaxentModel}.
+ */
public MaxentModel getMaxentModel() {
return (MaxentModel) artifactMap.get(DOCCAT_MODEL_ENTRY_NAME);
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
index b180549b..3ef92c4c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizer.java
@@ -27,75 +27,78 @@ import java.util.SortedMap;
public interface DocumentCategorizer {
/**
- * Categorize the given text provided as tokens along with
- * the provided extra information
+ * Categorizes the given {@code text} provided as tokens along with
+ * the provided {@code extraInformation}.
*
- * @param text the tokens of text to categorize
- * @param extraInformation extra information
- * @return per category probabilities
+ * @param text The tokens of text to categorize.
+ * @param extraInformation The extra information used for this context.
+ * @return The per category probabilities.
*/
double[] categorize(String[] text, Map<String, Object> extraInformation);
/**
- * Categorizes the given text, provided in separate tokens.
- * @param text the tokens of text to categorize
- * @return per category probabilities
+ * Categorizes the given {@code text}, provided in separate tokens.
+ *
+ * @param text The tokens of text to categorize.
+ * @return The per category probabilities.
*/
double[] categorize(String[] text);
/**
- * get the best category from previously generated outcome probabilities
+ * Retrieves the best category from previously generated {@code outcome} probabilities
*
- * @param outcome a vector of outcome probabilities
- * @return the best category String
+ * @param outcome An array of computed outcome probabilities.
+ * @return The best category represented as String.
*/
String getBestCategory(double[] outcome);
/**
- * get the index of a certain category
+ * Retrieves the index of a certain category.
*
- * @param category the category
- * @return an index
+ * @param category The category for which the {@code index} is to be found.
+ * @return The index.
*/
int getIndex(String category);
/**
- * get the category at a given index
+ * Retrieves the category at a given {@code index}.
*
- * @param index the index
- * @return a category
+ * @param index The index for which the {@code category} shall be found.
+ * @return The category represented as String.
*/
String getCategory(int index);
/**
- * get the number of categories
+ * Retrieves the number of categories.
*
- * @return the no. of categories
+ * @return The no. of categories.
*/
int getNumberOfCategories();
/**
- * get the name of the category associated with the given probabilties
+ * Retrieves the name of the category associated with the given probabilities.
*
- * @param results the probabilities of each category
- * @return the name of the outcome
+ * @param results The probabilities of each category.
+ * @return The name of the outcome.
*/
String getAllResults(double[] results);
/**
- * Returns a map in which the key is the category name and the value is the score
+ * Retrieves a {@link Map} in which the key is the category name and the value is the score.
*
- * @param text the input text to classify
- * @return a map with the score as a key. The value is a Set of categories with the score.
+ * @param text The tokenized input text to classify.
+ * @return A {@link Map} with the score as a key.
*/
Map<String, Double> scoreMap(String[] text);
/**
- * Get a map of the scores sorted in ascending aorder together with their associated categories.
- * Many categories can have the same score, hence the Set as value
+ * Retrieves a {@link SortedMap} of the scores sorted in ascending order,
+ * together with their associated categories.
+ * <p>
+ * Many categories can have the same score, hence the {@link Set} as value.
*
* @param text the input text to classify
- * @return a map with the score as a key. The value is a Set of categories with the score.
+ * @return A {@link SortedMap} with the score as a key.
*/
SortedMap<Double, Set<String>> sortedScoreMap(String[] text);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerContextGenerator.java
index e12f16bf..24dc288f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerContextGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerContextGenerator.java
@@ -22,12 +22,19 @@ import java.util.LinkedList;
import java.util.Map;
/**
- * Context generator for document categorizer
+ * Context generator for {@link DocumentCategorizer}.
+ *
+ * @see DocumentCategorizer
*/
class DocumentCategorizerContextGenerator {
- private FeatureGenerator[] mFeatureGenerators;
+ private final FeatureGenerator[] mFeatureGenerators;
+ /**
+ * Instantiates a {@link DocumentCategorizerContextGenerator} instance.
+ *
+ * @param featureGenerators The {@link FeatureGenerator featureGenerators} to use.
+ */
DocumentCategorizerContextGenerator(FeatureGenerator... featureGenerators) {
mFeatureGenerators = featureGenerators;
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
index c501280f..a417f974 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEvaluator.java
@@ -24,21 +24,23 @@ import opennlp.tools.util.eval.Mean;
/**
* The {@link DocumentCategorizerEvaluator} measures the performance of
* the given {@link DocumentCategorizer} with the provided reference
- * {@link DocumentSample}s.
+ * {@link DocumentSample samples}.
*
* @see DocumentCategorizer
* @see DocumentSample
+ * @see Evaluator
*/
public class DocumentCategorizerEvaluator extends Evaluator<DocumentSample> {
- private DocumentCategorizer categorizer;
+ private final DocumentCategorizer categorizer;
- private Mean accuracy = new Mean();
+ private final Mean accuracy = new Mean();
/**
- * Initializes the current instance.
+ * Initializes a {@link DocumentCategorizerEvaluator} instance.
*
- * @param categorizer the document categorizer instance
+ * @param categorizer the {@link DocumentCategorizer} instance.
+ * @param listeners the {@link DoccatEvaluationMonitor evaluation listeners}.
*/
public DocumentCategorizerEvaluator(DocumentCategorizer categorizer,
DoccatEvaluationMonitor ... listeners) {
@@ -47,13 +49,14 @@ public class DocumentCategorizerEvaluator extends Evaluator<DocumentSample> {
}
/**
- * Evaluates the given reference {@link DocumentSample} object.
- *
+ * Evaluates the given reference {@link DocumentSample sample}.
+ * <p>
* This is done by categorizing the document from the provided
* {@link DocumentSample}. The detected category is then used
* to calculate and update the score.
*
- * @param sample the reference {@link TokenSample}.
+ * @param sample The reference {@link TokenSample}.
+ * @return The processed {@link TokenSample}.
*/
public DocumentSample processSample(DocumentSample sample) {
@@ -74,11 +77,9 @@ public class DocumentCategorizerEvaluator extends Evaluator<DocumentSample> {
}
/**
- * Retrieves the accuracy of provided {@link DocumentCategorizer}.
- *
- * accuracy = correctly categorized documents / total documents
+ * {@code accuracy = correctly categorized documents / total documents}
*
- * @return the accuracy
+ * @return Retrieves the accuracy of provided {@link DocumentCategorizer}.
*/
public double getAccuracy() {
return accuracy.mean();
@@ -89,7 +90,7 @@ public class DocumentCategorizerEvaluator extends Evaluator<DocumentSample> {
}
/**
- * Represents this objects as human readable {@link String}.
+ * Represents this object as human-readable {@link String}.
*/
@Override
public String toString() {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEventStream.java
index 3fbe63d7..723a73e8 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEventStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerEventStream.java
@@ -28,46 +28,45 @@ import opennlp.tools.util.ObjectStream;
*/
public class DocumentCategorizerEventStream extends AbstractEventStream<DocumentSample> {
- private DocumentCategorizerContextGenerator mContextGenerator;
+ private final DocumentCategorizerContextGenerator mContextGenerator;
/**
- * Initializes the current instance via samples and feature generators.
+ * Initializes a {@link DocumentCategorizerEventStream} via samples and
+ * {@link FeatureGenerator feature generators}.
*
- * @param data {@link ObjectStream} of {@link DocumentSample}s
- *
- * @param featureGenerators the feature generators
+ * @param samples {@link ObjectStream} of {@link DocumentSample samples}.
+ * @param featureGenerators One or more {@link FeatureGenerator} to use.
*/
- public DocumentCategorizerEventStream(ObjectStream<DocumentSample> data,
+ public DocumentCategorizerEventStream(ObjectStream<DocumentSample> samples,
FeatureGenerator... featureGenerators) {
- super(data);
-
- mContextGenerator =
- new DocumentCategorizerContextGenerator(featureGenerators);
+ super(samples);
+ mContextGenerator = new DocumentCategorizerContextGenerator(featureGenerators);
}
/**
- * Initializes the current instance.
+ * Initializes a {@link DocumentCategorizerEventStream} via samples.
+ * {@link BagOfWordsFeatureGenerator} is used as feature generator.
*
- * @param samples {@link ObjectStream} of {@link DocumentSample}s
+ * @param samples {@link ObjectStream} of {@link DocumentSample samples}.
*/
public DocumentCategorizerEventStream(ObjectStream<DocumentSample> samples) {
super(samples);
-
- mContextGenerator =
- new DocumentCategorizerContextGenerator(new BagOfWordsFeatureGenerator());
+ mContextGenerator = new DocumentCategorizerContextGenerator(new BagOfWordsFeatureGenerator());
}
@Override
protected Iterator<Event> createEvents(final DocumentSample sample) {
- return new Iterator<Event>() {
+ return new Iterator<>() {
private boolean isVirgin = true;
+ @Override
public boolean hasNext() {
return isVirgin;
}
+ @Override
public Event next() {
isVirgin = false;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
index 9dc41d74..8d2155d7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentCategorizerME.java
@@ -33,23 +33,18 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.TrainingParameters;
/**
- * Maxent implementation of {@link DocumentCategorizer}.
+ * A Max-Ent based implementation of {@link DocumentCategorizer}.
*/
public class DocumentCategorizerME implements DocumentCategorizer {
+
+ private final DoccatModel model;
+ private final DocumentCategorizerContextGenerator mContextGenerator;
/**
- * Shared default thread safe feature generator.
- */
- private static FeatureGenerator defaultFeatureGenerator = new BagOfWordsFeatureGenerator();
-
- private DoccatModel model;
- private DocumentCategorizerContextGenerator mContextGenerator;
-
- /**
- * Initializes the current instance with a doccat model. Default feature
- * generation is used.
+ * Initializes a {@link DocumentCategorizerME} instance with a doccat model.
+ * Default feature generation is used.
*
- * @param model the doccat model
+ * @param model the {@link DoccatModel} to be used for categorization.
*/
public DocumentCategorizerME(DoccatModel model) {
this.model = model;
@@ -58,34 +53,24 @@ public class DocumentCategorizerME implements DocumentCategorizer {
}
/**
- * Categorize the given text provided as tokens along with
- * the provided extra information
+ * Categorize the given {@code text} provided as tokens along with
+ * the provided extra information.
*
- * @param text text tokens to categorize
- * @param extraInformation additional information
+ * @param text The text tokens to categorize.
+ * @param extraInformation Additional information for context to be used by the feature generator.
+ * @return The per category probabilities.
*/
@Override
public double[] categorize(String[] text, Map<String, Object> extraInformation) {
return model.getMaxentModel().eval(
mContextGenerator.getContext(text, extraInformation));
}
-
- /**
- * Categorizes the given text.
- *
- * @param text the text to categorize
- */
+
@Override
public double[] categorize(String[] text) {
return this.categorize(text, Collections.emptyMap());
}
- /**
- * Returns a map in which the key is the category name and the value is the score
- *
- * @param text the input text to classify
- * @return the score map
- */
@Override
public Map<String, Double> scoreMap(String[] text) {
Map<String, Double> probDist = new HashMap<>();
@@ -98,15 +83,7 @@ public class DocumentCategorizerME implements DocumentCategorizer {
}
return probDist;
}
-
- /**
- * Returns a map with the score as a key in ascending order.
- * The value is a Set of categories with the score.
- * Many categories can have the same score, hence the Set as value
- *
- * @param text the input text to classify
- * @return the sorted score map
- */
+
@Override
public SortedMap<Double, Set<String>> sortedScoreMap(String[] text) {
SortedMap<Double, Set<String>> descendingMap = new TreeMap<>();
@@ -126,29 +103,44 @@ public class DocumentCategorizerME implements DocumentCategorizer {
return descendingMap;
}
+ @Override
public String getBestCategory(double[] outcome) {
return model.getMaxentModel().getBestOutcome(outcome);
}
+ @Override
public int getIndex(String category) {
return model.getMaxentModel().getIndex(category);
}
+ @Override
public String getCategory(int index) {
return model.getMaxentModel().getOutcome(index);
}
+ @Override
public int getNumberOfCategories() {
return model.getMaxentModel().getNumOutcomes();
}
+ @Override
public String getAllResults(double[] results) {
return model.getMaxentModel().getAllOutcomes(results);
}
- public static DoccatModel train(String languageCode, ObjectStream<DocumentSample> samples,
- TrainingParameters mlParams, DoccatFactory factory)
- throws IOException {
+ /**
+ * Starts a training of a {@link DoccatModel} with the given parameters.
+ *
+ * @param lang The ISO conform language code.
+ * @param samples The {@link ObjectStream} of {@link DocumentSample} used as input for training.
+ * @param mlParams The {@link TrainingParameters} for the context of the training.
+ * @param factory The {@link DoccatFactory} for creating related objects defined via {@code mlParams}.
+ *
+ * @return A valid, trained {@link DoccatModel} instance.
+ * @throws IOException Thrown if IO errors occurred.
+ */
+ public static DoccatModel train(String lang, ObjectStream<DocumentSample> samples,
+ TrainingParameters mlParams, DoccatFactory factory) throws IOException {
Map<String, String> manifestInfoEntries = new HashMap<>();
@@ -158,6 +150,6 @@ public class DocumentCategorizerME implements DocumentCategorizer {
MaxentModel model = trainer.train(
new DocumentCategorizerEventStream(samples, factory.getFeatureGenerators()));
- return new DoccatModel(languageCode, model, manifestInfoEntries, factory);
+ return new DoccatModel(lang, model, manifestInfoEntries, factory);
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
index 560a9b62..f06efff6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSample.java
@@ -36,10 +36,23 @@ public class DocumentSample implements Sample {
private final List<String> text;
private final Map<String, Object> extraInformation;
+ /**
+ * Initializes a {@link DocumentSample instance}.
+ *
+ * @param category The category to be used. Must not be {@code null}.
+ * @param text The plain text in a tokenized form. Must not be {@code null}.
+ */
public DocumentSample(String category, String[] text) {
this(category, text, null);
}
+ /**
+ * Initializes a {@link DocumentSample instance}.
+ *
+ * @param category The category to be used.Must not be {@code null}.
+ * @param text The plain text in a tokenized form. Must not be {@code null}.
+ * @param extraInformation Additional information for context.
+ */
public DocumentSample(String category, String[] text, Map<String, Object> extraInformation) {
Objects.requireNonNull(text, "text must not be null");
@@ -53,14 +66,23 @@ public class DocumentSample implements Sample {
}
}
+ /**
+ * @return Retrieves the category.
+ */
public String getCategory() {
return category;
}
+ /**
+ * @return Retrieves the text in a tokenized form.
+ */
public String[] getText() {
return text.toArray(new String[text.size()]);
}
+ /**
+ * @return Retrieves contextual extra information.
+ */
public Map<String, Object> getExtraInformation() {
return extraInformation;
}
@@ -69,7 +91,6 @@ public class DocumentSample implements Sample {
public String toString() {
StringBuilder sampleString = new StringBuilder();
-
sampleString.append(category).append('\t');
for (String s : text) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSampleStream.java
index 9054eb7b..da8d9501 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DocumentSampleStream.java
@@ -24,21 +24,31 @@ import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.ObjectStream;
/**
- * This class reads in string encoded training samples, parses them and
+ * Reads in string encoded training samples, parses them and
* outputs {@link DocumentSample} objects.
* <p>
* Format:<br>
* Each line contains one sample document.<br>
* The category is the first string in the line followed by a tab and whitespace
- * separated document tokens.<br>
- * Sample line: category-string tab-char whitespace-separated-tokens line-break-char(s)<br>
+ * separated document tokens.
+ * <p>
+ * Sample line:
+ * {@code category-string tab-char whitespace-separated-tokens line-break-char(s)}
+ *
+ * @see DocumentSample
*/
public class DocumentSampleStream extends FilterObjectStream<String, DocumentSample> {
+ /**
+ * Initializes a {@link DocumentSampleStream instance}.
+ *
+ * @param samples A plain text {@link ObjectStream line stream}.
+ */
public DocumentSampleStream(ObjectStream<String> samples) {
super(samples);
}
+ @Override
public DocumentSample read() throws IOException {
String sampleString = samples.read();
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/FeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/FeatureGenerator.java
index 2ed5a30a..ebb8fa0c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/FeatureGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/FeatureGenerator.java
@@ -27,11 +27,11 @@ import java.util.Map;
public interface FeatureGenerator {
/**
- * Extract features from given text fragments
+ * Extracts features from given {@code text} fragments.
*
- * @param text the text fragments to extract features from
- * @param extraInformation optional extra information to be used by the feature generator
- * @return a collection of features
+ * @param text The text fragments to extract features from
+ * @param extraInformation Optional extra information to be used by the {@link FeatureGenerator}.
+ * @return A collection of features.
*/
Collection<String> extractFeatures(String[] text, Map<String, Object> extraInformation);
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
index 15accdf9..54909064 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
@@ -27,7 +27,8 @@ import opennlp.tools.util.InvalidFormatException;
/**
* Generates ngram features for a document.
- * n-gram {@link FeatureGenerator}
+ *
+ * @see FeatureGenerator
*/
public class NGramFeatureGenerator implements FeatureGenerator {
@@ -35,11 +36,11 @@ public class NGramFeatureGenerator implements FeatureGenerator {
private final int maxGram;
/**
- * Constructor for ngrams.
+ * Instantiates an {@link NGramFeatureGenerator} instance with configurable ngram parameters.
*
- * @param minGram minGram value - which means minimum words in ngram features
- * @param maxGram maxGram value - which means maximum words in ngram features
- * @throws InvalidFormatException
+ * @param minGram The minimum words in ngram features.
+ * @param maxGram The maximum words in ngram features.
+ * @throws InvalidFormatException Thrown if parameter values are invalid or inconsistent.
*/
public NGramFeatureGenerator(int minGram, int maxGram) throws InvalidFormatException {
if (minGram > 0 && maxGram > 0) {
@@ -57,19 +58,15 @@ public class NGramFeatureGenerator implements FeatureGenerator {
}
/**
- * Default constructor for Bi grams
+ * Instantiates an {@link NGramFeatureGenerator} instance with a Bi grams config.
+ *
+ * @throws InvalidFormatException Thrown if parameter values are invalid or inconsistent.
*/
public NGramFeatureGenerator() throws InvalidFormatException {
this(2, 2);
}
- /**
- * Extract ngram features from given text fragments
- *
- * @param text the text fragments to extract features from
- * @param extraInfo optional extra information
- * @return a collection of n gram features
- */
+ @Override
public Collection<String> extractFeatures(String[] text, Map<String, Object> extraInfo) {
Objects.requireNonNull(text, "text must not be null");
List<String> features = new ArrayList<>();