You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ki...@apache.org on 2022/12/04 13:29:36 UTC
[opennlp] branch master updated: OPENNLP-1403 Enhance JavaDoc in opennlp.tools.langdetect and opennlp.tools.languagemodel packages
This is an automated email from the ASF dual-hosted git repository.
kinow pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new babe7310 OPENNLP-1403 Enhance JavaDoc in opennlp.tools.langdetect and opennlp.tools.languagemodel packages
babe7310 is described below
commit babe7310d327f1d61864f05f73fecf91e498ba10
Author: Martin Wiesner <ma...@hs-heilbronn.de>
AuthorDate: Sun Dec 4 12:53:56 2022 +0100
OPENNLP-1403 Enhance JavaDoc in opennlp.tools.langdetect and opennlp.tools.languagemodel packages
- adds missing JavaDoc
- improves existing documentation for clarity
- removes superfluous text
- adds package-info.java for opennlp.tools.langdetect package
- adds 'final' modifier where useful and applicable
- adds 'Override' annotation where useful and applicable
- removes deprecated `calculateProbability(StringList tokens)` method in LanguageModel and privatized related impl in NGramLanguageModel
- fixes some typos
---
.../opennlp/tools/chunker/ChunkSampleStream.java | 4 +-
.../tools/chunker/ChunkerCrossValidator.java | 3 +
.../main/java/opennlp/tools/chunker/ChunkerME.java | 2 +-
.../DefaultLanguageDetectorContextGenerator.java | 16 ++---
.../java/opennlp/tools/langdetect/Language.java | 7 ++
.../opennlp/tools/langdetect/LanguageDetector.java | 17 ++++-
.../tools/langdetect/LanguageDetectorConfig.java | 29 +++-----
.../LanguageDetectorContextGenerator.java | 10 ++-
.../langdetect/LanguageDetectorCrossValidator.java | 35 +++++----
.../LanguageDetectorEvaluationMonitor.java | 5 +-
.../langdetect/LanguageDetectorEvaluator.java | 25 +++----
.../langdetect/LanguageDetectorEventStream.java | 15 ++--
.../tools/langdetect/LanguageDetectorFactory.java | 20 +++++-
.../tools/langdetect/LanguageDetectorME.java | 51 ++++++-------
.../tools/langdetect/LanguageDetectorModel.java | 38 +++++++++-
.../langdetect/LanguageDetectorSampleStream.java | 10 ++-
.../opennlp/tools/langdetect/LanguageSample.java | 2 +-
.../langdetect/ProbingLanguageDetectionResult.java | 9 ++-
...torEvaluationMonitor.java => package-info.java} | 11 +--
.../opennlp/tools/languagemodel/LanguageModel.java | 17 ++---
.../tools/languagemodel/NGramLanguageModel.java | 83 +++++++++++++++-------
21 files changed, 251 insertions(+), 158 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSampleStream.java
index 3d70fa13..85e62bbc 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkSampleStream.java
@@ -34,9 +34,9 @@ import opennlp.tools.util.ObjectStream;
public class ChunkSampleStream extends FilterObjectStream<String, ChunkSample> {
/**
- * Initializes the current instance.
+ * Initializes a {@link ChunkSampleStream instance}.
*
- * @param samples a plain text line stream
+ * @param samples A plain text {@link ObjectStream line stream}.
*/
public ChunkSampleStream(ObjectStream<String> samples) {
super(samples);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerCrossValidator.java b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerCrossValidator.java
index 41b54cd5..41b7c0d6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerCrossValidator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerCrossValidator.java
@@ -24,6 +24,9 @@ import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.eval.CrossValidationPartitioner;
import opennlp.tools.util.eval.FMeasure;
+/**
+ * Cross validator for {@link Chunker}.
+ */
public class ChunkerCrossValidator {
private final String languageCode;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
index ebaf6f62..e1246fa7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
@@ -154,7 +154,7 @@ public class ChunkerME implements Chunker {
}
/**
- * Start a training of a {@link ChunkerModel} with the given parameters.
+ * Starts a training of a {@link ChunkerModel} with the given parameters.
*
* @param lang The ISO conform language code.
* @param in The {@link ObjectStream} of {@link ChunkSample} used as input for training.
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/DefaultLanguageDetectorContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/DefaultLanguageDetectorContextGenerator.java
index 8d25201b..26fffa8b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/DefaultLanguageDetectorContextGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/DefaultLanguageDetectorContextGenerator.java
@@ -34,11 +34,12 @@ public class DefaultLanguageDetectorContextGenerator implements LanguageDetector
protected final CharSequenceNormalizer normalizer;
/**
- * Creates a customizable @{@link DefaultLanguageDetectorContextGenerator} that computes ngrams from text
- * @param minLength min ngrams chars
- * @param maxLength max ngrams chars
- * @param normalizers zero or more normalizers to
- * be applied in to the text before extracting ngrams
+ * Creates a customizable {@link DefaultLanguageDetectorContextGenerator} that computes ngrams from text.
+ *
+ * @param minLength The min number of ngrams characters. Must be greater than {@code 0}.
+ * @param maxLength The max number of ngrams characters. Must be greater than {@code 0}
+ * and must be greater than {@code minLength}.
+ * @param normalizers zero or more normalizers to be applied in to the text before extracting ngrams.
*/
public DefaultLanguageDetectorContextGenerator(int minLength, int maxLength,
CharSequenceNormalizer... normalizers) {
@@ -48,11 +49,6 @@ public class DefaultLanguageDetectorContextGenerator implements LanguageDetector
this.normalizer = new AggregateCharSequenceNormalizer(normalizers);
}
- /**
- * Generates the context for a document using character ngrams.
- * @param document document to extract context from
- * @return the generated context
- */
@Override
public String[] getContext(CharSequence document) {
Collection<String> context = new ArrayList<>();
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
index 05f40dcd..02b2d0b7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
@@ -29,10 +29,17 @@ public class Language implements Serializable {
private final String lang;
private final double confidence;
+ /**
+ * @param lang The language identifier.
+ */
public Language(String lang) {
this(lang, 0);
}
+ /**
+ * @param lang The language identifier.
+ * @param confidence The confidence computed during language detection.
+ */
public Language(String lang, double confidence) {
Objects.requireNonNull(lang, "lang must not be null");
this.lang = lang;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
index 7e4579e2..8de9805f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
@@ -20,14 +20,29 @@ package opennlp.tools.langdetect;
import java.io.Serializable;
/**
- * The interface for LanguageDetector which provide the @{@link Language} according to the context.
+ * The interface for {@link LanguageDetector} which predicts the {@link Language} for a context.
*/
public interface LanguageDetector extends Serializable {
+ /**
+ * Predicts the {@link Language languages} for the full {@code content} length.
+ *
+ * @param content The textual content to detect potential {@link Language languages} from.
+ * @return the predicted languages
+ */
Language[] predictLanguages(CharSequence content);
+ /**
+ * Predicts the {@link Language} for the full {@code content} length.
+ *
+ * @param content The textual content to detect potential {@link Language languages} from.
+ * @return the language with the highest confidence
+ */
Language predictLanguage(CharSequence content);
+ /**
+ * @return Retrieves an array of language (codes) that are supported by a {@link LanguageDetector}.
+ */
String[] getSupportedLanguages();
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorConfig.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorConfig.java
index 303bda37..eb9290ec 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorConfig.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorConfig.java
@@ -34,11 +34,8 @@ public class LanguageDetectorConfig {
private int minConsecImprovements = DEFAULT_MIN_CONSEC_IMPROVEMENTS;
private double minDiff = DEFAULT_MIN_DIFF;
-
/**
- * Maximum length in codepoints of text to process.
- *
- * @return
+ * @return The maximum length in codepoints of text to process.
*/
public int getMaxLength() {
return maxLength;
@@ -49,14 +46,12 @@ public class LanguageDetectorConfig {
}
/**
- * Size in codepoints of chunk to process at each
- * step for the probing detection.
- * <p>
* After processing a chunk of this size, the probing
* detection will compute probabilities and determine
* if there is enough confidence to stop.
*
- * @return
+ * @return The size in codepoints of chunk to process at each step for
+ * the probing detection.
*/
public int getChunkSize() {
return chunkSize;
@@ -67,14 +62,12 @@ public class LanguageDetectorConfig {
}
/**
- * Minimum number of consecutive increased probabilities
- * for the top language required in probing detection
- * to stop early.
- * <p>
- * If this value equals 0, probing detection will
+ * If this value equals {@code 0}, probing detection will
* rely solely on {@link #getMinDiff()}
*
- * @return minimum consecutive improvements
+ * @return The minimum number of consecutive increased probabilities
+ * for the top language required in probing detection
+ * to stop early.
*/
public int getMinConsecImprovements() {
return minConsecImprovements;
@@ -85,13 +78,11 @@ public class LanguageDetectorConfig {
}
/**
- * Minimum difference in confidence between the top predicted
- * language and the next most likely language.
- * <p>
- * If this value equals 0, probing detection will
+ * If this value equals {@code 0}, probing detection will
* rely solely on {@link #getMinConsecImprovements()}
*
- * @return
+ * @return The minimum difference in confidence between the top predicted
+ * language and the next most likely language.
*/
public double getMinDiff() {
return minDiff;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
index 39de4d7a..af0d13e6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
@@ -20,8 +20,16 @@ package opennlp.tools.langdetect;
import java.io.Serializable;
/**
- * A context generator interface for language detector.
+ * A context generator interface for {@link LanguageDetector}.
*/
public interface LanguageDetectorContextGenerator extends Serializable {
+
+ /**
+ * Retrieves the contexts for a {@code document} using character ngrams.
+ *
+ * @param document The textual input used to extract context from.
+ *
+ * @return An array of contexts on which a model basis its decisions.
+ */
String[] getContext(CharSequence document);
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java
index ce1823af..9aa85153 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java
@@ -19,29 +19,32 @@ package opennlp.tools.langdetect;
import java.io.IOException;
-import opennlp.tools.doccat.FeatureGenerator;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.eval.CrossValidationPartitioner;
import opennlp.tools.util.eval.Mean;
/**
- * Cross validator for language detector
+ * Cross validator for {@link LanguageDetector}.
*/
public class LanguageDetectorCrossValidator {
private final TrainingParameters params;
- private Mean documentAccuracy = new Mean();
+ private final Mean documentAccuracy = new Mean();
- private LanguageDetectorEvaluationMonitor[] listeners;
+ private final LanguageDetectorEvaluationMonitor[] listeners;
- private LanguageDetectorFactory factory;
+ private final LanguageDetectorFactory factory;
/**
- * Creates a {@link LanguageDetectorCrossValidator} with the given
- * {@link FeatureGenerator}s.
+ * Initializes a {@link LanguageDetectorCrossValidator} with the
+ * given {@link TrainingParameters parameters}.
+ *
+ * @param mlParams The {@link TrainingParameters} for the context of cross validation.
+ * @param factory The {@link LanguageDetectorFactory} for creating related objects.
+ * @param listeners the {@link LanguageDetectorEvaluationMonitor evaluation listeners}.
*/
public LanguageDetectorCrossValidator(TrainingParameters mlParams,
LanguageDetectorFactory factory,
@@ -54,12 +57,10 @@ public class LanguageDetectorCrossValidator {
/**
* Starts the evaluation.
*
- * @param samples
- * the data to train and test
- * @param nFolds
- * number of folds
+ * @param samples The {@link ObjectStream} of {@link LanguageSample samples} to train and test with.
+ * @param nFolds Number of folds. It must be greater than zero.
*
- * @throws IOException
+ * @throws IOException Thrown if IO errors occurred.
*/
public void evaluate(ObjectStream<LanguageSample> samples, int nFolds)
throws IOException {
@@ -87,19 +88,15 @@ public class LanguageDetectorCrossValidator {
}
/**
- * Retrieves the accuracy for all iterations.
- *
- * @return the word accuracy
+ * @return Retrieves the word accuracy for all iterations.
*/
public double getDocumentAccuracy() {
return documentAccuracy.mean();
}
/**
- * Retrieves the number of words which where validated over all iterations.
- * The result is the amount of folds multiplied by the total number of words.
- *
- * @return the word count
+ * @return Retrieves the number of words which where validated over all iterations.
+ * The result is the amount of folds multiplied by the total number of words.
*/
public long getDocumentCount() {
return documentAccuracy.count();
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
index 30f33137..f3ca3f88 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
@@ -20,9 +20,8 @@ package opennlp.tools.langdetect;
import opennlp.tools.util.eval.EvaluationMonitor;
/**
- * {@link EvaluationMonitor} for Language Detector.
+ * A marker interface for evaluating {@link LanguageDetector language detectors}.
*/
-public interface LanguageDetectorEvaluationMonitor extends
- EvaluationMonitor<LanguageSample> {
+public interface LanguageDetectorEvaluationMonitor extends EvaluationMonitor<LanguageSample> {
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java
index bbf73c32..211ec36e 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java
@@ -31,14 +31,15 @@ import opennlp.tools.util.eval.Mean;
*/
public class LanguageDetectorEvaluator extends Evaluator<LanguageSample> {
- private LanguageDetector languageDetector;
+ private final LanguageDetector languageDetector;
- private Mean accuracy = new Mean();
+ private final Mean accuracy = new Mean();
/**
- * Initializes the current instance.
+ * Initializes an instance to evaluate a {@link LanguageDetector}.
*
- * @param langDetect the language detector instance
+ * @param langDetect the {@link LanguageDetector} to evaluate.
+ * @param listeners the {@link LanguageDetectorEvaluationMonitor evaluation listeners}.
*/
public LanguageDetectorEvaluator(LanguageDetector langDetect,
LanguageDetectorEvaluationMonitor ... listeners) {
@@ -48,21 +49,18 @@ public class LanguageDetectorEvaluator extends Evaluator<LanguageSample> {
/**
* Evaluates the given reference {@link LanguageSample} object.
- *
- * This is done by categorizing the document from the provided
+ * This is achieved by categorizing the document of the provided
* {@link LanguageSample}. The detected language is then used
* to calculate and update the score.
*
* @param sample the reference {@link LanguageSample}.
+ * @return The processed {@link LanguageSample}.
*/
public LanguageSample processSample(LanguageSample sample) {
CharSequence document = sample.getContext();
-
Language predicted = languageDetector.predictLanguage(document);
-
-
if (sample.getLanguage().getLang().equals(predicted.getLang())) {
accuracy.add(1);
}
@@ -74,11 +72,8 @@ public class LanguageDetectorEvaluator extends Evaluator<LanguageSample> {
}
/**
- * Retrieves the accuracy of provided {@link DocumentCategorizer}.
- *
- * accuracy = correctly categorized documents / total documents
- *
- * @return the accuracy
+ * @return Retrieves the accuracy of provided {@link DocumentCategorizer}.
+ * Here: {@code accuracy = correctly categorized documents / total documents}.
*/
public double getAccuracy() {
return accuracy.mean();
@@ -89,7 +84,7 @@ public class LanguageDetectorEvaluator extends Evaluator<LanguageSample> {
}
/**
- * Represents this objects as human readable {@link String}.
+ * Represents this object as human-readable {@link String}.
*/
@Override
public String toString() {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
index 19e6d466..8c76ff4b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
@@ -24,28 +24,29 @@ import opennlp.tools.util.AbstractEventStream;
import opennlp.tools.util.ObjectStream;
/**
- * Iterator-like class for modeling language detector events.
+ * Iterator-like class for modeling an event stream of {@link LanguageSample samples}.
*/
public class LanguageDetectorEventStream extends AbstractEventStream<LanguageSample> {
- private LanguageDetectorContextGenerator mContextGenerator;
+ private final LanguageDetectorContextGenerator mContextGenerator;
/**
- * Initializes the current instance via samples and feature generators.
+ * Initializes an instance via samples and feature generators.
*
- * @param data {@link ObjectStream} of {@link LanguageSample}s
+ * @param data An {@link ObjectStream} of {@link LanguageSample samples} as input data.
+ * @param cg A {@link LanguageDetectorContextGenerator} used for the event stream {@code data}.
*/
public LanguageDetectorEventStream(ObjectStream<LanguageSample> data,
- LanguageDetectorContextGenerator contextGenerator) {
+ LanguageDetectorContextGenerator cg) {
super(data);
- mContextGenerator = contextGenerator;
+ mContextGenerator = cg;
}
@Override
protected Iterator<Event> createEvents(final LanguageSample sample) {
- return new Iterator<Event>() {
+ return new Iterator<>() {
private boolean isVirgin = true;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
index b9898877..a397f4ac 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
@@ -28,8 +28,10 @@ import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer;
/**
- * Default factory used by Language Detector. Extend this class to change the Language Detector
- * behaviour, such as the {@link LanguageDetectorContextGenerator}.
+ * Default factory used by {@link LanguageDetector}.
+ *
+ * Extend this class to change the Language Detector behaviour,
+ * such as the {@link LanguageDetectorContextGenerator}.
* The default {@link DefaultLanguageDetectorContextGenerator} will use char n-grams of
* size 1 to 3 and the following normalizers:
* <ul>
@@ -39,10 +41,12 @@ import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer;
* <li> {@link NumberCharSequenceNormalizer}
* <li> {@link ShrinkCharSequenceNormalizer}
* </ul>
- *
*/
public class LanguageDetectorFactory extends BaseToolFactory {
+ /**
+ * @return Retrieves a {@link LanguageDetectorContextGenerator}.
+ */
public LanguageDetectorContextGenerator getContextGenerator() {
return new DefaultLanguageDetectorContextGenerator(1, 3,
EmojiCharSequenceNormalizer.getInstance(),
@@ -52,6 +56,16 @@ public class LanguageDetectorFactory extends BaseToolFactory {
ShrinkCharSequenceNormalizer.getInstance());
}
+ /**
+ * Instantiates a {@link LanguageDetectorFactory} via a given {@code subclassName}.
+ *
+ * @param subclassName The class name used for instantiation. If {@code null}, an
+ * instance of {@link LanguageDetectorFactory} will be returned
+ * per default. Otherwise, the {@link ExtensionLoader} mechanism
+ * is applied to load the requested {@code subclassName}.
+ *
+ * @return A valid {@link LanguageDetectorFactory} instance.
+ */
public static LanguageDetectorFactory create(String subclassName)
throws InvalidFormatException {
if (subclassName == null) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
index a10b0e22..5679252a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
@@ -33,7 +33,7 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.TrainingParameters;
/**
- * Implements learnable Language Detector
+ * Implements a learnable {@link LanguageDetector}.
*
* <p>
* This will process the entire string when called with
@@ -63,37 +63,25 @@ import opennlp.tools.util.TrainingParameters;
public class LanguageDetectorME implements LanguageDetector {
protected LanguageDetectorModel model;
- private LanguageDetectorContextGenerator mContextGenerator;
+ private final LanguageDetectorContextGenerator mContextGenerator;
/**
- * Initializes the current instance with a language detector model. Default feature
- * generation is used.
+ * Initializes an instance with a specific {@link LanguageDetectorModel}.
+ * Default feature generation is used.
*
- * @param model the language detector model
+ * @param model the {@link LanguageDetectorModel} to be used.
*/
public LanguageDetectorME(LanguageDetectorModel model) {
this.model = model;
this.mContextGenerator = model.getFactory().getContextGenerator();
}
- /**
- * This will process the full content length.
- *
- * @param content
- * @return the predicted languages
- */
@Override
public Language[] predictLanguages(CharSequence content) {
return predict(arrayToCounts(
mContextGenerator.getContext(content)));
}
-
- /**
- * This will process the full content length.
- *
- * @param content
- * @return the language with the highest confidence
- */
+
@Override
public Language predictLanguage(CharSequence content) {
return predictLanguages(content)[0];
@@ -115,7 +103,7 @@ public class LanguageDetectorME implements LanguageDetector {
* are met.
*
* @param content content to be processed
- * @return result
+ * @return A computed {@link ProbingLanguageDetectionResult}.
*/
public ProbingLanguageDetectionResult probingPredictLanguages(CharSequence content) {
return probingPredictLanguages(content,
@@ -127,15 +115,16 @@ public class LanguageDetectorME implements LanguageDetector {
* specified in {@link LanguageDetectorConfig#DEFAULT_LANGUAGE_DETECTOR_CONFIG}
* are met.
*
- * @param content content to process
- * @param config config to customize detection
- * @return
+ * @param content The textual content to process.
+ * @param config The {@link LanguageDetectorConfig} to customize detection.
+ *
+ * @return A computed {@link ProbingLanguageDetectionResult}.
*/
public ProbingLanguageDetectionResult probingPredictLanguages(CharSequence content,
LanguageDetectorConfig config) {
//list of the languages that received the highest
//confidence over the last n chunk detections
- List<Language[]> predictions = new LinkedList();
+ List<Language[]> predictions = new LinkedList<>();
int start = 0;//where to start the next chunk in codepoints
Language[] currPredictions = null;
//cache ngram counts across chunks
@@ -202,13 +191,14 @@ public class LanguageDetectorME implements LanguageDetector {
}
/**
- * Override this for different behavior to determine if there is enough
+ * Overriding this for different behavior to determine if there is enough
* confidence in the predictions to stop.
*
* @param predictionsQueue queue of earlier predictions
* @param newPredictions most recent predictions
* @param ngramCounts -- not currently used, but might be useful
- * @return whether or not enough text has been processed to make a determination
+ * @return {@code true} if enough text has been processed to make a determination,
+ * else {@code false}.
*/
boolean seenEnough(List<Language[]> predictionsQueue, Language[] newPredictions,
Map<String, MutableInt> ngramCounts, LanguageDetectorConfig config) {
@@ -265,6 +255,17 @@ public class LanguageDetectorME implements LanguageDetector {
codepoints.length);
}
+ /**
+ * Starts a training of a {@link LanguageDetectorModel} with the given parameters.
+ *
+ * @param samples The {@link ObjectStream} of {@link LanguageSample} used as input for training.
+ * @param mlParams The {@link TrainingParameters} for the context of the training.
+ * @param factory The {@link LanguageDetectorFactory} for creating related objects defined
+ * via {@code mlParams}.
+ *
+ * @return A valid, trained {@link LanguageDetectorModel} instance.
+ * @throws IOException Thrown if IO errors occurred.
+ */
public static LanguageDetectorModel train(ObjectStream<LanguageSample> samples,
TrainingParameters mlParams,
LanguageDetectorFactory factory)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
index c0d9703e..a37b64f0 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
@@ -30,13 +30,22 @@ import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.model.BaseModel;
/**
- * A model for language detection
+ * The {@link LanguageDetectorModel} is the model used by a learnable {@link LanguageDetector}.
+ *
+ * @see LanguageDetectorME
*/
public class LanguageDetectorModel extends BaseModel {
private static final String COMPONENT_NAME = "LanguageDetectorME";
private static final String LANGDETECT_MODEL_ENTRY_NAME = "langdetect.model";
+ /**
+ * Initializes a {@link LanguageDetectorModel} instance via given parameters.
+ *
+ * @param langdetectModel A valid {@link MaxentModel}.
+ * @param manifestInfoEntries Additional information kept in the manifest.
+ * @param factory The {@link LanguageDetectorFactory} for creating related objects.
+ */
public LanguageDetectorModel(MaxentModel langdetectModel,
Map<String, String> manifestInfoEntries,
LanguageDetectorFactory factory) {
@@ -46,14 +55,35 @@ public class LanguageDetectorModel extends BaseModel {
checkArtifactMap();
}
+ /**
+ * Initializes a {@link LanguageDetectorModel} instance via a valid {@link InputStream}.
+ *
+ * @param in The {@link InputStream} used for loading the model.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public LanguageDetectorModel(InputStream in) throws IOException {
super(COMPONENT_NAME, in);
}
+ /**
+ * Initializes a {@link LanguageDetectorModel} instance via a valid {@link File}.
+ *
+ * @param modelFile The {@link File} used for loading the model.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public LanguageDetectorModel(File modelFile) throws IOException {
super(COMPONENT_NAME, modelFile);
}
+ /**
+ * Initializes a {@link LanguageDetectorModel} instance via a valid {@link URL}.
+ *
+ * @param modelURL The {@link URL} used for loading the model.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public LanguageDetectorModel(URL modelURL) throws IOException {
super(COMPONENT_NAME, modelURL);
}
@@ -67,6 +97,9 @@ public class LanguageDetectorModel extends BaseModel {
}
}
+ /**
+ * @return Retrieves the active {@link LanguageDetectorFactory}.
+ */
public LanguageDetectorFactory getFactory() {
return (LanguageDetectorFactory) this.toolFactory;
}
@@ -76,6 +109,9 @@ public class LanguageDetectorModel extends BaseModel {
return LanguageDetectorFactory.class;
}
+ /**
+ * @return Retrieves a {@link MaxentModel}.
+ */
public MaxentModel getMaxentModel() {
return (MaxentModel) artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME);
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
index 2a407f7f..bb06547e 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
@@ -23,8 +23,8 @@ import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.ObjectStream;
/**
- * This class reads in string encoded training samples, parses them and
- * outputs {@link LanguageSample} objects.
+ * This class reads in string encoded {@link ObjectStream training samples}, parses them
+ * and outputs {@link LanguageSample} objects.
* <p>
* Format:<br>
* Each line contains one sample document.<br>
@@ -34,10 +34,16 @@ import opennlp.tools.util.ObjectStream;
public class LanguageDetectorSampleStream
extends FilterObjectStream<String, LanguageSample> {
+ /**
+ * Initializes a {@link LanguageDetectorSampleStream instance}.
+ *
+ * @param samples A plain text {@link ObjectStream line stream}.
+ */
public LanguageDetectorSampleStream(ObjectStream<String> samples) {
super(samples);
}
+ @Override
public LanguageSample read() throws IOException {
String sampleString;
while ((sampleString = samples.read()) != null) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
index 041d5966..ae7ead62 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
@@ -21,7 +21,7 @@ import java.io.Serializable;
import java.util.Objects;
/**
- * Class which holds a classified document and its @{@link Language}.
+ * Holds a classified document and its {@link Language}.
*/
public class LanguageSample implements Serializable {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/ProbingLanguageDetectionResult.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/ProbingLanguageDetectionResult.java
index b3e8c1ff..bd56b777 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/ProbingLanguageDetectionResult.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/ProbingLanguageDetectionResult.java
@@ -17,6 +17,9 @@
package opennlp.tools.langdetect;
+/**
+ * A data container encapsulating language detection results.
+ */
public class ProbingLanguageDetectionResult {
private final Language[] languages;
private final int length;
@@ -26,13 +29,15 @@ public class ProbingLanguageDetectionResult {
this.length = length;
}
+ /**
+ * @return The {@link Language languages} detected.
+ */
public Language[] getLanguages() {
return languages;
}
/**
- *
- * @return length in codepoints of text processed
+ * @return The length in codepoints of text processed.
*/
public int getLength() {
return length;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/package-info.java
similarity index 80%
copy from opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
copy to opennlp-tools/src/main/java/opennlp/tools/langdetect/package-info.java
index 30f33137..35c7f754 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/package-info.java
@@ -15,14 +15,7 @@
* limitations under the License.
*/
-package opennlp.tools.langdetect;
-
-import opennlp.tools.util.eval.EvaluationMonitor;
-
/**
- * {@link EvaluationMonitor} for Language Detector.
+ * Package related to predicting languages from samples of text.
*/
-public interface LanguageDetectorEvaluationMonitor extends
- EvaluationMonitor<LanguageSample> {
-
-}
+package opennlp.tools.langdetect;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/languagemodel/LanguageModel.java b/opennlp-tools/src/main/java/opennlp/tools/languagemodel/LanguageModel.java
index 8366925f..2e8a1909 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/languagemodel/LanguageModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/languagemodel/LanguageModel.java
@@ -28,25 +28,16 @@ public interface LanguageModel {
/**
* Calculate the probability of a series of tokens (e.g. a sentence), given a vocabulary.
*
- * @param tokens the text tokens to calculate the probability for
- * @return the probability of the given text tokens in the vocabulary
- * @deprecated use {@link #calculateProbability(String...)}
- */
- @Deprecated
- double calculateProbability(StringList tokens);
-
- /**
- * Calculate the probability of a series of tokens (e.g. a sentence), given a vocabulary.
+ * @param tokens the text tokens to calculate the {@code probability} for.
*
- * @param tokens the text tokens to calculate the probability for
* @return the probability of the given text tokens in the vocabulary
*/
double calculateProbability(String... tokens);
/**
- * Predict the most probable output sequence of tokens, given an input sequence of tokens.
+ * Predict the most probable output sequence of tokens, given an input sequence of {@code tokens}.
*
- * @param tokens a sequence of tokens
+ * @param tokens a sequence of tokens.
* @return the most probable subsequent token sequence
* @deprecated use {@link #predictNextTokens(String...)}
*/
@@ -54,7 +45,7 @@ public interface LanguageModel {
StringList predictNextTokens(StringList tokens);
/**
- * Predict the most probable output sequence of tokens, given an input sequence of tokens.
+ * Predict the most probable output sequence of tokens, given an input sequence of {@code tokens}.
*
* @param tokens a sequence of tokens
* @return the most probable subsequent token sequence
diff --git a/opennlp-tools/src/main/java/opennlp/tools/languagemodel/NGramLanguageModel.java b/opennlp-tools/src/main/java/opennlp/tools/languagemodel/NGramLanguageModel.java
index 4dda6875..2b3888ae 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/languagemodel/NGramLanguageModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/languagemodel/NGramLanguageModel.java
@@ -25,8 +25,8 @@ import opennlp.tools.ngram.NGramUtils;
import opennlp.tools.util.StringList;
/**
- * A {@link opennlp.tools.languagemodel.LanguageModel} based on a {@link opennlp.tools.ngram.NGramModel}
- * using Stupid Backoff to get the probabilities of the ngrams.
+ * A {@link LanguageModel} based on a {@link NGramModel} using Stupid Backoff to get
+ * the probabilities of the ngrams.
*/
public class NGramLanguageModel extends NGramModel implements LanguageModel {
@@ -34,45 +34,65 @@ public class NGramLanguageModel extends NGramModel implements LanguageModel {
private final int n;
+ /**
+ * Initializes an {@link NGramLanguageModel} with {@link #DEFAULT_N}.
+ */
public NGramLanguageModel() {
this(DEFAULT_N);
}
+ /**
+ * Initializes an {@link NGramLanguageModel} with the given {@code n} for the ngram size.
+ *
+ * @param n The size of the ngrams to be used. Must be greater than {@code 0}.
+ *
+ * @throws IllegalArgumentException Thrown if one of the arguments was invalid.
+ */
public NGramLanguageModel(int n) {
+ if (n <= 0) {
+ throw new IllegalArgumentException("Parameter 'n' must be greater than 0.");
+ }
this.n = n;
}
+ /**
+ * Initializes a {@link NGramLanguageModel} instance via a valid {@link InputStream}.
+ *
+ * @param in The {@link InputStream} used for loading the model.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ * @throws IllegalArgumentException Thrown if one of the arguments was invalid.
+ */
public NGramLanguageModel(InputStream in) throws IOException {
this(in, DEFAULT_N);
}
- public NGramLanguageModel(InputStream in, int n)
- throws IOException {
+ /**
+ * Initializes a {@link NGramLanguageModel} instance via a valid {@link InputStream}.
+ *
+ * @param in The {@link InputStream} used for loading the model.
+ * @param n The size of the ngrams to be used. Must be greater than {@code 0}.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ * @throws IllegalArgumentException Thrown if one of the arguments was invalid.
+ */
+ public NGramLanguageModel(InputStream in, int n) throws IOException {
super(in);
+ if (n <= 0) {
+ throw new IllegalArgumentException("Parameter 'n' must be greater than 0.");
+ }
this.n = n;
}
+ /**
+ * Adds further tokens.
+ *
+ * @param tokens Text elements to add to the {@link NGramLanguageModel}.
+ */
public void add(String... tokens) {
add(new StringList(tokens), 1, n);
}
- @Override
- public double calculateProbability(StringList tokens) {
- double probability = 0d;
- if (size() > 0) {
- for (StringList ngram : NGramUtils.getNGrams(tokens, n)) {
- double score = stupidBackoff(ngram);
- probability += StrictMath.log(score);
- if (Double.isNaN(probability)) {
- probability = 0d;
- break;
- }
- }
- probability = StrictMath.exp(probability);
- }
- return probability;
- }
-
@Override
public double calculateProbability(String... tokens) {
double probability = 0d;
@@ -91,6 +111,7 @@ public class NGramLanguageModel extends NGramModel implements LanguageModel {
}
@Override
+ @Deprecated
public StringList predictNextTokens(StringList tokens) {
double maxProb = Double.NEGATIVE_INFINITY;
StringList token = null;
@@ -114,6 +135,22 @@ public class NGramLanguageModel extends NGramModel implements LanguageModel {
return token;
}
+ private double calculateProbability(StringList tokens) {
+ double probability = 0d;
+ if (size() > 0) {
+ for (StringList ngram : NGramUtils.getNGrams(tokens, n)) {
+ double score = stupidBackoff(ngram);
+ probability += StrictMath.log(score);
+ if (Double.isNaN(probability)) {
+ probability = 0d;
+ break;
+ }
+ }
+ probability = StrictMath.exp(probability);
+ }
+ return probability;
+ }
+
@Override
public String[] predictNextTokens(String... tokens) {
double maxProb = Double.NEGATIVE_INFINITY;
@@ -121,9 +158,7 @@ public class NGramLanguageModel extends NGramModel implements LanguageModel {
for (StringList ngram : this) {
String[] sequence = new String[ngram.size() + tokens.length];
- for (int i = 0; i < tokens.length; i++) {
- sequence[i] = tokens[i];
- }
+ System.arraycopy(tokens, 0, sequence, 0, tokens.length);
for (int i = 0; i < ngram.size(); i++) {
sequence[i + tokens.length] = ngram.getToken(i);
}