You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2022/12/24 21:19:51 UTC
[opennlp] branch main updated: OPENNLP-1410 Enhance JavaDoc in opennlp.tools.namefind package (#456)
This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new 9b9aa600 OPENNLP-1410 Enhance JavaDoc in opennlp.tools.namefind package (#456)
9b9aa600 is described below
commit 9b9aa6001be5a1ebc29c602a1be7b46d1dc2a6e1
Author: Martin Wiesner <ma...@users.noreply.github.com>
AuthorDate: Sat Dec 24 22:19:46 2022 +0100
OPENNLP-1410 Enhance JavaDoc in opennlp.tools.namefind package (#456)
---
.../TokenNameFinderCrossValidatorTool.java | 9 +-
.../namefind/TokenNameFinderTrainerTool.java | 8 +-
.../java/opennlp/tools/namefind/BilouCodec.java | 35 ++++--
.../namefind/BilouNameFinderSequenceValidator.java | 9 +-
.../main/java/opennlp/tools/namefind/BioCodec.java | 22 +++-
.../namefind/DefaultNameContextGenerator.java | 43 ++++---
.../tools/namefind/DictionaryNameFinder.java | 21 ++--
.../opennlp/tools/namefind/DocumentNameFinder.java | 13 +-
.../tools/namefind/NameContextGenerator.java | 23 ++--
.../tools/namefind/NameFinderEventStream.java | 56 ++++++---
.../java/opennlp/tools/namefind/NameFinderME.java | 116 +++++++++--------
.../namefind/NameFinderSequenceValidator.java | 10 +-
.../java/opennlp/tools/namefind/NameSample.java | 93 +++++++++++---
.../tools/namefind/NameSampleDataStream.java | 16 ++-
.../tools/namefind/NameSampleSequenceStream.java | 66 ++++++++--
.../tools/namefind/NameSampleTypeFilter.java | 18 ++-
.../opennlp/tools/namefind/RegexNameFinder.java | 77 +++++++-----
.../tools/namefind/RegexNameFinderFactory.java | 25 ++--
.../opennlp/tools/namefind/TokenNameFinder.java | 14 ++-
.../namefind/TokenNameFinderCrossValidator.java | 109 +++++++++-------
.../namefind/TokenNameFinderEvaluationMonitor.java | 3 +
.../tools/namefind/TokenNameFinderEvaluator.java | 22 ++--
.../tools/namefind/TokenNameFinderFactory.java | 131 +++++++++++++++----
.../tools/namefind/TokenNameFinderModel.java | 139 +++++++++++++++++++--
.../util/featuregen/CachedFeatureGenerator.java | 13 +-
.../java/opennlp/tools/util/model/BaseModel.java | 9 ++
26 files changed, 782 insertions(+), 318 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
index 7f4cbd12..ea356a71 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
@@ -104,9 +104,12 @@ public final class TokenNameFinderCrossValidatorTool
sequenceCodecImplName = BilouCodec.class.getName();
}
- SequenceCodec<String> sequenceCodec =
- TokenNameFinderFactory.instantiateSequenceCodec(sequenceCodecImplName);
-
+ SequenceCodec<String> sequenceCodec;
+ try {
+ sequenceCodec = TokenNameFinderFactory.instantiateSequenceCodec(sequenceCodecImplName);
+ } catch (InvalidFormatException e) {
+ throw new TerminateToolException(-1, e.getMessage(), e);
+ }
TokenNameFinderFineGrainedReportListener reportListener = null;
File reportFile = params.getReportOutputFile();
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
index bc2acbac..ecc23677 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
@@ -150,8 +150,12 @@ public final class TokenNameFinderTrainerTool
sequenceCodecImplName = BilouCodec.class.getName();
}
- SequenceCodec<String> sequenceCodec =
- TokenNameFinderFactory.instantiateSequenceCodec(sequenceCodecImplName);
+ SequenceCodec<String> sequenceCodec;
+ try {
+ sequenceCodec = TokenNameFinderFactory.instantiateSequenceCodec(sequenceCodecImplName);
+ } catch (InvalidFormatException e) {
+ throw new TerminateToolException(-1, e.getMessage(), e);
+ }
TokenNameFinderFactory nameFinderFactory;
try {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/BilouCodec.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/BilouCodec.java
index 43c0e343..b0d540e4 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/BilouCodec.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/BilouCodec.java
@@ -27,6 +27,23 @@ import opennlp.tools.util.SequenceCodec;
import opennlp.tools.util.SequenceValidator;
import opennlp.tools.util.Span;
+/**
+ * The default {@link SequenceCodec} implementation according to the {@code BILOU} scheme.
+ * <ul>
+ * <li>B: 'beginning' of a NE</li>
+ * <li>I: 'inside', the word is inside a NE</li>
+ * <li>L: 'last', the last (I) word inside a NE</li>
+ * <li>O: 'outside', the word is a regular word outside a NE</li>
+ * <li>U: 'unit', any standalone token following words outside of NE</li>
+ * </ul>
+ *
+ * See paper by Roth D. and Ratinov L. (2009):
+ * <a href="https://cogcomp.seas.upenn.edu/page/publication_view/199">
+ * Design Challenges and Misconceptions in Named Entity Recognition</a>.
+ *
+ * @see SequenceCodec
+ * @see BioCodec
+ */
public class BilouCodec implements SequenceCodec<String> {
public static final String START = "start";
@@ -114,15 +131,17 @@ public class BilouCodec implements SequenceCodec<String> {
}
/**
- * B requires CL or L
- * C requires BL
- * L requires B
- * O requires any valid combo/unit
- * U requires none
- *
- * @param outcomes all possible model outcomes
+ * {@code
+ * B requires CL or L,
+ * C requires BL,
+ * L requires B,
+ * O requires any valid combo/unit,
+ * U requires none.
+ * }
+ *
+ * @param outcomes All potential model outcomes check.
*
- * @return true, if model outcomes are compatible
+ * @return {@code true}, if model outcomes are compatible, {@code false} otherwise.
*/
@Override
public boolean areOutcomesCompatible(String[] outcomes) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/BilouNameFinderSequenceValidator.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/BilouNameFinderSequenceValidator.java
index 9916c93a..8a8d689c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/BilouNameFinderSequenceValidator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/BilouNameFinderSequenceValidator.java
@@ -19,9 +19,14 @@ package opennlp.tools.namefind;
import opennlp.tools.util.SequenceValidator;
-public class BilouNameFinderSequenceValidator implements
- SequenceValidator<String> {
+/**
+ * A {@link SequenceValidator} implementation for the {@link BilouCodec}.
+ *
+ * @see BilouCodec
+ */
+public class BilouNameFinderSequenceValidator implements SequenceValidator<String> {
+ @Override
public boolean validSequence(int i, String[] inputSequence,
String[] outcomesSequence, String outcome) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/BioCodec.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/BioCodec.java
index 3935ba99..5b63173f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/BioCodec.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/BioCodec.java
@@ -26,16 +26,31 @@ import java.util.regex.Pattern;
import opennlp.tools.util.SequenceCodec;
import opennlp.tools.util.Span;
+/**
+ * The default {@link SequenceCodec} implementation according to the {@code BIO} scheme:
+ * <ul>
+ * <li>B: 'beginning' of a NE</li>
+ * <li>I: 'inside', the word is inside a NE</li>
+ * <li>O: 'outside', the word is a regular word outside a NE</li>
+ * </ul>
+ *
+ * See also the paper by Roth D. and Ratinov L.:
+ * <a href="https://cogcomp.seas.upenn.edu/page/publication_view/199">
+ * Design Challenges and Misconceptions in Named Entity Recognition</a>.
+ *
+ * @see SequenceCodec
+ * @see BilouCodec
+ */
public class BioCodec implements SequenceCodec<String> {
public static final String START = "start";
public static final String CONTINUE = "cont";
public static final String OTHER = "other";
- private static final Pattern typedOutcomePattern = Pattern.compile("(.+)-\\w+");
+ private static final Pattern TYPED_OUTCOME_PATTERN = Pattern.compile("(.+)-\\w+");
static String extractNameType(String outcome) {
- Matcher matcher = typedOutcomePattern.matcher(outcome);
+ Matcher matcher = TYPED_OUTCOME_PATTERN.matcher(outcome);
if (matcher.matches()) {
return matcher.group(1);
}
@@ -43,6 +58,7 @@ public class BioCodec implements SequenceCodec<String> {
return null;
}
+ @Override
public Span[] decode(List<String> c) {
int start = -1;
int end = -1;
@@ -77,6 +93,7 @@ public class BioCodec implements SequenceCodec<String> {
return spans.toArray(new Span[spans.size()]);
}
+ @Override
public String[] encode(Span[] names, int length) {
String[] outcomes = new String[length];
Arrays.fill(outcomes, BioCodec.OTHER);
@@ -101,6 +118,7 @@ public class BioCodec implements SequenceCodec<String> {
return outcomes;
}
+ @Override
public NameFinderSequenceValidator createSequenceValidator() {
return new NameFinderSequenceValidator();
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/DefaultNameContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/DefaultNameContextGenerator.java
index 5361165b..e591c0bf 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/DefaultNameContextGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/DefaultNameContextGenerator.java
@@ -31,15 +31,15 @@ import opennlp.tools.util.featuregen.TokenFeatureGenerator;
import opennlp.tools.util.featuregen.WindowFeatureGenerator;
/**
- * Class for determining contextual features for a tag/chunk style
- * named-entity recognizer.
+ * A {@link NameContextGenerator} implementation for determining contextual features
+ * for a {@code tag-chunk} style named-entity recognizer.
*/
public class DefaultNameContextGenerator implements NameContextGenerator {
protected AdaptiveFeatureGenerator[] featureGenerators;
@Deprecated
- private static AdaptiveFeatureGenerator windowFeatures = new CachedFeatureGenerator(
+ private static final AdaptiveFeatureGenerator WINDOW_FEATURES = new CachedFeatureGenerator(
new WindowFeatureGenerator(new TokenFeatureGenerator(), 2, 2),
new WindowFeatureGenerator(new TokenClassFeatureGenerator(true), 2, 2),
new OutcomePriorFeatureGenerator(),
@@ -48,7 +48,9 @@ public class DefaultNameContextGenerator implements NameContextGenerator {
/**
* Creates a name context generator.
- * @deprecated use the other constructor and always provide the feature generators
+ *
+ * @deprecated use {@link #DefaultNameContextGenerator(AdaptiveFeatureGenerator...)} and
+ * always provide one or more {@link AdaptiveFeatureGenerator feature generators}.
*/
@Deprecated
public DefaultNameContextGenerator() {
@@ -56,22 +58,25 @@ public class DefaultNameContextGenerator implements NameContextGenerator {
}
/**
- * Creates a name context generator with the specified cache size.
+ * Creates a name context generator with the specified
+ * {@link AdaptiveFeatureGenerator feature generators}.
+ *
+ * @param featureGenerators One or more {@link AdaptiveFeatureGenerator feature generators}.
+ * If none are provided, a default config ({@link #WINDOW_FEATURES})
+ * will be used.
*/
public DefaultNameContextGenerator(AdaptiveFeatureGenerator... featureGenerators) {
if (featureGenerators != null) {
this.featureGenerators = featureGenerators;
}
- else {
- // use defaults
-
- this.featureGenerators = new AdaptiveFeatureGenerator[]{
- windowFeatures,
- new PreviousMapFeatureGenerator()};
+ else { // use defaults
+ this.featureGenerators =
+ new AdaptiveFeatureGenerator[]{WINDOW_FEATURES, new PreviousMapFeatureGenerator()};
}
}
+ @Override
public void addFeatureGenerator(AdaptiveFeatureGenerator generator) {
AdaptiveFeatureGenerator[] generators = featureGenerators;
@@ -82,6 +87,7 @@ public class DefaultNameContextGenerator implements NameContextGenerator {
featureGenerators[featureGenerators.length - 1] = generator;
}
+ @Override
public void updateAdaptiveData(String[] tokens, String[] outcomes) {
if (tokens != null && outcomes != null && tokens.length != outcomes.length) {
@@ -94,6 +100,7 @@ public class DefaultNameContextGenerator implements NameContextGenerator {
}
}
+ @Override
public void clearAdaptiveData() {
for (AdaptiveFeatureGenerator featureGenerator : featureGenerators) {
featureGenerator.clearAdaptiveData();
@@ -101,17 +108,19 @@ public class DefaultNameContextGenerator implements NameContextGenerator {
}
/**
- * Return the context for finding names at the specified index.
- * @param index The index of the token in the specified toks array for which the
+ * Finds the context for finding names at the specified index.
+ *
+ * @param index The index of the token in the specified {@code tokens} for which the
* context should be constructed.
- * @param tokens The tokens of the sentence. The <code>toString</code> methods
+ * @param tokens The tokens of the sentence. The {@code toString()} methods
* of these objects should return the token text.
* @param preds The previous decisions made in the tagging of this sequence.
- * Only indices less than i will be examined.
- * @param additionalContext Addition features which may be based on a context outside of the sentence.
+ * Only indices less than {@code index} will be examined.
+ * @param additionalContext Addition features which may be based on a context outside the sentence.
*
- * @return the context for finding names at the specified index.
+ * @return The context for finding names at the specified {@code index}.
*/
+ @Override
public String[] getContext(int index, String[] tokens, String[] preds, Object[] additionalContext) {
List<String> features = new ArrayList<>();
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/DictionaryNameFinder.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/DictionaryNameFinder.java
index d186ef98..4c85a182 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/DictionaryNameFinder.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/DictionaryNameFinder.java
@@ -26,22 +26,22 @@ import opennlp.tools.util.Span;
import opennlp.tools.util.StringList;
/**
- * This is a dictionary based name finder, it scans text
- * for names inside a dictionary.
+ * This is a {@link Dictionary} based {@link TokenNameFinder name finder}.
+ * It scans text for names inside a provided dictionary.
*/
public class DictionaryNameFinder implements TokenNameFinder {
private static final String DEFAULT_TYPE = "default";
- private Dictionary mDictionary;
+ private final Dictionary mDictionary;
private final String type;
/**
- * Initialized the current instance with he provided dictionary
- * and a type.
+ * Initializes a {@link DictionaryNameFinder} with the provided {@link Dictionary}
+ * and a {@code type}.
*
- * @param dictionary
- * @param type the name type used for the produced spans
+ * @param dictionary The {@link Dictionary} to use. Must not be {@code null}.
+ * @param type the name type used for the produced spans. Must not be {@code null}.
*/
public DictionaryNameFinder(Dictionary dictionary, String type) {
mDictionary = Objects.requireNonNull(dictionary, "dictionary must not be null");
@@ -49,14 +49,16 @@ public class DictionaryNameFinder implements TokenNameFinder {
}
/**
- * Initializes the current instance with the provided dictionary.
+ * Initializes a {@link DictionaryNameFinder} with the provided {@link Dictionary}
+ * and {@link #DEFAULT_TYPE}.
*
- * @param dictionary
+ * @param dictionary The {@link Dictionary} to use. Must not be {@code null}.
*/
public DictionaryNameFinder(Dictionary dictionary) {
this(dictionary, DEFAULT_TYPE);
}
+ @Override
public Span[] find(String[] textTokenized) {
List<Span> namesFound = new LinkedList<>();
@@ -91,6 +93,7 @@ public class DictionaryNameFinder implements TokenNameFinder {
return namesFound.toArray(new Span[namesFound.size()]);
}
+ @Override
public void clearAdaptiveData() {
// nothing to clear
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/DocumentNameFinder.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/DocumentNameFinder.java
index b6514c2a..0f09a986 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/DocumentNameFinder.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/DocumentNameFinder.java
@@ -20,22 +20,23 @@ package opennlp.tools.namefind;
import opennlp.tools.util.Span;
/**
- * Name finding interface which processes an entire document allowing the name finder to use context
+ * Interface for processing an entire document allowing a {@link TokenNameFinder} to use context
* from the entire document.
*
- * <strong>EXPERIMENTAL</strong>.
+ * <strong>EXPERIMENTAL</strong>:
* This interface has been added as part of a work in progress and might change without notice.
*/
public interface DocumentNameFinder {
/**
- * Returns tokens span for the specified document of sentences and their tokens.
+ * Finds tokens {@link Span spans} for the specified document of sentences and their tokens.
+ * <p>
* Span start and end indices are relative to the sentence they are in.
* For example, a span identifying a name consisting of the first and second word
- * of the second sentence would be 0..2 and be referenced as spans[1][0].
+ * of the second sentence would be {@code 0..2} and be referenced as {@code spans[1][0]}.
*
- * @param document An array of tokens for each sentence of a document.
- * @return The token spans for each sentence of the specified document.
+ * @param document A 2-dimensional array of tokens for each sentence of a document.
+ * @return The {@link Span token spans} for each sentence of the specified document.
*/
Span[][] find(String[][] document);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameContextGenerator.java
index a438cecf..062e249f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameContextGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameContextGenerator.java
@@ -21,28 +21,33 @@ import opennlp.tools.util.BeamSearchContextGenerator;
import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
/**
- * Interface for generating the context for an name finder by specifying a set of geature generators.
+ * Interface for generating the context for a {@link TokenNameFinder name finder} by
+ * specifying a set of feature generators.
*
+ * @see BeamSearchContextGenerator
+ * @see AdaptiveFeatureGenerator
*/
public interface NameContextGenerator extends BeamSearchContextGenerator<String> {
/**
- * Adds a feature generator to this set of feature generators.
- * @param generator The feature generator to add.
+ * Adds a feature generator.
+ *
+ * @param generator The {@link AdaptiveFeatureGenerator feature generator} to add.
*/
void addFeatureGenerator(AdaptiveFeatureGenerator generator);
/**
- * Informs all the feature generators for a name finder that the specified tokens have
- * been classified with the coorisponds set of specified outcomes.
- * @param tokens The tokens of the sentence or other text unit which has been processed.
- * @param outcomes The outcomes associated with the specified tokens.
+ * Informs all the feature generators that the specified {@code tokens}
+ * have been classified with the corresponds set of specified {@code outcomes}.
+ *
+ * @param tokens The tokens of a sentence or another text unit which has been processed.
+ * @param outcomes The outcomes associated with the specified {@code tokens}.
*/
void updateAdaptiveData(String[] tokens, String[] outcomes);
/**
- * Informs all the feature generators for a name finder that the context of the adaptive
- * data (typically a document) is no longer valid.
+ * Informs all the feature generators that the context of the adaptive
+ * data (typically a document) is no longer valid and should be cleared.
*/
void clearAdaptiveData();
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderEventStream.java
index 11f1b908..a77bfbf6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderEventStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderEventStream.java
@@ -32,36 +32,37 @@ import opennlp.tools.util.featuregen.AdditionalContextFeatureGenerator;
import opennlp.tools.util.featuregen.WindowFeatureGenerator;
/**
- * Class for creating an event stream out of data files for training an name
- * finder.
+ * Class for creating an event stream out of data files for training an {@link TokenNameFinder}.
*/
public class NameFinderEventStream extends opennlp.tools.util.AbstractEventStream<NameSample> {
- private NameContextGenerator contextGenerator;
+ private final NameContextGenerator contextGenerator;
- private AdditionalContextFeatureGenerator additionalContextFeatureGenerator =
+ private final AdditionalContextFeatureGenerator additionalContextFeatureGenerator =
new AdditionalContextFeatureGenerator();
- private SequenceCodec<String> codec;
+ private final SequenceCodec<String> codec;
private final String defaultType;
/**
- * Creates a new name finder event stream using the specified data stream and context generator.
- * @param dataStream The data stream of events.
- * @param type null or overrides the type parameter in the provided samples
- * @param contextGenerator The context generator used to generate features for the event stream.
+ * Initializes a {@link NameFinderEventStream} using the specified {@code dataStream} and
+ * {@link NameContextGenerator}.
+ *
+ * @param dataStream The {@link ObjectStream data stream} of events.
+ * @param type {@code null} or overrides the type parameter in the provided samples.
+ * @param contextGenerator The {@link NameContextGenerator} used to generate features for the event stream.
+ * @param codec The {@link SequenceCodec} to use.
*/
public NameFinderEventStream(ObjectStream<NameSample> dataStream, String type,
NameContextGenerator contextGenerator, SequenceCodec<String> codec) {
super(dataStream);
- this.codec = codec;
-
if (codec == null) {
this.codec = new BioCodec();
+ } else {
+ this.codec = codec;
}
-
this.contextGenerator = contextGenerator;
this.contextGenerator.addFeatureGenerator(
new WindowFeatureGenerator(additionalContextFeatureGenerator, 8, 8));
@@ -70,14 +71,17 @@ public class NameFinderEventStream extends opennlp.tools.util.AbstractEventStrea
}
/**
- * Generates the name tag outcomes (start, continue, other) for each token in a sentence
- * with the specified length using the specified name spans.
- * @param names Token spans for each of the names.
- * @param type null or overrides the type parameter in the provided samples
+ * Generates the name tag outcomes ({@code start}, {@code continue}, {@code other}) for each
+ * token in a sentence with the specified {@code length} using the specified {@link Span names}.
+ *
+ * @param names Token {@link Span spans} for each of the names.
+ * @param type {@code null} or overrides the type parameter in the provided samples
* @param length The length of the sentence.
- * @return An array of start, continue, other outcomes based on the specified names and sentence length.
+ *
+ * @return An array of {@code start}, {@code continue}, {@code other} outcomes based on the
+ * specified names and sentence {@code length}.
*
- * @deprecated use the BioCodec implementation of the SequenceValidator instead!
+ * @deprecated use the {@link BioCodec} implementation of the SequenceValidator instead!
*/
@Deprecated
public static String[] generateOutcomes(Span[] names, String type, int length) {
@@ -103,6 +107,16 @@ public class NameFinderEventStream extends opennlp.tools.util.AbstractEventStrea
return outcomes;
}
+ /**
+ * Generates {@link Event events} for each token in a {@code sentence}
+ * with the specified {@code outcomes} using the specified {@link NameContextGenerator}.
+ *
+ * @param sentence Token representing a sentence.
+ * @param outcomes An array of outcomes.
+ * @param cg The {@link NameContextGenerator} to use.
+ *
+ * @return A list of {@link Event events} generated.
+ */
public static List<Event> generateEvents(String[] sentence, String[] outcomes,
NameContextGenerator cg) {
List<Event> events = new ArrayList<>(outcomes.length);
@@ -148,10 +162,12 @@ public class NameFinderEventStream extends opennlp.tools.util.AbstractEventStrea
}
/**
- * Generated previous decision features for each token based on contents of the specified map.
+ * Generated previous decision features for each token based on contents of the
+ * specified {@code prevMap}.
+ *
* @param tokens The token for which the context is generated.
* @param prevMap A mapping of tokens to their previous decisions.
- * @return An additional context array with features for each token.
+ * @return A 2-dimensional array with additional context with features for each token.
*/
public static String[][] additionalContext(String[] tokens, Map<String, String> prevMap) {
String[][] ac = new String[tokens.length][1];
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
index a869d029..7b563af9 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
@@ -50,11 +50,11 @@ import opennlp.tools.util.featuregen.GeneratorFactory;
import opennlp.tools.util.featuregen.WindowFeatureGenerator;
/**
- * Class for creating a maximum-entropy-based name finder.
+ * A maximum-entropy-based {@link TokenNameFinder name finder} implementation.
*/
public class NameFinderME implements TokenNameFinder {
- private static String[][] EMPTY = new String[0][0];
+ private static final String[][] EMPTY = new String[0][0];
public static final int DEFAULT_BEAM_SIZE = 3;
private static final Pattern typedOutcomePattern = Pattern.compile("(.+)-\\w+");
@@ -69,10 +69,15 @@ public class NameFinderME implements TokenNameFinder {
protected NameContextGenerator contextGenerator;
private Sequence bestSequence;
- private AdditionalContextFeatureGenerator additionalContextFeatureGenerator
- = new AdditionalContextFeatureGenerator();
- private SequenceValidator<String> sequenceValidator;
+ private final AdditionalContextFeatureGenerator additionalContextFeatureGenerator =
+ new AdditionalContextFeatureGenerator();
+ private final SequenceValidator<String> sequenceValidator;
+ /**
+ * Initializes a {@link NameFinderME} with a {@link TokenNameFinderModel}.
+ *
+ * @param model The {@link TokenNameFinderModel} to initialize with.
+ */
public NameFinderME(TokenNameFinderModel model) {
TokenNameFinderFactory factory = model.getFactory();
@@ -107,24 +112,24 @@ public class NameFinderME implements TokenNameFinder {
return featureGenerator;
}
+ @Override
public Span[] find(String[] tokens) {
return find(tokens, EMPTY);
}
/**
* Generates name tags for the given sequence, typically a sentence, returning
- * token spans for any identified names.
+ * {@link Span token spans} for any identified names.
*
- * @param tokens an array of the tokens or words of the sequence, typically a sentence.
- * @param additionalContext features which are based on context outside of the
- * sentence but which should also be used.
+ * @param tokens An array of the tokens or words of a sequence, typically a sentence.
+ * @param additionalContext Features which are based on context outside of the
+ * sentence but which should also be used.
*
- * @return an array of spans for each of the names identified.
+ * @return An array of {@link Span token spans} for each of the names identified.
*/
public Span[] find(String[] tokens, String[][] additionalContext) {
additionalContextFeatureGenerator.setCurrentContext(additionalContext);
-
bestSequence = model.bestSequence(tokens, additionalContext, contextGenerator, sequenceValidator);
List<String> c = bestSequence.getOutcomes();
@@ -134,13 +139,8 @@ public class NameFinderME implements TokenNameFinder {
spans = setProbs(spans);
return spans;
}
-
- /**
- * Forgets all adaptive data which was collected during previous calls to one
- * of the find methods.
- *
- * This method is typical called at the end of a document.
- */
+
+ @Override
public void clearAdaptiveData() {
contextGenerator.clearAdaptiveData();
}
@@ -148,32 +148,32 @@ public class NameFinderME implements TokenNameFinder {
/**
* Populates the specified array with the probabilities of the last decoded
* sequence. The sequence was determined based on the previous call to
- * <code>chunk</code>. The specified array should be at least as large as the
- * number of tokens in the previous call to <code>chunk</code>.
+ * {@link #find(String[])}. The specified array should be at least as large as the
+ * number of tokens in the previous call to {@link #find(String[])}.
*
- * @param probs An array used to hold the probabilities of the last decoded
- * sequence.
+ * @param probs An array with the probabilities of the last decoded sequence.
*/
public void probs(double[] probs) {
bestSequence.getProbs(probs);
}
/**
- * Returns an array with the probabilities of the last decoded sequence. The
- * sequence was determined based on the previous call to <code>chunk</code>.
+ * Retrieves the probabilities of the last decoded sequence. The
+ * sequence was determined based on the previous call to {@link #find(String[])}.
*
* @return An array with the same number of probabilities as tokens were sent
- * to <code>chunk</code> when it was last called.
+ * to {@link #find(String[])} when it was last called.
*/
public double[] probs() {
return bestSequence.getProbs();
}
/**
- * sets the probs for the spans
+ * Sets probabilities for the spans.
*
- * @param spans
- * @return
+ * @param spans The {@link Span spans} to set probabilities.
+ *
+ * @return The {@link Span spans} with populated values.
*/
private Span[] setProbs(Span[] spans) {
double[] probs = probs(spans);
@@ -188,13 +188,14 @@ public class NameFinderME implements TokenNameFinder {
}
/**
- * Returns an array of probabilities for each of the specified spans which is
+ * Retrieves an array of probabilities for each of the specified spans which is
* the arithmetic mean of the probabilities for each of the outcomes which
* make up the span.
*
- * @param spans The spans of the names for which probabilities are desired.
+ * @param spans The {@link Span spans} of the names for which probabilities
+ * are requested.
*
- * @return an array of probabilities for each of the specified spans.
+ * @return An array of probabilities for each of the specified spans.
*/
public double[] probs(Span[] spans) {
@@ -217,42 +218,53 @@ public class NameFinderME implements TokenNameFinder {
return sprobs;
}
+ /**
+ * Starts a training of a {@link TokenNameFinderModel} with the given parameters.
+ *
+ * @param languageCode The ISO conform language code.
+ * @param type The type to use.
+ * @param samples The {@link ObjectStream} of {@link NameSample} used as input for training.
+ * @param params The {@link TrainingParameters} for the context of the training.
+ * @param factory The {@link TokenNameFinderFactory} for creating related objects defined
+ * via {@code params}.
+ *
+ * @return A valid, trained {@link TokenNameFinderModel} instance.
+ * @throws IOException Thrown if IO errors occurred during training.
+ */
public static TokenNameFinderModel train(String languageCode, String type,
- ObjectStream<NameSample> samples, TrainingParameters trainParams,
+ ObjectStream<NameSample> samples, TrainingParameters params,
TokenNameFinderFactory factory) throws IOException {
- trainParams.putIfAbsent(TrainingParameters.ALGORITHM_PARAM, PerceptronTrainer.PERCEPTRON_VALUE);
- trainParams.putIfAbsent(TrainingParameters.CUTOFF_PARAM, 0);
- trainParams.putIfAbsent(TrainingParameters.ITERATIONS_PARAM, 300);
+ params.putIfAbsent(TrainingParameters.ALGORITHM_PARAM, PerceptronTrainer.PERCEPTRON_VALUE);
+ params.putIfAbsent(TrainingParameters.CUTOFF_PARAM, 0);
+ params.putIfAbsent(TrainingParameters.ITERATIONS_PARAM, 300);
- int beamSize = trainParams.getIntParameter(BeamSearch.BEAM_SIZE_PARAMETER,
- NameFinderME.DEFAULT_BEAM_SIZE);
+ int beamSize = params.getIntParameter(BeamSearch.BEAM_SIZE_PARAMETER, NameFinderME.DEFAULT_BEAM_SIZE);
Map<String, String> manifestInfoEntries = new HashMap<>();
MaxentModel nameFinderModel = null;
-
SequenceClassificationModel<String> seqModel = null;
- TrainerType trainerType = TrainerFactory.getTrainerType(trainParams);
+ TrainerType trainerType = TrainerFactory.getTrainerType(params);
if (TrainerType.EVENT_MODEL_TRAINER.equals(trainerType)) {
ObjectStream<Event> eventStream = new NameFinderEventStream(samples, type,
factory.createContextGenerator(), factory.createSequenceCodec());
- EventTrainer trainer = TrainerFactory.getEventTrainer(trainParams, manifestInfoEntries);
+ EventTrainer trainer = TrainerFactory.getEventTrainer(params, manifestInfoEntries);
nameFinderModel = trainer.train(eventStream);
} // TODO: Maybe it is not a good idea, that these two don't use the context generator ?!
// These also don't use the sequence codec ?!
else if (TrainerType.EVENT_MODEL_SEQUENCE_TRAINER.equals(trainerType)) {
NameSampleSequenceStream ss = new NameSampleSequenceStream(samples, factory.createContextGenerator());
- EventModelSequenceTrainer<NameSample> trainer = TrainerFactory.getEventModelSequenceTrainer(
- trainParams, manifestInfoEntries);
+ EventModelSequenceTrainer trainer = TrainerFactory.getEventModelSequenceTrainer(
+ params, manifestInfoEntries);
nameFinderModel = trainer.train(ss);
} else if (TrainerType.SEQUENCE_TRAINER.equals(trainerType)) {
SequenceTrainer trainer = TrainerFactory.getSequenceModelTrainer(
- trainParams, manifestInfoEntries);
+ params, manifestInfoEntries);
NameSampleSequenceStream ss =
new NameSampleSequenceStream(samples, factory.createContextGenerator(), false);
@@ -271,10 +283,10 @@ public class NameFinderME implements TokenNameFinder {
}
/**
- * Gets the name type from the outcome
+ * Extracts the name type from the {@code outcome}.
*
- * @param outcome the outcome
- * @return the name type, or null if not set
+ * @param outcome The outcome
+ * @return The name type, or {@code null} if not set.
*/
static String extractNameType(String outcome) {
Matcher matcher = typedOutcomePattern.matcher(outcome);
@@ -286,17 +298,17 @@ public class NameFinderME implements TokenNameFinder {
}
/**
- * Removes spans with are intersecting or crossing in anyway.
+ * Removes {@link Span spans} with are intersecting or crossing in any way.
*
* <p>
* The following rules are used to remove the spans:<br>
- * Identical spans: The first span in the array after sorting it remains<br>
- * Intersecting spans: The first span after sorting remains<br>
- * Contained spans: All spans which are contained by another are removed<br>
+ * Identical spans: The first span in the array after sorting it remains.<br>
+ * Intersecting spans: The first span after sorting remains.<br>
+ * Contained spans: All spans which are contained by another are removed.<br>
*
- * @param spans
+ * @param spans The input {@link Span spans}.
*
- * @return non-overlapping spans
+ * @return The resulting non-overlapping {@link Span spans}.
*/
public static Span[] dropOverlappingSpans(Span[] spans) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderSequenceValidator.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderSequenceValidator.java
index 3054cb86..cbfa657a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderSequenceValidator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderSequenceValidator.java
@@ -20,11 +20,15 @@ package opennlp.tools.namefind;
import opennlp.tools.util.SequenceValidator;
/**
- * This class is created by the {@link BioCodec}.
+ * The default name finder {@link SequenceValidator} implementation.
+ * Created by the {@link BioCodec}.
+ *
+ * @see TokenNameFinder
+ * @see BioCodec
*/
-public class NameFinderSequenceValidator implements
- SequenceValidator<String> {
+public class NameFinderSequenceValidator implements SequenceValidator<String> {
+ @Override
public boolean validSequence(int i, String[] inputSequence,
String[] outcomesSequence, String outcome) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSample.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSample.java
index 1954073f..19577b7f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSample.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSample.java
@@ -31,7 +31,7 @@ import opennlp.tools.tokenize.WhitespaceTokenizer;
import opennlp.tools.util.Span;
/**
- * Class for holding names for a single unit of text.
+ * Encapsulates names for a single unit of text.
*/
public class NameSample implements Sample {
@@ -42,11 +42,22 @@ public class NameSample implements Sample {
private final String[][] additionalContext;
private final boolean isClearAdaptiveData;
- /** The a default type value when there is no type in training data. */
+ /** The default type value when there is no type in training data. */
public static final String DEFAULT_TYPE = "default";
- public NameSample(String id, String[] sentence, Span[] names,
- String[][] additionalContext, boolean clearAdaptiveData) {
+ /**
+ * Initializes a {@link NameSample} instance with given parameters.
+ *
+ * @param id The identifier to use.
+ * @param sentence The tokens representing a training sentence. Must not be {@code null}.
+ * @param names The {@link Span names} to use.
+ * @param additionalContext Additional context in a 2-dimensional array.
+ * @param clearAdaptiveData If {@code true} the adaptive data of the feature generators is cleared.
+ *
+ * @throws RuntimeException Thrown if name spans are overlapping.
+ */
+ public NameSample(String id, String[] sentence, Span[] names, String[][] additionalContext,
+ boolean clearAdaptiveData) {
this.id = id;
Objects.requireNonNull(sentence, "sentence must not be null");
@@ -84,41 +95,68 @@ public class NameSample implements Sample {
}
}
}
-
+
/**
- * Initializes the current instance.
+ * Initializes a {@link NameSample} instance with given parameters.
+ *
+ * @param sentence The tokens representing a sentence. Must not be {@code null}.
+ * @param names The {@link Span names} to use.
+ * @param additionalContext Additional context in a 2-dimensional array.
+ * @param clearAdaptiveData If {@code true} the adaptive data of the feature generators is cleared.
*
- * @param sentence training sentence
- * @param names
- * @param additionalContext
- * @param clearAdaptiveData if true the adaptive data of the
- * feature generators is cleared
+ * @throws RuntimeException Thrown if name spans are overlapping.
*/
public NameSample(String[] sentence, Span[] names,
String[][] additionalContext, boolean clearAdaptiveData) {
this(null, sentence, names, additionalContext, clearAdaptiveData);
}
+ /**
+ * Initializes a {@link NameSample} instance with given parameters.
+ *
+ * @param sentence The tokens representing a sentence. Must not be {@code null}.
+ * @param names The {@link Span names} to use.
+ * @param clearAdaptiveData If {@code true} the adaptive data of the feature generators is cleared.
+ *
+ * @throws RuntimeException Thrown if name spans are overlapping.
+ */
public NameSample(String[] sentence, Span[] names, boolean clearAdaptiveData) {
this(sentence, names, null, clearAdaptiveData);
}
+ /**
+ * @return Retrieves the current identifier. May be {@code null}.
+ */
public String getId() {
return id;
}
+ /**
+ * @return Retrieves the sentence in tokenized form.
+ */
public String[] getSentence() {
return sentence.toArray(new String[sentence.size()]);
}
+ /**
+ * @return Retrieves the {@link Span names}.
+ */
public Span[] getNames() {
return names.toArray(new Span[names.size()]);
}
+
+ /**
+ * @return Retrieves additional context. May be {@code null}.
+ */
public String[][] getAdditionalContext() {
return additionalContext;
}
+ /**
+ * @return {@code true} if the adaptive data of the feature generators are cleared,
+ * {@code false} otherwise.
+ */
public boolean isClearAdaptiveDataSet() {
return isClearAdaptiveData;
}
@@ -219,14 +257,33 @@ public class NameSample implements Sample {
private static final Pattern START_TAG_PATTERN = Pattern.compile("<START(:([^:>\\s]*))?>");
- public static NameSample parse(String taggedTokens,
- boolean isClearAdaptiveData) throws IOException {
- return parse(taggedTokens, DEFAULT_TYPE, isClearAdaptiveData);
+
+ /**
+ * Parses given input into a {@link NameSample}.
+ *
+ * @param taggedTokens The input data to parse.
+ * @param clearAdaptiveData {@code true} if the adaptive data of the feature generators should be cleared,
+ * {@code false} otherwise.
+ * @return A {@link NameSample} instance resulting from the parsing.
+ * @throws IOException Thrown if IO errors occurred during parsing.
+ */
+ public static NameSample parse(String taggedTokens, boolean clearAdaptiveData) throws IOException {
+ return parse(taggedTokens, DEFAULT_TYPE, clearAdaptiveData);
}
- public static NameSample parse(String taggedTokens, String defaultType,
- boolean isClearAdaptiveData) throws IOException {
- // TODO: Should throw another exception, and then convert it into an IOException in the stream
+ /**
+ * Parses given input into a {@link NameSample}.
+ *
+ * @param taggedTokens The input data to parse.
+ * @param defaultType The type to set by default.
+ * @param clearAdaptiveData {@code true} if the adaptive data of the feature generators should be cleared,
+ * {@code false} otherwise.
+ * @return A {@link NameSample} instance resulting from the parsing.
+ * @throws IOException Thrown if IO errors occurred during parsing.
+ */
+ // TODO: Should throw another exception, and then convert it into an IOException in the stream
+ public static NameSample parse(String taggedTokens, String defaultType, boolean clearAdaptiveData)
+ throws IOException {
String[] parts = WhitespaceTokenizer.INSTANCE.tokenize(taggedTokens);
@@ -276,6 +333,6 @@ public class NameSample implements Sample {
String[] sentence = tokenList.toArray(new String[tokenList.size()]);
Span[] names = nameList.toArray(new Span[nameList.size()]);
- return new NameSample(sentence, names, isClearAdaptiveData);
+ return new NameSample(sentence, names, clearAdaptiveData);
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleDataStream.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleDataStream.java
index 3c4cd68c..acfdbbae 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleDataStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleDataStream.java
@@ -24,10 +24,12 @@ import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.ObjectStream;
/**
- * The {@link NameSampleDataStream} class converts tagged {@link String}s
+ * The {@link NameSampleDataStream} class converts tagged {@link String strings}
* provided by a {@link DataStream} to {@link NameSample} objects.
- * It uses text that is is one-sentence per line and tokenized
- * with names identified by <code><START></code> and <code><END></code> tags.
+ * It uses text that is one-sentence per line and tokenized
+ * with names identified by:
+ * <p>
+ * {@code <START>} and {@code <END>} tags.
*/
public class NameSampleDataStream extends FilterObjectStream<String, NameSample> {
@@ -35,16 +37,22 @@ public class NameSampleDataStream extends FilterObjectStream<String, NameSample>
public static final String START_TAG = "<START>";
public static final String END_TAG = "<END>";
+ /**
+ * Initializes a {@link NameSampleDataStream} with given {@code psi} samples.
+ *
+ * @param in The {@link ObjectStream stream} of data samples.
+ */
public NameSampleDataStream(ObjectStream<String> in) {
super(in);
}
+ @Override
public NameSample read() throws IOException {
String token = samples.read();
boolean isClearAdaptiveData = false;
- // An empty line indicates the begin of a new article
+ // An empty line indicates the start of a new article
// for which the adaptive data in the feature generators
// must be cleared
while (token != null && token.trim().length() == 0) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleSequenceStream.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleSequenceStream.java
index d1a17873..0f86928c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleSequenceStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleSequenceStream.java
@@ -28,6 +28,9 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.SequenceCodec;
import opennlp.tools.util.featuregen.AdaptiveFeatureGenerator;
+/**
+ * A {@link SequenceStream} implementation encapsulating {@link NameSample samples}.
+ */
public class NameSampleSequenceStream implements SequenceStream<NameSample> {
private final NameContextGenerator pcg;
@@ -35,40 +38,81 @@ public class NameSampleSequenceStream implements SequenceStream<NameSample> {
private final ObjectStream<NameSample> psi;
private final SequenceCodec<String> seqCodec;
- public NameSampleSequenceStream(ObjectStream<NameSample> psi) throws IOException {
+ /**
+ * Initializes a {@link NameSampleSequenceStream} with given {@code psi} samples.
+ *
+ * @param psi The data stream of {@link NameSample samples}.
+ */
+ public NameSampleSequenceStream(ObjectStream<NameSample> psi) {
this(psi, new DefaultNameContextGenerator((AdaptiveFeatureGenerator) null), true);
}
- public NameSampleSequenceStream(ObjectStream<NameSample> psi, AdaptiveFeatureGenerator featureGen)
- throws IOException {
+ /**
+ * Initializes a {@link NameSampleSequenceStream} with given {@code psi} samples
+ * and an {@link AdaptiveFeatureGenerator feature generator}.
+ *
+ * @param psi The data stream of {@link NameSample samples}.
+ * @param featureGen The {@link AdaptiveFeatureGenerator feature generator} to use.
+ */
+ public NameSampleSequenceStream(ObjectStream<NameSample> psi, AdaptiveFeatureGenerator featureGen) {
this(psi, new DefaultNameContextGenerator(featureGen), true);
}
+ /**
+ * Initializes a {@link NameSampleSequenceStream} with given {@code psi} samples
+ * and an {@link AdaptiveFeatureGenerator feature generator}.
+ *
+ * @param psi The data stream of {@link NameSample samples}.
+ * @param featureGen The {@link AdaptiveFeatureGenerator feature generator} to use.
+ * @param useOutcomes Whether to use outcomes or not.
+ */
public NameSampleSequenceStream(ObjectStream<NameSample> psi,
- AdaptiveFeatureGenerator featureGen, boolean useOutcomes)
- throws IOException {
+ AdaptiveFeatureGenerator featureGen, boolean useOutcomes) {
this(psi, new DefaultNameContextGenerator(featureGen), useOutcomes);
}
- public NameSampleSequenceStream(ObjectStream<NameSample> psi, NameContextGenerator pcg)
- throws IOException {
+ /**
+ * Initializes a {@link NameSampleSequenceStream} with given {@code psi} samples
+ * and an {@link AdaptiveFeatureGenerator feature generator}.
+ *
+ * @param psi The data stream of {@link NameSample samples}.
+ * @param pcg The {@link NameContextGenerator context generator} to use.
+ */
+ public NameSampleSequenceStream(ObjectStream<NameSample> psi, NameContextGenerator pcg) {
this(psi, pcg, true);
}
- public NameSampleSequenceStream(ObjectStream<NameSample> psi, NameContextGenerator pcg, boolean useOutcomes)
- throws IOException {
+ /**
+ * Initializes a {@link NameSampleSequenceStream} with given {@code psi} samples
+ * and an {@link AdaptiveFeatureGenerator feature generator}.
+ *
+ * @param psi The data stream of {@link NameSample samples}.
+ * @param pcg The {@link NameContextGenerator context generator} to use.
+ * @param useOutcomes Whether to use outcomes or not.
+ */
+ public NameSampleSequenceStream(ObjectStream<NameSample> psi, NameContextGenerator pcg,
+ boolean useOutcomes) {
this(psi, pcg, useOutcomes, new BioCodec());
}
+ /**
+ * Initializes a {@link NameSampleSequenceStream} with given {@code psi} samples
+ * and an {@link AdaptiveFeatureGenerator feature generator}.
+ *
+ * @param psi The data stream of {@link NameSample samples}.
+ * @param pcg The {@link NameContextGenerator context generator} to use.
+ * @param useOutcomes Whether to use outcomes or not.
+ * @param seqCodec The {@link SequenceCodec} to use.
+ */
public NameSampleSequenceStream(ObjectStream<NameSample> psi, NameContextGenerator pcg, boolean useOutcomes,
- SequenceCodec<String> seqCodec)
- throws IOException {
+ SequenceCodec<String> seqCodec) {
this.psi = psi;
this.useOutcomes = useOutcomes;
this.pcg = pcg;
this.seqCodec = seqCodec;
}
+ @Override
public Event[] updateContext(Sequence<NameSample> sequence, AbstractModel model) {
TokenNameFinder tagger = new NameFinderME(new TokenNameFinderModel(
"x-unspecified", model, Collections.emptyMap(), null));
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleTypeFilter.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleTypeFilter.java
index 9ae1dc91..84f5cca2 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleTypeFilter.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameSampleTypeFilter.java
@@ -30,30 +30,42 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
/**
- * A stream which removes Name Samples which do not have a certain type.
+ * A {@link FilterObjectStream stream} which removes {@link NameSample name samples}
+ * which do not have a certain type.
*/
public class NameSampleTypeFilter extends FilterObjectStream<NameSample, NameSample> {
private final Set<String> types;
+ /**
+ * Initializes a {@link NameSampleTypeFilter}.
+ *
+ * @param types An array with types to use.
+ * @param samples An {@link ObjectStream<NameSample>} with the samples to filter.
+ */
public NameSampleTypeFilter(String[] types, ObjectStream<NameSample> samples) {
super(samples);
this.types = Collections.unmodifiableSet(new HashSet<>(Arrays.asList(types)));
}
+ /**
+ * Initializes a {@link NameSampleTypeFilter}.
+ *
+ * @param types A {@link Set} with types to use.
+ * @param samples An {@link ObjectStream<NameSample>} with the samples to filter.
+ */
public NameSampleTypeFilter(Set<String> types, ObjectStream<NameSample> samples) {
super(samples);
this.types = Collections.unmodifiableSet(new HashSet<>(types));
}
+ @Override
public NameSample read() throws IOException {
NameSample sample = samples.read();
-
if (sample != null) {
List<Span> filteredNames = new ArrayList<>();
-
for (Span name : sample.getNames()) {
if (types.contains(name.getType())) {
filteredNames.add(name);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/RegexNameFinder.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/RegexNameFinder.java
index aff3392a..a5e6193a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/RegexNameFinder.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/RegexNameFinder.java
@@ -28,7 +28,7 @@ import java.util.regex.Pattern;
import opennlp.tools.util.Span;
/**
- * Name finder based on a series of regular expressions.
+ * A {@link TokenNameFinder} implementation based on a series of regular expressions.
*/
public final class RegexNameFinder implements TokenNameFinder {
@@ -36,10 +36,25 @@ public final class RegexNameFinder implements TokenNameFinder {
private String sType;
private Map<String, Pattern[]> regexMap;
+ /**
+ * Initializes a {@link RegexNameFinder} instance.
+ *
+ * @param regexMap A {@link Map} where the key is a type, and the value is a
+ * {@link Pattern[]}. Must not be {@code null}.
+ */
public RegexNameFinder(Map<String, Pattern[]> regexMap) {
this.regexMap = Objects.requireNonNull(regexMap, "regexMap must not be null");
}
+ /**
+ * Initializes a {@link RegexNameFinder} instance.
+ *
+ * @param patterns The {@link Pattern[] patterns} to use.
+ * Must not be {@code null} and not be empty.
+ * @param type The type to use.
+ *
+ * @throws IllegalArgumentException Thrown if {@code patterns} were {@code null} or empty.
+ */
public RegexNameFinder(Pattern[] patterns, String type) {
if (patterns == null || patterns.length == 0) {
throw new IllegalArgumentException("patterns must not be null or empty!");
@@ -49,10 +64,16 @@ public final class RegexNameFinder implements TokenNameFinder {
sType = type;
}
+
/**
- * use constructor {@link #RegexNameFinder(Pattern[], String)}
- * for single types, and/or constructor
- * {@link #RegexNameFinder(Map)}
+ * Initializes a {@link RegexNameFinder} instance.
+ *
+ * @param patterns The {@link Pattern[] patterns} to use.
+ * Must not be {@code null} and not be empty.
+ *
+ * @throws IllegalArgumentException Thrown if {@code patterns} were {@code null} or empty.
+ * @deprecated Use constructor {@link #RegexNameFinder(Pattern[], String)}
+ * for single types, and/or constructor {@link #RegexNameFinder(Map)} instead.
*/
@Deprecated
public RegexNameFinder(Pattern[] patterns) {
@@ -67,14 +88,12 @@ public final class RegexNameFinder implements TokenNameFinder {
@Override
public Span[] find(String[] tokens) {
Map<Integer, Integer> sentencePosTokenMap = new HashMap<>();
-
StringBuilder sentenceString = new StringBuilder(tokens.length * 10);
for (int i = 0; i < tokens.length; i++) {
int startIndex = sentenceString.length();
sentencePosTokenMap.put(startIndex, i);
-
sentenceString.append(tokens[i]);
int endIndex = sentenceString.length();
@@ -123,17 +142,14 @@ public final class RegexNameFinder implements TokenNameFinder {
}
}
-
- return annotations.toArray(
- new Span[annotations.size()]);
+ return annotations.toArray(new Span[annotations.size()]);
}
/**
- * NEW. This method removes the need for tokenization, but returns the Span
- * with character indices, rather than word.
+ * Finds {@link Span spans} with character indices, rather than word.
*
- * @param text
- * @return
+ * @param text The text to use.
+ * @return A {@link Span[]} representing the annotations.
*/
public Span[] find(String text) {
return getAnnotations(text);
@@ -145,32 +161,23 @@ public final class RegexNameFinder implements TokenNameFinder {
for (Map.Entry<String, Pattern[]> entry : regexMap.entrySet()) {
for (Pattern mPattern : entry.getValue()) {
Matcher matcher = mPattern.matcher(text);
-
while (matcher.find()) {
- Integer tokenStartIndex = matcher.start();
- Integer tokenEndIndex = matcher.end();
- Span annotation = new Span(tokenStartIndex, tokenEndIndex, entry.getKey());
+ Span annotation = new Span(matcher.start(), matcher.end(), entry.getKey());
annotations.add(annotation);
-
}
}
}
} else {
for (Pattern mPattern : mPatterns) {
Matcher matcher = mPattern.matcher(text);
-
while (matcher.find()) {
- Integer tokenStartIndex = matcher.start();
- Integer tokenEndIndex = matcher.end();
- Span annotation = new Span(tokenStartIndex, tokenEndIndex, sType);
+ Span annotation = new Span(matcher.start(), matcher.end(), sType);
annotations.add(annotation);
-
}
}
}
- return annotations.toArray(
- new Span[annotations.size()]);
+ return annotations.toArray(new Span[annotations.size()]);
}
@Override
@@ -178,19 +185,31 @@ public final class RegexNameFinder implements TokenNameFinder {
// nothing to clear
}
- public Pattern[] getmPatterns() {
+ /**
+ * @return Retrieves the {@link Pattern matching patterns} used.
+ */
+ public Pattern[] getMatchingPatterns() {
return mPatterns;
}
- public void setmPatterns(Pattern[] mPatterns) {
+ /**
+ * @param mPatterns The {@link Pattern matching patterns} to be set.
+ */
+ public void setMatchingPatterns(Pattern[] mPatterns) {
this.mPatterns = mPatterns;
}
- public String getsType() {
+ /**
+ * @return Retrieves the {@link Span} type used.
+ */
+ public String getSpanType() {
return sType;
}
- public void setsType(String sType) {
+ /**
+ * @param sType Sets a (different) {@link Span} type.
+ */
+ public void setSpanType(String sType) {
this.sType = sType;
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/RegexNameFinderFactory.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/RegexNameFinderFactory.java
index ab99a94e..7c2bc79e 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/RegexNameFinderFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/RegexNameFinderFactory.java
@@ -24,20 +24,20 @@ import java.util.regex.Pattern;
/**
*
- * Returns a RegexNameFinder based on A selection of
- * defaults or a configuration and a selection of defaults
+ * Returns a {@link RegexNameFinder} based on a selection of
+ * defaults or a configuration and a selection of defaults.
*/
public class RegexNameFinderFactory {
/**
* Allows for use of selected Defaults as well as regexes from external
- * configuration
+ * configuration.
*
- * @param config a map where the key is a type, and the value is a
- * Pattern[]. If the keys clash with default keys, the config
- * map will win
- * @param defaults the OpenNLP default regexes
- * @return {@link RegexNameFinder}
+ * @param config A {@link Map} where the key is a type, and the value is a
+ * {@link Pattern[]}. If a key clashes with one of the default keys,
+ * the config map entry will be taken.
+ * @param defaults One or more of the default {@link DEFAULT_REGEX_NAME_FINDER} enum values.
+ * @return A {@link RegexNameFinder} instance.
*/
public static synchronized RegexNameFinder getDefaultRegexNameFinders(
Map<String, Pattern[]> config, DEFAULT_REGEX_NAME_FINDER... defaults) {
@@ -52,10 +52,10 @@ public class RegexNameFinderFactory {
}
/**
- * Returns a RegexNamefinder that will utilize specified default regexes.
+ * Retrieves a {@link RegexNameFinder} that will utilize specified default regexes.
*
- * @param defaults the OpenNLP default regexes
- * @return {@link RegexNameFinder}
+ * @param defaults One or more of the default {@link DEFAULT_REGEX_NAME_FINDER} enum values.
+ * @return A {@link RegexNameFinder} instance.
*/
public static synchronized RegexNameFinder getDefaultRegexNameFinders(
DEFAULT_REGEX_NAME_FINDER... defaults) {
@@ -79,6 +79,9 @@ public class RegexNameFinderFactory {
String getType();
}
+ /**
+ * Enumeration of typical regex expressions available in OpenNLP.
+ */
public enum DEFAULT_REGEX_NAME_FINDER implements RegexAble {
USA_PHONE_NUM {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinder.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinder.java
index c9de988e..c2ab7d30 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinder.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinder.java
@@ -24,19 +24,21 @@ import opennlp.tools.util.Span;
*/
public interface TokenNameFinder {
- /** Generates name tags for the given sequence, typically a sentence,
- * returning token spans for any identified names.
+ /**
+ * Generates name tags for the given sequence, typically a sentence,
+ * returning {@link Span token spans} for any identified names.
*
- * @param tokens an array of the tokens or words of the sequence, typically a sentence.
- * @return an array of spans for each of the names identified.
+ * @param tokens An array of the tokens or words of the sequence, typically a sentence.
+ * @return An array of {@link Span spans} for each of the names identified.
*/
Span[] find(String[] tokens);
/**
* Forgets all adaptive data which was collected during previous
* calls to one of the find methods.
- *
- * This method is typical called at the end of a document.
+ * <p>
+ * Note:
+ * This method should typically be called at the end of the processing of a document.
*/
void clearAdaptiveData();
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderCrossValidator.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderCrossValidator.java
index 0a2fb4db..8e5bad90 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderCrossValidator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderCrossValidator.java
@@ -18,7 +18,6 @@
package opennlp.tools.namefind;
import java.io.IOException;
-import java.io.Serializable;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@@ -26,6 +25,7 @@ import java.util.Iterator;
import java.util.List;
import java.util.Map;
+import opennlp.tools.commons.Sample;
import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.SequenceCodec;
@@ -33,11 +33,16 @@ import opennlp.tools.util.TrainingParameters;
import opennlp.tools.util.eval.CrossValidationPartitioner;
import opennlp.tools.util.eval.FMeasure;
+/**
+ * Cross validator for {@link TokenNameFinder}.
+ */
public class TokenNameFinderCrossValidator {
- private static class DocumentSample implements Serializable {
+ private static class DocumentSample implements Sample {
- private NameSample[] samples;
+ private static final long serialVersionUID = -7032022251020271479L;
+
+ private final NameSample[] samples;
DocumentSample(NameSample[] samples) {
this.samples = samples;
@@ -49,7 +54,7 @@ public class TokenNameFinderCrossValidator {
}
/**
- * Reads Name Samples to group them as a document based on the clear adaptive data flag.
+ * Reads {@link NameSample samples} to group them as a document based on the clear adaptive data flag.
*/
private static class NameToDocumentSampleStream extends FilterObjectStream<NameSample, DocumentSample> {
@@ -59,6 +64,7 @@ public class TokenNameFinderCrossValidator {
super(samples);
}
+ @Override
public DocumentSample read() throws IOException {
List<NameSample> document = new ArrayList<>();
@@ -104,7 +110,7 @@ public class TokenNameFinderCrossValidator {
}
/**
- * Splits DocumentSample into NameSamples.
+ * Splits {@link DocumentSample document samples} into {@link NameSample name samples}.
*/
private static class DocumentToNameSampleStream extends FilterObjectStream<DocumentSample, NameSample> {
@@ -112,8 +118,9 @@ public class TokenNameFinderCrossValidator {
super(samples);
}
- private Iterator<NameSample> documentSamples = Collections.<NameSample>emptyList().iterator();
+ private Iterator<NameSample> documentSamples = Collections.emptyIterator();
+ @Override
public NameSample read() throws IOException {
// Note: Empty document samples should be skipped
@@ -139,72 +146,80 @@ public class TokenNameFinderCrossValidator {
private final String languageCode;
private final TrainingParameters params;
private final String type;
- private byte[] featureGeneratorBytes;
- private Map<String, Object> resources;
- private TokenNameFinderEvaluationMonitor[] listeners;
+ private final byte[] featureGeneratorBytes;
+ private final Map<String, Object> resources;
+ private final TokenNameFinderEvaluationMonitor[] listeners;
- private FMeasure fmeasure = new FMeasure();
+ private final FMeasure fmeasure = new FMeasure();
private TokenNameFinderFactory factory;
-
+
/**
- * Name finder cross validator
+ * Initializes a {@link TokenNameFinderCrossValidator} with the given parameters.
*
- * @param languageCode
- * the language of the training data
- * @param type
- * null or an override type for all types in the training data
- * @param trainParams
- * machine learning train parameters
- * @param featureGeneratorBytes
- * descriptor to configure the feature generation or null
- * @param listeners
- * a list of listeners
- * @param resources
- * the resources for the name finder or null if none
+ * @param languageCode The ISO conform language code.
+ * @param type {@code null} or an override type for all types in the training data.
+ * @param featureGeneratorBytes The {@code byte[]} representing the feature generator descriptor.
+ * @param resources Additional resources in a mapping.
+ * @param codec The {@link SequenceCodec} to use.
+ * @param params The {@link TrainingParameters} for the context of cross validation.
+ * @param listeners the {@link TokenNameFinderEvaluationMonitor evaluation listeners}.
*/
- public TokenNameFinderCrossValidator(String languageCode, String type,
- TrainingParameters trainParams, byte[] featureGeneratorBytes,
- Map<String, Object> resources, SequenceCodec<String> codec,
- TokenNameFinderEvaluationMonitor... listeners) {
+ public TokenNameFinderCrossValidator(String languageCode, String type, TrainingParameters params,
+ byte[] featureGeneratorBytes, Map<String, Object> resources,
+ SequenceCodec<String> codec,
+ TokenNameFinderEvaluationMonitor... listeners) {
this.languageCode = languageCode;
this.type = type;
this.featureGeneratorBytes = featureGeneratorBytes;
this.resources = resources;
- this.params = trainParams;
+ this.params = params;
this.listeners = listeners;
}
- public TokenNameFinderCrossValidator(String languageCode, String type,
- TrainingParameters trainParams, byte[] featureGeneratorBytes,
- Map<String, Object> resources,
- TokenNameFinderEvaluationMonitor... listeners) {
+ /**
+ * Initializes a {@link TokenNameFinderCrossValidator} with the given parameters.
+ *
+ * @param languageCode The ISO conform language code.
+ * @param type {@code null} or an override type for all types in the training data.
+ * @param featureGeneratorBytes The {@code byte[]} representing the feature generator descriptor.
+ * @param resources Additional resources in a mapping.
+ * @param listeners the {@link TokenNameFinderEvaluationMonitor evaluation listeners}.
+ */
+ public TokenNameFinderCrossValidator(String languageCode, String type, TrainingParameters trainParams,
+ byte[] featureGeneratorBytes, Map<String, Object> resources,
+ TokenNameFinderEvaluationMonitor... listeners) {
this(languageCode, type, trainParams, featureGeneratorBytes, resources, new BioCodec(), listeners);
}
+ /**
+ * Initializes a {@link TokenNameFinderCrossValidator} with the given parameters.
+ *
+ * @param languageCode The ISO conform language code.
+ * @param type {@code null} or an override type for all types in the training data.
+ * @param params The {@link TrainingParameters} for the context of cross validation.
+ * @param factory The {@link TokenNameFinderFactory} for creating related objects.
+ * @param listeners the {@link TokenNameFinderEvaluationMonitor evaluation listeners}.
+ */
public TokenNameFinderCrossValidator(String languageCode, String type,
- TrainingParameters trainParams, TokenNameFinderFactory factory,
+ TrainingParameters params, TokenNameFinderFactory factory,
TokenNameFinderEvaluationMonitor... listeners) {
- this.languageCode = languageCode;
- this.type = type;
- this.params = trainParams;
+ this(languageCode, type, params, null, null, new BioCodec(), listeners);
this.factory = factory;
- this.listeners = listeners;
}
/**
* Starts the evaluation.
+ * <p>
+ * Note:
+ * The name samples need to be grouped on a document basis.
*
- * @param samples
- * the data to train and test
- * @param nFolds
- * number of folds
- * @throws IOException
+ * @param samples The {@link ObjectStream} of {@link NameSample samples} to train and test with.
+ * @param nFolds Number of folds. It must be greater than zero.
+ *
+ * @throws IOException Thrown if IO errors occurred.
*/
- public void evaluate(ObjectStream<NameSample> samples, int nFolds)
- throws IOException {
-
- // Note: The name samples need to be grouped on a document basis.
+ public void evaluate(ObjectStream<NameSample> samples, int nFolds) throws IOException {
CrossValidationPartitioner<DocumentSample> partitioner = new CrossValidationPartitioner<>(
new NameToDocumentSampleStream(samples), nFolds);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderEvaluationMonitor.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderEvaluationMonitor.java
index e77eff69..c920e8c0 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderEvaluationMonitor.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderEvaluationMonitor.java
@@ -19,6 +19,9 @@ package opennlp.tools.namefind;
import opennlp.tools.util.eval.EvaluationMonitor;
+/**
+ * A marker interface for evaluating {@link TokenNameFinder name finders}.
+ */
public interface TokenNameFinderEvaluationMonitor extends EvaluationMonitor<NameSample> {
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderEvaluator.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderEvaluator.java
index a84ebb81..f52be54c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderEvaluator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderEvaluator.java
@@ -25,7 +25,7 @@ import opennlp.tools.util.eval.FMeasure;
/**
* The {@link TokenNameFinderEvaluator} measures the performance
* of the given {@link TokenNameFinder} with the provided
- * reference {@link NameSample}s.
+ * reference {@link NameSample samples}.
*
* @see Evaluator
* @see TokenNameFinder
@@ -33,20 +33,18 @@ import opennlp.tools.util.eval.FMeasure;
*/
public class TokenNameFinderEvaluator extends Evaluator<NameSample> {
- private FMeasure fmeasure = new FMeasure();
+ private final FMeasure fmeasure = new FMeasure();
/**
- * The {@link TokenNameFinder} used to create the predicted
- * {@link NameSample} objects.
+ * The {@link TokenNameFinder} used to create the predicted {@link NameSample} objects.
*/
- private TokenNameFinder nameFinder;
+ private final TokenNameFinder nameFinder;
/**
- * Initializes the current instance with the given
- * {@link TokenNameFinder}.
+ * Initializes a {@link TokenNameFinderEvaluator} for a given {@link TokenNameFinder}.
*
- * @param nameFinder the {@link TokenNameFinder} to evaluate.
- * @param listeners evaluation sample listeners
+ * @param nameFinder The {@link TokenNameFinder} to evaluate.
+ * @param listeners The {@link TokenNameFinderEvaluationMonitor evaluation listeners}.
*/
public TokenNameFinderEvaluator(TokenNameFinder nameFinder,
TokenNameFinderEvaluationMonitor ... listeners) {
@@ -56,15 +54,15 @@ public class TokenNameFinderEvaluator extends Evaluator<NameSample> {
/**
* Evaluates the given reference {@link NameSample} object.
- *
+ * <p>
* This is done by finding the names with the
* {@link TokenNameFinder} in the sentence from the reference
* {@link NameSample}. The found names are then used to
* calculate and update the scores.
*
- * @param reference the reference {@link NameSample}.
+ * @param reference The reference {@link NameSample}.
*
- * @return the predicted {@link NameSample}.
+ * @return The predicted {@link NameSample}.
*/
@Override
protected NameSample processSample(NameSample reference) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderFactory.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderFactory.java
index f570be30..4a87ed14 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderFactory.java
@@ -40,10 +40,10 @@ import opennlp.tools.util.featuregen.TokenClassFeatureGenerator;
import opennlp.tools.util.featuregen.TokenFeatureGenerator;
import opennlp.tools.util.featuregen.WindowFeatureGenerator;
-// Idea of this factory is that most resources/impls used by the name finder
-// can be modified through this class!
-// That only works if that's the central class used for training/runtime
-
+/**
+ * The factory that provides {@link TokenNameFinder} default implementations and
+ * resources. That only works if that's the central class used for training/runtime.
+ */
public class TokenNameFinderFactory extends BaseToolFactory {
private byte[] featureGeneratorBytes;
@@ -51,25 +51,42 @@ public class TokenNameFinderFactory extends BaseToolFactory {
private SequenceCodec<String> seqCodec;
/**
- * Creates a {@link TokenNameFinderFactory} that provides the default implementation
- * of the resources.
+ * Initializes a {@link TokenNameFinderFactory} that provides the default implementation
+ * of the resources. {@link BioCodec} will be used as default {@link SequenceCodec}.
*/
public TokenNameFinderFactory() {
this.seqCodec = new BioCodec();
}
+ /**
+ * Initializes a {@link TokenNameFinderFactory} instance via given parameters.
+ *
+ * @param featureGeneratorBytes The {@code byte[]} representing the feature generator descriptor.
+ * @param resources Additional resources in a mapping.
+ * @param seqCodec The {@link SequenceCodec} to use.
+ */
public TokenNameFinderFactory(byte[] featureGeneratorBytes, final Map<String, Object> resources,
- SequenceCodec<String> seqCodec) {
+ SequenceCodec<String> seqCodec) {
init(featureGeneratorBytes, resources, seqCodec);
}
+ /**
+ * Initializes via given parameters.
+ *
+ * @param featureGeneratorBytes The {@code byte[]} representing the feature generator descriptor.
+ * @param resources Additional resources in a mapping.
+ * @param seqCodec The {@link SequenceCodec} to use.
+ */
void init(byte[] featureGeneratorBytes, final Map<String, Object> resources,
- SequenceCodec<String> seqCodec) {
+ SequenceCodec<String> seqCodec) {
this.featureGeneratorBytes = featureGeneratorBytes;
this.resources = resources;
this.seqCodec = seqCodec;
}
+ /*
+ * Loads the default feature generator bytes via classpath resources.
+ */
private static byte[] loadDefaultFeatureGeneratorBytes() {
ByteArrayOutputStream bytes = new ByteArrayOutputStream();
@@ -77,7 +94,7 @@ public class TokenNameFinderFactory extends BaseToolFactory {
"/opennlp/tools/namefind/ner-default-features.xml")) {
if (in == null) {
- throw new IllegalStateException("Classpath must contain ner-default-features.xml file!");
+ throw new IllegalStateException("Classpath must contain 'ner-default-features.xml' file!");
}
byte[] buf = new byte[1024];
@@ -87,24 +104,49 @@ public class TokenNameFinderFactory extends BaseToolFactory {
}
}
catch (IOException e) {
- throw new IllegalStateException("Failed reading from ner-default-features.xml file on classpath!");
+ throw new IllegalStateException("Failed reading from 'ner-default-features.xml' file on classpath!");
}
return bytes.toByteArray();
}
+ /**
+ * @return Retrieves the {@link SequenceCodec} in use.
+ */
protected SequenceCodec<String> getSequenceCodec() {
return seqCodec;
}
+ /**
+ * @return Retrieves the additional {@code resources} in use.
+ */
protected Map<String, Object> getResources() {
return resources;
}
+ /**
+ * @return Retrieves {@code byte[]} in use representing the feature generator descriptor.
+ */
protected byte[] getFeatureGenerator() {
return featureGeneratorBytes;
}
+
+ /**
+ * Initializes a {@link TokenNameFinderFactory} instance via given parameters.
+ *
+ * @param subclassName The class name used for instantiation. If {@code null}, an
+ * instance of {@link TokenNameFinderFactory} will be returned
+ * per default. Otherwise, the {@link ExtensionLoader} mechanism
+ * is applied to load the requested {@code subclassName}.
+ * @param featureGeneratorBytes The {@code byte[]} representing the feature generator descriptor.
+ * @param resources Additional resources in a mapping.
+ * @param seqCodec The {@link SequenceCodec} to use.
+ *
+ * @return A valid {@link TokenNameFinderFactory} instance.
+ * @throws InvalidFormatException Thrown if the {@link ExtensionLoader} mechanism failed to
+ * create the factory associated with {@code subclassName}.
+ */
public static TokenNameFinderFactory create(String subclassName, byte[] featureGeneratorBytes,
final Map<String, Object> resources, SequenceCodec<String> seqCodec)
throws InvalidFormatException {
@@ -118,9 +160,7 @@ public class TokenNameFinderFactory extends BaseToolFactory {
TokenNameFinderFactory.class, subclassName);
} catch (Exception e) {
String msg = "Could not instantiate the " + subclassName
- + ". The initialization throw an exception.";
- System.err.println(msg);
- e.printStackTrace();
+ + ". The initialization threw an exception.";
throw new InvalidFormatException(msg, e);
}
}
@@ -133,18 +173,39 @@ public class TokenNameFinderFactory extends BaseToolFactory {
// no additional artifacts
}
+ /**
+ * @return Initializes and returns a {@link SequenceCodec} via its class name configured in a manifest.
+ * If that initialization fails (e.g., if no matching class could be loaded for the configured
+ * class name at runtime), the currently loaded (default) {@link SequenceCodec} is returned.
+ *
+ * @see BioCodec
+ * @see BilouCodec
+ */
public SequenceCodec<String> createSequenceCodec() {
if (artifactProvider != null) {
- String sequeceCodecImplName = artifactProvider.getManifestProperty(
+ String sequenceCodecImplName = artifactProvider.getManifestProperty(
TokenNameFinderModel.SEQUENCE_CODEC_CLASS_NAME_PARAMETER);
- return instantiateSequenceCodec(sequeceCodecImplName);
+ try {
+ return instantiateSequenceCodec(sequenceCodecImplName);
+ } catch (InvalidFormatException e) {
+ // Uses the (already) available SequenceCodec instance. Default: BioCodec, see no-arg constructor
+ return seqCodec;
+ }
}
else {
return seqCodec;
}
}
+ /**
+ * Creates and configures a new {@link NameContextGenerator} in a default combination.
+ *
+ * @return A {@link NameContextGenerator} instance.
+ *
+ * @see DefaultNameContextGenerator
+ * @see AdaptiveFeatureGenerator
+ */
public NameContextGenerator createContextGenerator() {
AdaptiveFeatureGenerator featureGenerator = createFeatureGenerators();
@@ -164,12 +225,16 @@ public class TokenNameFinderFactory extends BaseToolFactory {
/**
* Creates the {@link AdaptiveFeatureGenerator}. Usually this
- * is a set of generators contained in the {@link AggregatedFeatureGenerator}.
- *
+ * is a set of generators contained in {@link AggregatedFeatureGenerator}.
+ * <p>
* Note:
* The generators are created on every call to this method.
*
- * @return the feature generator or null if there is no descriptor in the model
+ * @return The {@link AdaptiveFeatureGenerator} or {@code null} if there
+ * is no descriptor in the model.
+ *
+ * @throws FeatureGeneratorCreationError Thrown if configuration errors occurred.
+ * @throws IllegalStateException Thrown if inconsistencies occurred during creation.
*/
public AdaptiveFeatureGenerator createFeatureGenerators() {
@@ -214,15 +279,35 @@ public class TokenNameFinderFactory extends BaseToolFactory {
return generator;
}
- public static SequenceCodec<String> instantiateSequenceCodec(
- String sequenceCodecImplName) {
+ /**
+ * Initializes a {@link SequenceCodec} instance via given parameters.
+ *
+ * @param sequenceCodecImplName The class name used for instantiation. If {@code null},
+ * an instance of {@link BioCodec} will be returned
+ * per default. Otherwise, the {@link ExtensionLoader}
+ * mechanism is applied to load the requested {@code subclassName}.
+ *
+ * @return A valid {@link SequenceCodec} instance.
+ * @throws InvalidFormatException Thrown if the {@link ExtensionLoader} mechanism failed to
+ * create the codec associated with {@code sequenceCodecImplName}.
+ * @see SequenceCodec
+ * @see BioCodec
+ * @see BilouCodec
+ */
+ public static SequenceCodec<String> instantiateSequenceCodec(String sequenceCodecImplName)
+ throws InvalidFormatException {
if (sequenceCodecImplName != null) {
- return ExtensionLoader.instantiateExtension(
- SequenceCodec.class, sequenceCodecImplName);
+ try {
+ return ExtensionLoader.instantiateExtension(SequenceCodec.class, sequenceCodecImplName);
+ } catch (Exception e) {
+ String msg = "Could not instantiate the " + sequenceCodecImplName
+ + ". The initialization threw an exception.";
+ throw new InvalidFormatException(msg, e);
+ }
}
else {
- // If nothing is specified return old default!
+ // If nothing is specified return default codec!
return new BioCodec();
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java
index 9ad79420..2a89a4c3 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java
@@ -34,13 +34,13 @@ import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.SequenceCodec;
import opennlp.tools.util.featuregen.BrownCluster;
import opennlp.tools.util.featuregen.WordClusterDictionary;
+import opennlp.tools.util.featuregen.WordClusterFeatureGenerator;
import opennlp.tools.util.model.ArtifactSerializer;
import opennlp.tools.util.model.BaseModel;
import opennlp.tools.util.model.ByteArraySerializer;
/**
- * The {@link TokenNameFinderModel} is the model used
- * by a learnable {@link TokenNameFinder}.
+ * The {@link TokenNameFinderModel} is the model used by a learnable {@link TokenNameFinder}.
*
* @see NameFinderME
*/
@@ -60,6 +60,20 @@ public class TokenNameFinderModel extends BaseModel {
static final String SEQUENCE_CODEC_CLASS_NAME_PARAMETER = "sequenceCodecImplName";
+ /**
+ * Initializes a {@link TokenNameFinderModel} instance via given parameters.
+ *
+ * @param languageCode The ISO conform language code.
+ * @param nameFinderModel A valid {@link MaxentModel}.
+ * @param generatorDescriptor The {@code byte[]} representing the feature generator descriptor.
+ * @param resources Additional resources in a mapping.
+ * @param manifestInfoEntries Additional information kept in the manifest.
+ * @param seqCodec The {@link SequenceCodec} to use.
+ * @param factory The {@link TokenNameFinderFactory} for creating related objects.
+ *
+ * @throws IllegalArgumentException Thrown if the {@code namFinderModel} incompatible
+ * with {@code seqCodec}.
+ */
public TokenNameFinderModel(String languageCode, SequenceClassificationModel<String> nameFinderModel,
byte[] generatorDescriptor, Map<String, Object> resources, Map<String, String> manifestInfoEntries,
SequenceCodec<String> seqCodec, TokenNameFinderFactory factory) {
@@ -72,6 +86,21 @@ public class TokenNameFinderModel extends BaseModel {
}
}
+ /**
+ * Initializes a {@link TokenNameFinderModel} instance via given parameters.
+ *
+ * @param languageCode The ISO conform language code.
+ * @param nameFinderModel A valid {@link MaxentModel}.
+ * @param beamSize The beam size. Must be greater than {@code 0}.
+ * @param generatorDescriptor The {@code byte[]} representing the feature generator descriptor.
+ * @param resources Additional resources in a mapping.
+ * @param manifestInfoEntries Additional information kept in the manifest.
+ * @param seqCodec The {@link SequenceCodec} to use.
+ * @param factory The {@link TokenNameFinderFactory} for creating related objects.
+ *
+ * @throws IllegalArgumentException Thrown if the {@code namFinderModel} incompatible
+ * with {@code seqCodec}.
+ */
public TokenNameFinderModel(String languageCode, MaxentModel nameFinderModel, int beamSize,
byte[] generatorDescriptor, Map<String, Object> resources, Map<String, String> manifestInfoEntries,
SequenceCodec<String> seqCodec, TokenNameFinderFactory factory) {
@@ -88,37 +117,109 @@ public class TokenNameFinderModel extends BaseModel {
}
}
- // TODO: Extend this one with beam size!
- public TokenNameFinderModel(String languageCode, MaxentModel nameFinderModel,
- byte[] generatorDescriptor, Map<String, Object> resources, Map<String, String> manifestInfoEntries) {
+ /**
+ * Initializes a {@link TokenNameFinderModel} instance via given parameters.
+ *
+ * @param languageCode The ISO conform language code.
+ * @param nameFinderModel A valid {@link MaxentModel}.
+ * @param generatorDescriptor The {@code byte[]} representing the feature generator descriptor.
+ * @param resources Additional resources in a mapping.
+ * @param manifestInfoEntries Additional information kept in the manifest.
+ *
+ * @throws IllegalArgumentException Thrown if the {@code namFinderModel} incompatible
+ * with {@code seqCodec}.
+ */
+ public TokenNameFinderModel(String languageCode, MaxentModel nameFinderModel, byte[] generatorDescriptor,
+ Map<String, Object> resources, Map<String, String> manifestInfoEntries) {
this(languageCode, nameFinderModel, NameFinderME.DEFAULT_BEAM_SIZE,
- generatorDescriptor, resources, manifestInfoEntries, new BioCodec(), new TokenNameFinderFactory());
+ generatorDescriptor, resources, manifestInfoEntries,
+ new BioCodec(), new TokenNameFinderFactory());
}
+ /**
+ * Initializes a {@link TokenNameFinderModel} instance via given parameters.
+ *
+ * @param languageCode The ISO conform language code.
+ * @param nameFinderModel A valid {@link MaxentModel}.
+ * @param beamSize The beam size. Must be greater than {@code 0}.
+ * @param generatorDescriptor The {@code byte[]} representing the feature generator descriptor.
+ * @param resources Additional resources in a mapping.
+ * @param manifestInfoEntries Additional information kept in the manifest.
+ *
+ * @throws IllegalArgumentException Thrown if the {@code namFinderModel} incompatible
+ * with {@code seqCodec}.
+ */
+ public TokenNameFinderModel(String languageCode, MaxentModel nameFinderModel, int beamSize,
+ byte[] generatorDescriptor, Map<String, Object> resources,
+ Map<String, String> manifestInfoEntries) {
+ this(languageCode, nameFinderModel, beamSize,
+ generatorDescriptor, resources, manifestInfoEntries,
+ new BioCodec(), new TokenNameFinderFactory());
+ }
+
+ /**
+ * Initializes a {@link TokenNameFinderModel} instance via given parameters.
+ *
+ * @param languageCode The ISO conform language code.
+ * @param nameFinderModel A valid {@link MaxentModel}.
+ * @param resources Additional resources in a mapping.
+ * @param manifestInfoEntries Additional information kept in the manifest.
+ *
+ * @throws IllegalArgumentException Thrown if the {@code nameFinderModel} is incompatible
+ * with {@code seqCodec}.
+ */
public TokenNameFinderModel(String languageCode, MaxentModel nameFinderModel,
Map<String, Object> resources, Map<String, String> manifestInfoEntries) {
this(languageCode, nameFinderModel, null, resources, manifestInfoEntries);
}
+ /**
+ * Initializes a {@link TokenNameFinderModel} instance via a valid {@link InputStream}.
+ *
+ * @param in The {@link InputStream} used for loading the model.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public TokenNameFinderModel(InputStream in) throws IOException {
super(COMPONENT_NAME, in);
}
+ /**
+ * Initializes a {@link TokenNameFinderModel} instance via a valid {@link File}.
+ *
+ * @param modelFile The {@link File} used for loading the model.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public TokenNameFinderModel(File modelFile) throws IOException {
super(COMPONENT_NAME, modelFile);
}
+ /**
+ * Initializes a {@link TokenNameFinderModel} instance via a valid {@link Path}.
+ *
+ * @param modelPath The {@link Path} used for loading the model.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public TokenNameFinderModel(Path modelPath) throws IOException {
- this(modelPath.toFile());
+ super(COMPONENT_NAME, modelPath);
}
+ /**
+ * Initializes a {@link TokenNameFinderModel} instance via a valid {@link URL}.
+ *
+ * @param modelURL The {@link URL} used for loading the model.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public TokenNameFinderModel(URL modelURL) throws IOException {
super(COMPONENT_NAME, modelURL);
}
- private void init(Object nameFinderModel,
- byte[] generatorDescriptor, Map<String, Object> resources, Map<String, String> manifestInfoEntries,
- SequenceCodec<String> seqCodec) {
+ private void init(Object nameFinderModel, byte[] generatorDescriptor,
+ Map<String, Object> resources, Map<String, String> manifestInfoEntries,
+ SequenceCodec<String> seqCodec) {
Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);
manifest.put(SEQUENCE_CODEC_CLASS_NAME_PARAMETER, seqCodec.getClass().getName());
@@ -143,6 +244,10 @@ public class TokenNameFinderModel extends BaseModel {
checkArtifactMap();
}
+ /**
+ * @return Retrieves a valid {@link SequenceClassificationModel} or {@code null}
+ * if no matching one could be found.
+ */
public SequenceClassificationModel<String> getNameFinderSequenceModel() {
Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);
@@ -170,10 +275,16 @@ public class TokenNameFinderModel extends BaseModel {
return TokenNameFinderFactory.class;
}
+ /**
+ * @return Retrieves the {@link SequenceCodec} in use.
+ */
public SequenceCodec<String> getSequenceCodec() {
return this.getFactory().getSequenceCodec();
}
+ /**
+ * @return Retrieves the {@link TokenNameFinderFactory} in use.
+ */
public TokenNameFinderFactory getFactory() {
return (TokenNameFinderFactory) this.toolFactory;
}
@@ -186,13 +297,15 @@ public class TokenNameFinderModel extends BaseModel {
}
/**
- * Create the artifact serializers. Currently for serializers related to
+ * Create the {@link ArtifactSerializer serializers}. Currently, for serializers related to
* features that require external resources, such as {@code W2VClassesDictionary}
* objects, the convention is to add its element tag name as key of the serializer map.
- * For example, the element tag name for the {@code WordClusterFeatureGenerator} which
+ * <p>
+ * For example, the element tag name for the {@link WordClusterFeatureGenerator} which
* uses {@code W2VClassesDictionary} objects serialized by the {@code W2VClassesDictionarySerializer}
* is 'wordcluster', which is the key used to add the serializer to the map.
- * @return the map containing the added serializers
+ *
+ * @return A {@link Map} containing the added {@link ArtifactSerializer serializers}.
*/
public static Map<String, ArtifactSerializer<?>> createArtifactSerializers() {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CachedFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CachedFeatureGenerator.java
index f02a47e8..dd79cbb1 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CachedFeatureGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/CachedFeatureGenerator.java
@@ -24,7 +24,7 @@ import java.util.List;
import opennlp.tools.util.Cache;
/**
- * Caches features of the aggregated {@link AdaptiveFeatureGenerator}s.
+ * Caches features of the aggregated {@link AdaptiveFeatureGenerator generators}.
*/
public class CachedFeatureGenerator implements AdaptiveFeatureGenerator {
@@ -32,7 +32,7 @@ public class CachedFeatureGenerator implements AdaptiveFeatureGenerator {
private String[] prevTokens;
- private Cache<Integer, List<String>> contextsCache;
+ private final Cache<Integer, List<String>> contextsCache;
private long numberOfCacheHits;
private long numberOfCacheMisses;
@@ -48,6 +48,7 @@ public class CachedFeatureGenerator implements AdaptiveFeatureGenerator {
contextsCache = new Cache<>(100);
}
+ @Override
public void createFeatures(List<String> features, String[] tokens, int index,
String[] previousOutcomes) {
@@ -86,18 +87,14 @@ public class CachedFeatureGenerator implements AdaptiveFeatureGenerator {
}
/**
- * Retrieves the number of times a cache hit occurred.
- *
- * @return number of cache hits
+ * @return Retrieves the number of times a cache hit occurred.
*/
public long getNumberOfCacheHits() {
return numberOfCacheHits;
}
/**
- * Retrieves the number of times a cache miss occurred.
- *
- * @return number of cache misses
+ * @return Retrieves the number of times a cache miss occurred.
*/
public long getNumberOfCacheMisses() {
return numberOfCacheMisses;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java b/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java
index e8f7b415..782d01e8 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java
@@ -29,6 +29,7 @@ import java.io.ObjectOutputStream;
import java.io.OutputStream;
import java.io.Serializable;
import java.net.URL;
+import java.nio.file.Files;
import java.nio.file.Path;
import java.util.HashMap;
import java.util.Map;
@@ -184,6 +185,14 @@ public abstract class BaseModel implements ArtifactProvider, Serializable {
}
}
+ protected BaseModel(String componentName, Path modelPath) throws IOException {
+ this(componentName, true);
+
+ try (InputStream in = Files.newInputStream(modelPath)) {
+ loadModel(in);
+ }
+ }
+
protected BaseModel(String componentName, URL modelURL) throws IOException {
this(componentName, true);