You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jz...@apache.org on 2022/12/20 16:25:10 UTC
[opennlp] branch master updated: OPENNLP-1416 Enhance JavaDoc in opennlp.tools.formats.ad package (#461)
This is an automated email from the ASF dual-hosted git repository.
jzemerick pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new f250404a OPENNLP-1416 Enhance JavaDoc in opennlp.tools.formats.ad package (#461)
f250404a is described below
commit f250404afce780a12947b513f2a2ac423aa5c7b3
Author: Martin Wiesner <ma...@users.noreply.github.com>
AuthorDate: Tue Dec 20 17:25:05 2022 +0100
OPENNLP-1416 Enhance JavaDoc in opennlp.tools.formats.ad package (#461)
- adds missing JavaDoc
- improves existing documentation for clarity
- removes superfluous text
- adds 'final' modifier where useful and applicable
- adds 'Override' annotation where useful and applicable
- simplifies several constructors, removing duplicate code
- fixes non-JNC compliant naming of constants
- fixes several typos
---
.../tools/formats/ad/ADChunkSampleStream.java | 44 +++---
.../formats/ad/ADChunkSampleStreamFactory.java | 7 +-
.../tools/formats/ad/ADNameSampleStream.java | 162 +++++++++------------
.../formats/ad/ADNameSampleStreamFactory.java | 6 +-
.../tools/formats/ad/ADPOSSampleStream.java | 56 +++----
.../tools/formats/ad/ADPOSSampleStreamFactory.java | 9 +-
.../tools/formats/ad/ADSentenceSampleStream.java | 57 +++-----
.../formats/ad/ADSentenceSampleStreamFactory.java | 6 +-
.../opennlp/tools/formats/ad/ADSentenceStream.java | 114 ++++++---------
.../formats/ad/ADTokenSampleStreamFactory.java | 4 +-
.../formats/ad/PortugueseContractionUtility.java | 17 +--
11 files changed, 205 insertions(+), 277 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStream.java
index c3f78121..6b7471f2 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStream.java
@@ -18,7 +18,6 @@
package opennlp.tools.formats.ad;
import java.io.IOException;
-import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
@@ -27,7 +26,6 @@ import opennlp.tools.formats.ad.ADSentenceStream.Sentence;
import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.Leaf;
import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.Node;
import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.TreeElement;
-import opennlp.tools.namefind.NameSample;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
@@ -41,18 +39,19 @@ import opennlp.tools.util.StringUtil;
* Approach to Portuguese Clause Identification', (Eraldo Fernandes, Cicero
* Santos and Ruy Milidiú).<br>
* <p>
- * Data can be found on this web site:<br>
- * http://www.linguateca.pt/floresta/corpus.html
+ * Data can be found on
+ * <a href="http://www.linguateca.pt/floresta/corpus.html">this web site</a>.
+ *
* <p>
* Information about the format:<br>
* Susana Afonso.
- * "Árvores deitadas: Descrição do formato e das opções de análise na Floresta Sintáctica"
- * .<br>
+ * <a href="http://www.linguateca.pt/documentos/Afonso2006ArvoresDeitadas.pdf">
+ * "Árvores deitadas: Descrição do formato e das opções de análise na Floresta Sintáctica"</a>.
+ * <br>
* 12 de Fevereiro de 2006.
- * http://www.linguateca.pt/documentos/Afonso2006ArvoresDeitadas.pdf
* <p>
- * Detailed info about the NER tagset:
- * http://beta.visl.sdu.dk/visl/pt/info/portsymbol.html#semtags_names
+ * Detailed info about the
+ * <a href="http://beta.visl.sdu.dk/visl/pt/info/portsymbol.html#semtags_names">NER tagset</a>.
* <p>
* <b>Note:</b> Do not use this class, internal use only!
*/
@@ -68,28 +67,27 @@ public class ADChunkSampleStream implements ObjectStream<ChunkSample> {
public static final String OTHER = "O";
/**
- * Creates a new {@link NameSample} stream from a line stream, i.e.
- * {@link ObjectStream}<{@link String}>, that could be a
- * {@link PlainTextByLineStream} object.
+ * Instantiates a {@link ADChunkSampleStream} stream from {@link ObjectStream<String>},
+ * that could be a {@link PlainTextByLineStream} object.
*
- * @param lineStream
- * a stream of lines as {@link String}
+ * @param lineStream An {@link ObjectStream<String>} as input.
*/
public ADChunkSampleStream(ObjectStream<String> lineStream) {
this.adSentenceStream = new ADSentenceStream(lineStream);
}
+ /**
+ * Instantiates a {@link ADChunkSampleStream} stream from an {@link InputStreamFactory}.
+ *
+ * @param in The {@link InputStreamFactory} for the corpus.
+ * @param charsetName The {@link java.nio.charset.Charset charset} to use
+ * for reading of the corpus.
+ */
public ADChunkSampleStream(InputStreamFactory in, String charsetName) throws IOException {
-
- try {
- this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream(
- in, charsetName));
- } catch (UnsupportedEncodingException e) {
- // UTF-8 is available on all JVMs, will never happen
- throw new IllegalStateException(e);
- }
+ this(new PlainTextByLineStream(in, charsetName));
}
+ @Override
public ChunkSample read() throws IOException {
Sentence paragraph;
@@ -270,10 +268,12 @@ public class ADChunkSampleStream implements ObjectStream<ChunkSample> {
this.end = aEnd;
}
+ @Override
public void reset() throws IOException, UnsupportedOperationException {
adSentenceStream.reset();
}
+ @Override
public void close() throws IOException {
adSentenceStream.close();
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactory.java
index 42a6ee28..fcd8a1ca 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactory.java
@@ -36,7 +36,8 @@ import opennlp.tools.util.PlainTextByLineStream;
* A Factory to create a Arvores Deitadas ChunkStream from the command line
* utility.
* <p>
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
public class ADChunkSampleStreamFactory<P> extends LanguageSampleStreamFactory<ChunkSample, P> {
@@ -71,14 +72,13 @@ public class ADChunkSampleStreamFactory<P> extends LanguageSampleStreamFactory<C
super(params);
}
+ @Override
public ObjectStream<ChunkSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
-
language = params.getLang();
InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
-
ObjectStream<String> lineStream = null;
try {
lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
@@ -87,7 +87,6 @@ public class ADChunkSampleStreamFactory<P> extends LanguageSampleStreamFactory<C
}
ADChunkSampleStream sampleStream = new ADChunkSampleStream(lineStream);
-
if (params.getStart() != null && params.getStart() > -1) {
sampleStream.setStart(params.getStart());
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java
index 5b4b926c..a0b77585 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStream.java
@@ -18,8 +18,6 @@
package opennlp.tools.formats.ad;
import java.io.IOException;
-import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
@@ -47,35 +45,36 @@ import opennlp.tools.util.Span;
* The data contains four named entity types: Person, Organization, Group,
* Place, Event, ArtProd, Abstract, Thing, Time and Numeric.<br>
* <p>
- * Data can be found on this web site:<br>
- * http://www.linguateca.pt/floresta/corpus.html
+ * Data can be found on
+ * <a href="http://www.linguateca.pt/floresta/corpus.html">this web site</a>.
+ *
* <p>
* Information about the format:<br>
* Susana Afonso.
- * "Árvores deitadas: Descrição do formato e das opções de análise na Floresta Sintáctica"
- * .<br>
+ * <a href="http://www.linguateca.pt/documentos/Afonso2006ArvoresDeitadas.pdf">
+ * "Árvores deitadas: Descrição do formato e das opções de análise na Floresta Sintáctica"</a>.
+ * <br>
* 12 de Fevereiro de 2006.
- * http://www.linguateca.pt/documentos/Afonso2006ArvoresDeitadas.pdf
* <p>
- * Detailed info about the NER tagset:
- * http://beta.visl.sdu.dk/visl/pt/info/portsymbol.html#semtags_names
+ * Detailed info about the
+ * <a href="http://beta.visl.sdu.dk/visl/pt/info/portsymbol.html#semtags_names">NER tagset</a>.
* <p>
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
public class ADNameSampleStream implements ObjectStream<NameSample> {
- /**
+ /*
* Pattern of a NER tag in Arvores Deitadas
*/
- private static final Pattern tagPattern = Pattern.compile("<(NER:)?(.*?)>");
-
- private static final Pattern whitespacePattern = Pattern.compile("\\s+");
- private static final Pattern underlinePattern = Pattern.compile("[_]+");
- private static final Pattern hyphenPattern =
+ private static final Pattern TAG_PATTERN = Pattern.compile("<(NER:)?(.*?)>");
+ private static final Pattern WHITESPACE_PATTERN = Pattern.compile("\\s+");
+ private static final Pattern UNDERLINE_PATTERN = Pattern.compile("[_]+");
+ private static final Pattern HYPHEN_PATTERN =
Pattern.compile("((\\p{L}+)-$)|(^-(\\p{L}+)(.*))|((\\p{L}+)-(\\p{L}+)(.*))");
- private static final Pattern alphanumericPattern = Pattern.compile("^[\\p{L}\\p{Nd}]+$");
+ private static final Pattern ALPHANUMERIC_PATTERN = Pattern.compile("^[\\p{L}\\p{Nd}]+$");
- /**
+ /*
* Map to the Arvores Deitadas types to our types. It is read-only.
*/
private static final Map<String, String> HAREM;
@@ -154,7 +153,7 @@ public class ADNameSampleStream implements ObjectStream<NameSample> {
private final ObjectStream<ADSentenceStream.Sentence> adSentenceStream;
- /**
+ /*
* To keep the last left contraction part
*/
private String leftContractionPart = null;
@@ -162,15 +161,12 @@ public class ADNameSampleStream implements ObjectStream<NameSample> {
private final boolean splitHyphenatedTokens;
/**
- * Creates a new {@link NameSample} stream from a line stream, i.e.
- * {@link ObjectStream}<{@link String}>, that could be a
- * {@link PlainTextByLineStream} object.
+ * Initializes a new {@link ADNameSampleStream} stream from a {@link ObjectStream<String>},
+ * that could be a {@link PlainTextByLineStream} object.
*
- * @param lineStream
- * a stream of lines as {@link String}
- * @param splitHyphenatedTokens
- * if true hyphenated tokens will be separated: "carros-monstro" >
- * "carros" "-" "monstro"
+ * @param lineStream An {@link ObjectStream<String>} as input.
+ * @param splitHyphenatedTokens If {@code true} hyphenated tokens will be separated:
+ * "carros-monstro" > "carros" "-" "monstro".
*/
public ADNameSampleStream(ObjectStream<String> lineStream, boolean splitHyphenatedTokens) {
this.adSentenceStream = new ADSentenceStream(lineStream);
@@ -178,37 +174,28 @@ public class ADNameSampleStream implements ObjectStream<NameSample> {
}
/**
- * Creates a new {@link NameSample} stream from a {@link InputStream}
+ * Initializes a new {@link ADNameSampleStream} from an {@link InputStreamFactory}
*
- * @param in
- * the Corpus {@link InputStream}
- * @param charsetName
- * the charset of the Arvores Deitadas Corpus
- * @param splitHyphenatedTokens
- * if true hyphenated tokens will be separated: "carros-monstro" >
- * "carros" "-" "monstro"
+ * @param in The Corpus {@link InputStreamFactory}.
+ * @param charsetName The {@link java.nio.charset.Charset charset} to use
+ * for reading of the corpus.
+ * @param splitHyphenatedTokens If {@code true} hyphenated tokens will be separated:
+ * "carros-monstro" > "carros" "-" "monstro".
*/
@Deprecated
public ADNameSampleStream(InputStreamFactory in, String charsetName,
boolean splitHyphenatedTokens) throws IOException {
-
- try {
- this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream(
- in, charsetName));
- this.splitHyphenatedTokens = splitHyphenatedTokens;
- } catch (UnsupportedEncodingException e) {
- // UTF-8 is available on all JVMs, will never happen
- throw new IllegalStateException(e);
- }
+ this(new PlainTextByLineStream(in, charsetName), splitHyphenatedTokens);
}
private int textID = -1;
+ @Override
public NameSample read() throws IOException {
Sentence paragraph;
// we should look for text here.
- while ((paragraph = this.adSentenceStream.read()) != null) {
+ if ((paragraph = this.adSentenceStream.read()) != null) {
int currentTextID = getTextID(paragraph);
boolean clearData = false;
@@ -229,14 +216,11 @@ public class ADNameSampleStream implements ObjectStream<NameSample> {
}
/**
- * Recursive method to process a node in Arvores Deitadas format.
+ * Recursive method to process a {@link Node} in Arvores Deitadas format.
*
- * @param node
- * the node to be processed
- * @param sentence
- * the sentence tokens we got so far
- * @param names
- * the names we got so far
+ * @param node The {@link Node} to be processed.
+ * @param sentence The {@link List<String> sentence tokens} processed so far.
+ * @param names The {@link List<Span> names} processed so far.
*/
private void process(Node node, List<String> sentence, List<Span> names) {
if (node != null) {
@@ -251,17 +235,13 @@ public class ADNameSampleStream implements ObjectStream<NameSample> {
}
/**
- * Process a Leaf of Arvores Detaitadas format
+ * Processes a {@link Leaf} of Arvores Detaitadas format
*
- * @param leaf
- * the leaf to be processed
- * @param sentence
- * the sentence tokens we got so far
- * @param names
- * the names we got so far
+ * @param leaf The {@link Leaf} to be processed
+ * @param sentence The {@link List<String> sentence tokens} processed so far.
+ * @param names The {@link List<Span> names} processed so far.
*/
- private void processLeaf(Leaf leaf, List<String> sentence,
- List<Span> names) {
+ private void processLeaf(Leaf leaf, List<String> sentence, List<Span> names) {
boolean alreadyAdded = false;
@@ -272,7 +252,7 @@ public class ADNameSampleStream implements ObjectStream<NameSample> {
String c = PortugueseContractionUtility.toContraction(
leftContractionPart, right);
if (c != null) {
- String[] parts = whitespacePattern.split(c);
+ String[] parts = WHITESPACE_PATTERN.split(c);
sentence.addAll(Arrays.asList(parts));
alreadyAdded = true;
} else {
@@ -291,7 +271,7 @@ public class ADNameSampleStream implements ObjectStream<NameSample> {
if (leafTag != null) {
if (leafTag.contains("<sam->") && !alreadyAdded) {
- String[] lexemes = underlinePattern.split(leaf.getLexeme());
+ String[] lexemes = UNDERLINE_PATTERN.split(leaf.getLexeme());
if (lexemes.length > 1) {
sentence.addAll(Arrays.asList(lexemes).subList(0, lexemes.length - 1));
}
@@ -336,9 +316,9 @@ public class ADNameSampleStream implements ObjectStream<NameSample> {
private List<String> processLexeme(String lexemeStr) {
List<String> out = new ArrayList<>();
- String[] parts = underlinePattern.split(lexemeStr);
+ String[] parts = UNDERLINE_PATTERN.split(lexemeStr);
for (String tok : parts) {
- if (tok.length() > 1 && !alphanumericPattern.matcher(tok).matches()) {
+ if (tok.length() > 1 && !ALPHANUMERIC_PATTERN.matcher(tok).matches()) {
out.addAll(processTok(tok));
} else {
out.add(tok);
@@ -365,7 +345,7 @@ public class ADNameSampleStream implements ObjectStream<NameSample> {
// lets split all hyphens
if (this.splitHyphenatedTokens && tok.contains("-") && tok.length() > 1) {
- Matcher matcher = hyphenPattern.matcher(tok);
+ Matcher matcher = HYPHEN_PATTERN.matcher(tok);
String firstTok = null;
String hyphen = "-";
@@ -393,7 +373,7 @@ public class ADNameSampleStream implements ObjectStream<NameSample> {
}
if (!tokAdded) {
if (!original.equals(tok) && tok.length() > 1
- && !alphanumericPattern.matcher(tok).matches()) {
+ && !ALPHANUMERIC_PATTERN.matcher(tok).matches()) {
out.addAll(processTok(tok));
} else {
out.add(tok);
@@ -410,11 +390,10 @@ public class ADNameSampleStream implements ObjectStream<NameSample> {
}
/**
- * Parse a NER tag in Arvores Deitadas format.
+ * Parses a NER tag in Arvores Deitadas format.
*
- * @param tags
- * the NER tag in Arvores Deitadas format
- * @return the NER tag, or null if not a NER tag in Arvores Deitadas format
+ * @param tags The NER tag in Arvores Deitadas format.
+ * @return The NER tag, or {@code null} if not a NER tag in Arvores Deitadas format.
*/
private static String getNER(String tags) {
if (tags.contains("<NER2>")) {
@@ -422,7 +401,7 @@ public class ADNameSampleStream implements ObjectStream<NameSample> {
}
String[] tag = tags.split("\\s+");
for (String t : tag) {
- Matcher matcher = tagPattern.matcher(t);
+ Matcher matcher = TAG_PATTERN.matcher(t);
if (matcher.matches()) {
String ner = matcher.group(2);
if (HAREM.containsKey(ner)) {
@@ -433,10 +412,12 @@ public class ADNameSampleStream implements ObjectStream<NameSample> {
return null;
}
+ @Override
public void reset() throws IOException, UnsupportedOperationException {
adSentenceStream.reset();
}
+ @Override
public void close() throws IOException {
adSentenceStream.close();
}
@@ -445,10 +426,6 @@ public class ADNameSampleStream implements ObjectStream<NameSample> {
ama, cie, lit
}
- private Type corpusType = null;
-
- private Pattern metaPattern;
-
// works for Amazonia
// private static final Pattern meta1 = Pattern
// .compile("^(?:[a-zA-Z\\-]*(\\d+)).*?p=(\\d+).*");
@@ -457,24 +434,23 @@ public class ADNameSampleStream implements ObjectStream<NameSample> {
// private static final Pattern meta2 = Pattern
// .compile("^(?:[a-zA-Z\\-]*(\\d+)).*?p=(\\d+).*");
- private int textIdMeta2 = -1;
- private String textMeta2 = "";
-
private int getTextID(Sentence paragraph) {
-
- String meta = paragraph.getMetadata();
-
- if (corpusType == null) {
- if (meta.startsWith("LIT")) {
- corpusType = Type.lit;
- metaPattern = Pattern.compile("^([a-zA-Z\\-]+)(\\d+).*?p=(\\d+).*");
- } else if (meta.startsWith("CIE")) {
- corpusType = Type.cie;
- metaPattern = Pattern.compile("^.*?source=\"(.*?)\".*");
- } else { // ama
- corpusType = Type.ama;
- metaPattern = Pattern.compile("^(?:[a-zA-Z\\-]*(\\d+)).*?p=(\\d+).*");
- }
+
+ final String meta = paragraph.getMetadata();
+ Type corpusType;
+ Pattern metaPattern;
+ int textIdMeta2 = -1;
+ String textMeta2 = "";
+
+ if (meta.startsWith("LIT")) {
+ corpusType = Type.lit;
+ metaPattern = Pattern.compile("^([a-zA-Z\\-]+)(\\d+).*?p=(\\d+).*");
+ } else if (meta.startsWith("CIE")) {
+ corpusType = Type.cie;
+ metaPattern = Pattern.compile("^.*?source=\"(.*?)\".*");
+ } else { // ama
+ corpusType = Type.ama;
+ metaPattern = Pattern.compile("^(?:[a-zA-Z\\-]*(\\d+)).*?p=(\\d+).*");
}
if (corpusType.equals(Type.lit)) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java
index ca371c28..f813771a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java
@@ -36,7 +36,8 @@ import opennlp.tools.util.PlainTextByLineStream;
* A Factory to create a Arvores Deitadas NameSampleDataStream from the command line
* utility.
* <p>
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
public class ADNameSampleStreamFactory<P> extends LanguageSampleStreamFactory<NameSample, P> {
@@ -68,14 +69,13 @@ public class ADNameSampleStreamFactory<P> extends LanguageSampleStreamFactory<Na
super(params);
}
+ @Override
public ObjectStream<NameSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
-
language = params.getLang();
InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
-
ObjectStream<String> lineStream = null;
try {
lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java
index 7b8139aa..782ce7bd 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStream.java
@@ -18,8 +18,6 @@
package opennlp.tools.formats.ad;
import java.io.IOException;
-import java.io.InputStream;
-import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.List;
import java.util.StringTokenizer;
@@ -34,27 +32,24 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
public class ADPOSSampleStream implements ObjectStream<POSSample> {
private final ObjectStream<ADSentenceStream.Sentence> adSentenceStream;
- private boolean expandME;
- private boolean isIncludeFeatures;
+ private final boolean expandME;
+ private final boolean isIncludeFeatures;
/**
- * Creates a new {@link POSSample} stream from a line stream, i.e.
- * {@link ObjectStream}<{@link String}>, that could be a
- * {@link PlainTextByLineStream} object.
+ * Creates a new {@link ADPOSSampleStream} stream from a {@link ObjectStream<String>},
+ * that could be a {@link PlainTextByLineStream} object.
*
- * @param lineStream
- * a stream of lines as {@link String}
- * @param expandME
- * if true will expand the multiword expressions, each word of the
+ * @param lineStream A {@link ObjectStream<String>} stream as input.
+ * @param expandME If {@code true} will expand the multiword expressions, each word of the
* expression will have the POS Tag that was attributed to the
- * expression plus the prefix B- or I- (CONLL convention)
- * @param includeFeatures
- * if true will combine the POS Tag with the feature tags
+ * expression plus the prefix {@code B-} or {@code I-} (CONLL convention).
+ * @param includeFeatures If {@code true} will combine the POS Tag with the feature tags.
*/
public ADPOSSampleStream(ObjectStream<String> lineStream, boolean expandME,
boolean includeFeatures) {
@@ -64,35 +59,26 @@ public class ADPOSSampleStream implements ObjectStream<POSSample> {
}
/**
- * Creates a new {@link POSSample} stream from a {@link InputStream}
+ * Creates a new {@link POSSample} stream from an {@link InputStreamFactory}
*
- * @param in
- * the Corpus {@link InputStream}
- * @param charsetName
- * the charset of the Arvores Deitadas Corpus
- * @param expandME
- * if true will expand the multiword expressions, each word of the
+ * @param in The {@link InputStreamFactory} for the corpus.
+ * @param charsetName The {@link java.nio.charset.Charset charset} to use
+ * for reading of the corpus.
+ * @param expandME If {@code true} will expand the multiword expressions, each word of the
* expression will have the POS Tag that was attributed to the
- * expression plus the prefix B- or I- (CONLL convention)
- * @param includeFeatures
- * if true will combine the POS Tag with the feature tags
+ * expression plus the prefix {@code B-} or {@code I-} (CONLL convention).
+ * @param includeFeatures If {@code true} will combine the POS Tag with the feature tags.
*/
public ADPOSSampleStream(InputStreamFactory in, String charsetName,
boolean expandME, boolean includeFeatures) throws IOException {
- try {
- this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream(in, charsetName));
- this.expandME = expandME;
- this.isIncludeFeatures = includeFeatures;
- } catch (UnsupportedEncodingException e) {
- // UTF-8 is available on all JVMs, will never happen
- throw new IllegalStateException(e);
- }
+ this(new PlainTextByLineStream(in, charsetName), expandME, includeFeatures);
}
+ @Override
public POSSample read() throws IOException {
Sentence paragraph;
- while ((paragraph = this.adSentenceStream.read()) != null) {
+ if ((paragraph = this.adSentenceStream.read()) != null) {
Node root = paragraph.getRoot();
List<String> sentence = new ArrayList<>();
List<String> tags = new ArrayList<>();
@@ -161,10 +147,12 @@ public class ADPOSSampleStream implements ObjectStream<POSSample> {
}
+ @Override
public void reset() throws IOException, UnsupportedOperationException {
adSentenceStream.reset();
}
+ @Override
public void close() throws IOException {
adSentenceStream.close();
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactory.java
index 71fbce5e..cdfbc30f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactory.java
@@ -33,7 +33,8 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
public class ADPOSSampleStreamFactory<P> extends
LanguageSampleStreamFactory<POSSample, P> {
@@ -68,14 +69,13 @@ public class ADPOSSampleStreamFactory<P> extends
super(params);
}
+ @Override
public ObjectStream<POSSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
-
language = params.getLang();
InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
-
ObjectStream<String> lineStream = null;
try {
lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
@@ -83,8 +83,7 @@ public class ADPOSSampleStreamFactory<P> extends
CmdLineUtil.handleCreateObjectStreamError(ex);
}
- return new ADPOSSampleStream(lineStream,
- params.getExpandME(), params.getIncludeFeatures());
+ return new ADPOSSampleStream(lineStream, params.getExpandME(), params.getIncludeFeatures());
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStream.java
index 744d0cbb..89b819a0 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStream.java
@@ -17,9 +17,7 @@
package opennlp.tools.formats.ad;
-import java.io.FileInputStream;
import java.io.IOException;
-import java.io.UnsupportedEncodingException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
@@ -46,20 +44,18 @@ public class ADSentenceSampleStream implements ObjectStream<SentenceSample> {
private boolean isSameText;
private boolean isSamePara;
private Sentence sent;
- private boolean isIncludeTitles = true;
+ private final boolean isIncludeTitles;
private boolean isTitle;
private final char[] ptEosCharacters;
/**
- * Creates a new {@link SentenceSample} stream from a line stream, i.e.
- * {@link ObjectStream}<{@link String}>, that could be a
- * {@link PlainTextByLineStream} object.
+ * Initializes a new {@link ADSentenceSampleStream} from a {@link ObjectStream<String>},
+ * that could be a {@link PlainTextByLineStream} object.
*
- * @param lineStream
- * a stream of lines as {@link String}
- * @param includeHeadlines
- * if true will output the sentences marked as news headlines
+ * @param lineStream A stream of lines as {@link String}.
+ * @param includeHeadlines If {@code true} will output the sentences marked
+ * as news headlines.
*/
public ADSentenceSampleStream(ObjectStream<String> lineStream, boolean includeHeadlines) {
this.adSentenceStream = new ADSentenceStream(lineStream);
@@ -69,30 +65,22 @@ public class ADSentenceSampleStream implements ObjectStream<SentenceSample> {
}
/**
- * Creates a new {@link SentenceSample} stream from a {@link FileInputStream}
+ * Initializes a new {@link ADSentenceSampleStream}.
*
- * @param in
- * input stream from the corpus
- * @param charsetName
- * the charset to use while reading the corpus
- * @param includeHeadlines
- * if true will output the sentences marked as news headlines
+ * @param in The {@link InputStreamFactory} for the corpus.
+ * @param charsetName The {@link java.nio.charset.Charset charset} to use
+ * for reading of the corpus.
+ * @param includeHeadlines If {@code true} will output the sentences marked
+ * as news headlines.
+ * @throws IOException Thrown if IO errors occurred.
*/
- public ADSentenceSampleStream(InputStreamFactory in, String charsetName,
- boolean includeHeadlines) throws IOException {
- try {
- this.adSentenceStream = new ADSentenceStream(new PlainTextByLineStream(
- in, charsetName));
- } catch (UnsupportedEncodingException e) {
- // UTF-8 is available on all JVMs, will never happen
- throw new IllegalStateException(e);
- }
- ptEosCharacters = Factory.ptEosCharacters;
- Arrays.sort(ptEosCharacters);
- this.isIncludeTitles = includeHeadlines;
+ public ADSentenceSampleStream(InputStreamFactory in, String charsetName, boolean includeHeadlines)
+ throws IOException {
+ this(new PlainTextByLineStream(in, charsetName), includeHeadlines);
}
// The Arvores Deitadas Corpus has information about texts and paragraphs.
+ @Override
public SentenceSample read() throws IOException {
if (sent == null) {
@@ -131,8 +119,7 @@ public class ADSentenceSampleStream implements ObjectStream<SentenceSample> {
doc = document.toString();
}
- return new SentenceSample(doc,
- sentences.toArray(new Span[sentences.size()]));
+ return new SentenceSample(doc, sentences.toArray(new Span[sentences.size()]));
}
private boolean hasPunctuation(String text) {
@@ -145,14 +132,14 @@ public class ADSentenceSampleStream implements ObjectStream<SentenceSample> {
}
// there are some different types of metadata depending on the corpus.
- // todo: merge this patterns
- private Pattern meta1 = Pattern
+ // TODO Merge this patterns
+ private static final Pattern META_1 = Pattern
.compile("^(?:[a-zA-Z\\-]*(\\d+)).*?p=(\\d+).*");
private void updateMeta() {
if (this.sent != null) {
String meta = this.sent.getMetadata();
- Matcher m = meta1.matcher(meta);
+ Matcher m = META_1.matcher(meta);
int currentText;
int currentPara;
if (m.matches()) {
@@ -178,10 +165,12 @@ public class ADSentenceSampleStream implements ObjectStream<SentenceSample> {
}
}
+ @Override
public void reset() throws IOException, UnsupportedOperationException {
adSentenceStream.reset();
}
+ @Override
public void close() throws IOException {
adSentenceStream.close();
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactory.java
index ba8a11d1..bb6ae661 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactory.java
@@ -33,7 +33,8 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
public class ADSentenceSampleStreamFactory<P> extends
LanguageSampleStreamFactory<SentenceSample, P> {
@@ -63,14 +64,13 @@ public class ADSentenceSampleStreamFactory<P> extends
super(params);
}
+ @Override
public ObjectStream<SentenceSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
-
language = params.getLang();
boolean includeTitle = params.getIncludeTitles();
-
InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
ObjectStream<String> lineStream = null;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
index 5a9e60e8..47c0dbca 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceStream.java
@@ -36,12 +36,13 @@ import opennlp.tools.util.ObjectStream;
* <p>
* Information about the format:<br>
* Susana Afonso.
- * "Árvores deitadas: Descrição do formato e das opções de análise na Floresta Sintáctica"
- * .<br>
+ * <a href="http://www.linguateca.pt/documentos/Afonso2006ArvoresDeitadas.pdf">
+ * "Árvores deitadas: Descrição do formato e das opções de análise na Floresta Sintáctica"</a>.
+ * <br>
* 12 de Fevereiro de 2006.
- * http://www.linguateca.pt/documentos/Afonso2006ArvoresDeitadas.pdf
* <p>
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
public class ADSentenceStream extends FilterObjectStream<String, ADSentenceStream.Sentence> {
@@ -86,24 +87,30 @@ public class ADSentenceStream extends FilterObjectStream<String, ADSentenceStrea
*/
public static class SentenceParser {
- private Pattern nodePattern = Pattern
+ private static final Pattern NODE_PATTERN = Pattern
.compile("([=-]*)([^:=]+:[^\\(\\s]+)(\\(([^\\)]+)\\))?\\s*(?:(\\((<.+>)\\))*)\\s*$");
- private Pattern leafPattern = Pattern
+ private static final Pattern LEAF_PATTERN = Pattern
.compile("^([=-]*)([^:=]+):([^\\(\\s]+)\\([\"'](.+)[\"']\\s*((?:<.+>)*)\\s*([^\\)]+)?\\)\\s+(.+)");
- private Pattern bizarreLeafPattern = Pattern
+ private static final Pattern BIZARRE_LEAF_PATTERN = Pattern
.compile("^([=-]*)([^:=]+=[^\\(\\s]+)\\(([\"'].+[\"'])?\\s*([^\\)]+)?\\)\\s+(.+)");
- private Pattern punctuationPattern = Pattern.compile("^(=*)(\\W+)$");
+ private static final Pattern PUNCTUATION_PATTERN = Pattern.compile("^(=*)(\\W+)$");
private String text,meta;
/**
- * Parse the sentence
+ * Parses a sentence string into a {@link Sentence}.
+ *
+ * @param sentenceString The input string to parse.
+ * @param isTitle {@code true} if it represents a title element, {@code false} otherwise.
+ * @param para The parameter number.
+ * @param isBox {@code true} if it represents a box element, {@code false} otherwise.
+ *
+ * @return A {@link Sentence} instance parsed from {@code sentenceString}.
*/
public Sentence parse(String sentenceString, int para, boolean isTitle, boolean isBox) {
- BufferedReader reader = new BufferedReader(new StringReader(sentenceString));
Sentence sentence = new Sentence();
Node root = new Node();
- try {
+ try (BufferedReader reader = new BufferedReader(new StringReader(sentenceString))) {
// first line is <s ...>
String line = reader.readLine();
@@ -245,7 +252,7 @@ public class ADSentenceStream extends FilterObjectStream<String, ADSentenceStrea
// Note: all levels are higher than 1, because 0 is reserved for the root.
// try node
- Matcher nodeMatcher = nodePattern.matcher(line);
+ Matcher nodeMatcher = NODE_PATTERN.matcher(line);
if (nodeMatcher.matches()) {
int level = nodeMatcher.group(1).length() + 1;
String syntacticTag = nodeMatcher.group(2);
@@ -255,7 +262,7 @@ public class ADSentenceStream extends FilterObjectStream<String, ADSentenceStrea
return node;
}
- Matcher leafMatcher = leafPattern.matcher(line);
+ Matcher leafMatcher = LEAF_PATTERN.matcher(line);
if (leafMatcher.matches()) {
int level = leafMatcher.group(1).length() + 1;
String syntacticTag = leafMatcher.group(2);
@@ -276,7 +283,7 @@ public class ADSentenceStream extends FilterObjectStream<String, ADSentenceStrea
return leaf;
}
- Matcher punctuationMatcher = punctuationPattern.matcher(line);
+ Matcher punctuationMatcher = PUNCTUATION_PATTERN.matcher(line);
if (punctuationMatcher.matches()) {
int level = punctuationMatcher.group(1).length() + 1;
String lexeme = punctuationMatcher.group(2);
@@ -292,7 +299,7 @@ public class ADSentenceStream extends FilterObjectStream<String, ADSentenceStrea
}
if (line.startsWith("=")) {
- Matcher bizarreLeafMatcher = bizarreLeafPattern.matcher(line);
+ Matcher bizarreLeafMatcher = BIZARRE_LEAF_PATTERN.matcher(line);
if (bizarreLeafMatcher.matches()) {
int level = bizarreLeafMatcher.group(1).length() + 1;
String syntacticTag = bizarreLeafMatcher.group(2);
@@ -341,7 +348,7 @@ public class ADSentenceStream extends FilterObjectStream<String, ADSentenceStrea
}
/** Represents a tree element, Node or Leaf */
- public abstract class TreeElement {
+ public abstract static class TreeElement {
private String syntacticTag;
private String morphologicalTag;
@@ -377,7 +384,7 @@ public class ADSentenceStream extends FilterObjectStream<String, ADSentenceStrea
}
/** Represents the AD node */
- public class Node extends TreeElement {
+ public static class Node extends TreeElement {
private List<TreeElement> elems = new ArrayList<>();
public void addElement(TreeElement element) {
@@ -408,7 +415,7 @@ public class ADSentenceStream extends FilterObjectStream<String, ADSentenceStrea
}
/** Represents the AD leaf */
- public class Leaf extends TreeElement {
+ public static class Leaf extends TreeElement {
private String word;
private String lemma;
@@ -478,49 +485,17 @@ public class ADSentenceStream extends FilterObjectStream<String, ADSentenceStrea
}
- /**
- * The start sentence pattern
- */
- private static final Pattern sentStart = Pattern.compile("<s[^>]*>");
-
- /**
- * The end sentence pattern
- */
- private static final Pattern sentEnd = Pattern.compile("</s>");
- private static final Pattern extEnd = Pattern.compile("</ext>");
-
- /**
- * The start sentence pattern
- */
- private static final Pattern titleStart = Pattern.compile("<t[^>]*>");
-
- /**
- * The end sentence pattern
- */
- private static final Pattern titleEnd = Pattern.compile("</t>");
-
- /**
- * The start sentence pattern
- */
- private static final Pattern boxStart = Pattern.compile("<caixa[^>]*>");
-
- /**
- * The end sentence pattern
- */
- private static final Pattern boxEnd = Pattern.compile("</caixa>");
-
-
- /**
- * The start sentence pattern
- */
- private static final Pattern paraStart = Pattern.compile("<p[^>]*>");
-
- /**
- * The start sentence pattern
- */
- private static final Pattern textStart = Pattern.compile("<ext[^>]*>");
+ private static final Pattern SENT_START = Pattern.compile("<s[^>]*>");
+ private static final Pattern SENT_END = Pattern.compile("</s>");
+ private static final Pattern EXT_END = Pattern.compile("</ext>");
+ private static final Pattern TITLE_START = Pattern.compile("<t[^>]*>");
+ private static final Pattern TITLE_END = Pattern.compile("</t>");
+ private static final Pattern BOX_START = Pattern.compile("<caixa[^>]*>");
+ private static final Pattern BOX_END = Pattern.compile("</caixa>");
+ private static final Pattern PARA_START = Pattern.compile("<p[^>]*>");
+ private static final Pattern TEXT_START = Pattern.compile("<ext[^>]*>");
- private SentenceParser parser;
+ private final SentenceParser parser;
private int paraID = 0;
private boolean isTitle = false;
@@ -532,9 +507,10 @@ public class ADSentenceStream extends FilterObjectStream<String, ADSentenceStrea
}
+ @Override
public Sentence read() throws IOException {
- StringBuilder sentence = new StringBuilder();
+ final StringBuilder sentence = new StringBuilder();
boolean sentenceStarted = false;
while (true) {
@@ -543,25 +519,25 @@ public class ADSentenceStream extends FilterObjectStream<String, ADSentenceStrea
if (line != null) {
if (sentenceStarted) {
- if (sentEnd.matcher(line).matches() || extEnd.matcher(line).matches()) {
+ if (SENT_END.matcher(line).matches() || EXT_END.matcher(line).matches()) {
sentenceStarted = false;
} else if (!line.startsWith("A1")) {
sentence.append(line).append('\n');
}
} else {
- if (sentStart.matcher(line).matches()) {
+ if (SENT_START.matcher(line).matches()) {
sentenceStarted = true;
- } else if (paraStart.matcher(line).matches()) {
+ } else if (PARA_START.matcher(line).matches()) {
paraID++;
- } else if (titleStart.matcher(line).matches()) {
+ } else if (TITLE_START.matcher(line).matches()) {
isTitle = true;
- } else if (titleEnd.matcher(line).matches()) {
+ } else if (TITLE_END.matcher(line).matches()) {
isTitle = false;
- } else if (textStart.matcher(line).matches()) {
+ } else if (TEXT_START.matcher(line).matches()) {
paraID = 0;
- } else if (boxStart.matcher(line).matches()) {
+ } else if (BOX_START.matcher(line).matches()) {
isBox = true;
- } else if (boxEnd.matcher(line).matches()) {
+ } else if (BOX_END.matcher(line).matches()) {
isBox = false;
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java
index ddddd498..1a99f40c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java
@@ -27,7 +27,8 @@ import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
public class ADTokenSampleStreamFactory<P> extends
DetokenizerSampleStreamFactory<TokenSample, P> {
@@ -44,6 +45,7 @@ public class ADTokenSampleStreamFactory<P> extends
super(params);
}
+ @Override
public ObjectStream<TokenSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/PortugueseContractionUtility.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/PortugueseContractionUtility.java
index eeecc0e0..4eb7a3e9 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/PortugueseContractionUtility.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/PortugueseContractionUtility.java
@@ -26,13 +26,14 @@ import opennlp.tools.util.StringUtil;
/**
* Utility class to handle Portuguese contractions.
* <p>
- * Some Corpora splits contractions in its parts, for example, "da" > "de" +
- * "a", but according to the fase of language processing, NER for instance, we
+ * Some Corpora split contractions in its parts, for example, "da" > "de" +
+ * "a", but according to the phase of language processing, NER for instance, we
* can't decide if to split a contraction or not, specially because contractions
* inside names are not separated, but outside are.
*
* <p>
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
public class PortugueseContractionUtility {
@@ -152,13 +153,11 @@ public class PortugueseContractionUtility {
}
/**
- * Merges a contraction
+ * Merges a contraction.
*
- * @param left
- * the left component
- * @param right
- * the right component
- * @return the merged contraction
+ * @param left The left component.
+ * @param right The right component.
+ * @return The merged contraction.
*/
public static String toContraction(String left, String right) {
String key = left + "+" + right;