You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by ma...@apache.org on 2022/12/26 16:59:39 UTC
[opennlp] branch main updated: OPENNLP-1422 Enhance JavaDoc in opennlp.tools.formats sub-packages (#468)
This is an automated email from the ASF dual-hosted git repository.
mawiesne pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/main by this push:
new 82ed136d OPENNLP-1422 Enhance JavaDoc in opennlp.tools.formats sub-packages (#468)
82ed136d is described below
commit 82ed136d830c755ac22991d9941fa62f67dd76cc
Author: Martin Wiesner <ma...@users.noreply.github.com>
AuthorDate: Mon Dec 26 17:59:32 2022 +0100
OPENNLP-1422 Enhance JavaDoc in opennlp.tools.formats sub-packages (#468)
---
.../Internal.java} | 35 +++++-----
.../tools/formats/AbstractSampleStreamFactory.java | 1 -
.../tools/formats/BioNLP2004NameSampleStream.java | 56 +++++++++++-----
.../formats/BioNLP2004NameSampleStreamFactory.java | 4 ++
.../tools/formats/ChunkerSampleStreamFactory.java | 1 +
.../tools/formats/Conll02NameSampleStream.java | 48 +++++++++-----
.../formats/Conll02NameSampleStreamFactory.java | 8 ++-
.../tools/formats/Conll03NameSampleStream.java | 35 +++++-----
.../formats/Conll03NameSampleStreamFactory.java | 4 ++
.../tools/formats/ConllXPOSSampleStream.java | 23 ++++++-
.../formats/ConllXPOSSampleStreamFactory.java | 8 ++-
.../formats/ConllXSentenceSampleStreamFactory.java | 6 +-
.../formats/ConllXTokenSampleStreamFactory.java | 6 +-
.../formats/DetokenizerSampleStreamFactory.java | 2 +-
.../tools/formats/DirectorySampleStream.java | 24 ++++---
.../tools/formats/DocumentSampleStreamFactory.java | 1 +
.../tools/formats/EvalitaNameSampleStream.java | 26 ++++----
.../formats/EvalitaNameSampleStreamFactory.java | 17 +++--
.../LanguageDetectorSampleStreamFactory.java | 1 +
.../formats/LemmatizerSampleStreamFactory.java | 1 +
.../formats/NameFinderCensus90NameStream.java | 25 +++++---
.../tools/formats/NameSampleDataStreamFactory.java | 1 +
.../tools/formats/ParseSampleStreamFactory.java | 1 +
.../tools/formats/SentenceSampleStreamFactory.java | 1 +
.../tools/formats/TokenSampleStreamFactory.java | 1 +
.../tools/formats/TwentyNewsgroupSampleStream.java | 6 +-
.../TwentyNewsgroupSampleStreamFactory.java | 2 +-
.../tools/formats/WordTagSampleStreamFactory.java | 6 +-
.../tools/formats/ad/ADChunkSampleStream.java | 2 +
.../formats/ad/ADChunkSampleStreamFactory.java | 2 +
.../formats/ad/ADNameSampleStreamFactory.java | 2 +
.../tools/formats/ad/ADPOSSampleStreamFactory.java | 2 +
.../formats/ad/ADSentenceSampleStreamFactory.java | 2 +
.../formats/ad/ADTokenSampleStreamFactory.java | 2 +
.../package-info.java} | 27 +-------
.../tools/formats/brat/BratAnnotationStream.java | 2 +-
.../opennlp/tools/formats/brat/BratDocument.java | 10 +++
.../tools/formats/brat/BratDocumentParser.java | 4 +-
.../tools/formats/brat/BratDocumentStream.java | 7 +-
.../formats/brat/BratNameSampleStreamFactory.java | 2 +-
.../package-info.java} | 27 +-------
.../formats/conllu/ConlluLemmaSampleStream.java | 9 ++-
.../conllu/ConlluLemmaSampleStreamFactory.java | 4 +-
.../formats/conllu/ConlluPOSSampleStream.java | 6 ++
.../conllu/ConlluPOSSampleStreamFactory.java | 4 +-
.../tools/formats/conllu/ConlluSentence.java | 6 +-
.../formats/conllu/ConlluSentenceSampleStream.java | 6 ++
.../conllu/ConlluSentenceSampleStreamFactory.java | 7 +-
.../opennlp/tools/formats/conllu/ConlluStream.java | 24 +++++--
.../formats/conllu/ConlluTokenSampleStream.java | 5 ++
.../conllu/ConlluTokenSampleStreamFactory.java | 7 +-
.../tools/formats/conllu/ConlluWordLine.java | 29 +++++----
.../package-info.java} | 27 +-------
.../convert/AbstractToSentenceSampleStream.java | 8 +++
.../convert/FileToByteArraySampleStream.java | 12 ++++
.../formats/convert/FileToStringSampleStream.java | 33 +++-------
.../convert/NameToSentenceSampleStream.java | 14 +++-
.../convert/NameToSentenceSampleStreamFactory.java | 8 ++-
.../formats/convert/NameToTokenSampleStream.java | 14 +++-
.../convert/NameToTokenSampleStreamFactory.java | 8 ++-
.../formats/convert/POSToSentenceSampleStream.java | 14 +++-
.../convert/POSToSentenceSampleStreamFactory.java | 8 ++-
.../formats/convert/POSToTokenSampleStream.java | 16 ++++-
.../convert/POSToTokenSampleStreamFactory.java | 8 ++-
.../formats/convert/ParseToPOSSampleStream.java | 13 +++-
.../convert/ParseToPOSSampleStreamFactory.java | 8 ++-
.../ParseToSentenceSampleStreamFactory.java | 6 +-
.../convert/ParseToTokenSampleStreamFactory.java | 6 +-
.../frenchtreebank/ConstitParseSampleStream.java | 13 +++-
.../ConstitParseSampleStreamFactory.java | 9 +++
.../package-info.java} | 27 +-------
.../IrishSentenceBankDocument.java | 75 ++++++++++++++++------
.../IrishSentenceBankSentenceStreamFactory.java | 3 +
.../IrishSentenceBankTokenSampleStream.java | 6 ++
.../IrishSentenceBankTokenSampleStreamFactory.java | 4 ++
.../package-info.java} | 27 +-------
.../leipzig/LeipzigLanguageSampleStream.java | 31 +++++++--
.../LeipzigLanguageSampleStreamFactory.java | 8 ++-
.../tools/formats/leipzig/SampleShuffleStream.java | 2 +-
.../tools/formats/leipzig/SampleSkipStream.java | 1 -
.../package-info.java} | 27 +-------
.../letsmt/DetokenizeSentenceSampleStream.java | 8 +++
.../tools/formats/letsmt/LetsmtDocument.java | 37 ++++++++---
.../letsmt/LetsmtSentenceStreamFactory.java | 6 +-
.../package-info.java} | 27 +-------
.../package-info.java} | 27 +-------
.../formats/moses/MosesSentenceSampleStream.java | 9 +++
.../moses/MosesSentenceSampleStreamFactory.java | 3 +-
.../tools/formats/muc/DocumentSplitterStream.java | 3 +-
.../formats/muc/Muc6NameSampleStreamFactory.java | 1 +
.../tools/formats/muc/MucNameContentHandler.java | 14 ++--
.../tools/formats/muc/MucNameSampleStream.java | 12 +++-
.../package-info.java} | 27 +-------
.../formats/nkjp/NKJPSegmentationDocument.java | 6 +-
.../tools/formats/nkjp/NKJPTextDocument.java | 34 +++++++---
.../package-info.java} | 27 +-------
.../formats/ontonotes/DocumentToLineStream.java | 10 ++-
.../ontonotes/OntoNotesNameSampleStream.java | 13 +++-
.../OntoNotesNameSampleStreamFactory.java | 1 +
.../ontonotes/OntoNotesPOSSampleStreamFactory.java | 3 +-
.../ontonotes/OntoNotesParseSampleStream.java | 9 +++
.../OntoNotesParseSampleStreamFactory.java | 1 +
.../package-info.java} | 27 +-------
103 files changed, 760 insertions(+), 529 deletions(-)
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/commons/Internal.java
similarity index 56%
copy from opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
copy to opennlp-tools/src/main/java/opennlp/tools/commons/Internal.java
index 5e1429b9..b6c9c633 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/commons/Internal.java
@@ -15,30 +15,25 @@
* limitations under the License.
*/
-package opennlp.tools.formats;
+package opennlp.tools.commons;
-import opennlp.tools.cmdline.ObjectStreamFactory;
+import java.lang.annotation.Documented;
+import java.lang.annotation.Retention;
+import java.lang.annotation.RetentionPolicy;
/**
- * Base class for sample stream factories.
+ * Classes, fields, or methods annotated {@code @Internal} are for OpenNLP
+ * internal use only. Such elements are likely to be removed, have a different access level,
+ * or might experience a signature change in upcoming releases of OpenNLP.
*/
-public abstract class AbstractSampleStreamFactory<T,P> implements ObjectStreamFactory<T,P> {
+@Documented
+@Retention(value = RetentionPolicy.RUNTIME)
+public @interface Internal {
- protected Class<P> params;
+ String value() default "";
- private AbstractSampleStreamFactory() {
- }
-
- protected AbstractSampleStreamFactory(Class<P> params) {
- this.params = params;
- }
-
- public String getLang() {
- return "eng";
- }
-
- // FIXME
- public Class<P> getParameters() {
- return params;
- }
+ /**
+ * The OpenNLP release when an element was first declared internal.
+ */
+ String since() default "";
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
index 5e1429b9..663980ac 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
@@ -37,7 +37,6 @@ public abstract class AbstractSampleStreamFactory<T,P> implements ObjectStreamFa
return "eng";
}
- // FIXME
public Class<P> getParameters() {
return params;
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java
index 8730c4c3..c586c980 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStream.java
@@ -19,11 +19,11 @@ package opennlp.tools.formats;
import java.io.IOException;
import java.io.PrintStream;
-import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
+import opennlp.tools.commons.Internal;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
@@ -32,15 +32,37 @@ import opennlp.tools.util.Span;
import opennlp.tools.util.StringUtil;
/**
- * Parser for the training files of the BioNLP/NLPBA 2004 shared task.
+ * A {@link ObjectStream sample stream} for the training files of the
+ * BioNLP/NLPBA 2004 shared task.
* <p>
- * The data contains five named entity types: DNA, RNA, protein, cell_type and cell_line.<br>
+ * The data contains five named entity types:
+ * <ul>
+ * <li>{@code DNA}</li>
+ * <li>{@code RNA}</li>
+ * <li>{@code protein}</li>
+ * <li>{@code cell_type}</li>
+ * <li>{@code cell_line}</li>
+ * </ul>
* <p>
- * Data can be found on this web site:<br>
- * http://www-tsujii.is.s.u-tokyo.ac.jp/GENIA/ERtask/report.html
+ * Data can be found on this
+ * <a href="http://www.geniaproject.org/shared-tasks/bionlp-jnlpba-shared-task-2004">website</a>,
+ * or in
+ * <a href="https://github.com/spyysalo/jnlpba">this repository</a>.
* <p>
- * <b>Note:</b> Do not use this class, internal use only!
+ * The BioNLP/NLPBA 2004 data were originally published here:
+ * <p>
+ * <a href="http://www-tsujii.is.s.u-tokyo.ac.jp/GENIA/ERtask/report.html">
+ * http://www-tsujii.is.s.u-tokyo.ac.jp/GENIA/ERtask/report.html</a>,
+ * <p>
+ * yet this page was gone when last checked in December 2022.
+ * <p>
+ * It looks like this repo contains a copy of the data located on the original page:
+ * The BioNLP 2004 seems to be related to http://www.geniaproject.org/shared-tasks/bionlp-jnlpba-shared-task-2004
+ * <p>
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
+@Internal
public class BioNLP2004NameSampleStream implements ObjectStream<NameSample> {
public static final int GENERATE_DNA_ENTITIES = 0x01;
@@ -53,19 +75,21 @@ public class BioNLP2004NameSampleStream implements ObjectStream<NameSample> {
private final ObjectStream<String> lineStream;
+ /**
+ * Initializes a {@link BioNLP2004NameSampleStream}.
+ *
+ * @param in The {@link InputStreamFactory} to use.
+ * @param types The types to use.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public BioNLP2004NameSampleStream(InputStreamFactory in, int types) throws IOException {
- try {
- this.lineStream = new PlainTextByLineStream(in, StandardCharsets.UTF_8);
- System.setOut(new PrintStream(System.out, true, StandardCharsets.UTF_8.name()));
- } catch (UnsupportedEncodingException e) {
- // UTF-8 is available on all JVMs, will never happen
- throw new IllegalStateException(e);
- }
-
+ this.lineStream = new PlainTextByLineStream(in, StandardCharsets.UTF_8);
+ System.setOut(new PrintStream(System.out, true, StandardCharsets.UTF_8));
this.types = types;
-
}
+ @Override
public NameSample read() throws IOException {
List<String> sentence = new ArrayList<>();
@@ -167,10 +191,12 @@ public class BioNLP2004NameSampleStream implements ObjectStream<NameSample> {
}
}
+ @Override
public void reset() throws IOException, UnsupportedOperationException {
lineStream.reset();
}
+ @Override
public void close() throws IOException {
lineStream.close();
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java
index 2fd378df..422cd4c6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/BioNLP2004NameSampleStreamFactory.java
@@ -27,6 +27,9 @@ import opennlp.tools.cmdline.params.BasicFormatParams;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.util.ObjectStream;
+/**
+ * @see BioNLP2004NameSampleStream
+ */
public class BioNLP2004NameSampleStreamFactory<P> extends AbstractSampleStreamFactory<NameSample, P> {
interface Parameters extends BasicFormatParams {
@@ -43,6 +46,7 @@ public class BioNLP2004NameSampleStreamFactory<P> extends AbstractSampleStreamFa
super(params);
}
+ @Override
public ObjectStream<NameSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ChunkerSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ChunkerSampleStreamFactory.java
index 4b1dfb87..36f8b58e 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ChunkerSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ChunkerSampleStreamFactory.java
@@ -46,6 +46,7 @@ public class ChunkerSampleStreamFactory<P> extends AbstractSampleStreamFactory<C
super(params);
}
+ @Override
public ObjectStream<ChunkSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStream.java
index 5d4507a0..51887541 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStream.java
@@ -19,11 +19,11 @@ package opennlp.tools.formats;
import java.io.IOException;
import java.io.PrintStream;
-import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
+import opennlp.tools.commons.Internal;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.InvalidFormatException;
@@ -33,20 +33,22 @@ import opennlp.tools.util.Span;
import opennlp.tools.util.StringUtil;
/**
- * Parser for the dutch and spanish ner training files of the CONLL 2002 shared task.
+ * Parser for the Dutch and Spanish ner training files of the CONLL 2002 shared task.
* <p>
- * The dutch data has a -DOCSTART- tag to mark article boundaries,
+ * The Dutch data has a {@link #DOCSTART} tag to mark article boundaries,
* adaptive data in the feature generators will be cleared before every article.<br>
- * The spanish data does not contain article boundaries,
+ * The Spanish data does not contain article boundaries,
* adaptive data will be cleared for every sentence.
* <p>
* The data contains four named entity types: Person, Organization, Location and Misc.<br>
* <p>
- * Data can be found on this web site:<br>
- * http://www.cnts.ua.ac.be/conll2002/ner/
+ * Data can be found on this
+ * <a href="http://www.cnts.ua.ac.be/conll2002/ner/">web site</a>.
* <p>
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
+@Internal
public class Conll02NameSampleStream implements ObjectStream<NameSample> {
public enum LANGUAGE {
@@ -66,22 +68,32 @@ public class Conll02NameSampleStream implements ObjectStream<NameSample> {
private final int types;
+ /**
+ * Initializes a {@link Conll02NameSampleStream}.
+ *
+ * @param lang The language of the CONLL 02 data.
+ * @param lineStream An {@link ObjectStream<String>} over the lines
+ * in the CONLL 02 data file.
+ * @param types The entity types to include in the Name Sample object stream.
+ */
public Conll02NameSampleStream(LANGUAGE lang, ObjectStream<String> lineStream, int types) {
this.lang = lang;
this.lineStream = lineStream;
this.types = types;
}
+ /**
+ * Initializes a {@link Conll02NameSampleStream}.
+ *
+ * @param lang The language of the CONLL 02 data.
+ * @param in The {@link InputStreamFactory} for the input file.
+ * @param types The entity types to include in the Name Sample object stream.
+ *
+ * @throws IOException Thrown if IO errors occurred.
+ */
public Conll02NameSampleStream(LANGUAGE lang, InputStreamFactory in, int types) throws IOException {
- this.lang = lang;
- try {
- this.lineStream = new PlainTextByLineStream(in, StandardCharsets.UTF_8);
- System.setOut(new PrintStream(System.out, true, StandardCharsets.UTF_8.name()));
- } catch (UnsupportedEncodingException e) {
- // UTF-8 is available on all JVMs, will never happen
- throw new IllegalStateException(e);
- }
- this.types = types;
+ this (lang, new PlainTextByLineStream(in, StandardCharsets.UTF_8), types);
+ System.setOut(new PrintStream(System.out, true, StandardCharsets.UTF_8));
}
static Span extract(int begin, int end, String beginTag) throws InvalidFormatException {
@@ -108,7 +120,7 @@ public class Conll02NameSampleStream implements ObjectStream<NameSample> {
return new Span(begin, end, type);
}
-
+ @Override
public NameSample read() throws IOException {
List<String> sentence = new ArrayList<>();
@@ -208,10 +220,12 @@ public class Conll02NameSampleStream implements ObjectStream<NameSample> {
}
}
+ @Override
public void reset() throws IOException, UnsupportedOperationException {
lineStream.reset();
}
+ @Override
public void close() throws IOException {
lineStream.close();
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStreamFactory.java
index 246d3dce..d417df0c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll02NameSampleStreamFactory.java
@@ -25,13 +25,18 @@ import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.TerminateToolException;
import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.Conll02NameSampleStream.LANGUAGE;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
+ *
+ * @see Conll02NameSampleStream
*/
+@Internal
public class Conll02NameSampleStreamFactory<P> extends LanguageSampleStreamFactory<NameSample, P> {
interface Parameters extends BasicFormatParams {
@@ -51,6 +56,7 @@ public class Conll02NameSampleStreamFactory<P> extends LanguageSampleStreamFacto
super(params);
}
+ @Override
public ObjectStream<NameSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStream.java
index 2d0b9231..5ec77d88 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStream.java
@@ -19,7 +19,6 @@ package opennlp.tools.formats;
import java.io.IOException;
import java.io.PrintStream;
-import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
@@ -47,10 +46,12 @@ public class Conll03NameSampleStream implements ObjectStream<NameSample> {
private final int types;
/**
- *
- * @param lang the language of the CONLL 03 data
- * @param lineStream an Object Stream over the lines in the CONLL 03 data file
- * @param types the entity types to include in the Name Sample object stream
+ * Initializes a {@link Conll03NameSampleStream}.
+ *
+ * @param lang The language of the CONLL 03 data.
+ * @param lineStream An {@link ObjectStream<String>} over the lines
+ * in the CONLL 03 data file.
+ * @param types The entity types to include in the Name Sample object stream.
*/
public Conll03NameSampleStream(LANGUAGE lang, ObjectStream<String> lineStream, int types) {
this.lang = lang;
@@ -58,19 +59,21 @@ public class Conll03NameSampleStream implements ObjectStream<NameSample> {
this.types = types;
}
+ /**
+ * Initializes a {@link Conll03NameSampleStream}.
+ *
+ * @param lang The language of the CONLL 03 data.
+ * @param in The {@link InputStreamFactory} for the input file.
+ * @param types The entity types to include in the Name Sample object stream.
+ *
+ * @throws IOException Thrown if IO errors occurred.
+ */
public Conll03NameSampleStream(LANGUAGE lang, InputStreamFactory in, int types) throws IOException {
-
- this.lang = lang;
- try {
- this.lineStream = new PlainTextByLineStream(in, StandardCharsets.UTF_8);
- System.setOut(new PrintStream(System.out, true, StandardCharsets.UTF_8.name()));
- } catch (UnsupportedEncodingException e) {
- // UTF-8 is available on all JVMs, will never happen
- throw new IllegalStateException(e);
- }
- this.types = types;
+ this(lang, new PlainTextByLineStream(in, StandardCharsets.UTF_8), types);
+ System.setOut(new PrintStream(System.out, true, StandardCharsets.UTF_8));
}
+ @Override
public NameSample read() throws IOException {
List<String> sentence = new ArrayList<>();
@@ -192,10 +195,12 @@ public class Conll03NameSampleStream implements ObjectStream<NameSample> {
}
}
+ @Override
public void reset() throws IOException, UnsupportedOperationException {
lineStream.reset();
}
+ @Override
public void close() throws IOException {
lineStream.close();
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStreamFactory.java
index 75d4b082..dae580cb 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/Conll03NameSampleStreamFactory.java
@@ -29,6 +29,9 @@ import opennlp.tools.formats.Conll03NameSampleStream.LANGUAGE;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.util.ObjectStream;
+/**
+ * @see Conll03NameSampleStream
+ */
public class Conll03NameSampleStreamFactory<P> extends LanguageSampleStreamFactory<NameSample, P> {
interface Parameters extends BasicFormatParams {
@@ -48,6 +51,7 @@ public class Conll03NameSampleStreamFactory<P> extends LanguageSampleStreamFacto
super(params);
}
+ @Override
public ObjectStream<NameSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStream.java
index 9525ab60..3bd0f9b3 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStream.java
@@ -24,6 +24,7 @@ import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.List;
+import opennlp.tools.commons.Internal;
import opennlp.tools.postag.POSSample;
import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.InputStreamFactory;
@@ -35,21 +36,37 @@ import opennlp.tools.util.PlainTextByLineStream;
/**
* Parses the data from the CONLL 06 shared task into POS Samples.
* <p>
- * More information about the data format can be found here:<br>
- * http://www.cnts.ua.ac.be/conll2006/
+ * More information about the data format can be found
+ * <a href="http://www.cnts.ua.ac.be/conll2006/">here</a>.
* <p>
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
+@Internal
public class ConllXPOSSampleStream extends FilterObjectStream<String, POSSample> {
+ /**
+ * Initializes a {@link ConllXPOSSampleStream}.
+ *
+ * @param lineStream A {@link ObjectStream<String> line stream} representing the input.
+ */
public ConllXPOSSampleStream(ObjectStream<String> lineStream) {
super(new ParagraphStream(lineStream));
}
+ /**
+ * Initializes a {@link ConllXPOSSampleStream}.
+ *
+ * @param in The {@link InputStreamFactory} to use.
+ * @param charset The {@link Charset} to interpret characters with.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public ConllXPOSSampleStream(InputStreamFactory in, Charset charset) throws IOException {
super(new ParagraphStream(new PlainTextByLineStream(in, charset)));
}
+ @Override
public POSSample read() throws IOException {
// The CONLL-X data has a word per line and each line is tab separated
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStreamFactory.java
index d60507dc..cae6c1c2 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXPOSSampleStreamFactory.java
@@ -27,13 +27,18 @@ import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.TerminateToolException;
import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.commons.Internal;
import opennlp.tools.postag.POSSample;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
+ *
+ * @see ConllXPOSSampleStream
*/
+@Internal
public class ConllXPOSSampleStreamFactory<P> extends AbstractSampleStreamFactory<POSSample, P> {
public static final String CONLLX_FORMAT = "conllx";
@@ -50,6 +55,7 @@ public class ConllXPOSSampleStreamFactory<P> extends AbstractSampleStreamFactory
super(params);
}
+ @Override
public ObjectStream<POSSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXSentenceSampleStreamFactory.java
index 06af7989..505f94f2 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXSentenceSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXSentenceSampleStreamFactory.java
@@ -20,14 +20,17 @@ package opennlp.tools.formats;
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.DetokenizerParameter;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.convert.POSToSentenceSampleStream;
import opennlp.tools.postag.POSSample;
import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
+@Internal
public class ConllXSentenceSampleStreamFactory<P> extends
DetokenizerSampleStreamFactory<SentenceSample, P> {
@@ -45,6 +48,7 @@ public class ConllXSentenceSampleStreamFactory<P> extends
super(params);
}
+ @Override
public ObjectStream<SentenceSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXTokenSampleStreamFactory.java
index ef9b3fc6..c894be0a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXTokenSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ConllXTokenSampleStreamFactory.java
@@ -20,14 +20,17 @@ package opennlp.tools.formats;
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.DetokenizerParameter;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.convert.POSToTokenSampleStream;
import opennlp.tools.postag.POSSample;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
+@Internal
public class ConllXTokenSampleStreamFactory<P> extends DetokenizerSampleStreamFactory<TokenSample, P> {
interface Parameters extends ConllXPOSSampleStreamFactory.Parameters, DetokenizerParameter {
@@ -42,6 +45,7 @@ public class ConllXTokenSampleStreamFactory<P> extends DetokenizerSampleStreamFa
super(params);
}
+ @Override
public ObjectStream<TokenSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/DetokenizerSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/DetokenizerSampleStreamFactory.java
index 33663dc8..0b018a2d 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/DetokenizerSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/DetokenizerSampleStreamFactory.java
@@ -28,7 +28,7 @@ import opennlp.tools.tokenize.Detokenizer;
import opennlp.tools.tokenize.DictionaryDetokenizer;
/**
- * Base class for factories which need detokenizer.
+ * Base class for factories which need a {@link Detokenizer}.
*/
public abstract class DetokenizerSampleStreamFactory<T, P> extends AbstractSampleStreamFactory<T, P> {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java
index 8aa99759..f2d2a9d8 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java
@@ -29,7 +29,7 @@ import java.util.Stack;
import opennlp.tools.util.ObjectStream;
/**
- * The directory sample stream allows for creating a stream
+ * The directory sample stream allows for creating an {@link ObjectStream<File>}
* from a directory listing of files.
*/
public class DirectorySampleStream implements ObjectStream<File> {
@@ -40,15 +40,18 @@ public class DirectorySampleStream implements ObjectStream<File> {
private final FileFilter fileFilter;
- private Stack<File> directories = new Stack<>();
+ private final Stack<File> directories = new Stack<>();
- private Stack<File> textFiles = new Stack<>();
+ private final Stack<File> textFiles = new Stack<>();
/**
- * Creates a new directory sample stream.
- * @param dirs The directories to read.
+ * Initializes a {@link DirectorySampleStream}.
+ *
+ * @param dirs The {@link File directories} to read.
* @param fileFilter The {@link FileFilter filter} to apply while enumerating files.
* @param recursive Enables or disables recursive file listing.
+ *
+ * @throws IllegalArgumentException Thrown if one element in {@code dirs} is not a directory.
*/
public DirectorySampleStream(File[] dirs, FileFilter fileFilter, boolean recursive) {
this.fileFilter = fileFilter;
@@ -67,15 +70,17 @@ public class DirectorySampleStream implements ObjectStream<File> {
}
inputDirectories = Collections.unmodifiableList(inputDirectoryList);
-
directories.addAll(inputDirectories);
}
/**
- * Creates a new directory sample stream.
- * @param dir The {@link File directory}.
+ * Initializes a {@link DirectorySampleStream}.
+ *
+ * @param dir The {@link File directory} to read.
* @param fileFilter The {@link FileFilter filter} to apply while enumerating files.
* @param recursive Enables or disables recursive file listing.
+ *
+ * @throws IllegalArgumentException Thrown if {@code dir} is not a directory.
*/
public DirectorySampleStream(File dir, FileFilter fileFilter, boolean recursive) {
this(new File[]{dir}, fileFilter, recursive);
@@ -126,8 +131,7 @@ public class DirectorySampleStream implements ObjectStream<File> {
/**
* {@inheritDoc}
- * Calling this function has no effect on
- * the stream.
+ * Calling this function has no effect on the stream.
*/
@Override
public void close() throws IOException {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/DocumentSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/DocumentSampleStreamFactory.java
index c359f1a6..2ecf5664 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/DocumentSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/DocumentSampleStreamFactory.java
@@ -46,6 +46,7 @@ public class DocumentSampleStreamFactory<P> extends AbstractSampleStreamFactory<
super(params);
}
+ @Override
public ObjectStream<DocumentSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java
index 4bcaf4b8..96ed3338 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStream.java
@@ -19,11 +19,11 @@ package opennlp.tools.formats;
import java.io.IOException;
import java.io.PrintStream;
-import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.List;
+import opennlp.tools.commons.Internal;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.InvalidFormatException;
@@ -48,14 +48,16 @@ import opennlp.tools.util.StringUtil;
* GPE (for Geo-Political Entity), or LOC (for Location).
* <p>
* Each file consists of four columns separated by a blank, containing
- * respectively the token, the Elsnet PoS-tag, the Adige news story to
+ * respectively the token, the Elsnet PoS-tag, the Adige news story to
* which the token belongs, and the Named Entity tag.
* <p>
- * Data can be found on this web site:<br>
- * http://www.evalita.it
+ * Data can be found on this
+ * <a href="http://www.evalita.it">web site</a>.
* <p>
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
+@Internal
public class EvalitaNameSampleStream implements ObjectStream<NameSample> {
public enum LANGUAGE {
@@ -81,15 +83,8 @@ public class EvalitaNameSampleStream implements ObjectStream<NameSample> {
}
public EvalitaNameSampleStream(LANGUAGE lang, InputStreamFactory in, int types) throws IOException {
- this.lang = lang;
- try {
- this.lineStream = new PlainTextByLineStream(in, StandardCharsets.UTF_8);
- System.setOut(new PrintStream(System.out, true, StandardCharsets.UTF_8.name()));
- } catch (UnsupportedEncodingException e) {
- // UTF-8 is available on all JVMs, will never happen
- throw new IllegalStateException(e);
- }
- this.types = types;
+ this(lang, new PlainTextByLineStream(in, StandardCharsets.UTF_8),types);
+ System.setOut(new PrintStream(System.out, true, StandardCharsets.UTF_8));
}
private static Span extract(int begin, int end, String beginTag) throws InvalidFormatException {
@@ -117,6 +112,7 @@ public class EvalitaNameSampleStream implements ObjectStream<NameSample> {
}
+ @Override
public NameSample read() throws IOException {
List<String> sentence = new ArrayList<>();
@@ -221,10 +217,12 @@ public class EvalitaNameSampleStream implements ObjectStream<NameSample> {
}
}
+ @Override
public void reset() throws IOException, UnsupportedOperationException {
lineStream.reset();
}
+ @Override
public void close() throws IOException {
lineStream.close();
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java
index 2b680511..7fa9db40 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/EvalitaNameSampleStreamFactory.java
@@ -25,13 +25,18 @@ import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.TerminateToolException;
import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.EvalitaNameSampleStream.LANGUAGE;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
+ *
+ * @see EvalitaNameSampleStream
*/
+@Internal
public class EvalitaNameSampleStreamFactory<P> extends LanguageSampleStreamFactory<NameSample, P> {
interface Parameters extends BasicFormatParams {
@@ -51,6 +56,7 @@ public class EvalitaNameSampleStreamFactory<P> extends LanguageSampleStreamFacto
super(params);
}
+ @Override
public ObjectStream<NameSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
@@ -66,19 +72,20 @@ public class EvalitaNameSampleStreamFactory<P> extends LanguageSampleStreamFacto
int typesToGenerate = 0;
- if (params.getTypes().contains("per")) {
+ final String types = params.getTypes();
+ if (types.contains("per")) {
typesToGenerate = typesToGenerate |
EvalitaNameSampleStream.GENERATE_PERSON_ENTITIES;
}
- if (params.getTypes().contains("org")) {
+ if (types.contains("org")) {
typesToGenerate = typesToGenerate |
EvalitaNameSampleStream.GENERATE_ORGANIZATION_ENTITIES;
}
- if (params.getTypes().contains("loc")) {
+ if (types.contains("loc")) {
typesToGenerate = typesToGenerate |
EvalitaNameSampleStream.GENERATE_LOCATION_ENTITIES;
}
- if (params.getTypes().contains("gpe")) {
+ if (types.contains("gpe")) {
typesToGenerate = typesToGenerate |
EvalitaNameSampleStream.GENERATE_GPE_ENTITIES;
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
index 43ad0d7b..bda9d482 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
@@ -49,6 +49,7 @@ public class LanguageDetectorSampleStreamFactory<P>
super(params);
}
+ @Override
public ObjectStream<LanguageSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LemmatizerSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LemmatizerSampleStreamFactory.java
index b91e87d6..dfb137e7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LemmatizerSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LemmatizerSampleStreamFactory.java
@@ -46,6 +46,7 @@ public class LemmatizerSampleStreamFactory<P> extends AbstractSampleStreamFactor
super(params);
}
+ @Override
public ObjectStream<LemmaSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/NameFinderCensus90NameStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/NameFinderCensus90NameStream.java
index 3c2fddaf..43fe6f38 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/NameFinderCensus90NameStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/NameFinderCensus90NameStream.java
@@ -21,6 +21,7 @@ import java.io.IOException;
import java.nio.charset.Charset;
import java.util.Locale;
+import opennlp.tools.commons.Internal;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
@@ -40,8 +41,10 @@ import opennlp.tools.util.StringUtil;
* <li>The last is a ranking.
* </ul>
* <p>
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
+@Internal
public class NameFinderCensus90NameStream implements ObjectStream<StringList> {
private final Locale locale;
@@ -49,10 +52,9 @@ public class NameFinderCensus90NameStream implements ObjectStream<StringList> {
private final ObjectStream<String> lineStream;
/**
- * This constructor takes an ObjectStream and initializes the class to handle
- * the stream.
+ * Initializes a {@link NameFinderCensus90NameStream} via {@link ObjectStream<String>}.
*
- * @param lineStream an <code>ObjectSteam<String></code> that represents the
+ * @param lineStream An {@link ObjectStream<String>} that represents the
* input file to be attached to this class.
*/
public NameFinderCensus90NameStream(ObjectStream<String> lineStream) {
@@ -63,12 +65,14 @@ public class NameFinderCensus90NameStream implements ObjectStream<StringList> {
}
/**
- * This constructor takes an <code>InputStream</code> and a <code>Charset</code>
- * and opens an associated stream object with the specified encoding specified.
+ * Initializes a {@link NameFinderCensus90NameStream} via an {@link InputStreamFactory}
+ * and a {@link Charset}.
+ * Opens an associated stream object with the specified encoding specified.
*
- * @param in an <code>InputStreamFactory</code> for the input file.
- * @param encoding the <code>Charset</code> to apply to the input stream.
- * @throws IOException
+ * @param in The {@link InputStreamFactory} for the input file.
+ * @param encoding the {@link Charset} to apply to the input stream.
+ *
+ * @throws IOException Thrown if IO errors occurred.
*/
public NameFinderCensus90NameStream(InputStreamFactory in, Charset encoding)
throws IOException {
@@ -77,6 +81,7 @@ public class NameFinderCensus90NameStream implements ObjectStream<StringList> {
this.lineStream = new PlainTextByLineStream(in, this.encoding);
}
+ @Override
public StringList read() throws IOException {
String line = lineStream.read();
StringList name = null;
@@ -107,10 +112,12 @@ public class NameFinderCensus90NameStream implements ObjectStream<StringList> {
return name;
}
+ @Override
public void reset() throws IOException, UnsupportedOperationException {
lineStream.reset();
}
+ @Override
public void close() throws IOException {
lineStream.close();
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/NameSampleDataStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/NameSampleDataStreamFactory.java
index 5454b799..508359bb 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/NameSampleDataStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/NameSampleDataStreamFactory.java
@@ -46,6 +46,7 @@ public class NameSampleDataStreamFactory<P> extends AbstractSampleStreamFactory<
super(params);
}
+ @Override
public ObjectStream<NameSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ParseSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ParseSampleStreamFactory.java
index dab05057..6ed45743 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ParseSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ParseSampleStreamFactory.java
@@ -46,6 +46,7 @@ public class ParseSampleStreamFactory<P> extends AbstractSampleStreamFactory<Par
super(params);
}
+ @Override
public ObjectStream<Parse> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/SentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/SentenceSampleStreamFactory.java
index 51ba2ce4..e002bbb1 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/SentenceSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/SentenceSampleStreamFactory.java
@@ -47,6 +47,7 @@ public class SentenceSampleStreamFactory<P> extends AbstractSampleStreamFactory<
super(params);
}
+ @Override
public ObjectStream<SentenceSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/TokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/TokenSampleStreamFactory.java
index bad46bf9..ffbd1e6c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/TokenSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/TokenSampleStreamFactory.java
@@ -46,6 +46,7 @@ public class TokenSampleStreamFactory<P> extends LanguageSampleStreamFactory<Tok
super(params);
}
+ @Override
public ObjectStream<TokenSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStream.java
index 98b38f1b..a5cd3af6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStream.java
@@ -31,15 +31,15 @@ import opennlp.tools.util.ObjectStream;
public class TwentyNewsgroupSampleStream implements ObjectStream<DocumentSample> {
- private Tokenizer tokenizer;
+ private final Tokenizer tokenizer;
- private Map<Path, String> catFileMap = new HashMap<>();
+ private final Map<Path, String> catFileMap = new HashMap<>();
private Iterator<Map.Entry<Path, String>> catFileTupleIterator;
TwentyNewsgroupSampleStream(Tokenizer tokenizer, Path dataDir) throws IOException {
this.tokenizer = tokenizer;
- for (Path dir : Files.newDirectoryStream(dataDir, entry -> Files.isDirectory(entry))) {
+ for (Path dir : Files.newDirectoryStream(dataDir, Files::isDirectory)) {
for (Path file : Files.newDirectoryStream(dir)) {
catFileMap.put(file, dir.getFileName().toString());
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactory.java
index 827d2096..edf3d5d3 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/TwentyNewsgroupSampleStreamFactory.java
@@ -69,7 +69,7 @@ public class TwentyNewsgroupSampleStreamFactory<P> extends AbstractSampleStreamF
tokenizer = WhitespaceTokenizer.INSTANCE;
}
else {
- throw new TerminateToolException(-1, "Unkown tokenizer: " + tokenizerName);
+ throw new TerminateToolException(-1, "Unknown tokenizer: " + tokenizerName);
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/WordTagSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/WordTagSampleStreamFactory.java
index c8d171d9..4972b4d1 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/WordTagSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/WordTagSampleStreamFactory.java
@@ -23,6 +23,7 @@ import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.commons.Internal;
import opennlp.tools.postag.POSSample;
import opennlp.tools.postag.WordTagSampleStream;
import opennlp.tools.util.InputStreamFactory;
@@ -30,8 +31,10 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
+@Internal
public class WordTagSampleStreamFactory<P> extends AbstractSampleStreamFactory<POSSample, P> {
public interface Parameters extends BasicFormatParams {
@@ -46,6 +49,7 @@ public class WordTagSampleStreamFactory<P> extends AbstractSampleStreamFactory<P
super(params);
}
+ @Override
public ObjectStream<POSSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStream.java
index 6b7471f2..cdcbd9d1 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStream.java
@@ -22,6 +22,7 @@ import java.util.ArrayList;
import java.util.List;
import opennlp.tools.chunker.ChunkSample;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.ad.ADSentenceStream.Sentence;
import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.Leaf;
import opennlp.tools.formats.ad.ADSentenceStream.SentenceParser.Node;
@@ -55,6 +56,7 @@ import opennlp.tools.util.StringUtil;
* <p>
* <b>Note:</b> Do not use this class, internal use only!
*/
+@Internal
public class ADChunkSampleStream implements ObjectStream<ChunkSample> {
protected final ObjectStream<ADSentenceStream.Sentence> adSentenceStream;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactory.java
index fcd8a1ca..49922f8c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADChunkSampleStreamFactory.java
@@ -27,6 +27,7 @@ import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.LanguageSampleStreamFactory;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
@@ -39,6 +40,7 @@ import opennlp.tools.util.PlainTextByLineStream;
* <b>Note:</b>
* Do not use this class, internal use only!
*/
+@Internal
public class ADChunkSampleStreamFactory<P> extends LanguageSampleStreamFactory<ChunkSample, P> {
interface Parameters {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java
index f813771a..525b4097 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADNameSampleStreamFactory.java
@@ -26,6 +26,7 @@ import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.LanguageSampleStreamFactory;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.util.InputStreamFactory;
@@ -39,6 +40,7 @@ import opennlp.tools.util.PlainTextByLineStream;
* <b>Note:</b>
* Do not use this class, internal use only!
*/
+@Internal
public class ADNameSampleStreamFactory<P> extends LanguageSampleStreamFactory<NameSample, P> {
interface Parameters {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactory.java
index cdfbc30f..80dff476 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADPOSSampleStreamFactory.java
@@ -26,6 +26,7 @@ import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.LanguageSampleStreamFactory;
import opennlp.tools.postag.POSSample;
import opennlp.tools.util.InputStreamFactory;
@@ -36,6 +37,7 @@ import opennlp.tools.util.PlainTextByLineStream;
* <b>Note:</b>
* Do not use this class, internal use only!
*/
+@Internal
public class ADPOSSampleStreamFactory<P> extends
LanguageSampleStreamFactory<POSSample, P> {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactory.java
index bb6ae661..48748c92 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADSentenceSampleStreamFactory.java
@@ -26,6 +26,7 @@ import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.LanguageSampleStreamFactory;
import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.util.InputStreamFactory;
@@ -36,6 +37,7 @@ import opennlp.tools.util.PlainTextByLineStream;
* <b>Note:</b>
* Do not use this class, internal use only!
*/
+@Internal
public class ADSentenceSampleStreamFactory<P> extends
LanguageSampleStreamFactory<SentenceSample, P> {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java
index 1a99f40c..7a93006a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/ADTokenSampleStreamFactory.java
@@ -20,6 +20,7 @@ package opennlp.tools.formats.ad;
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.DetokenizerParameter;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.DetokenizerSampleStreamFactory;
import opennlp.tools.formats.convert.NameToTokenSampleStream;
import opennlp.tools.namefind.NameSample;
@@ -30,6 +31,7 @@ import opennlp.tools.util.ObjectStream;
* <b>Note:</b>
* Do not use this class, internal use only!
*/
+@Internal
public class ADTokenSampleStreamFactory<P> extends
DetokenizerSampleStreamFactory<TokenSample, P> {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/package-info.java
similarity index 61%
copy from opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
copy to opennlp-tools/src/main/java/opennlp/tools/formats/ad/package-info.java
index 5e1429b9..f1d6a677 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ad/package-info.java
@@ -15,30 +15,7 @@
* limitations under the License.
*/
-package opennlp.tools.formats;
-
-import opennlp.tools.cmdline.ObjectStreamFactory;
-
/**
- * Base class for sample stream factories.
+ * Experimental package related to the {@code Arvores Deitadas corpus} format.
*/
-public abstract class AbstractSampleStreamFactory<T,P> implements ObjectStreamFactory<T,P> {
-
- protected Class<P> params;
-
- private AbstractSampleStreamFactory() {
- }
-
- protected AbstractSampleStreamFactory(Class<P> params) {
- this.params = params;
- }
-
- public String getLang() {
- return "eng";
- }
-
- // FIXME
- public Class<P> getParameters() {
- return params;
- }
-}
+package opennlp.tools.formats.ad;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java
index f876d519..7edf4d9d 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratAnnotationStream.java
@@ -33,7 +33,7 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.Span;
/**
- * Reads the annotations from the brat .ann annotation file.
+ * Reads the annotations from the brat {@code .ann} annotation file.
*/
public class BratAnnotationStream implements ObjectStream<BratAnnotation> {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java
index 7444ec6e..8f786749 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java
@@ -31,6 +31,16 @@ import java.util.Map;
import opennlp.tools.util.ObjectStream;
+/**
+ * Brat (brat rapid annotation tool) is based on the stav visualiser
+ * which was originally made in order to visualise BioNLP'11 Shared Task data.
+ * <p>
+ * Data can be found on this
+ * <a href="https://brat.nlplab.org/examples.html#corpus-examples-brat">web site</a>.
+ * <p>
+ * Information about the format are found on this
+ * <a href="https://brat.nlplab.org/introduction.html">web site</a>.
+ */
public class BratDocument {
private final AnnotationConfiguration config;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
index 2e3ae563..bec36f8c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
@@ -33,8 +33,8 @@ import opennlp.tools.util.Span;
public class BratDocumentParser {
- private SentenceDetector sentDetector;
- private Tokenizer tokenizer;
+ private final SentenceDetector sentDetector;
+ private final Tokenizer tokenizer;
private final Set<String> nameTypes;
public BratDocumentParser(SentenceDetector sentenceDetector, Tokenizer tokenizer) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java
index 67d11f97..5abaf0ea 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentStream.java
@@ -32,12 +32,12 @@ import opennlp.tools.util.ObjectStream;
public class BratDocumentStream implements ObjectStream<BratDocument> {
- private AnnotationConfiguration config;
+ private final AnnotationConfiguration config;
private List<String> documentIds = new LinkedList<>();
private Iterator<String> documentIdIterator;
/**
- * Creates a BratDocumentStream which reads the documents from the given input directory.
+ * Creates a {@link BratDocumentStream} which reads the documents from the given input directory.
*
* @param config the annotation.conf from the brat project as an Annotation Configuration object
* @param bratCorpusDirectory the directory containing all the brat training data files
@@ -86,6 +86,7 @@ public class BratDocumentStream implements ObjectStream<BratDocument> {
reset();
}
+ @Override
public BratDocument read() throws IOException {
BratDocument doc = null;
@@ -102,10 +103,12 @@ public class BratDocumentStream implements ObjectStream<BratDocument> {
return doc;
}
+ @Override
public void reset() {
documentIdIterator = documentIds.iterator();
}
+ @Override
public void close() {
// No longer needed, make the object unusable
documentIds = null;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
index e915b052..fd4582bf 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStreamFactory.java
@@ -151,7 +151,7 @@ public class BratNameSampleStreamFactory
tokenizer = WhitespaceTokenizer.INSTANCE;
}
else {
- throw new TerminateToolException(-1, "Unkown tokenizer: " + tokenizerName);
+ throw new TerminateToolException(-1, "Unknown tokenizer: " + tokenizerName);
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/package-info.java
similarity index 61%
copy from opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
copy to opennlp-tools/src/main/java/opennlp/tools/formats/brat/package-info.java
index 5e1429b9..ad52f9b1 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/package-info.java
@@ -15,30 +15,7 @@
* limitations under the License.
*/
-package opennlp.tools.formats;
-
-import opennlp.tools.cmdline.ObjectStreamFactory;
-
/**
- * Base class for sample stream factories.
+ * Experimental package related to the corpus format used by the "brat rapid annotation tool" (brat).
*/
-public abstract class AbstractSampleStreamFactory<T,P> implements ObjectStreamFactory<T,P> {
-
- protected Class<P> params;
-
- private AbstractSampleStreamFactory() {
- }
-
- protected AbstractSampleStreamFactory(Class<P> params) {
- this.params = params;
- }
-
- public String getLang() {
- return "eng";
- }
-
- // FIXME
- public Class<P> getParameters() {
- return params;
- }
-}
+package opennlp.tools.formats.brat;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java
index 98ee48d9..2437db33 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java
@@ -20,6 +20,7 @@ package opennlp.tools.formats.conllu;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import java.util.Objects;
import opennlp.tools.lemmatizer.LemmaSample;
import opennlp.tools.util.FilterObjectStream;
@@ -29,9 +30,15 @@ public class ConlluLemmaSampleStream extends FilterObjectStream<ConlluSentence,
private final ConlluTagset tagset;
+ /**
+ * Initializes a {@link ConlluLemmaSampleStream}.
+ *
+ * @param samples The {@link ObjectStream<ConlluSentence> samples} used as input.
+ * @param tagset The {@link ConlluTagset} to use. Must not be {@code null}.
+ */
public ConlluLemmaSampleStream(ObjectStream<ConlluSentence> samples, ConlluTagset tagset) {
super(samples);
- this.tagset = tagset;
+ this.tagset = Objects.requireNonNull(tagset);
}
@Override
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
index 0647b5f8..452ae15c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
@@ -24,6 +24,7 @@ import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.TerminateToolException;
import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.AbstractSampleStreamFactory;
import opennlp.tools.lemmatizer.LemmaSample;
import opennlp.tools.util.InputStreamFactory;
@@ -32,6 +33,7 @@ import opennlp.tools.util.ObjectStream;
/**
* <b>Note:</b> Do not use this class, internal use only!
*/
+@Internal
public class ConlluLemmaSampleStreamFactory<P> extends AbstractSampleStreamFactory<LemmaSample, P> {
interface Parameters extends BasicFormatParams {
@@ -64,7 +66,7 @@ public class ConlluLemmaSampleStreamFactory<P> extends AbstractSampleStreamFacto
tagset = ConlluTagset.X;
break;
default:
- throw new TerminateToolException(-1, "Unkown tagset parameter: " + params.getTagset());
+ throw new TerminateToolException(-1, "Unknown tagset parameter: " + params.getTagset());
}
InputStreamFactory inFactory =
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStream.java
index 82e81437..f45eb172 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStream.java
@@ -30,6 +30,12 @@ public class ConlluPOSSampleStream extends FilterObjectStream<ConlluSentence, PO
private final ConlluTagset tagset;
+ /**
+ * Initializes a {@link ConlluPOSSampleStream}.
+ *
+ * @param samples The {@link ObjectStream<ConlluSentence> samples} used as input.
+ * @param tagset The {@link ConlluTagset} to use. Must not be {@code null}.
+ */
public ConlluPOSSampleStream(ObjectStream<ConlluSentence> samples, ConlluTagset tagset) {
super(samples);
this.tagset = Objects.requireNonNull(tagset);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java
index d8c64da2..76a3277c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamFactory.java
@@ -24,6 +24,7 @@ import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.TerminateToolException;
import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.AbstractSampleStreamFactory;
import opennlp.tools.postag.POSSample;
import opennlp.tools.util.InputStreamFactory;
@@ -32,6 +33,7 @@ import opennlp.tools.util.ObjectStream;
/**
* <b>Note:</b> Do not use this class, internal use only!
*/
+@Internal
public class ConlluPOSSampleStreamFactory<P> extends AbstractSampleStreamFactory<POSSample, P> {
public static final String CONLLU_FORMAT = "conllu";
@@ -65,7 +67,7 @@ public class ConlluPOSSampleStreamFactory<P> extends AbstractSampleStreamFactory
tagset = ConlluTagset.X;
break;
default:
- throw new TerminateToolException(-1, "Unkown tagset parameter: " + params.getTagset());
+ throw new TerminateToolException(-1, "Unknown tagset parameter: " + params.getTagset());
}
InputStreamFactory inFactory =
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
index 730da2f2..695534d1 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
@@ -24,10 +24,10 @@ import java.util.Optional;
public class ConlluSentence {
- private List<ConlluWordLine> wordLines;
+ private final List<ConlluWordLine> wordLines;
- private String sentenceIdComment;
- private String textComment;
+ private final String sentenceIdComment;
+ private final String textComment;
private boolean newDocument;
private String documentId;
private boolean newParagraph;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java
index f49e2050..852e71eb 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java
@@ -30,6 +30,12 @@ public class ConlluSentenceSampleStream extends FilterObjectStream<ConlluSentenc
private final int sentencesPerSample;
+ /**
+ * Initializes a {@link ConlluSentenceSampleStream}.
+ *
+ * @param samples The {@link ObjectStream<ConlluSentence> samples} used as input.
+ * @param sentencesPerSample The number of sentences per sample. Must not be negative.
+ */
public ConlluSentenceSampleStream(ObjectStream<ConlluSentence> samples, int sentencesPerSample) {
super(samples);
this.sentencesPerSample = sentencesPerSample;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
index f39eca11..3b1164ca 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
@@ -23,14 +23,19 @@ import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.AbstractSampleStreamFactory;
import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
+ *
+ * @see ConlluSentenceSampleStream
*/
+@Internal
public class ConlluSentenceSampleStreamFactory<P> extends AbstractSampleStreamFactory<SentenceSample, P> {
interface Parameters extends BasicFormatParams {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
index 29634a18..67913f32 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
@@ -39,12 +39,21 @@ import opennlp.tools.util.ParagraphStream;
import opennlp.tools.util.PlainTextByLineStream;
/**
- * The CoNNL-U Format is specified here:
- * http://universaldependencies.org/format.html
+ * The CoNNL-U Format is specified
+ * <a href="http://universaldependencies.org/format.html">here</a>.
*/
public class ConlluStream implements ObjectStream<ConlluSentence> {
private final ObjectStream<String> sentenceStream;
+ private static final Pattern regex = Pattern.compile("text_([a-z]{2,3})");
+
+ /**
+ * Initializes a {@link ConlluStream}.
+ *
+ * @param in The {@link InputStreamFactory} to use. Characters will be interpreted in UTF-8.
+ *
+ * @throws IOException Thrown if IO errors occurred during initialization.
+ */
public ConlluStream(InputStreamFactory in) throws IOException {
this.sentenceStream = new ParagraphStream(new PlainTextByLineStream(in, StandardCharsets.UTF_8));
}
@@ -180,10 +189,12 @@ public class ConlluStream implements ObjectStream<ConlluSentence> {
}
/**
- * Merges token level annotations
- * @param contraction the line that receives the annotation
- * @param expandedParts the lines to get annotation
- * @return the merged line
+ * Merges token level annotations.
+ *
+ * @param contraction The line that receives the annotation.
+ * @param expandedParts The lines to get annotation.
+ *
+ * @return The {@link ConlluWordLine merged line}.
*/
private ConlluWordLine mergeAnnotation(ConlluWordLine contraction,
List<ConlluWordLine> expandedParts) {
@@ -221,7 +232,6 @@ public class ConlluStream implements ObjectStream<ConlluSentence> {
Map<Locale, String> textLang) throws InvalidFormatException {
String lang = "";
try {
- Pattern regex = Pattern.compile("text_([a-z]{2,3})");
Matcher regexMatcher = regex.matcher(firstPart);
if (regexMatcher.find()) {
lang = regexMatcher.group(1);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
index bc6907b5..eb97f04f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
@@ -26,6 +26,11 @@ import opennlp.tools.util.StringUtil;
public class ConlluTokenSampleStream extends FilterObjectStream<ConlluSentence, TokenSample> {
+ /**
+ * Initializes a {@link ConlluTokenSampleStream}.
+ *
+ * @param samples The {@link ObjectStream<ConlluSentence> samples} used as input.
+ */
public ConlluTokenSampleStream(ObjectStream<ConlluSentence> samples) {
super(samples);
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
index 21d8c4af..5f813a65 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
@@ -23,14 +23,19 @@ import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.CmdLineUtil;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.AbstractSampleStreamFactory;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.util.InputStreamFactory;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
+ *
+ * @see ConlluTokenSampleStream
*/
+@Internal
public class ConlluTokenSampleStreamFactory<P> extends AbstractSampleStreamFactory<TokenSample, P> {
interface Parameters extends BasicFormatParams {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java
index 4aee16d2..0e3129f3 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java
@@ -67,7 +67,7 @@ public class ConlluWordLine {
}
/**
- * Retrieves the word index. An Integer starting at 1 for each new sentence;
+ * @return Retrieves the word index. An Integer starting at {@code 1} for each new sentence;
* may be a range for multiword tokens; may be a decimal number for empty nodes.
*/
public String getId() {
@@ -75,24 +75,26 @@ public class ConlluWordLine {
}
/**
- * Retrieve the word form or punctuation symbol.
+ * @return Retrieves the word form or punctuation symbol.
*/
public String getForm() {
return form;
}
/**
- * Retrieve the lemma or stem of the word form.
+ * @return Retrieves the lemma or stem of the word form.
*/
public String getLemma() {
return lemma;
}
/**
- * Retrieve the Universal part-of-speech tag or the language-specific part-of-speech tag;
- * underscore if not available.
+ * @param tagset The {@link ConlluTagset type of tag} to retrieve, either universal
+ * ({@link ConlluTagset#U}) or language specific ({@link ConlluTagset#X}).
*
- * @param tagset the type of tag to retrieve, either universial (u) or language specific (x)
+ * @return Retrieves the Universal part-of-speech tag or the language-specific part-of-speech tag;
+ * underscore if not available.
+ * @throws IllegalStateException Thrown if a non-supported {@link ConlluTagset} was specified.
*/
public String getPosTag(ConlluTagset tagset) {
switch (tagset) {
@@ -106,37 +108,38 @@ public class ConlluWordLine {
}
/**
- * Retrieve list of morphological features from the universal feature inventory or from a
- * defined language-specific extension; underscore if not available.
+ * @return Retrieves morphological features from the universal feature inventory
+ * or from a defined language-specific extension; underscore if not available.
*/
public String getFeats() {
return feats;
}
/**
- * Head of the current word, which is either a value of ID or zero (0).
+ * @return Retrieves the head of the current word, which is either a value of ID or zero (0).
*/
public String getHead() {
return head;
}
/**
- * Universal dependency relation to the HEAD (root iff HEAD = 0) or a
- * defined language-specific subtype of one.
+ * @return Retrieves the Universal dependency relation to the HEAD (root if HEAD = 0)
+ * or a defined language-specific subtype of one.
*/
public String getDeprel() {
return deprel;
}
/**
- * Enhanced dependency graph in the form of a list of head-deprel pairs.
+ * @return Retrieves the enhanced dependency graph in the form of a list of
+ * head-deprel pairs.
*/
public String getDeps() {
return deps;
}
/**
- * Retrieve any other annotation.
+ * @return Retrieves any other annotation.
*/
public String getMisc() {
return misc;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/package-info.java
similarity index 61%
copy from opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
copy to opennlp-tools/src/main/java/opennlp/tools/formats/conllu/package-info.java
index 5e1429b9..862ff624 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/package-info.java
@@ -15,30 +15,7 @@
* limitations under the License.
*/
-package opennlp.tools.formats;
-
-import opennlp.tools.cmdline.ObjectStreamFactory;
-
/**
- * Base class for sample stream factories.
+ * Experimental package related to the CoNNL-U format.
*/
-public abstract class AbstractSampleStreamFactory<T,P> implements ObjectStreamFactory<T,P> {
-
- protected Class<P> params;
-
- private AbstractSampleStreamFactory() {
- }
-
- protected AbstractSampleStreamFactory(Class<P> params) {
- this.params = params;
- }
-
- public String getLang() {
- return "eng";
- }
-
- // FIXME
- public Class<P> getParameters() {
- return params;
- }
-}
+package opennlp.tools.formats.conllu;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/AbstractToSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/AbstractToSentenceSampleStream.java
index 77468a8f..bab8c3fa 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/AbstractToSentenceSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/AbstractToSentenceSampleStream.java
@@ -34,6 +34,13 @@ public abstract class AbstractToSentenceSampleStream<T> extends
private final int chunkSize;
+ /**
+ * @param detokenizer The {@link Detokenizer} to use. Must not be {@code null}.
+ * @param samples The {@link ObjectStream<T> samples} as input. Must not be {@code null}.
+ * @param chunkSize The size of chunks. Must be equal to or greater than {@code 0}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
AbstractToSentenceSampleStream(Detokenizer detokenizer,
ObjectStream<T> samples, int chunkSize) {
super(samples);
@@ -54,6 +61,7 @@ public abstract class AbstractToSentenceSampleStream<T> extends
protected abstract String[] toSentence(T sample);
+ @Override
public SentenceSample read() throws IOException {
List<String[]> sentences = new ArrayList<>();
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToByteArraySampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToByteArraySampleStream.java
index b7dedbbd..428cfed0 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToByteArraySampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToByteArraySampleStream.java
@@ -24,11 +24,22 @@ import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
+import opennlp.tools.commons.Internal;
import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.ObjectStream;
+/**
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
+ */
+@Internal
public class FileToByteArraySampleStream extends FilterObjectStream<File, byte[]> {
+ /**
+ * Initializes a {@link FileToByteArraySampleStream}.
+ *
+ * @param samples The {@link ObjectStream} containing the files.
+ */
public FileToByteArraySampleStream(ObjectStream<File> samples) {
super(samples);
}
@@ -48,6 +59,7 @@ public class FileToByteArraySampleStream extends FilterObjectStream<File, byte[]
return bytes.toByteArray();
}
+ @Override
public byte[] read() throws IOException {
File sampleFile = samples.read();
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToStringSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToStringSampleStream.java
index 933d3b85..0b18b3a5 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToStringSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/FileToStringSampleStream.java
@@ -17,13 +17,11 @@
package opennlp.tools.formats.convert;
-import java.io.BufferedReader;
import java.io.File;
-import java.io.FileInputStream;
import java.io.IOException;
-import java.io.InputStreamReader;
import java.io.Reader;
import java.nio.charset.Charset;
+import java.nio.file.Files;
import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.ObjectStream;
@@ -31,52 +29,42 @@ import opennlp.tools.util.ObjectStream;
/**
* Provides the ability to read the contents of files
* contained in an object stream of files.
- *
*/
public class FileToStringSampleStream extends FilterObjectStream<File, String> {
private final Charset encoding;
/**
- * Creates a new file-to-string sample stream.
+ * Initializes a {@link FileToStringSampleStream}.
+ *
* @param samples The {@link ObjectStream} containing the files.
* @param encoding The {@link Charset} encoding of the files.
*/
public FileToStringSampleStream(ObjectStream<File> samples, Charset encoding) {
super(samples);
-
this.encoding = encoding;
}
/**
- * Reads the contents of a file to a string.
+ * Reads the contents of a {@code textFile} to a string.
+ *
* @param textFile The {@link File} to read.
- * @param encoding The {@link Charset} for the file.
+ * @param encoding The {@link Charset} of the {@code textFile}.
+ *
* @return The string contents of the file.
- * @throws IOException Thrown if the file cannot be read.
+ * @throws IOException Thrown if the file could not be read.
*/
private static String readFile(File textFile, Charset encoding) throws IOException {
- Reader in = new BufferedReader(new InputStreamReader(new FileInputStream(textFile), encoding));
-
StringBuilder text = new StringBuilder();
-
- try {
+ try (Reader in = Files.newBufferedReader(textFile.toPath(), encoding)) {
char[] buffer = new char[1024];
int length;
while ((length = in.read(buffer, 0, buffer.length)) > 0) {
text.append(buffer, 0, length);
}
}
- finally {
- try {
- in.close();
- }
- catch (IOException e) {
- // sorry that this can fail!
- }
- }
-
+
return text.toString();
}
@@ -84,7 +72,6 @@ public class FileToStringSampleStream extends FilterObjectStream<File, String> {
public String read() throws IOException {
File sampleFile = samples.read();
-
if (sampleFile != null) {
return readFile(sampleFile, encoding);
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToSentenceSampleStream.java
index b5f3196f..4f8db8b5 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToSentenceSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToSentenceSampleStream.java
@@ -17,15 +17,27 @@
package opennlp.tools.formats.convert;
+import opennlp.tools.commons.Internal;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.tokenize.Detokenizer;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
+@Internal
public class NameToSentenceSampleStream extends AbstractToSentenceSampleStream<NameSample> {
+ /**
+ * Initializes a {@link NameToSentenceSampleStream}.
+ *
+ * @param detokenizer The {@link Detokenizer} to use. Must not be {@code null}.
+ * @param samples The {@link ObjectStream<NameSample> samples} as input. Must not be {@code null}.
+ * @param chunkSize The size of chunks. Must be equal to or greater than {@code 0}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
public NameToSentenceSampleStream(Detokenizer detokenizer,
ObjectStream<NameSample> samples, int chunkSize) {
super(detokenizer, samples, chunkSize);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToSentenceSampleStreamFactory.java
index 5a387af8..ad5354e4 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToSentenceSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToSentenceSampleStreamFactory.java
@@ -20,6 +20,7 @@ package opennlp.tools.formats.convert;
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.DetokenizerParameter;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.DetokenizerSampleStreamFactory;
import opennlp.tools.formats.NameSampleDataStreamFactory;
import opennlp.tools.namefind.NameSample;
@@ -27,8 +28,12 @@ import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
+ *
+ * @see NameToSentenceSampleStream
*/
+@Internal
public class NameToSentenceSampleStreamFactory<P> extends DetokenizerSampleStreamFactory<SentenceSample, P> {
interface Parameters extends NameSampleDataStreamFactory.Parameters, DetokenizerParameter {
@@ -43,6 +48,7 @@ public class NameToSentenceSampleStreamFactory<P> extends DetokenizerSampleStrea
super(params);
}
+ @Override
public ObjectStream<SentenceSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToTokenSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToTokenSampleStream.java
index cfcdd38f..4c3057f0 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToTokenSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToTokenSampleStream.java
@@ -19,6 +19,7 @@ package opennlp.tools.formats.convert;
import java.io.IOException;
+import opennlp.tools.commons.Internal;
import opennlp.tools.namefind.NameSample;
import opennlp.tools.tokenize.Detokenizer;
import opennlp.tools.tokenize.TokenSample;
@@ -26,18 +27,29 @@ import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
+@Internal
public class NameToTokenSampleStream extends FilterObjectStream<NameSample, TokenSample> {
private final Detokenizer detokenizer;
+ /**
+ * Initializes a {@link NameToTokenSampleStream}.
+ *
+ * @param detokenizer The {@link Detokenizer} to use. Must not be {@code null}.
+ * @param samples The {@link ObjectStream<NameSample> samples} as input. Must not be {@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
public NameToTokenSampleStream(Detokenizer detokenizer, ObjectStream<NameSample> samples) {
super(samples);
this.detokenizer = detokenizer;
}
+ @Override
public TokenSample read() throws IOException {
NameSample nameSample = samples.read();
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToTokenSampleStreamFactory.java
index 667845ea..4d2c10bc 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToTokenSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/NameToTokenSampleStreamFactory.java
@@ -20,6 +20,7 @@ package opennlp.tools.formats.convert;
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.DetokenizerParameter;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.DetokenizerSampleStreamFactory;
import opennlp.tools.formats.NameSampleDataStreamFactory;
import opennlp.tools.namefind.NameSample;
@@ -27,8 +28,12 @@ import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
+ *
+ * @see NameToTokenSampleStream
*/
+@Internal
public class NameToTokenSampleStreamFactory<P> extends DetokenizerSampleStreamFactory<TokenSample, P> {
interface Parameters extends NameSampleDataStreamFactory.Parameters, DetokenizerParameter {
@@ -43,6 +48,7 @@ public class NameToTokenSampleStreamFactory<P> extends DetokenizerSampleStreamFa
super(params);
}
+ @Override
public ObjectStream<TokenSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToSentenceSampleStream.java
index 64fa0842..c4788c98 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToSentenceSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToSentenceSampleStream.java
@@ -17,15 +17,27 @@
package opennlp.tools.formats.convert;
+import opennlp.tools.commons.Internal;
import opennlp.tools.postag.POSSample;
import opennlp.tools.tokenize.Detokenizer;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
+@Internal
public class POSToSentenceSampleStream extends AbstractToSentenceSampleStream<POSSample> {
+ /**
+ * Initializes a {@link POSToSentenceSampleStream}.
+ *
+ * @param detokenizer The {@link Detokenizer} to use. Must not be {@code null}.
+ * @param samples The {@link ObjectStream<POSSample> samples} as input. Must not be {@code null}.
+ * @param chunkSize The size of chunks. Must be equal to or greater than {@code 0}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
public POSToSentenceSampleStream(Detokenizer detokenizer, ObjectStream<POSSample> samples,
int chunkSize) {
super(detokenizer, samples, chunkSize);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToSentenceSampleStreamFactory.java
index f84331dc..25162683 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToSentenceSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToSentenceSampleStreamFactory.java
@@ -20,6 +20,7 @@ package opennlp.tools.formats.convert;
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.DetokenizerParameter;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.DetokenizerSampleStreamFactory;
import opennlp.tools.formats.WordTagSampleStreamFactory;
import opennlp.tools.postag.POSSample;
@@ -27,8 +28,12 @@ import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
+ *
+ * @see POSToSentenceSampleStream
*/
+@Internal
public class POSToSentenceSampleStreamFactory<P> extends DetokenizerSampleStreamFactory<SentenceSample, P> {
interface Parameters extends WordTagSampleStreamFactory.Parameters, DetokenizerParameter {
@@ -43,6 +48,7 @@ public class POSToSentenceSampleStreamFactory<P> extends DetokenizerSampleStream
super(params);
}
+ @Override
public ObjectStream<SentenceSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToTokenSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToTokenSampleStream.java
index 05ca149b..c59aadc7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToTokenSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToTokenSampleStream.java
@@ -20,6 +20,7 @@ package opennlp.tools.formats.convert;
import java.io.IOException;
import java.util.Objects;
+import opennlp.tools.commons.Internal;
import opennlp.tools.postag.POSSample;
import opennlp.tools.tokenize.Detokenizer;
import opennlp.tools.tokenize.TokenSample;
@@ -27,22 +28,31 @@ import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
+@Internal
public class POSToTokenSampleStream extends FilterObjectStream<POSSample, TokenSample> {
private final Detokenizer detokenizer;
+ /**
+ * Initializes a {@link POSToTokenSampleStream}.
+ *
+ * @param detokenizer The {@link Detokenizer} to use. Must not be {@code null}.
+ * @param samples The {@link ObjectStream<POSSample> samples} as input. Must not be {@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
public POSToTokenSampleStream(Detokenizer detokenizer, ObjectStream<POSSample> samples) {
super(samples);
-
this.detokenizer = Objects.requireNonNull(detokenizer, "detokenizer must not be null!");
}
+ @Override
public TokenSample read() throws IOException {
POSSample posSample = samples.read();
-
TokenSample tokenSample = null;
if (posSample != null ) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToTokenSampleStreamFactory.java
index c8a7aa89..e5750a33 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToTokenSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/POSToTokenSampleStreamFactory.java
@@ -20,6 +20,7 @@ package opennlp.tools.formats.convert;
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.DetokenizerParameter;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.DetokenizerSampleStreamFactory;
import opennlp.tools.formats.WordTagSampleStreamFactory;
import opennlp.tools.postag.POSSample;
@@ -27,8 +28,12 @@ import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
+ *
+ * @see POSToTokenSampleStream
*/
+@Internal
public class POSToTokenSampleStreamFactory<P> extends DetokenizerSampleStreamFactory<TokenSample, P> {
interface Parameters extends WordTagSampleStreamFactory.Parameters, DetokenizerParameter {
@@ -43,6 +48,7 @@ public class POSToTokenSampleStreamFactory<P> extends DetokenizerSampleStreamFac
super(params);
}
+ @Override
public ObjectStream<TokenSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToPOSSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToPOSSampleStream.java
index d5b66759..60354850 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToPOSSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToPOSSampleStream.java
@@ -21,20 +21,31 @@ import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
+import opennlp.tools.commons.Internal;
import opennlp.tools.parser.Parse;
import opennlp.tools.postag.POSSample;
import opennlp.tools.util.FilterObjectStream;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
+@Internal
public class ParseToPOSSampleStream extends FilterObjectStream<Parse, POSSample> {
+ /**
+ * Initializes a {@link ParseToPOSSampleStream}.
+ *
+ * @param samples The {@link ObjectStream<Parse> samples} as input. Must not be {@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
public ParseToPOSSampleStream(ObjectStream<Parse> samples) {
super(samples);
}
+ @Override
public POSSample read() throws IOException {
Parse parse = samples.read();
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToPOSSampleStreamFactory.java
index 2db7aa16..73f12a3c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToPOSSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToPOSSampleStreamFactory.java
@@ -19,6 +19,7 @@ package opennlp.tools.formats.convert;
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.LanguageSampleStreamFactory;
import opennlp.tools.formats.ParseSampleStreamFactory;
import opennlp.tools.parser.Parse;
@@ -26,8 +27,12 @@ import opennlp.tools.postag.POSSample;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
+ *
+ * @see ParseToPOSSampleStream
*/
+@Internal
public class ParseToPOSSampleStreamFactory
extends LanguageSampleStreamFactory<POSSample, ParseSampleStreamFactory.Parameters> {
@@ -35,6 +40,7 @@ public class ParseToPOSSampleStreamFactory
super(ParseSampleStreamFactory.Parameters.class);
}
+ @Override
public ObjectStream<POSSample> create(String[] args) {
ParseSampleStreamFactory.Parameters params =
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToSentenceSampleStreamFactory.java
index 6b929741..cfa253b7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToSentenceSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToSentenceSampleStreamFactory.java
@@ -20,6 +20,7 @@ package opennlp.tools.formats.convert;
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.DetokenizerParameter;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.DetokenizerSampleStreamFactory;
import opennlp.tools.formats.ParseSampleStreamFactory;
import opennlp.tools.parser.Parse;
@@ -27,8 +28,10 @@ import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
+@Internal
public class ParseToSentenceSampleStreamFactory extends DetokenizerSampleStreamFactory
<SentenceSample, ParseToSentenceSampleStreamFactory.Parameters> {
@@ -39,6 +42,7 @@ public class ParseToSentenceSampleStreamFactory extends DetokenizerSampleStreamF
super(Parameters.class);
}
+ @Override
public ObjectStream<SentenceSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToTokenSampleStreamFactory.java
index b825aad6..1a39c2c1 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToTokenSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/convert/ParseToTokenSampleStreamFactory.java
@@ -20,6 +20,7 @@ package opennlp.tools.formats.convert;
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.DetokenizerParameter;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.DetokenizerSampleStreamFactory;
import opennlp.tools.formats.ParseSampleStreamFactory;
import opennlp.tools.formats.WordTagSampleStreamFactory;
@@ -28,8 +29,10 @@ import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
*/
+@Internal
public class ParseToTokenSampleStreamFactory extends DetokenizerSampleStreamFactory<TokenSample,
ParseToTokenSampleStreamFactory.Parameters> {
@@ -40,6 +43,7 @@ public class ParseToTokenSampleStreamFactory extends DetokenizerSampleStreamFact
super(Parameters.class);
}
+ @Override
public ObjectStream<TokenSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStream.java
index 51067eab..fef25d8b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStream.java
@@ -33,15 +33,23 @@ import opennlp.tools.util.XmlUtil;
public class ConstitParseSampleStream extends FilterObjectStream<byte[], Parse> {
- private SAXParser saxParser;
+ private final SAXParser saxParser;
- private List<Parse> parses = new ArrayList<>();
+ private final List<Parse> parses = new ArrayList<>();
+ /**
+ * Initializes a {@link ConstitParseSampleStream}.
+ *
+ * @param samples The {@link ObjectStream byte[] samples} as input. Must not be {@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
protected ConstitParseSampleStream(ObjectStream<byte[]> samples) {
super(samples);
saxParser = XmlUtil.createSaxParser();
}
+ @Override
public Parse read() throws IOException {
if (parses.isEmpty()) {
byte[] xmlbytes = samples.read();
@@ -53,7 +61,6 @@ public class ConstitParseSampleStream extends FilterObjectStream<byte[], Parse>
saxParser.parse(new ByteArrayInputStream(xmlbytes),
new ConstitDocumentHandler(producedParses));
} catch (SAXException e) {
- //TODO update after Java6 upgrade
throw new IOException(e.getMessage(), e);
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java
index bd39c16e..11cd0108 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/ConstitParseSampleStreamFactory.java
@@ -20,12 +20,20 @@ package opennlp.tools.formats.frenchtreebank;
import opennlp.tools.cmdline.ArgumentParser;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.AbstractSampleStreamFactory;
import opennlp.tools.formats.DirectorySampleStream;
import opennlp.tools.formats.convert.FileToByteArraySampleStream;
import opennlp.tools.parser.Parse;
import opennlp.tools.util.ObjectStream;
+/**
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
+ *
+ * @see ConstitParseSampleStream
+ */
+@Internal
public class ConstitParseSampleStreamFactory
extends AbstractSampleStreamFactory<Parse, ConstitParseSampleStreamFactory.Parameters> {
@@ -37,6 +45,7 @@ public class ConstitParseSampleStreamFactory
super(Parameters.class);
}
+ @Override
public ObjectStream<Parse> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/package-info.java
similarity index 61%
copy from opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
copy to opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/package-info.java
index 5e1429b9..4edbd5bd 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/frenchtreebank/package-info.java
@@ -15,30 +15,7 @@
* limitations under the License.
*/
-package opennlp.tools.formats;
-
-import opennlp.tools.cmdline.ObjectStreamFactory;
-
/**
- * Base class for sample stream factories.
+ * Experimental package related to the French Treebank format.
*/
-public abstract class AbstractSampleStreamFactory<T,P> implements ObjectStreamFactory<T,P> {
-
- protected Class<P> params;
-
- private AbstractSampleStreamFactory() {
- }
-
- protected AbstractSampleStreamFactory(Class<P> params) {
- this.params = params;
- }
-
- public String getLang() {
- return "eng";
- }
-
- // FIXME
- public Class<P> getParameters() {
- return params;
- }
-}
+package opennlp.tools.formats.frenchtreebank;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java
index d4ea184f..1949eaa7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java
@@ -18,10 +18,10 @@
package opennlp.tools.formats.irishsentencebank;
import java.io.File;
-import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.lang.StringBuilder;
+import java.nio.file.Files;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
@@ -44,8 +44,13 @@ import opennlp.tools.util.XmlUtil;
* A structure to hold an Irish Sentence Bank document, which is a collection
* of tokenized sentences.
* <p>
- * The sentence bank can be downloaded from, and is described
- * <a href="http://www.lexiconista.com/datasets/sentencebank-ga/">here</a>
+ * The sentence bank can be downloaded from this
+ * <a href="https://github.com/michmech/irish-sentence-bank">website</a>.
+ * <p>
+ * It was originally published and described
+ * <a href="http://www.lexiconista.com/datasets/sentencebank-ga/">
+ * http://www.lexiconista.com/datasets/sentencebank-ga/</a>, yet this
+ * page was gone when last checked in December 2022.
*/
public class IrishSentenceBankDocument {
@@ -65,11 +70,12 @@ public class IrishSentenceBankDocument {
}
public static class IrishSentenceBankSentence {
- private String source;
- private String translation;
- private String original;
- private Span[] tokens;
- private IrishSentenceBankFlex[] flex;
+ private final String source;
+ private final String translation;
+ private final String original;
+ private final Span[] tokens;
+ private final IrishSentenceBankFlex[] flex;
+
public String getSource() {
return source;
}
@@ -98,25 +104,35 @@ public class IrishSentenceBankDocument {
}
}
- private List<IrishSentenceBankSentence> sentences;
+ private final List<IrishSentenceBankSentence> sentences;
+ /**
+ * Initializes an empty {@link IrishSentenceBankDocument}.
+ */
public IrishSentenceBankDocument() {
- sentences = new ArrayList<IrishSentenceBankSentence>();
+ sentences = new ArrayList<>();
}
+ /**
+ * @param sent A {@link IrishSentenceBankSentence} to be added.
+ */
public void add(IrishSentenceBankSentence sent) {
this.sentences.add(sent);
}
+ /**
+ * @return Retrieves an unmodifiable list of all {@link IrishSentenceBankSentence sentences}.
+ */
public List<IrishSentenceBankSentence> getSentences() {
return Collections.unmodifiableList(sentences);
}
/**
- * Helper to adjust the span of punctuation tokens: ignores spaces to the left of the string
- * @param s the string to check
- * @param start the offset of the start of the string
- * @return the offset adjusted to ignore spaces to the left
+ * Helper to adjust the span of punctuation tokens: ignores spaces to the left of the string.
+ * @param s The string to check.
+ * @param start The offset of the start of the string.
+ *
+ * @return The offset adjusted to ignore spaces to the left.
*/
private static int advanceLeft(String s, int start) {
int ret = start;
@@ -131,10 +147,11 @@ public class IrishSentenceBankDocument {
}
/**
- * Helper to adjust the span of punctuation tokens: ignores spaces to the right of the string
- * @param s the string to check
- * @param start the offset of the start of the string
- * @return the offset of the end of the string, adjusted to ignore spaces to the right
+ * Helper to adjust the span of punctuation tokens: ignores spaces to the right of the string.
+ * @param s The string to check.
+ * @param start The offset of the start of the string.
+ *
+ * @return The offset of the end of the string, adjusted to ignore spaces to the right.
*/
private static int advanceRight(String s, int start) {
int end = s.length() - 1;
@@ -149,6 +166,15 @@ public class IrishSentenceBankDocument {
return ret;
}
+ /**
+ * Parses the data provided via an {@link InputStream} into a
+ * {@link IrishSentenceBankDocument}.
+ *
+ * @param is A valid, open {@link InputStream} ready for use.
+ *
+ * @return A valid {@link IrishSentenceBankDocument}.
+ * @throws IOException Thrown if IO errors occurred.
+ */
public static IrishSentenceBankDocument parse(InputStream is) throws IOException {
IrishSentenceBankDocument document = new IrishSentenceBankDocument();
@@ -203,7 +229,7 @@ public class IrishSentenceBankDocument {
spans.add(new Span(last, last + tmptok.length()));
String slottmpb = orignl.item(k).getAttributes().getNamedItem("slot").getNodeValue();
- Integer tokslot = Integer.parseInt(slottmpb);
+ int tokslot = Integer.parseInt(slottmpb);
if (tokslot > flexes) {
flexes = tokslot;
}
@@ -265,8 +291,17 @@ public class IrishSentenceBankDocument {
}
}
+ /**
+ * Parses the data provided via a {@link File} into a
+ * {@link IrishSentenceBankDocument}.
+ *
+ * @param file A valid {@link File} that holds the data to process.
+ *
+ * @return A valid {@link IrishSentenceBankDocument}.
+ * @throws IOException Thrown if IO errors occurred.
+ */
static IrishSentenceBankDocument parse(File file) throws IOException {
- try (InputStream in = new FileInputStream(file)) {
+ try (InputStream in = Files.newInputStream(file.toPath())) {
return parse(in);
}
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java
index 488e3b6f..3decd8e0 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java
@@ -27,6 +27,9 @@ import opennlp.tools.formats.AbstractSampleStreamFactory;
import opennlp.tools.sentdetect.SentenceSample;
import opennlp.tools.util.ObjectStream;
+/**
+ * @see IrishSentenceBankSentenceStream
+ */
public class IrishSentenceBankSentenceStreamFactory<P>
extends AbstractSampleStreamFactory<SentenceSample, P> {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java
index 8cbfac24..ec9ddea3 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java
@@ -20,9 +20,15 @@ package opennlp.tools.formats.irishsentencebank;
import java.io.IOException;
import java.util.Iterator;
+import opennlp.tools.commons.Internal;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.util.ObjectStream;
+/**
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
+ */
+@Internal
class IrishSentenceBankTokenSampleStream implements ObjectStream<TokenSample> {
private final IrishSentenceBankDocument source;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java
index 9b7429a6..213c0b8a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java
@@ -27,6 +27,9 @@ import opennlp.tools.formats.DetokenizerSampleStreamFactory;
import opennlp.tools.tokenize.TokenSample;
import opennlp.tools.util.ObjectStream;
+/**
+ * @see IrishSentenceBankTokenSampleStream
+ */
public class IrishSentenceBankTokenSampleStreamFactory<P>
extends DetokenizerSampleStreamFactory<TokenSample, P> {
@@ -43,6 +46,7 @@ public class IrishSentenceBankTokenSampleStreamFactory<P>
super(params);
}
+ @Override
public ObjectStream<TokenSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/package-info.java
similarity index 61%
copy from opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
copy to opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/package-info.java
index 5e1429b9..8f13cd71 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/package-info.java
@@ -15,30 +15,7 @@
* limitations under the License.
*/
-package opennlp.tools.formats;
-
-import opennlp.tools.cmdline.ObjectStreamFactory;
-
/**
- * Base class for sample stream factories.
+ * Experimental package related to the Irish Sentence Bank format.
*/
-public abstract class AbstractSampleStreamFactory<T,P> implements ObjectStreamFactory<T,P> {
-
- protected Class<P> params;
-
- private AbstractSampleStreamFactory() {
- }
-
- protected AbstractSampleStreamFactory(Class<P> params) {
- this.params = params;
- }
-
- public String getLang() {
- return "eng";
- }
-
- // FIXME
- public Class<P> getParameters() {
- return params;
- }
-}
+package opennlp.tools.formats.irishsentencebank;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
index 28b2f64c..14b1c544 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
@@ -47,8 +47,19 @@ public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample>
private final String lang;
- private Iterator<String> lineIterator;
-
+ private final Iterator<String> lineIterator;
+
+ /**
+ * Initializes a {@link LeipzigSentencesStream}.
+ *
+ * @param lang An ISO language code.
+ * @param sentencesFile The {@link File} which contains sentences to process.
+ * @param sentencesPerSample The number of sentences per sample.
+ * @param numberOfSamples The number of samples to process at maximum.
+ *
+ * @throws IOException Thrown if IO errors occurred.
+ * @throws InvalidFormatException Thrown if {@code sentencesFile} has not enough lines to process.
+ */
LeipzigSentencesStream(String lang, File sentencesFile, int sentencesPerSample, int numberOfSamples)
throws IOException {
@@ -106,7 +117,7 @@ public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample>
String line = lineIterator.next();
int textStart = line.indexOf('\t') + 1;
- sampleString.append(line.substring(textStart) + " ");
+ sampleString.append(line.substring(textStart)).append(" ");
count++;
}
@@ -121,14 +132,23 @@ public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample>
private final int sentencesPerSample;
- private Map<String, Integer> langSampleCounts;
- private File[] sentencesFiles;
+ private final Map<String, Integer> langSampleCounts;
+ private final File[] sentencesFiles;
private Iterator<File> sentencesFilesIt;
private ObjectStream<LanguageSample> sampleStream;
private final Random random;
+ /**
+ * Initializes a {@link LeipzigLanguageSampleStream}.
+ *
+ * @param leipzigFolder The {@link File directory} which contains files to process.
+ * @param sentencesPerSample The number of sentences per sample.
+ * @param samplesPerLanguage The number of samples per language to process at maximum.
+ *
+ * @throws IOException Thrown if IO errors occurred.
+ */
public LeipzigLanguageSampleStream(File leipzigFolder, final int sentencesPerSample,
final int samplesPerLanguage) throws IOException {
this.sentencesPerSample = sentencesPerSample;
@@ -155,6 +175,7 @@ public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample>
reset();
}
+ @Override
public LanguageSample read() throws IOException {
LanguageSample sample;
if (sampleStream != null && (sample = sampleStream.read()) != null) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
index ea6c5c3b..b344de74 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
@@ -26,13 +26,18 @@ import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
import opennlp.tools.cmdline.StreamFactoryRegistry;
import opennlp.tools.cmdline.TerminateToolException;
import opennlp.tools.cmdline.params.EncodingParameter;
+import opennlp.tools.commons.Internal;
import opennlp.tools.formats.AbstractSampleStreamFactory;
import opennlp.tools.langdetect.LanguageSample;
import opennlp.tools.util.ObjectStream;
/**
- * <b>Note:</b> Do not use this class, internal use only!
+ * <b>Note:</b>
+ * Do not use this class, internal use only!
+ *
+ * @see LeipzigLanguageSampleStream
*/
+@Internal
public class LeipzigLanguageSampleStreamFactory<P>
extends AbstractSampleStreamFactory<LanguageSample, P> {
@@ -64,6 +69,7 @@ public class LeipzigLanguageSampleStreamFactory<P>
"leipzig", new LeipzigLanguageSampleStreamFactory<>(Parameters.class));
}
+ @Override
public ObjectStream<LanguageSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleShuffleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleShuffleStream.java
index be81c12d..417d202e 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleShuffleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleShuffleStream.java
@@ -28,7 +28,7 @@ import opennlp.tools.util.ObjectStream;
class SampleShuffleStream<T> implements ObjectStream<T> {
- private List<T> bufferedSamples = new ArrayList<>();
+ private final List<T> bufferedSamples = new ArrayList<>();
private Iterator<T> sampleIt;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleSkipStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleSkipStream.java
index 13472754..2bfff5b8 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleSkipStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/SampleSkipStream.java
@@ -23,7 +23,6 @@ import opennlp.tools.util.ObjectStream;
class SampleSkipStream<T> implements ObjectStream<T> {
-
private final ObjectStream<T> samples;
private final int samplesToSkip;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/package-info.java
similarity index 61%
copy from opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
copy to opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/package-info.java
index 5e1429b9..832ea618 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/package-info.java
@@ -15,30 +15,7 @@
* limitations under the License.
*/
-package opennlp.tools.formats;
-
-import opennlp.tools.cmdline.ObjectStreamFactory;
-
/**
- * Base class for sample stream factories.
+ * Experimental package related to the {@code Leipzig} corpus format.
*/
-public abstract class AbstractSampleStreamFactory<T,P> implements ObjectStreamFactory<T,P> {
-
- protected Class<P> params;
-
- private AbstractSampleStreamFactory() {
- }
-
- protected AbstractSampleStreamFactory(Class<P> params) {
- this.params = params;
- }
-
- public String getLang() {
- return "eng";
- }
-
- // FIXME
- public Class<P> getParameters() {
- return params;
- }
-}
+package opennlp.tools.formats.leipzig;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/DetokenizeSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/DetokenizeSentenceSampleStream.java
index f5ae9271..a71fdad6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/DetokenizeSentenceSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/DetokenizeSentenceSampleStream.java
@@ -34,6 +34,14 @@ public class DetokenizeSentenceSampleStream
private final Detokenizer detokenizer;
+ /**
+ * Initializes a {@link DetokenizeSentenceSampleStream}.
+ *
+ * @param detokenizer The {@link Detokenizer} to use. Must not be {@code null}.
+ * @param samples The {@link ObjectStream<SentenceSample> samples} as input. Must not be {@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
public DetokenizeSentenceSampleStream(Detokenizer detokenizer, ObjectStream<SentenceSample> samples) {
super(samples);
this.detokenizer = Objects.requireNonNull(detokenizer);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/LetsmtDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/LetsmtDocument.java
index eb10b7cd..b1abd517 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/LetsmtDocument.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/LetsmtDocument.java
@@ -36,14 +36,18 @@ import org.xml.sax.helpers.DefaultHandler;
import opennlp.tools.util.XmlUtil;
/**
- * A structure to hold the letsmt document. The documents contains sentences and depending on the
+ * A structure to hold the letsmt document. The documents contain sentences and depending on the
* source it either contains tokenized text (words) or an un-tokenized sentence string.
* <p>
* The format specification can be found
- * <a href="http://project.letsmt.eu/uploads/Deliverables/D2.1%20%20Specification%20of%20data%20formats%20v1%20final.pdf">here</a>.
+ * <a href="http://project.letsmt.eu/uploads/Deliverables/D2.1%20%20Specification%20of%20data%20formats%20v1%20final.pdf">
+ * here</a>.
*/
public class LetsmtDocument {
+ private static final String ORG_XML_FEATURES_DISALLOW_DOCTYPE_DECL =
+ "http://apache.org/xml/features/disallow-doctype-decl";
+
public static class LetsmtSentence {
private String nonTokenizedText;
private String[] tokens;
@@ -61,12 +65,14 @@ public class LetsmtDocument {
}
}
- // define a content handler to receive the sax events ...
+ /**
+ * A {@link DefaultHandler content handler} to receive and process SAX events.
+ */
public static class LetsmtDocumentHandler extends DefaultHandler {
- private List<LetsmtSentence> sentences = new ArrayList<>();
+ private final List<LetsmtSentence> sentences = new ArrayList<>();
- private StringBuilder chars = new StringBuilder();
+ private final StringBuilder chars = new StringBuilder();
private List<String> tokens = new ArrayList<>();
@Override
@@ -88,7 +94,7 @@ public class LetsmtDocument {
break;
// TODO: The sentence should contain the id, so it can be tracked back to the
- // place it came from
+ // place it came from
case "s":
LetsmtSentence sentence = new LetsmtSentence();
@@ -107,16 +113,25 @@ public class LetsmtDocument {
}
}
- private List<LetsmtSentence> sentences = new ArrayList<>();
+ private final List<LetsmtSentence> sentences;
private LetsmtDocument(List<LetsmtSentence> sentences) {
this.sentences = sentences;
}
+ /**
+ * @return Retrieves the sentences of a {@link LetsmtDocument}.
+ */
public List<LetsmtSentence> getSentences() {
return Collections.unmodifiableList(sentences);
}
+ /**
+ * @param letsmtXmlIn The {@link InputStream} referencing the document to parse.
+ *
+ * @return A valid {@link LetsmtDocument} instance.
+ * @throws IOException Thrown if IO errors occurred during loading or parsing.
+ */
static LetsmtDocument parse(InputStream letsmtXmlIn) throws IOException {
SAXParser saxParser = XmlUtil.createSaxParser();
@@ -124,7 +139,7 @@ public class LetsmtDocument {
XMLReader xmlReader = saxParser.getXMLReader();
LetsmtDocumentHandler docHandler = new LetsmtDocumentHandler();
xmlReader.setContentHandler(docHandler);
- xmlReader.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
+ xmlReader.setFeature(ORG_XML_FEATURES_DISALLOW_DOCTYPE_DECL, true);
xmlReader.parse(new InputSource(letsmtXmlIn));
return new LetsmtDocument(docHandler.sentences);
} catch (SAXException e) {
@@ -132,6 +147,12 @@ public class LetsmtDocument {
}
}
+ /**
+ * @param file The {@link File} referencing the document to parse.
+ *
+ * @return A valid {@link LetsmtDocument} instance.
+ * @throws IOException Thrown if IO errors occurred during loading or parsing.
+ */
static LetsmtDocument parse(File file) throws IOException {
try (InputStream in = new FileInputStream(file)) {
return parse(in);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/LetsmtSentenceStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/LetsmtSentenceStreamFactory.java
index 8c83e2fc..e3474257 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/LetsmtSentenceStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/LetsmtSentenceStreamFactory.java
@@ -32,6 +32,9 @@ import opennlp.tools.tokenize.Detokenizer;
import opennlp.tools.tokenize.DictionaryDetokenizer;
import opennlp.tools.util.ObjectStream;
+/**
+ * @see LetsmtSentenceStream
+ */
public class LetsmtSentenceStreamFactory<P> extends AbstractSampleStreamFactory<SentenceSample, P> {
interface Parameters extends BasicFormatParams {
@@ -65,8 +68,7 @@ public class LetsmtSentenceStreamFactory<P> extends AbstractSampleStreamFactory<
CmdLineUtil.handleCreateObjectStreamError(ex);
}
- // TODO:
- // Implement a filter stream to remove splits which are not at an eos char
+ // TODO Implement a filter stream to remove splits which are not at an eos char
ObjectStream<SentenceSample> samples = new LetsmtSentenceStream(letsmtDoc);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/package-info.java
similarity index 61%
copy from opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
copy to opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/package-info.java
index 5e1429b9..3252a2ae 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/letsmt/package-info.java
@@ -15,30 +15,7 @@
* limitations under the License.
*/
-package opennlp.tools.formats;
-
-import opennlp.tools.cmdline.ObjectStreamFactory;
-
/**
- * Base class for sample stream factories.
+ * Experimental package related to the {@code letsmt} corpus format.
*/
-public abstract class AbstractSampleStreamFactory<T,P> implements ObjectStreamFactory<T,P> {
-
- protected Class<P> params;
-
- private AbstractSampleStreamFactory() {
- }
-
- protected AbstractSampleStreamFactory(Class<P> params) {
- this.params = params;
- }
-
- public String getLang() {
- return "eng";
- }
-
- // FIXME
- public Class<P> getParameters() {
- return params;
- }
-}
+package opennlp.tools.formats.letsmt;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/package-info.java
similarity index 61%
copy from opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
copy to opennlp-tools/src/main/java/opennlp/tools/formats/masc/package-info.java
index 5e1429b9..2e619c7c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/masc/package-info.java
@@ -15,30 +15,7 @@
* limitations under the License.
*/
-package opennlp.tools.formats;
-
-import opennlp.tools.cmdline.ObjectStreamFactory;
-
/**
- * Base class for sample stream factories.
+ * Experimental package related to the {@code MASC} corpus format.
*/
-public abstract class AbstractSampleStreamFactory<T,P> implements ObjectStreamFactory<T,P> {
-
- protected Class<P> params;
-
- private AbstractSampleStreamFactory() {
- }
-
- protected AbstractSampleStreamFactory(Class<P> params) {
- this.params = params;
- }
-
- public String getLang() {
- return "eng";
- }
-
- // FIXME
- public Class<P> getParameters() {
- return params;
- }
-}
+package opennlp.tools.formats.masc;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/moses/MosesSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/moses/MosesSentenceSampleStream.java
index 496a7f62..7ca6291f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/moses/MosesSentenceSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/moses/MosesSentenceSampleStream.java
@@ -29,10 +29,19 @@ import opennlp.tools.util.Span;
public class MosesSentenceSampleStream extends FilterObjectStream<String, SentenceSample> {
+ /**
+ * Initializes a {@link MosesSentenceSampleStream}.
+ *
+ * @param sentences The {@link ObjectStream<String> samples} as input.
+ * Must not be {@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
public MosesSentenceSampleStream(ObjectStream<String> sentences) {
super(new EmptyLinePreprocessorStream(sentences));
}
+ @Override
public SentenceSample read() throws IOException {
StringBuilder sentencesString = new StringBuilder();
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/moses/MosesSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/moses/MosesSentenceSampleStreamFactory.java
index a04c4028..6e942c93 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/moses/MosesSentenceSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/moses/MosesSentenceSampleStreamFactory.java
@@ -30,7 +30,7 @@ import opennlp.tools.util.ObjectStream;
import opennlp.tools.util.PlainTextByLineStream;
/**
- * Factory producing OpenNLP {@link MosesSentenceSampleStream}s.
+ * Factory producing OpenNLP {@link MosesSentenceSampleStream} objects.
*/
public class MosesSentenceSampleStreamFactory<P> extends AbstractSampleStreamFactory<SentenceSample, P> {
@@ -46,6 +46,7 @@ public class MosesSentenceSampleStreamFactory<P> extends AbstractSampleStreamFac
super(params);
}
+ @Override
public ObjectStream<SentenceSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java
index 9ac95147..691ef6b6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/DocumentSplitterStream.java
@@ -30,12 +30,13 @@ class DocumentSplitterStream extends FilterObjectStream<String, String> {
private static final String DOC_START_ELEMENT = "<DOC>";
private static final String DOC_END_ELEMENT = "</DOC>";
- private List<String> docs = new ArrayList<>();
+ private final List<String> docs = new ArrayList<>();
DocumentSplitterStream(ObjectStream<String> samples) {
super(samples);
}
+ @Override
public String read() throws IOException {
if (docs.isEmpty()) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactory.java
index 95957d00..28030beb 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/Muc6NameSampleStreamFactory.java
@@ -47,6 +47,7 @@ public class Muc6NameSampleStreamFactory
new Muc6NameSampleStreamFactory());
}
+ @Override
public ObjectStream<NameSample> create(String[] args) {
Parameters params = ArgumentParser.parse(args, Parameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/muc/MucNameContentHandler.java b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/MucNameContentHandler.java
index e25d6741..c9536e86 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/muc/MucNameContentHandler.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/MucNameContentHandler.java
@@ -70,10 +70,16 @@ public class MucNameContentHandler extends SgmlParser.ContentHandler {
private boolean isClearAdaptiveData = false;
private final Stack<Span> incompleteNames = new Stack<>();
- private List<Span> names = new ArrayList<>();
-
- public MucNameContentHandler(Tokenizer tokenizer,
- List<NameSample> storedSamples) {
+ private final List<Span> names = new ArrayList<>();
+
+ /**
+ * Initializes a {@link MucNameContentHandler}.
+ *
+ * @param tokenizer The {@link Tokenizer} to use. Must not be {@code null}.
+ * @param storedSamples The {@link List<NameSample> samples} as input.
+ * Must not be {@code null}.
+ */
+ public MucNameContentHandler(Tokenizer tokenizer, List<NameSample> storedSamples) {
this.tokenizer = tokenizer;
this.storedSamples = storedSamples;
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/muc/MucNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/MucNameSampleStream.java
index 281df5d6..a2ab8346 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/muc/MucNameSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/MucNameSampleStream.java
@@ -31,13 +31,23 @@ public class MucNameSampleStream extends FilterObjectStream<String, NameSample>
private final Tokenizer tokenizer;
- private List<NameSample> storedSamples = new ArrayList<>();
+ private final List<NameSample> storedSamples = new ArrayList<>();
+ /**
+ * Initializes a {@link MucNameSampleStream}.
+ *
+ * @param tokenizer The {@link Tokenizer} to use. Must not be {@code null}.
+ * @param samples The {@link ObjectStream<String> samples} as input.
+ * Must not be {@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
protected MucNameSampleStream(Tokenizer tokenizer, ObjectStream<String> samples) {
super(samples);
this.tokenizer = tokenizer;
}
+ @Override
public NameSample read() throws IOException {
if (storedSamples.isEmpty()) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/package-info.java
similarity index 61%
copy from opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
copy to opennlp-tools/src/main/java/opennlp/tools/formats/muc/package-info.java
index 5e1429b9..43c49634 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/muc/package-info.java
@@ -15,30 +15,7 @@
* limitations under the License.
*/
-package opennlp.tools.formats;
-
-import opennlp.tools.cmdline.ObjectStreamFactory;
-
/**
- * Base class for sample stream factories.
+ * Experimental package related to the {@code MUC} corpus format.
*/
-public abstract class AbstractSampleStreamFactory<T,P> implements ObjectStreamFactory<T,P> {
-
- protected Class<P> params;
-
- private AbstractSampleStreamFactory() {
- }
-
- protected AbstractSampleStreamFactory(Class<P> params) {
- this.params = params;
- }
-
- public String getLang() {
- return "eng";
- }
-
- // FIXME
- public Class<P> getParameters() {
- return params;
- }
-}
+package opennlp.tools.formats.muc;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocument.java
index 514626cf..b377460d 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocument.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPSegmentationDocument.java
@@ -86,7 +86,7 @@ public class NKJPSegmentationDocument {
Map<String, Map<String, Pointer>> sentences = new LinkedHashMap<>();
try {
- DocumentBuilder docBuilder = XmlUtil.createDocumentBuilder();;
+ DocumentBuilder docBuilder = XmlUtil.createDocumentBuilder();
Document doc = docBuilder.parse(is);
XPathFactory xPathfactory = XPathFactory.newInstance();
@@ -231,8 +231,8 @@ public class NKJPSegmentationDocument {
}
String docid = pieces[0];
- int offset = 0;
- int length = 0;
+ int offset;
+ int length;
if (pieces.length == 3) {
offset = Integer.parseInt(pieces[1]);
length = Integer.parseInt(pieces[2]);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPTextDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPTextDocument.java
index 53421f4d..1ed187d7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPTextDocument.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/NKJPTextDocument.java
@@ -39,11 +39,24 @@ import org.xml.sax.SAXException;
import opennlp.tools.util.XmlUtil;
+/**
+ * The National corpus of Polish (NKJP) format.
+ * <p>
+ * Information about the format are found on this
+ * <a href="http://nkjp.pl/index.php?page=0%26lang=1">web site</a>.
+ * <p>
+ * A 1-million word corpus can be found on this
+ * <a href="http://nkjp.pl/index.php?page=14%26lang=1">
+ * web site</a>.
+ * <p>
+ * The NKJP schema can be found
+ * <a href="http://nlp.ipipan.waw.pl/TEI4NKJP/">here</a>.
+ */
public class NKJPTextDocument {
- Map<String, String> divtypes;
+ private Map<String, String> divtypes;
- Map<String, Map<String, Map<String, String>>> texts;
+ private Map<String, Map<String, Map<String, String>>> texts;
NKJPTextDocument() {
divtypes = new HashMap<>();
@@ -61,7 +74,7 @@ public class NKJPTextDocument {
Map<String, Map<String, Map<String, String>>> texts = new HashMap<>();
try {
- DocumentBuilder docBuilder = XmlUtil.createDocumentBuilder();;
+ DocumentBuilder docBuilder = XmlUtil.createDocumentBuilder();
Document doc = docBuilder.parse(is);
XPathFactory xPathfactory = XPathFactory.newInstance();
@@ -143,7 +156,8 @@ public class NKJPTextDocument {
* Segmentation etc. is done only in relation to the paragraph,
* which are unique within a document. This is to simplify
* working with the paragraphs within the document
- * @return a map of paragaph IDs and their text
+ *
+ * @return A map of paragraph IDs and their text.
*/
Map<String, String> getParagraphs() {
Map<String, String> paragraphs = new HashMap<>();
@@ -158,12 +172,14 @@ public class NKJPTextDocument {
}
/**
- * Helper method to get the value of an attribute
- * @param n The node being processed
- * @param attrib The name of the attribute
- * @param required Whether or not the attribute is required
+ * Helper method to get the value of an attribute.
+ *
+ * @param n The {@link Node} to be processed.
+ * @param attrib The name of the attribute.
+ * @param required Whether the attribute is required or not.
+ *
* @return The value of the attribute, or null if not required and not present
- * @throws Exception
+ * @throws IOException Thrown if IO errors occurred.
*/
private static String attrib(Node n, String attrib, boolean required) throws IOException {
if (required && (n.getAttributes() == null || n.getAttributes().getLength() == 0)) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/package-info.java
similarity index 61%
copy from opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
copy to opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/package-info.java
index 5e1429b9..ed08eb7c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/nkjp/package-info.java
@@ -15,30 +15,7 @@
* limitations under the License.
*/
-package opennlp.tools.formats;
-
-import opennlp.tools.cmdline.ObjectStreamFactory;
-
/**
- * Base class for sample stream factories.
+ * Experimental package related to the National corpus of Polish {@code NKJP} format.
*/
-public abstract class AbstractSampleStreamFactory<T,P> implements ObjectStreamFactory<T,P> {
-
- protected Class<P> params;
-
- private AbstractSampleStreamFactory() {
- }
-
- protected AbstractSampleStreamFactory(Class<P> params) {
- this.params = params;
- }
-
- public String getLang() {
- return "eng";
- }
-
- // FIXME
- public Class<P> getParameters() {
- return params;
- }
-}
+package opennlp.tools.formats.nkjp;
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/DocumentToLineStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/DocumentToLineStream.java
index eb8b3e23..c00398bb 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/DocumentToLineStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/DocumentToLineStream.java
@@ -27,10 +27,18 @@ import opennlp.tools.formats.brat.SegmenterObjectStream;
import opennlp.tools.util.ObjectStream;
/**
- * Reads a plain text file and return each line as a <code>String</code> object.
+ * Reads a plain text file and return each line as a {@link String} object.
*/
public class DocumentToLineStream extends SegmenterObjectStream<String, String> {
+ /**
+ * Initializes a {@link DocumentToLineStream}.
+ *
+ * @param samples The {@link ObjectStream<String> samples} as input.
+ * Must not be {@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
public DocumentToLineStream(ObjectStream<String> samples) {
super(samples);
}
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java
index af2b5c8e..bdafca83 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStream.java
@@ -42,8 +42,16 @@ public class OntoNotesNameSampleStream extends
private final Map<String, String> tokenConversionMap;
- private List<NameSample> nameSamples = new LinkedList<>();
-
+ private final List<NameSample> nameSamples = new LinkedList<>();
+
+ /**
+ * Initializes a {@link OntoNotesNameSampleStream}.
+ *
+ * @param samples The {@link ObjectStream<String> samples} as input.
+ * Must not be {@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
public OntoNotesNameSampleStream(ObjectStream<String> samples) {
super(samples);
@@ -84,6 +92,7 @@ public class OntoNotesNameSampleStream extends
return cleanedToken;
}
+ @Override
public NameSample read() throws IOException {
if (nameSamples.isEmpty()) {
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java
index b552bdcf..f71e3042 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesNameSampleStreamFactory.java
@@ -35,6 +35,7 @@ public class OntoNotesNameSampleStreamFactory extends
super(OntoNotesFormatParameters.class);
}
+ @Override
public ObjectStream<NameSample> create(String[] args) {
OntoNotesFormatParameters params = ArgumentParser.parse(args, OntoNotesFormatParameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactory.java
index d5b9125a..a82c0d97 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesPOSSampleStreamFactory.java
@@ -27,13 +27,14 @@ import opennlp.tools.util.ObjectStream;
public class OntoNotesPOSSampleStreamFactory
extends AbstractSampleStreamFactory<POSSample, OntoNotesFormatParameters> {
- private OntoNotesParseSampleStreamFactory parseSampleStreamFactory =
+ private final OntoNotesParseSampleStreamFactory parseSampleStreamFactory =
new OntoNotesParseSampleStreamFactory();
protected OntoNotesPOSSampleStreamFactory() {
super(OntoNotesFormatParameters.class);
}
+ @Override
public ObjectStream<POSSample> create(String[] args) {
ObjectStream<Parse> parseSampleStream = parseSampleStreamFactory.create(args);
return new ParseToPOSSampleStream(parseSampleStream);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStream.java
index 855f9121..84fd1a5a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStream.java
@@ -27,10 +27,19 @@ import opennlp.tools.util.ObjectStream;
// Should be possible with this one, to train the parser and pos tagger!
public class OntoNotesParseSampleStream extends FilterObjectStream<String, Parse> {
+ /**
+ * Initializes a {@link OntoNotesParseSampleStream}.
+ *
+ * @param samples The {@link ObjectStream<String> samples} as input.
+ * Must not be {@code null}.
+ *
+ * @throws IllegalArgumentException Thrown if parameters are invalid.
+ */
public OntoNotesParseSampleStream(ObjectStream<String> samples) {
super(samples);
}
+ @Override
public Parse read() throws IOException {
StringBuilder parseString = new StringBuilder();
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactory.java
index ba94c716..5b9fa0b5 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/OntoNotesParseSampleStreamFactory.java
@@ -35,6 +35,7 @@ public class OntoNotesParseSampleStreamFactory
super(OntoNotesFormatParameters.class);
}
+ @Override
public ObjectStream<Parse> create(String[] args) {
OntoNotesFormatParameters params = ArgumentParser.parse(args, OntoNotesFormatParameters.class);
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/package-info.java
similarity index 61%
copy from opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
copy to opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/package-info.java
index 5e1429b9..912bd843 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/AbstractSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/ontonotes/package-info.java
@@ -15,30 +15,7 @@
* limitations under the License.
*/
-package opennlp.tools.formats;
-
-import opennlp.tools.cmdline.ObjectStreamFactory;
-
/**
- * Base class for sample stream factories.
+ * Experimental package related to the OntoNotes 4.0 format.
*/
-public abstract class AbstractSampleStreamFactory<T,P> implements ObjectStreamFactory<T,P> {
-
- protected Class<P> params;
-
- private AbstractSampleStreamFactory() {
- }
-
- protected AbstractSampleStreamFactory(Class<P> params) {
- this.params = params;
- }
-
- public String getLang() {
- return "eng";
- }
-
- // FIXME
- public Class<P> getParameters() {
- return params;
- }
-}
+package opennlp.tools.formats.ontonotes;