You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by sm...@apache.org on 2017/07/04 04:13:27 UTC
[opennlp] branch master updated: OPENNLP-1084: Documents language
detector usage and training
This is an automated email from the ASF dual-hosted git repository.
smarthi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git
The following commit(s) were added to refs/heads/master by this push:
new 2b13a14 OPENNLP-1084: Documents language detector usage and training
2b13a14 is described below
commit 2b13a14ca826fb1ad3e77380d9d091659f9e5f97
Author: William D C M SILVA <co...@apache.org>
AuthorDate: Tue Jul 4 00:54:47 2017 -0300
OPENNLP-1084: Documents language detector usage and training
---
opennlp-docs/src/docbkx/cli.xml | 950 +++++++++++++++++++++++++--------
opennlp-docs/src/docbkx/langdetect.xml | 226 ++++++++
opennlp-docs/src/docbkx/opennlp.xml | 1 +
3 files changed, 944 insertions(+), 233 deletions(-)
diff --git a/opennlp-docs/src/docbkx/cli.xml b/opennlp-docs/src/docbkx/cli.xml
index 1a8c326..f809029 100644
--- a/opennlp-docs/src/docbkx/cli.xml
+++ b/opennlp-docs/src/docbkx/cli.xml
@@ -247,6 +247,255 @@ Usage: opennlp DoccatConverter help|leipzig [help|options...]
</section>
+<section id='tools.cli.langdetect'>
+
+<title>Langdetect</title>
+
+<section id='tools.cli.langdetect.LanguageDetector'>
+
+<title>LanguageDetector</title>
+
+<para>Learned language detector</para>
+
+<screen>
+<![CDATA[
+Usage: opennlp LanguageDetector model < documents
+
+]]>
+</screen>
+</section>
+
+<section id='tools.cli.langdetect.LanguageDetectorTrainer'>
+
+<title>LanguageDetectorTrainer</title>
+
+<para>Trainer for the learnable language detector</para>
+
+<screen>
+<![CDATA[
+Usage: opennlp LanguageDetectorTrainer[.leipzig] -model modelFile [-params paramsFile] [-factory factoryName]
+ -data sampleData [-encoding charsetName]
+Arguments description:
+ -model modelFile
+ output model file.
+ -params paramsFile
+ training parameters file.
+ -factory factoryName
+ A sub-class of LanguageDetectorFactory where to get implementation and resources.
+ -data sampleData
+ data to be used, usually a file name.
+ -encoding charsetName
+ encoding for reading and writing text, if absent the system default is used.
+
+]]>
+</screen>
+<para>The supported formats and arguments are:</para>
+
+<informaltable frame='all'><tgroup cols='4' align='left' colsep='1' rowsep='1'>
+<thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
+<tbody>
+<row>
+<entry morerows='3' valign='middle'>leipzig</entry>
+<entry>sentencesDir</entry>
+<entry>sentencesDir</entry>
+<entry>No</entry>
+<entry>Dir with Leipig sentences to be used</entry>
+</row>
+<row>
+<entry>sentencesPerSample</entry>
+<entry>sentencesPerSample</entry>
+<entry>No</entry>
+<entry>Number of sentences per sample</entry>
+</row>
+<row>
+<entry>samplesPerLanguage</entry>
+<entry>samplesPerLanguage</entry>
+<entry>No</entry>
+<entry>Number of samples per language</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+</tbody>
+</tgroup></informaltable>
+
+</section>
+
+<section id='tools.cli.langdetect.LanguageDetectorConverter'>
+
+<title>LanguageDetectorConverter</title>
+
+<para>Converts leipzig data format to native OpenNLP format</para>
+
+<screen>
+<![CDATA[
+Usage: opennlp LanguageDetectorConverter help|leipzig [help|options...]
+
+]]>
+</screen>
+<para>The supported formats and arguments are:</para>
+
+<informaltable frame='all'><tgroup cols='4' align='left' colsep='1' rowsep='1'>
+<thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
+<tbody>
+<row>
+<entry morerows='3' valign='middle'>leipzig</entry>
+<entry>sentencesDir</entry>
+<entry>sentencesDir</entry>
+<entry>No</entry>
+<entry>Dir with Leipig sentences to be used</entry>
+</row>
+<row>
+<entry>sentencesPerSample</entry>
+<entry>sentencesPerSample</entry>
+<entry>No</entry>
+<entry>Number of sentences per sample</entry>
+</row>
+<row>
+<entry>samplesPerLanguage</entry>
+<entry>samplesPerLanguage</entry>
+<entry>No</entry>
+<entry>Number of samples per language</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+</tbody>
+</tgroup></informaltable>
+
+</section>
+
+<section id='tools.cli.langdetect.LanguageDetectorCrossValidator'>
+
+<title>LanguageDetectorCrossValidator</title>
+
+<para>K-fold cross validator for the learnable Language Detector</para>
+
+<screen>
+<![CDATA[
+Usage: opennlp LanguageDetectorCrossValidator[.leipzig] [-misclassified true|false] [-folds num] [-factory
+ factoryName] [-params paramsFile] [-reportOutputFile outputFile] -data sampleData [-encoding
+ charsetName]
+Arguments description:
+ -misclassified true|false
+ if true will print false negatives and false positives.
+ -folds num
+ number of folds, default is 10.
+ -factory factoryName
+ A sub-class of LanguageDetectorFactory where to get implementation and resources.
+ -params paramsFile
+ training parameters file.
+ -reportOutputFile outputFile
+ the path of the fine-grained report file.
+ -data sampleData
+ data to be used, usually a file name.
+ -encoding charsetName
+ encoding for reading and writing text, if absent the system default is used.
+
+]]>
+</screen>
+<para>The supported formats and arguments are:</para>
+
+<informaltable frame='all'><tgroup cols='4' align='left' colsep='1' rowsep='1'>
+<thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
+<tbody>
+<row>
+<entry morerows='3' valign='middle'>leipzig</entry>
+<entry>sentencesDir</entry>
+<entry>sentencesDir</entry>
+<entry>No</entry>
+<entry>Dir with Leipig sentences to be used</entry>
+</row>
+<row>
+<entry>sentencesPerSample</entry>
+<entry>sentencesPerSample</entry>
+<entry>No</entry>
+<entry>Number of sentences per sample</entry>
+</row>
+<row>
+<entry>samplesPerLanguage</entry>
+<entry>samplesPerLanguage</entry>
+<entry>No</entry>
+<entry>Number of samples per language</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+</tbody>
+</tgroup></informaltable>
+
+</section>
+
+<section id='tools.cli.langdetect.LanguageDetectorEvaluator'>
+
+<title>LanguageDetectorEvaluator</title>
+
+<para>Measures the performance of the Language Detector model with the reference data</para>
+
+<screen>
+<![CDATA[
+Usage: opennlp LanguageDetectorEvaluator[.leipzig] -model model [-misclassified true|false]
+ [-reportOutputFile outputFile] -data sampleData [-encoding charsetName]
+Arguments description:
+ -model model
+ the model file to be evaluated.
+ -misclassified true|false
+ if true will print false negatives and false positives.
+ -reportOutputFile outputFile
+ the path of the fine-grained report file.
+ -data sampleData
+ data to be used, usually a file name.
+ -encoding charsetName
+ encoding for reading and writing text, if absent the system default is used.
+
+]]>
+</screen>
+<para>The supported formats and arguments are:</para>
+
+<informaltable frame='all'><tgroup cols='4' align='left' colsep='1' rowsep='1'>
+<thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
+<tbody>
+<row>
+<entry morerows='3' valign='middle'>leipzig</entry>
+<entry>sentencesDir</entry>
+<entry>sentencesDir</entry>
+<entry>No</entry>
+<entry>Dir with Leipig sentences to be used</entry>
+</row>
+<row>
+<entry>sentencesPerSample</entry>
+<entry>sentencesPerSample</entry>
+<entry>No</entry>
+<entry>Number of sentences per sample</entry>
+</row>
+<row>
+<entry>samplesPerLanguage</entry>
+<entry>samplesPerLanguage</entry>
+<entry>No</entry>
+<entry>Number of samples per language</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+</tbody>
+</tgroup></informaltable>
+
+</section>
+
+</section>
+
<section id='tools.cli.dictionary'>
<title>Dictionary</title>
@@ -315,9 +564,9 @@ Usage: opennlp TokenizerME model < sentences
<screen>
<![CDATA[
-Usage: opennlp TokenizerTrainer[.ad|.pos|.conllx|.namefinder|.parse] [-factory factoryName] [-abbDict path]
- [-alphaNumOpt isAlphaNumOpt] [-params paramsFile] -lang language -model modelFile -data sampleData
- [-encoding charsetName]
+Usage: opennlp TokenizerTrainer[.irishsentencebank|.ad|.pos|.conllx|.namefinder|.parse|.conllu] [-factory
+ factoryName] [-abbDict path] [-alphaNumOpt isAlphaNumOpt] [-params paramsFile] -lang language -model
+ modelFile -data sampleData [-encoding charsetName]
Arguments description:
-factory factoryName
A sub-class of TokenizerFactory where to get implementation and resources.
@@ -344,17 +593,24 @@ Arguments description:
<thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
<tbody>
<row>
-<entry morerows='4' valign='middle'>ad</entry>
+<entry morerows='1' valign='middle'>irishsentencebank</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
<entry>encoding</entry>
<entry>charsetName</entry>
-<entry>No</entry>
+<entry>Yes</entry>
<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
-<entry>splitHyphenatedTokens</entry>
-<entry>split</entry>
-<entry>Yes</entry>
-<entry>If true all hyphenated tokens will be separated (default true)</entry>
+<entry morerows='4' valign='middle'>ad</entry>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>No</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
<entry>lang</entry>
@@ -369,6 +625,12 @@ Arguments description:
<entry>Data to be used, usually a file name.</entry>
</row>
<row>
+<entry>splitHyphenatedTokens</entry>
+<entry>split</entry>
+<entry>Yes</entry>
+<entry>If true all hyphenated tokens will be separated (default true)</entry>
+</row>
+<row>
<entry>detokenizer</entry>
<entry>dictionary</entry>
<entry>No</entry>
@@ -450,6 +712,19 @@ Arguments description:
<entry>No</entry>
<entry>Specifies the file with detokenizer dictionary.</entry>
</row>
+<row>
+<entry morerows='1' valign='middle'>conllu</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
</tbody>
</tgroup></informaltable>
@@ -463,8 +738,8 @@ Arguments description:
<screen>
<![CDATA[
-Usage: opennlp TokenizerMEEvaluator[.ad|.pos|.conllx|.namefinder|.parse] -model model [-misclassified
- true|false] -data sampleData [-encoding charsetName]
+Usage: opennlp TokenizerMEEvaluator[.irishsentencebank|.ad|.pos|.conllx|.namefinder|.parse|.conllu] -model
+ model [-misclassified true|false] -data sampleData [-encoding charsetName]
Arguments description:
-model model
the model file to be evaluated.
@@ -483,17 +758,24 @@ Arguments description:
<thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
<tbody>
<row>
-<entry morerows='4' valign='middle'>ad</entry>
+<entry morerows='1' valign='middle'>irishsentencebank</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
<entry>encoding</entry>
<entry>charsetName</entry>
-<entry>No</entry>
+<entry>Yes</entry>
<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
-<entry>splitHyphenatedTokens</entry>
-<entry>split</entry>
-<entry>Yes</entry>
-<entry>If true all hyphenated tokens will be separated (default true)</entry>
+<entry morerows='4' valign='middle'>ad</entry>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>No</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
<entry>lang</entry>
@@ -508,6 +790,12 @@ Arguments description:
<entry>Data to be used, usually a file name.</entry>
</row>
<row>
+<entry>splitHyphenatedTokens</entry>
+<entry>split</entry>
+<entry>Yes</entry>
+<entry>If true all hyphenated tokens will be separated (default true)</entry>
+</row>
+<row>
<entry>detokenizer</entry>
<entry>dictionary</entry>
<entry>No</entry>
@@ -589,6 +877,19 @@ Arguments description:
<entry>No</entry>
<entry>Specifies the file with detokenizer dictionary.</entry>
</row>
+<row>
+<entry morerows='1' valign='middle'>conllu</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
</tbody>
</tgroup></informaltable>
@@ -602,9 +903,9 @@ Arguments description:
<screen>
<![CDATA[
-Usage: opennlp TokenizerCrossValidator[.ad|.pos|.conllx|.namefinder|.parse] [-misclassified true|false]
- [-folds num] [-factory factoryName] [-abbDict path] [-alphaNumOpt isAlphaNumOpt] [-params paramsFile]
- -lang language -data sampleData [-encoding charsetName]
+Usage: opennlp TokenizerCrossValidator[.irishsentencebank|.ad|.pos|.conllx|.namefinder|.parse|.conllu]
+ [-misclassified true|false] [-folds num] [-factory factoryName] [-abbDict path] [-alphaNumOpt
+ isAlphaNumOpt] [-params paramsFile] -lang language -data sampleData [-encoding charsetName]
Arguments description:
-misclassified true|false
if true will print false negatives and false positives.
@@ -633,6 +934,19 @@ Arguments description:
<thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
<tbody>
<row>
+<entry morerows='1' valign='middle'>irishsentencebank</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
<entry morerows='4' valign='middle'>ad</entry>
<entry>encoding</entry>
<entry>charsetName</entry>
@@ -640,31 +954,50 @@ Arguments description:
<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
+<entry>lang</entry>
+<entry>language</entry>
+<entry>No</entry>
+<entry>Language which is being processed.</entry>
+</row>
+<row>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
<entry>splitHyphenatedTokens</entry>
<entry>split</entry>
<entry>Yes</entry>
<entry>If true all hyphenated tokens will be separated (default true)</entry>
</row>
<row>
-<entry>lang</entry>
-<entry>language</entry>
+<entry>detokenizer</entry>
+<entry>dictionary</entry>
<entry>No</entry>
-<entry>Language which is being processed.</entry>
+<entry>Specifies the file with detokenizer dictionary.</entry>
</row>
<row>
+<entry morerows='2' valign='middle'>pos</entry>
<entry>data</entry>
<entry>sampleData</entry>
<entry>No</entry>
<entry>Data to be used, usually a file name.</entry>
</row>
<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
<entry>detokenizer</entry>
<entry>dictionary</entry>
<entry>No</entry>
<entry>Specifies the file with detokenizer dictionary.</entry>
</row>
<row>
-<entry morerows='2' valign='middle'>pos</entry>
+<entry morerows='2' valign='middle'>conllx</entry>
<entry>data</entry>
<entry>sampleData</entry>
<entry>No</entry>
@@ -683,7 +1016,7 @@ Arguments description:
<entry>Specifies the file with detokenizer dictionary.</entry>
</row>
<row>
-<entry morerows='2' valign='middle'>conllx</entry>
+<entry morerows='2' valign='middle'>namefinder</entry>
<entry>data</entry>
<entry>sampleData</entry>
<entry>No</entry>
@@ -702,7 +1035,7 @@ Arguments description:
<entry>Specifies the file with detokenizer dictionary.</entry>
</row>
<row>
-<entry morerows='2' valign='middle'>namefinder</entry>
+<entry morerows='2' valign='middle'>parse</entry>
<entry>data</entry>
<entry>sampleData</entry>
<entry>No</entry>
@@ -721,7 +1054,7 @@ Arguments description:
<entry>Specifies the file with detokenizer dictionary.</entry>
</row>
<row>
-<entry morerows='2' valign='middle'>parse</entry>
+<entry morerows='1' valign='middle'>conllu</entry>
<entry>data</entry>
<entry>sampleData</entry>
<entry>No</entry>
@@ -733,12 +1066,6 @@ Arguments description:
<entry>Yes</entry>
<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
-<row>
-<entry>detokenizer</entry>
-<entry>dictionary</entry>
-<entry>No</entry>
-<entry>Specifies the file with detokenizer dictionary.</entry>
-</row>
</tbody>
</tgroup></informaltable>
@@ -748,12 +1075,12 @@ Arguments description:
<title>TokenizerConverter</title>
-<para>Converts foreign data formats (ad,pos,conllx,namefinder,parse) to native OpenNLP format</para>
+<para>Converts foreign data formats (irishsentencebank,ad,pos,conllx,namefinder,parse,conllu) to native OpenNLP format</para>
<screen>
<![CDATA[
-Usage: opennlp TokenizerConverter help|ad|pos|conllx|namefinder|parse [help|options...]
-
+Usage: opennlp TokenizerConverter help|irishsentencebank|ad|pos|conllx|namefinder|parse|conllu
+ [help|options...]
]]>
</screen>
<para>The supported formats and arguments are:</para>
@@ -762,17 +1089,24 @@ Usage: opennlp TokenizerConverter help|ad|pos|conllx|namefinder|parse [help|opti
<thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
<tbody>
<row>
-<entry morerows='4' valign='middle'>ad</entry>
+<entry morerows='1' valign='middle'>irishsentencebank</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
<entry>encoding</entry>
<entry>charsetName</entry>
-<entry>No</entry>
+<entry>Yes</entry>
<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
-<entry>splitHyphenatedTokens</entry>
-<entry>split</entry>
-<entry>Yes</entry>
-<entry>If true all hyphenated tokens will be separated (default true)</entry>
+<entry morerows='4' valign='middle'>ad</entry>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>No</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
<entry>lang</entry>
@@ -787,6 +1121,12 @@ Usage: opennlp TokenizerConverter help|ad|pos|conllx|namefinder|parse [help|opti
<entry>Data to be used, usually a file name.</entry>
</row>
<row>
+<entry>splitHyphenatedTokens</entry>
+<entry>split</entry>
+<entry>Yes</entry>
+<entry>If true all hyphenated tokens will be separated (default true)</entry>
+</row>
+<row>
<entry>detokenizer</entry>
<entry>dictionary</entry>
<entry>No</entry>
@@ -868,6 +1208,19 @@ Usage: opennlp TokenizerConverter help|ad|pos|conllx|namefinder|parse [help|opti
<entry>No</entry>
<entry>Specifies the file with detokenizer dictionary.</entry>
</row>
+<row>
+<entry morerows='1' valign='middle'>conllu</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
</tbody>
</tgroup></informaltable>
@@ -915,16 +1268,17 @@ Usage: opennlp SentenceDetector model < sentences
<screen>
<![CDATA[
-Usage: opennlp SentenceDetectorTrainer[.ad|.pos|.conllx|.namefinder|.parse|.moses|.letsmt] [-factory
- factoryName] [-abbDict path] [-eosChars string] [-params paramsFile] -lang language -model modelFile
- -data sampleData [-encoding charsetName]
+Usage: opennlp
+ SentenceDetectorTrainer[.irishsentencebank|.ad|.pos|.conllx|.namefinder|.parse|.moses|.conllu|.letsmt]
+ [-factory factoryName] [-eosChars string] [-abbDict path] [-params paramsFile] -lang language -model
+ modelFile -data sampleData [-encoding charsetName]
Arguments description:
-factory factoryName
A sub-class of SentenceDetectorFactory where to get implementation and resources.
- -abbDict path
- abbreviation dictionary in XML format.
-eosChars string
EOS characters.
+ -abbDict path
+ abbreviation dictionary in XML format.
-params paramsFile
training parameters file.
-lang language
@@ -944,6 +1298,19 @@ Arguments description:
<thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
<tbody>
<row>
+<entry morerows='1' valign='middle'>irishsentencebank</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
<entry morerows='3' valign='middle'>ad</entry>
<entry>encoding</entry>
<entry>charsetName</entry>
@@ -951,12 +1318,6 @@ Arguments description:
<entry>Encoding for reading and writing text.</entry>
</row>
<row>
-<entry>includeTitles</entry>
-<entry>includeTitles</entry>
-<entry>Yes</entry>
-<entry>If true will include sentences marked as headlines.</entry>
-</row>
-<row>
<entry>lang</entry>
<entry>language</entry>
<entry>No</entry>
@@ -969,6 +1330,12 @@ Arguments description:
<entry>Data to be used, usually a file name.</entry>
</row>
<row>
+<entry>includeTitles</entry>
+<entry>includeTitles</entry>
+<entry>Yes</entry>
+<entry>If true will include sentences marked as headlines.</entry>
+</row>
+<row>
<entry morerows='2' valign='middle'>pos</entry>
<entry>data</entry>
<entry>sampleData</entry>
@@ -1058,6 +1425,25 @@ Arguments description:
<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
+<entry morerows='2' valign='middle'>conllu</entry>
+<entry>sentencesPerSample</entry>
+<entry>sentencesPerSample</entry>
+<entry>No</entry>
+<entry>Number of sentences per sample</entry>
+</row>
+<row>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
<entry morerows='2' valign='middle'>letsmt</entry>
<entry>detokenizer</entry>
<entry>dictionary</entry>
@@ -1089,8 +1475,9 @@ Arguments description:
<screen>
<![CDATA[
-Usage: opennlp SentenceDetectorEvaluator[.ad|.pos|.conllx|.namefinder|.parse|.moses|.letsmt] -model model
- [-misclassified true|false] -data sampleData [-encoding charsetName]
+Usage: opennlp
+ SentenceDetectorEvaluator[.irishsentencebank|.ad|.pos|.conllx|.namefinder|.parse|.moses|.conllu|.letsmt]
+ -model model [-misclassified true|false] -data sampleData [-encoding charsetName]
Arguments description:
-model model
the model file to be evaluated.
@@ -1109,6 +1496,19 @@ Arguments description:
<thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
<tbody>
<row>
+<entry morerows='1' valign='middle'>irishsentencebank</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
<entry morerows='3' valign='middle'>ad</entry>
<entry>encoding</entry>
<entry>charsetName</entry>
@@ -1116,12 +1516,6 @@ Arguments description:
<entry>Encoding for reading and writing text.</entry>
</row>
<row>
-<entry>includeTitles</entry>
-<entry>includeTitles</entry>
-<entry>Yes</entry>
-<entry>If true will include sentences marked as headlines.</entry>
-</row>
-<row>
<entry>lang</entry>
<entry>language</entry>
<entry>No</entry>
@@ -1134,6 +1528,12 @@ Arguments description:
<entry>Data to be used, usually a file name.</entry>
</row>
<row>
+<entry>includeTitles</entry>
+<entry>includeTitles</entry>
+<entry>Yes</entry>
+<entry>If true will include sentences marked as headlines.</entry>
+</row>
+<row>
<entry morerows='2' valign='middle'>pos</entry>
<entry>data</entry>
<entry>sampleData</entry>
@@ -1223,6 +1623,25 @@ Arguments description:
<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
+<entry morerows='2' valign='middle'>conllu</entry>
+<entry>sentencesPerSample</entry>
+<entry>sentencesPerSample</entry>
+<entry>No</entry>
+<entry>Number of sentences per sample</entry>
+</row>
+<row>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
<entry morerows='2' valign='middle'>letsmt</entry>
<entry>detokenizer</entry>
<entry>dictionary</entry>
@@ -1254,16 +1673,17 @@ Arguments description:
<screen>
<![CDATA[
-Usage: opennlp SentenceDetectorCrossValidator[.ad|.pos|.conllx|.namefinder|.parse|.moses|.letsmt] [-factory
- factoryName] [-abbDict path] [-eosChars string] [-params paramsFile] -lang language [-misclassified
- true|false] [-folds num] -data sampleData [-encoding charsetName]
+Usage: opennlp
+ SentenceDetectorCrossValidator[.irishsentencebank|.ad|.pos|.conllx|.namefinder|.parse|.moses|.conllu|.letsmt]
+ [-factory factoryName] [-eosChars string] [-abbDict path] [-params paramsFile] -lang language
+ [-misclassified true|false] [-folds num] -data sampleData [-encoding charsetName]
Arguments description:
-factory factoryName
A sub-class of SentenceDetectorFactory where to get implementation and resources.
- -abbDict path
- abbreviation dictionary in XML format.
-eosChars string
EOS characters.
+ -abbDict path
+ abbreviation dictionary in XML format.
-params paramsFile
training parameters file.
-lang language
@@ -1285,6 +1705,19 @@ Arguments description:
<thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
<tbody>
<row>
+<entry morerows='1' valign='middle'>irishsentencebank</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
<entry morerows='3' valign='middle'>ad</entry>
<entry>encoding</entry>
<entry>charsetName</entry>
@@ -1292,12 +1725,6 @@ Arguments description:
<entry>Encoding for reading and writing text.</entry>
</row>
<row>
-<entry>includeTitles</entry>
-<entry>includeTitles</entry>
-<entry>Yes</entry>
-<entry>If true will include sentences marked as headlines.</entry>
-</row>
-<row>
<entry>lang</entry>
<entry>language</entry>
<entry>No</entry>
@@ -1310,6 +1737,12 @@ Arguments description:
<entry>Data to be used, usually a file name.</entry>
</row>
<row>
+<entry>includeTitles</entry>
+<entry>includeTitles</entry>
+<entry>Yes</entry>
+<entry>If true will include sentences marked as headlines.</entry>
+</row>
+<row>
<entry morerows='2' valign='middle'>pos</entry>
<entry>data</entry>
<entry>sampleData</entry>
@@ -1399,6 +1832,25 @@ Arguments description:
<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
+<entry morerows='2' valign='middle'>conllu</entry>
+<entry>sentencesPerSample</entry>
+<entry>sentencesPerSample</entry>
+<entry>No</entry>
+<entry>Number of sentences per sample</entry>
+</row>
+<row>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
<entry morerows='2' valign='middle'>letsmt</entry>
<entry>detokenizer</entry>
<entry>dictionary</entry>
@@ -1426,12 +1878,12 @@ Arguments description:
<title>SentenceDetectorConverter</title>
-<para>Converts foreign data formats (ad,pos,conllx,namefinder,parse,moses,letsmt) to native OpenNLP format</para>
+<para>Converts foreign data formats (irishsentencebank,ad,pos,conllx,namefinder,parse,moses,conllu,letsmt) to native OpenNLP format</para>
<screen>
<![CDATA[
-Usage: opennlp SentenceDetectorConverter help|ad|pos|conllx|namefinder|parse|moses|letsmt [help|options...]
-
+Usage: opennlp SentenceDetectorConverter
+ help|irishsentencebank|ad|pos|conllx|namefinder|parse|moses|conllu|letsmt [help|options...]
]]>
</screen>
<para>The supported formats and arguments are:</para>
@@ -1440,6 +1892,19 @@ Usage: opennlp SentenceDetectorConverter help|ad|pos|conllx|namefinder|parse|mos
<thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
<tbody>
<row>
+<entry morerows='1' valign='middle'>irishsentencebank</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
<entry morerows='3' valign='middle'>ad</entry>
<entry>encoding</entry>
<entry>charsetName</entry>
@@ -1447,12 +1912,6 @@ Usage: opennlp SentenceDetectorConverter help|ad|pos|conllx|namefinder|parse|mos
<entry>Encoding for reading and writing text.</entry>
</row>
<row>
-<entry>includeTitles</entry>
-<entry>includeTitles</entry>
-<entry>Yes</entry>
-<entry>If true will include sentences marked as headlines.</entry>
-</row>
-<row>
<entry>lang</entry>
<entry>language</entry>
<entry>No</entry>
@@ -1465,6 +1924,12 @@ Usage: opennlp SentenceDetectorConverter help|ad|pos|conllx|namefinder|parse|mos
<entry>Data to be used, usually a file name.</entry>
</row>
<row>
+<entry>includeTitles</entry>
+<entry>includeTitles</entry>
+<entry>Yes</entry>
+<entry>If true will include sentences marked as headlines.</entry>
+</row>
+<row>
<entry morerows='2' valign='middle'>pos</entry>
<entry>data</entry>
<entry>sampleData</entry>
@@ -1554,6 +2019,25 @@ Usage: opennlp SentenceDetectorConverter help|ad|pos|conllx|namefinder|parse|mos
<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
+<entry morerows='2' valign='middle'>conllu</entry>
+<entry>sentencesPerSample</entry>
+<entry>sentencesPerSample</entry>
+<entry>No</entry>
+<entry>Number of sentences per sample</entry>
+</row>
+<row>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
<entry morerows='2' valign='middle'>letsmt</entry>
<entry>detokenizer</entry>
<entry>dictionary</entry>
@@ -1642,14 +2126,14 @@ Arguments description:
<tbody>
<row>
<entry morerows='3' valign='middle'>evalita</entry>
-<entry>types</entry>
-<entry>per,loc,org,gpe</entry>
+<entry>lang</entry>
+<entry>it</entry>
<entry>No</entry>
<entry></entry>
</row>
<row>
-<entry>lang</entry>
-<entry>it</entry>
+<entry>types</entry>
+<entry>per,loc,org,gpe</entry>
<entry>No</entry>
<entry></entry>
</row>
@@ -1673,12 +2157,6 @@ Arguments description:
<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
-<entry>splitHyphenatedTokens</entry>
-<entry>split</entry>
-<entry>Yes</entry>
-<entry>If true all hyphenated tokens will be separated (default true)</entry>
-</row>
-<row>
<entry>lang</entry>
<entry>language</entry>
<entry>No</entry>
@@ -1691,15 +2169,21 @@ Arguments description:
<entry>Data to be used, usually a file name.</entry>
</row>
<row>
+<entry>splitHyphenatedTokens</entry>
+<entry>split</entry>
+<entry>Yes</entry>
+<entry>If true all hyphenated tokens will be separated (default true)</entry>
+</row>
+<row>
<entry morerows='3' valign='middle'>conll03</entry>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>eng|deu</entry>
<entry>No</entry>
<entry></entry>
</row>
<row>
-<entry>lang</entry>
-<entry>eng|deu</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
<entry>No</entry>
<entry></entry>
</row>
@@ -1736,14 +2220,14 @@ Arguments description:
</row>
<row>
<entry morerows='3' valign='middle'>conll02</entry>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>spa|nld</entry>
<entry>No</entry>
<entry></entry>
</row>
<row>
-<entry>lang</entry>
-<entry>es|nl</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
<entry>No</entry>
<entry></entry>
</row>
@@ -1863,14 +2347,14 @@ Arguments description:
<tbody>
<row>
<entry morerows='3' valign='middle'>evalita</entry>
-<entry>types</entry>
-<entry>per,loc,org,gpe</entry>
+<entry>lang</entry>
+<entry>it</entry>
<entry>No</entry>
<entry></entry>
</row>
<row>
-<entry>lang</entry>
-<entry>it</entry>
+<entry>types</entry>
+<entry>per,loc,org,gpe</entry>
<entry>No</entry>
<entry></entry>
</row>
@@ -1894,12 +2378,6 @@ Arguments description:
<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
-<entry>splitHyphenatedTokens</entry>
-<entry>split</entry>
-<entry>Yes</entry>
-<entry>If true all hyphenated tokens will be separated (default true)</entry>
-</row>
-<row>
<entry>lang</entry>
<entry>language</entry>
<entry>No</entry>
@@ -1912,15 +2390,21 @@ Arguments description:
<entry>Data to be used, usually a file name.</entry>
</row>
<row>
+<entry>splitHyphenatedTokens</entry>
+<entry>split</entry>
+<entry>Yes</entry>
+<entry>If true all hyphenated tokens will be separated (default true)</entry>
+</row>
+<row>
<entry morerows='3' valign='middle'>conll03</entry>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>eng|deu</entry>
<entry>No</entry>
<entry></entry>
</row>
<row>
-<entry>lang</entry>
-<entry>eng|deu</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
<entry>No</entry>
<entry></entry>
</row>
@@ -1957,14 +2441,14 @@ Arguments description:
</row>
<row>
<entry morerows='3' valign='middle'>conll02</entry>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>spa|nld</entry>
<entry>No</entry>
<entry></entry>
</row>
<row>
-<entry>lang</entry>
-<entry>es|nl</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
<entry>No</entry>
<entry></entry>
</row>
@@ -2101,14 +2585,14 @@ Arguments description:
<tbody>
<row>
<entry morerows='3' valign='middle'>evalita</entry>
-<entry>types</entry>
-<entry>per,loc,org,gpe</entry>
+<entry>lang</entry>
+<entry>it</entry>
<entry>No</entry>
<entry></entry>
</row>
<row>
-<entry>lang</entry>
-<entry>it</entry>
+<entry>types</entry>
+<entry>per,loc,org,gpe</entry>
<entry>No</entry>
<entry></entry>
</row>
@@ -2132,12 +2616,6 @@ Arguments description:
<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
-<entry>splitHyphenatedTokens</entry>
-<entry>split</entry>
-<entry>Yes</entry>
-<entry>If true all hyphenated tokens will be separated (default true)</entry>
-</row>
-<row>
<entry>lang</entry>
<entry>language</entry>
<entry>No</entry>
@@ -2150,15 +2628,21 @@ Arguments description:
<entry>Data to be used, usually a file name.</entry>
</row>
<row>
+<entry>splitHyphenatedTokens</entry>
+<entry>split</entry>
+<entry>Yes</entry>
+<entry>If true all hyphenated tokens will be separated (default true)</entry>
+</row>
+<row>
<entry morerows='3' valign='middle'>conll03</entry>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>eng|deu</entry>
<entry>No</entry>
<entry></entry>
</row>
<row>
-<entry>lang</entry>
-<entry>eng|deu</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
<entry>No</entry>
<entry></entry>
</row>
@@ -2195,14 +2679,14 @@ Arguments description:
</row>
<row>
<entry morerows='3' valign='middle'>conll02</entry>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>spa|nld</entry>
<entry>No</entry>
<entry></entry>
</row>
<row>
-<entry>lang</entry>
-<entry>es|nl</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
<entry>No</entry>
<entry></entry>
</row>
@@ -2305,14 +2789,14 @@ Usage: opennlp TokenNameFinderConverter help|evalita|ad|conll03|bionlp2004|conll
<tbody>
<row>
<entry morerows='3' valign='middle'>evalita</entry>
-<entry>types</entry>
-<entry>per,loc,org,gpe</entry>
+<entry>lang</entry>
+<entry>it</entry>
<entry>No</entry>
<entry></entry>
</row>
<row>
-<entry>lang</entry>
-<entry>it</entry>
+<entry>types</entry>
+<entry>per,loc,org,gpe</entry>
<entry>No</entry>
<entry></entry>
</row>
@@ -2336,12 +2820,6 @@ Usage: opennlp TokenNameFinderConverter help|evalita|ad|conll03|bionlp2004|conll
<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
-<entry>splitHyphenatedTokens</entry>
-<entry>split</entry>
-<entry>Yes</entry>
-<entry>If true all hyphenated tokens will be separated (default true)</entry>
-</row>
-<row>
<entry>lang</entry>
<entry>language</entry>
<entry>No</entry>
@@ -2354,15 +2832,21 @@ Usage: opennlp TokenNameFinderConverter help|evalita|ad|conll03|bionlp2004|conll
<entry>Data to be used, usually a file name.</entry>
</row>
<row>
+<entry>splitHyphenatedTokens</entry>
+<entry>split</entry>
+<entry>Yes</entry>
+<entry>If true all hyphenated tokens will be separated (default true)</entry>
+</row>
+<row>
<entry morerows='3' valign='middle'>conll03</entry>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>eng|deu</entry>
<entry>No</entry>
<entry></entry>
</row>
<row>
-<entry>lang</entry>
-<entry>eng|deu</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
<entry>No</entry>
<entry></entry>
</row>
@@ -2399,14 +2883,14 @@ Usage: opennlp TokenNameFinderConverter help|evalita|ad|conll03|bionlp2004|conll
</row>
<row>
<entry morerows='3' valign='middle'>conll02</entry>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>spa|nld</entry>
<entry>No</entry>
<entry></entry>
</row>
<row>
-<entry>lang</entry>
-<entry>es|nl</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
<entry>No</entry>
<entry></entry>
</row>
@@ -2539,19 +3023,19 @@ Usage: opennlp POSTagger model < sentences
<screen>
<![CDATA[
Usage: opennlp POSTaggerTrainer[.ad|.conllx|.parse|.ontonotes|.conllu] [-factory factoryName] [-resources
- resourcesDir] [-featuregen featuregenFile] [-dict dictionaryPath] [-tagDictCutoff tagDictCutoff]
+ resourcesDir] [-tagDictCutoff tagDictCutoff] [-featuregen featuregenFile] [-dict dictionaryPath]
[-params paramsFile] -lang language -model modelFile -data sampleData [-encoding charsetName]
Arguments description:
-factory factoryName
A sub-class of POSTaggerFactory where to get implementation and resources.
-resources resourcesDir
The resources directory
+ -tagDictCutoff tagDictCutoff
+ TagDictionary cutoff. If specified will create/expand a mutable TagDictionary
-featuregen featuregenFile
The feature generator descriptor file
-dict dictionaryPath
The XML tag dictionary file
- -tagDictCutoff tagDictCutoff
- TagDictionary cutoff. If specified will create/expand a mutable TagDictionary
-params paramsFile
training parameters file.
-lang language
@@ -2578,18 +3062,6 @@ Arguments description:
<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
-<entry>expandME</entry>
-<entry>expandME</entry>
-<entry>Yes</entry>
-<entry>Expand multiword expressions.</entry>
-</row>
-<row>
-<entry>includeFeatures</entry>
-<entry>includeFeatures</entry>
-<entry>Yes</entry>
-<entry>Combine POS Tags with word features, like number and gender.</entry>
-</row>
-<row>
<entry>lang</entry>
<entry>language</entry>
<entry>No</entry>
@@ -2602,6 +3074,18 @@ Arguments description:
<entry>Data to be used, usually a file name.</entry>
</row>
<row>
+<entry>expandME</entry>
+<entry>expandME</entry>
+<entry>Yes</entry>
+<entry>Expand multiword expressions.</entry>
+</row>
+<row>
+<entry>includeFeatures</entry>
+<entry>includeFeatures</entry>
+<entry>Yes</entry>
+<entry>Combine POS Tags with word features, like number and gender.</entry>
+</row>
+<row>
<entry morerows='1' valign='middle'>conllx</entry>
<entry>data</entry>
<entry>sampleData</entry>
@@ -2695,18 +3179,6 @@ Arguments description:
<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
-<entry>expandME</entry>
-<entry>expandME</entry>
-<entry>Yes</entry>
-<entry>Expand multiword expressions.</entry>
-</row>
-<row>
-<entry>includeFeatures</entry>
-<entry>includeFeatures</entry>
-<entry>Yes</entry>
-<entry>Combine POS Tags with word features, like number and gender.</entry>
-</row>
-<row>
<entry>lang</entry>
<entry>language</entry>
<entry>No</entry>
@@ -2719,6 +3191,18 @@ Arguments description:
<entry>Data to be used, usually a file name.</entry>
</row>
<row>
+<entry>expandME</entry>
+<entry>expandME</entry>
+<entry>Yes</entry>
+<entry>Expand multiword expressions.</entry>
+</row>
+<row>
+<entry>includeFeatures</entry>
+<entry>includeFeatures</entry>
+<entry>Yes</entry>
+<entry>Combine POS Tags with word features, like number and gender.</entry>
+</row>
+<row>
<entry morerows='1' valign='middle'>conllx</entry>
<entry>data</entry>
<entry>sampleData</entry>
@@ -2784,9 +3268,9 @@ Arguments description:
<screen>
<![CDATA[
Usage: opennlp POSTaggerCrossValidator[.ad|.conllx|.parse|.ontonotes|.conllu] [-misclassified true|false]
- [-folds num] [-factory factoryName] [-resources resourcesDir] [-featuregen featuregenFile] [-dict
- dictionaryPath] [-tagDictCutoff tagDictCutoff] [-params paramsFile] -lang language [-reportOutputFile
- outputFile] -data sampleData [-encoding charsetName]
+ [-folds num] [-factory factoryName] [-resources resourcesDir] [-tagDictCutoff tagDictCutoff]
+ [-featuregen featuregenFile] [-dict dictionaryPath] [-params paramsFile] -lang language
+ [-reportOutputFile outputFile] -data sampleData [-encoding charsetName]
Arguments description:
-misclassified true|false
if true will print false negatives and false positives.
@@ -2796,12 +3280,12 @@ Arguments description:
A sub-class of POSTaggerFactory where to get implementation and resources.
-resources resourcesDir
The resources directory
+ -tagDictCutoff tagDictCutoff
+ TagDictionary cutoff. If specified will create/expand a mutable TagDictionary
-featuregen featuregenFile
The feature generator descriptor file
-dict dictionaryPath
The XML tag dictionary file
- -tagDictCutoff tagDictCutoff
- TagDictionary cutoff. If specified will create/expand a mutable TagDictionary
-params paramsFile
training parameters file.
-lang language
@@ -2828,18 +3312,6 @@ Arguments description:
<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
-<entry>expandME</entry>
-<entry>expandME</entry>
-<entry>Yes</entry>
-<entry>Expand multiword expressions.</entry>
-</row>
-<row>
-<entry>includeFeatures</entry>
-<entry>includeFeatures</entry>
-<entry>Yes</entry>
-<entry>Combine POS Tags with word features, like number and gender.</entry>
-</row>
-<row>
<entry>lang</entry>
<entry>language</entry>
<entry>No</entry>
@@ -2852,6 +3324,18 @@ Arguments description:
<entry>Data to be used, usually a file name.</entry>
</row>
<row>
+<entry>expandME</entry>
+<entry>expandME</entry>
+<entry>Yes</entry>
+<entry>Expand multiword expressions.</entry>
+</row>
+<row>
+<entry>includeFeatures</entry>
+<entry>includeFeatures</entry>
+<entry>Yes</entry>
+<entry>Combine POS Tags with word features, like number and gender.</entry>
+</row>
+<row>
<entry morerows='1' valign='middle'>conllx</entry>
<entry>data</entry>
<entry>sampleData</entry>
@@ -2933,18 +3417,6 @@ Usage: opennlp POSTaggerConverter help|ad|conllx|parse|ontonotes|conllu [help|op
<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
</row>
<row>
-<entry>expandME</entry>
-<entry>expandME</entry>
-<entry>Yes</entry>
-<entry>Expand multiword expressions.</entry>
-</row>
-<row>
-<entry>includeFeatures</entry>
-<entry>includeFeatures</entry>
-<entry>Yes</entry>
-<entry>Combine POS Tags with word features, like number and gender.</entry>
-</row>
-<row>
<entry>lang</entry>
<entry>language</entry>
<entry>No</entry>
@@ -2957,6 +3429,18 @@ Usage: opennlp POSTaggerConverter help|ad|conllx|parse|ontonotes|conllu [help|op
<entry>Data to be used, usually a file name.</entry>
</row>
<row>
+<entry>expandME</entry>
+<entry>expandME</entry>
+<entry>Yes</entry>
+<entry>Expand multiword expressions.</entry>
+</row>
+<row>
+<entry>includeFeatures</entry>
+<entry>includeFeatures</entry>
+<entry>Yes</entry>
+<entry>Combine POS Tags with word features, like number and gender.</entry>
+</row>
+<row>
<entry morerows='1' valign='middle'>conllx</entry>
<entry>data</entry>
<entry>sampleData</entry>
@@ -3206,18 +3690,18 @@ Arguments description:
<entry>Language which is being processed.</entry>
</row>
<row>
-<entry>start</entry>
-<entry>start</entry>
-<entry>Yes</entry>
-<entry>Index of first sentence</entry>
-</row>
-<row>
<entry>end</entry>
<entry>end</entry>
<entry>Yes</entry>
<entry>Index of last sentence</entry>
</row>
<row>
+<entry>start</entry>
+<entry>start</entry>
+<entry>Yes</entry>
+<entry>Index of first sentence</entry>
+</row>
+<row>
<entry>data</entry>
<entry>sampleData</entry>
<entry>No</entry>
@@ -3271,18 +3755,18 @@ Arguments description:
<entry>Language which is being processed.</entry>
</row>
<row>
-<entry>start</entry>
-<entry>start</entry>
-<entry>Yes</entry>
-<entry>Index of first sentence</entry>
-</row>
-<row>
<entry>end</entry>
<entry>end</entry>
<entry>Yes</entry>
<entry>Index of last sentence</entry>
</row>
<row>
+<entry>start</entry>
+<entry>start</entry>
+<entry>Yes</entry>
+<entry>Index of first sentence</entry>
+</row>
+<row>
<entry>data</entry>
<entry>sampleData</entry>
<entry>No</entry>
@@ -3343,18 +3827,18 @@ Arguments description:
<entry>Language which is being processed.</entry>
</row>
<row>
-<entry>start</entry>
-<entry>start</entry>
-<entry>Yes</entry>
-<entry>Index of first sentence</entry>
-</row>
-<row>
<entry>end</entry>
<entry>end</entry>
<entry>Yes</entry>
<entry>Index of last sentence</entry>
</row>
<row>
+<entry>start</entry>
+<entry>start</entry>
+<entry>Yes</entry>
+<entry>Index of first sentence</entry>
+</row>
+<row>
<entry>data</entry>
<entry>sampleData</entry>
<entry>No</entry>
@@ -3396,18 +3880,18 @@ Usage: opennlp ChunkerConverter help|ad [help|options...]
<entry>Language which is being processed.</entry>
</row>
<row>
-<entry>start</entry>
-<entry>start</entry>
-<entry>Yes</entry>
-<entry>Index of first sentence</entry>
-</row>
-<row>
<entry>end</entry>
<entry>end</entry>
<entry>Yes</entry>
<entry>Index of last sentence</entry>
</row>
<row>
+<entry>start</entry>
+<entry>start</entry>
+<entry>Yes</entry>
+<entry>Index of first sentence</entry>
+</row>
+<row>
<entry>data</entry>
<entry>sampleData</entry>
<entry>No</entry>
diff --git a/opennlp-docs/src/docbkx/langdetect.xml b/opennlp-docs/src/docbkx/langdetect.xml
new file mode 100644
index 0000000..9f170ce
--- /dev/null
+++ b/opennlp-docs/src/docbkx/langdetect.xml
@@ -0,0 +1,226 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE chapter PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
+"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"[
+]>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+<chapter id="tools.langdetect">
+<title>Language Detector</title>
+ <section id="tools.langdetect.classifying">
+ <title>Classifying</title>
+ <para>
+ The OpenNLP Language Detector classifies a document in ISO-639-3 languages according to the model capabilities.
+ A model can be trained with Maxent, Perceptron or Naive Bayes algorithms. By default normalizes a text and
+ the context generator extracts n-grams of size 1, 2 and 3. The n-gram sizes, the normalization and the
+ context generator can be customized by extending the LanguageDetectorFactory.
+
+ </para>
+ <para>
+ The default normalizers are:
+
+ <table>
+ <title>Normalizers</title>
+ <tgroup cols="2">
+ <colspec colname="c1"/>
+ <colspec colname="c2"/>
+ <thead>
+ <row>
+ <entry>Normalizer</entry>
+ <entry>Description</entry>
+ </row>
+ </thead>
+ <tbody>
+ <row>
+ <entry>EmojiCharSequenceNormalizer</entry>
+ <entry>Replaces emojis by blank space</entry>
+ </row>
+ <row>
+ <entry>UrlCharSequenceNormalizer</entry>
+ <entry>Replaces URLs and E-Mails by a blank space.</entry>
+ </row>
+ <row>
+ <entry>TwitterCharSequenceNormalizer</entry>
+ <entry>Replaces hashtags and Twitter user names by blank spaces.</entry>
+ </row>
+ <row>
+ <entry>NumberCharSequenceNormalizer</entry>
+ <entry>Replaces number sequences by blank spaces</entry>
+ </row>
+ <row>
+ <entry>ShrinkCharSequenceNormalizer</entry>
+ <entry>Shrink characters that repeats three or more times to only two repetitions.</entry>
+ </row>
+ </tbody>
+ </tgroup>
+ </table>
+ </para>
+ </section>
+
+ <section id="tools.langdetect.classifying.cmdline">
+ <title>Language Detector Tool</title>
+ <para>
+ The easiest way to try out the language detector is the command line tool. The tool is only
+ intended for demonstration and testing. The following command shows how to use the language detector tool.
+ <screen>
+ <![CDATA[
+$ bin/opennlp LanguageDetector model]]>
+ </screen>
+ The input is read from standard input and output is written to standard output, unless they are redirected
+ or piped.
+ </para>
+ </section>
+ <section id="tools.langdetect.classifying.api">
+ <title>Language Detector API</title>
+ <para>
+ To perform classification you will need a machine learning model -
+ these are encapsulated in the LanguageDetectorModel class of OpenNLP tools.
+ </para>
+ <para>
+ First you need to grab the bytes from the serialized model on an InputStream -
+ we'll leave it you to do that, since you were the one who serialized it to begin with. Now for the easy part:
+ <programlisting language="java">
+ <![CDATA[
+InputStream is = ...
+LanguageDetectorModel m = new LanguageDetectorModel(is);]]>
+ </programlisting>
+ With the LanguageDetectorModel in hand we are just about there:
+ <programlisting language="java">
+ <![CDATA[
+String inputText = ...
+LanguageDetector myCategorizer = new LanguageDetectorME(m);
+
+// Get the most probable language
+Language bestLanguage = myCategorizer.predictLanguage(inputText);
+System.out.println("Best language: " + bestLanguage.getLang());
+System.out.println("Best language confidence: " + bestLanguage.getConfidence());
+
+// Get an array with the most probable languages
+Language[] languages = myCategorizer.predictLanguages(null);]]>
+ </programlisting>
+
+ Note that the both the API or the CLI will consider the complete text to choose the most probable languages.
+ To handle mixed language one can analyze smaller chunks of text to find language regions.
+ </para>
+ </section>
+ <section id="tools.langdetect.training">
+ <title>Training</title>
+ <para>
+ The Language Detector can be trained on annotated training material. The data
+ can be in OpenNLP Language Detector training format. This is one document per line,
+ containing the ISO-639-3 language code and text separated by a tab. Other formats can also be
+ available.
+ The following sample shows the sample from above in the required format.
+ <screen>
+ <![CDATA[
+spa A la fecha tres calles bonaerenses recuerdan su nombre (en Ituzaingó, Merlo y Campana). A la fecha, unas 50 \
+ naves y 20 aviones se han perdido en esa área particular del océano Atlántico.
+deu Alle Jahre wieder: Millionen Spanier haben am Dienstag die Auslosung in der größten Lotterie der Welt verfolgt.\
+ Alle Jahre wieder: So gelingt der stressfreie Geschenke-Umtausch Artikel per E-Mail empfehlen So gelingt der \
+ stressfre ie Geschenke-Umtausch Nicht immer liegt am Ende das unter dem Weihnachtsbaum, was man sich gewünscht hat.
+srp Већина становника боравила је кућама од блата или шаторима, како би радили на својим удаљеним пољима у долини \
+ Јордана и напасали своје стадо оваца и коза. Већина становника говори оба језика.
+lav Egija Tri-Active procedūru īpaši iesaka izmantot siltākajos gadalaikos, jo ziemā aukstums var šķist arī \
+ nepatīkams. Valdība vienojās, ka izmaiņas nodokļu politikā tiek konceptuāli atbalstītas, tomēr deva \
+ nedēļu laika Ekonomikas ministrijai, Finanšu ministrijai un Labklājības ministrijai, lai ar vienotu \
+ pozīciju atgrieztos pie jautājuma izskatīšanas.]]>
+ </screen>
+ Note: The line breaks marked with a backslash are just inserted for formatting purposes and must not be
+ included in the training data.
+ </para>
+ <section id="tools.langdetect.training.tool">
+ <title>Training Tool</title>
+ <para>
+ The following command will train the language detector and write the model to langdetect.bin:
+ <screen>
+ <![CDATA[
+$ bin/opennlp LanguageDetectorTrainer[.leipzig] -model modelFile [-params paramsFile] [-factory factoryName] -data sampleData [-encoding charsetName]
+]]>
+ </screen>
+ Note: To customize the language detector, extend the class opennlp.tools.langdetect.LanguageDetectorFactory
+ add it to the classpath and pass it in the -factory argument.
+ </para>
+ </section>
+ <section id="tools.langdetect.training.leipzig">
+ <title>Training with Leipzig</title>
+ <para>
+ The Leipzig Corpora collection presents corpora in different languages. The corpora is a collection
+ of individual sentences collected from the web and newspapers. The Corpora is available as plain text
+ and as MySQL database tables. The OpenNLP integration can only use the plain text version. More
+ information about the corpora and how to download can be found in the
+ <link linkend="tools.corpora.leipzig">Corpora section</link>.
+ </para>
+ <para>
+ This corpora is specially good to train Language Detector and a converter is provided. First, you need to
+ download the files that compose the Leipzig Corpora collection to a folder. Apache OpenNLP Language
+ Detector supports training, evaluation and cross validation using the Leipzig Corpora. For example,
+ the following command shows how to train a model.
+
+ <screen>
+ <![CDATA[
+$ bin/opennlp LanguageDetectorTrainer.leipzig -model modelFile [-params paramsFile] [-factory factoryName] \
+ -sentencesDir sentencesDir -sentencesPerSample sentencesPerSample -samplesPerLanguage samplesPerLanguage \
+ [-encoding charsetName]
+]]>
+ </screen>
+
+ </para>
+ <para>
+ The following sequence of commands shows how to convert the Leipzig Corpora collection at folder
+ leipzig-train/ to the default Language Detector format, by creating groups of 5 sentences as documents
+ and limiting to 10000 documents per language. Them, it shuffles the result and select the first
+ 100000 lines as train corpus and the last 20000 as evaluation corpus:
+ <screen>
+ <![CDATA[
+$ bin/opennlp LanguageDetectorConverter leipzig -sentencesDir leipzig-train/ -sentencesPerSample 5 -samplesPerLanguage 10000 > leipzig.txt
+$ perl -MList::Util=shuffle -e 'print shuffle(<STDIN>);' < leipzig.txt > leipzig_shuf.txt
+$ head -100000 < leipzig_shuf.txt > leipzig.train
+$ tail -20000 < leipzig_shuf.txt > leipzig.eval
+]]>
+ </screen>
+ </para>
+ </section>
+ <section id="tools.langdetect.training.api">
+ <title>Training API</title>
+ <para>
+ The following example shows how to train a model from API.
+ <programlisting language="java">
+ <![CDATA[
+InputStreamFactory inputStreamFactory = new MarkableFileInputStreamFactory(new File("corpus.txt"));
+
+ObjectStream<String> lineStream =
+ new PlainTextByLineStream(inputStreamFactory, "UTF-8");
+ObjectStream<LanguageSample> sampleStream = new LanguageDetectorSampleStream(lineStream);
+
+TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
+params.put(TrainingParameters.ALGORITHM_PARAM,
+ PerceptronTrainer.PERCEPTRON_VALUE);
+params.put(TrainingParameters.CUTOFF_PARAM, 0);
+
+LanguageDetectorFactory factory = new LanguageDetectorFactory();
+
+LanguageDetectorModel model = LanguageDetectorME.train(sampleStream, params, factory);
+model.serialize(new File("langdetect.bin"));
+}
+]]>
+ </programlisting>
+ </para>
+ </section>
+ </section>
+</chapter>
\ No newline at end of file
diff --git a/opennlp-docs/src/docbkx/opennlp.xml b/opennlp-docs/src/docbkx/opennlp.xml
index 172d06c..2f7e2fa 100644
--- a/opennlp-docs/src/docbkx/opennlp.xml
+++ b/opennlp-docs/src/docbkx/opennlp.xml
@@ -76,6 +76,7 @@ under the License.
<title>Apache OpenNLP Developer Documentation</title>
<toc/>
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="./introduction.xml"/>
+ <xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="./langdetect.xml" />
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="./sentdetect.xml"/>
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="./tokenizer.xml" />
<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="./namefinder.xml" />
--
To stop receiving notification emails like this one, please contact
['"commits@opennlp.apache.org" <co...@opennlp.apache.org>'].