You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by sm...@apache.org on 2017/07/04 04:13:27 UTC

[opennlp] branch master updated: OPENNLP-1084: Documents language detector usage and training

This is an automated email from the ASF dual-hosted git repository.

smarthi pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp.git


The following commit(s) were added to refs/heads/master by this push:
     new 2b13a14  OPENNLP-1084: Documents language detector usage and training
2b13a14 is described below

commit 2b13a14ca826fb1ad3e77380d9d091659f9e5f97
Author: William D C M SILVA <co...@apache.org>
AuthorDate: Tue Jul 4 00:54:47 2017 -0300

    OPENNLP-1084: Documents language detector usage and training
---
 opennlp-docs/src/docbkx/cli.xml        | 950 +++++++++++++++++++++++++--------
 opennlp-docs/src/docbkx/langdetect.xml | 226 ++++++++
 opennlp-docs/src/docbkx/opennlp.xml    |   1 +
 3 files changed, 944 insertions(+), 233 deletions(-)

diff --git a/opennlp-docs/src/docbkx/cli.xml b/opennlp-docs/src/docbkx/cli.xml
index 1a8c326..f809029 100644
--- a/opennlp-docs/src/docbkx/cli.xml
+++ b/opennlp-docs/src/docbkx/cli.xml
@@ -247,6 +247,255 @@ Usage: opennlp DoccatConverter help|leipzig [help|options...]
 
 </section>
 
+<section id='tools.cli.langdetect'>
+
+<title>Langdetect</title>
+
+<section id='tools.cli.langdetect.LanguageDetector'>
+
+<title>LanguageDetector</title>
+
+<para>Learned language detector</para>
+
+<screen>
+<![CDATA[
+Usage: opennlp LanguageDetector model < documents
+
+]]>
+</screen> 
+</section>
+
+<section id='tools.cli.langdetect.LanguageDetectorTrainer'>
+
+<title>LanguageDetectorTrainer</title>
+
+<para>Trainer for the learnable language detector</para>
+
+<screen>
+<![CDATA[
+Usage: opennlp LanguageDetectorTrainer[.leipzig] -model modelFile [-params paramsFile] [-factory factoryName] 
+        -data sampleData [-encoding charsetName] 
+Arguments description:
+	-model modelFile
+		output model file.
+	-params paramsFile
+		training parameters file.
+	-factory factoryName
+		A sub-class of LanguageDetectorFactory where to get implementation and resources.
+	-data sampleData
+		data to be used, usually a file name.
+	-encoding charsetName
+		encoding for reading and writing text, if absent the system default is used.
+
+]]>
+</screen> 
+<para>The supported formats and arguments are:</para>
+
+<informaltable frame='all'><tgroup cols='4' align='left' colsep='1' rowsep='1'>
+<thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
+<tbody>
+<row>
+<entry morerows='3' valign='middle'>leipzig</entry>
+<entry>sentencesDir</entry>
+<entry>sentencesDir</entry>
+<entry>No</entry>
+<entry>Dir with Leipig sentences to be used</entry>
+</row>
+<row>
+<entry>sentencesPerSample</entry>
+<entry>sentencesPerSample</entry>
+<entry>No</entry>
+<entry>Number of sentences per sample</entry>
+</row>
+<row>
+<entry>samplesPerLanguage</entry>
+<entry>samplesPerLanguage</entry>
+<entry>No</entry>
+<entry>Number of samples per language</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+</tbody>
+</tgroup></informaltable>
+
+</section>
+
+<section id='tools.cli.langdetect.LanguageDetectorConverter'>
+
+<title>LanguageDetectorConverter</title>
+
+<para>Converts leipzig data format to native OpenNLP format</para>
+
+<screen>
+<![CDATA[
+Usage: opennlp LanguageDetectorConverter help|leipzig [help|options...]
+
+]]>
+</screen> 
+<para>The supported formats and arguments are:</para>
+
+<informaltable frame='all'><tgroup cols='4' align='left' colsep='1' rowsep='1'>
+<thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
+<tbody>
+<row>
+<entry morerows='3' valign='middle'>leipzig</entry>
+<entry>sentencesDir</entry>
+<entry>sentencesDir</entry>
+<entry>No</entry>
+<entry>Dir with Leipig sentences to be used</entry>
+</row>
+<row>
+<entry>sentencesPerSample</entry>
+<entry>sentencesPerSample</entry>
+<entry>No</entry>
+<entry>Number of sentences per sample</entry>
+</row>
+<row>
+<entry>samplesPerLanguage</entry>
+<entry>samplesPerLanguage</entry>
+<entry>No</entry>
+<entry>Number of samples per language</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+</tbody>
+</tgroup></informaltable>
+
+</section>
+
+<section id='tools.cli.langdetect.LanguageDetectorCrossValidator'>
+
+<title>LanguageDetectorCrossValidator</title>
+
+<para>K-fold cross validator for the learnable Language Detector</para>
+
+<screen>
+<![CDATA[
+Usage: opennlp LanguageDetectorCrossValidator[.leipzig] [-misclassified true|false] [-folds num] [-factory 
+        factoryName] [-params paramsFile] [-reportOutputFile outputFile] -data sampleData [-encoding 
+        charsetName] 
+Arguments description:
+	-misclassified true|false
+		if true will print false negatives and false positives.
+	-folds num
+		number of folds, default is 10.
+	-factory factoryName
+		A sub-class of LanguageDetectorFactory where to get implementation and resources.
+	-params paramsFile
+		training parameters file.
+	-reportOutputFile outputFile
+		the path of the fine-grained report file.
+	-data sampleData
+		data to be used, usually a file name.
+	-encoding charsetName
+		encoding for reading and writing text, if absent the system default is used.
+
+]]>
+</screen> 
+<para>The supported formats and arguments are:</para>
+
+<informaltable frame='all'><tgroup cols='4' align='left' colsep='1' rowsep='1'>
+<thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
+<tbody>
+<row>
+<entry morerows='3' valign='middle'>leipzig</entry>
+<entry>sentencesDir</entry>
+<entry>sentencesDir</entry>
+<entry>No</entry>
+<entry>Dir with Leipig sentences to be used</entry>
+</row>
+<row>
+<entry>sentencesPerSample</entry>
+<entry>sentencesPerSample</entry>
+<entry>No</entry>
+<entry>Number of sentences per sample</entry>
+</row>
+<row>
+<entry>samplesPerLanguage</entry>
+<entry>samplesPerLanguage</entry>
+<entry>No</entry>
+<entry>Number of samples per language</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+</tbody>
+</tgroup></informaltable>
+
+</section>
+
+<section id='tools.cli.langdetect.LanguageDetectorEvaluator'>
+
+<title>LanguageDetectorEvaluator</title>
+
+<para>Measures the performance of the Language Detector model with the reference data</para>
+
+<screen>
+<![CDATA[
+Usage: opennlp LanguageDetectorEvaluator[.leipzig] -model model [-misclassified true|false] 
+        [-reportOutputFile outputFile] -data sampleData [-encoding charsetName] 
+Arguments description:
+	-model model
+		the model file to be evaluated.
+	-misclassified true|false
+		if true will print false negatives and false positives.
+	-reportOutputFile outputFile
+		the path of the fine-grained report file.
+	-data sampleData
+		data to be used, usually a file name.
+	-encoding charsetName
+		encoding for reading and writing text, if absent the system default is used.
+
+]]>
+</screen> 
+<para>The supported formats and arguments are:</para>
+
+<informaltable frame='all'><tgroup cols='4' align='left' colsep='1' rowsep='1'>
+<thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
+<tbody>
+<row>
+<entry morerows='3' valign='middle'>leipzig</entry>
+<entry>sentencesDir</entry>
+<entry>sentencesDir</entry>
+<entry>No</entry>
+<entry>Dir with Leipig sentences to be used</entry>
+</row>
+<row>
+<entry>sentencesPerSample</entry>
+<entry>sentencesPerSample</entry>
+<entry>No</entry>
+<entry>Number of sentences per sample</entry>
+</row>
+<row>
+<entry>samplesPerLanguage</entry>
+<entry>samplesPerLanguage</entry>
+<entry>No</entry>
+<entry>Number of samples per language</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+</tbody>
+</tgroup></informaltable>
+
+</section>
+
+</section>
+
 <section id='tools.cli.dictionary'>
 
 <title>Dictionary</title>
@@ -315,9 +564,9 @@ Usage: opennlp TokenizerME model < sentences
 
 <screen>
 <![CDATA[
-Usage: opennlp TokenizerTrainer[.ad|.pos|.conllx|.namefinder|.parse] [-factory factoryName] [-abbDict path] 
-        [-alphaNumOpt isAlphaNumOpt] [-params paramsFile] -lang language -model modelFile -data sampleData 
-        [-encoding charsetName] 
+Usage: opennlp TokenizerTrainer[.irishsentencebank|.ad|.pos|.conllx|.namefinder|.parse|.conllu] [-factory 
+        factoryName] [-abbDict path] [-alphaNumOpt isAlphaNumOpt] [-params paramsFile] -lang language -model 
+        modelFile -data sampleData [-encoding charsetName] 
 Arguments description:
 	-factory factoryName
 		A sub-class of TokenizerFactory where to get implementation and resources.
@@ -344,17 +593,24 @@ Arguments description:
 <thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
 <tbody>
 <row>
-<entry morerows='4' valign='middle'>ad</entry>
+<entry morerows='1' valign='middle'>irishsentencebank</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
 <entry>encoding</entry>
 <entry>charsetName</entry>
-<entry>No</entry>
+<entry>Yes</entry>
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>splitHyphenatedTokens</entry>
-<entry>split</entry>
-<entry>Yes</entry>
-<entry>If true all hyphenated tokens will be separated (default true)</entry>
+<entry morerows='4' valign='middle'>ad</entry>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>No</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
 <entry>lang</entry>
@@ -369,6 +625,12 @@ Arguments description:
 <entry>Data to be used, usually a file name.</entry>
 </row>
 <row>
+<entry>splitHyphenatedTokens</entry>
+<entry>split</entry>
+<entry>Yes</entry>
+<entry>If true all hyphenated tokens will be separated (default true)</entry>
+</row>
+<row>
 <entry>detokenizer</entry>
 <entry>dictionary</entry>
 <entry>No</entry>
@@ -450,6 +712,19 @@ Arguments description:
 <entry>No</entry>
 <entry>Specifies the file with detokenizer dictionary.</entry>
 </row>
+<row>
+<entry morerows='1' valign='middle'>conllu</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
 </tbody>
 </tgroup></informaltable>
 
@@ -463,8 +738,8 @@ Arguments description:
 
 <screen>
 <![CDATA[
-Usage: opennlp TokenizerMEEvaluator[.ad|.pos|.conllx|.namefinder|.parse] -model model [-misclassified 
-        true|false] -data sampleData [-encoding charsetName] 
+Usage: opennlp TokenizerMEEvaluator[.irishsentencebank|.ad|.pos|.conllx|.namefinder|.parse|.conllu] -model 
+        model [-misclassified true|false] -data sampleData [-encoding charsetName] 
 Arguments description:
 	-model model
 		the model file to be evaluated.
@@ -483,17 +758,24 @@ Arguments description:
 <thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
 <tbody>
 <row>
-<entry morerows='4' valign='middle'>ad</entry>
+<entry morerows='1' valign='middle'>irishsentencebank</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
 <entry>encoding</entry>
 <entry>charsetName</entry>
-<entry>No</entry>
+<entry>Yes</entry>
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>splitHyphenatedTokens</entry>
-<entry>split</entry>
-<entry>Yes</entry>
-<entry>If true all hyphenated tokens will be separated (default true)</entry>
+<entry morerows='4' valign='middle'>ad</entry>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>No</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
 <entry>lang</entry>
@@ -508,6 +790,12 @@ Arguments description:
 <entry>Data to be used, usually a file name.</entry>
 </row>
 <row>
+<entry>splitHyphenatedTokens</entry>
+<entry>split</entry>
+<entry>Yes</entry>
+<entry>If true all hyphenated tokens will be separated (default true)</entry>
+</row>
+<row>
 <entry>detokenizer</entry>
 <entry>dictionary</entry>
 <entry>No</entry>
@@ -589,6 +877,19 @@ Arguments description:
 <entry>No</entry>
 <entry>Specifies the file with detokenizer dictionary.</entry>
 </row>
+<row>
+<entry morerows='1' valign='middle'>conllu</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
 </tbody>
 </tgroup></informaltable>
 
@@ -602,9 +903,9 @@ Arguments description:
 
 <screen>
 <![CDATA[
-Usage: opennlp TokenizerCrossValidator[.ad|.pos|.conllx|.namefinder|.parse] [-misclassified true|false] 
-        [-folds num] [-factory factoryName] [-abbDict path] [-alphaNumOpt isAlphaNumOpt] [-params paramsFile] 
-        -lang language -data sampleData [-encoding charsetName] 
+Usage: opennlp TokenizerCrossValidator[.irishsentencebank|.ad|.pos|.conllx|.namefinder|.parse|.conllu] 
+        [-misclassified true|false] [-folds num] [-factory factoryName] [-abbDict path] [-alphaNumOpt 
+        isAlphaNumOpt] [-params paramsFile] -lang language -data sampleData [-encoding charsetName] 
 Arguments description:
 	-misclassified true|false
 		if true will print false negatives and false positives.
@@ -633,6 +934,19 @@ Arguments description:
 <thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
 <tbody>
 <row>
+<entry morerows='1' valign='middle'>irishsentencebank</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
 <entry morerows='4' valign='middle'>ad</entry>
 <entry>encoding</entry>
 <entry>charsetName</entry>
@@ -640,31 +954,50 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
+<entry>lang</entry>
+<entry>language</entry>
+<entry>No</entry>
+<entry>Language which is being processed.</entry>
+</row>
+<row>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
 <entry>splitHyphenatedTokens</entry>
 <entry>split</entry>
 <entry>Yes</entry>
 <entry>If true all hyphenated tokens will be separated (default true)</entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>language</entry>
+<entry>detokenizer</entry>
+<entry>dictionary</entry>
 <entry>No</entry>
-<entry>Language which is being processed.</entry>
+<entry>Specifies the file with detokenizer dictionary.</entry>
 </row>
 <row>
+<entry morerows='2' valign='middle'>pos</entry>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
 <entry>Data to be used, usually a file name.</entry>
 </row>
 <row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
 <entry>detokenizer</entry>
 <entry>dictionary</entry>
 <entry>No</entry>
 <entry>Specifies the file with detokenizer dictionary.</entry>
 </row>
 <row>
-<entry morerows='2' valign='middle'>pos</entry>
+<entry morerows='2' valign='middle'>conllx</entry>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -683,7 +1016,7 @@ Arguments description:
 <entry>Specifies the file with detokenizer dictionary.</entry>
 </row>
 <row>
-<entry morerows='2' valign='middle'>conllx</entry>
+<entry morerows='2' valign='middle'>namefinder</entry>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -702,7 +1035,7 @@ Arguments description:
 <entry>Specifies the file with detokenizer dictionary.</entry>
 </row>
 <row>
-<entry morerows='2' valign='middle'>namefinder</entry>
+<entry morerows='2' valign='middle'>parse</entry>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -721,7 +1054,7 @@ Arguments description:
 <entry>Specifies the file with detokenizer dictionary.</entry>
 </row>
 <row>
-<entry morerows='2' valign='middle'>parse</entry>
+<entry morerows='1' valign='middle'>conllu</entry>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -733,12 +1066,6 @@ Arguments description:
 <entry>Yes</entry>
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
-<row>
-<entry>detokenizer</entry>
-<entry>dictionary</entry>
-<entry>No</entry>
-<entry>Specifies the file with detokenizer dictionary.</entry>
-</row>
 </tbody>
 </tgroup></informaltable>
 
@@ -748,12 +1075,12 @@ Arguments description:
 
 <title>TokenizerConverter</title>
 
-<para>Converts foreign data formats (ad,pos,conllx,namefinder,parse) to native OpenNLP format</para>
+<para>Converts foreign data formats (irishsentencebank,ad,pos,conllx,namefinder,parse,conllu) to native OpenNLP format</para>
 
 <screen>
 <![CDATA[
-Usage: opennlp TokenizerConverter help|ad|pos|conllx|namefinder|parse [help|options...]
-
+Usage: opennlp TokenizerConverter help|irishsentencebank|ad|pos|conllx|namefinder|parse|conllu 
+        [help|options...] 
 ]]>
 </screen> 
 <para>The supported formats and arguments are:</para>
@@ -762,17 +1089,24 @@ Usage: opennlp TokenizerConverter help|ad|pos|conllx|namefinder|parse [help|opti
 <thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
 <tbody>
 <row>
-<entry morerows='4' valign='middle'>ad</entry>
+<entry morerows='1' valign='middle'>irishsentencebank</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
 <entry>encoding</entry>
 <entry>charsetName</entry>
-<entry>No</entry>
+<entry>Yes</entry>
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>splitHyphenatedTokens</entry>
-<entry>split</entry>
-<entry>Yes</entry>
-<entry>If true all hyphenated tokens will be separated (default true)</entry>
+<entry morerows='4' valign='middle'>ad</entry>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>No</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
 <entry>lang</entry>
@@ -787,6 +1121,12 @@ Usage: opennlp TokenizerConverter help|ad|pos|conllx|namefinder|parse [help|opti
 <entry>Data to be used, usually a file name.</entry>
 </row>
 <row>
+<entry>splitHyphenatedTokens</entry>
+<entry>split</entry>
+<entry>Yes</entry>
+<entry>If true all hyphenated tokens will be separated (default true)</entry>
+</row>
+<row>
 <entry>detokenizer</entry>
 <entry>dictionary</entry>
 <entry>No</entry>
@@ -868,6 +1208,19 @@ Usage: opennlp TokenizerConverter help|ad|pos|conllx|namefinder|parse [help|opti
 <entry>No</entry>
 <entry>Specifies the file with detokenizer dictionary.</entry>
 </row>
+<row>
+<entry morerows='1' valign='middle'>conllu</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
 </tbody>
 </tgroup></informaltable>
 
@@ -915,16 +1268,17 @@ Usage: opennlp SentenceDetector model < sentences
 
 <screen>
 <![CDATA[
-Usage: opennlp SentenceDetectorTrainer[.ad|.pos|.conllx|.namefinder|.parse|.moses|.letsmt] [-factory 
-        factoryName] [-abbDict path] [-eosChars string] [-params paramsFile] -lang language -model modelFile 
-        -data sampleData [-encoding charsetName] 
+Usage: opennlp 
+        SentenceDetectorTrainer[.irishsentencebank|.ad|.pos|.conllx|.namefinder|.parse|.moses|.conllu|.letsmt] 
+        [-factory factoryName] [-eosChars string] [-abbDict path] [-params paramsFile] -lang language -model 
+        modelFile -data sampleData [-encoding charsetName] 
 Arguments description:
 	-factory factoryName
 		A sub-class of SentenceDetectorFactory where to get implementation and resources.
-	-abbDict path
-		abbreviation dictionary in XML format.
 	-eosChars string
 		EOS characters.
+	-abbDict path
+		abbreviation dictionary in XML format.
 	-params paramsFile
 		training parameters file.
 	-lang language
@@ -944,6 +1298,19 @@ Arguments description:
 <thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
 <tbody>
 <row>
+<entry morerows='1' valign='middle'>irishsentencebank</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
 <entry morerows='3' valign='middle'>ad</entry>
 <entry>encoding</entry>
 <entry>charsetName</entry>
@@ -951,12 +1318,6 @@ Arguments description:
 <entry>Encoding for reading and writing text.</entry>
 </row>
 <row>
-<entry>includeTitles</entry>
-<entry>includeTitles</entry>
-<entry>Yes</entry>
-<entry>If true will include sentences marked as headlines.</entry>
-</row>
-<row>
 <entry>lang</entry>
 <entry>language</entry>
 <entry>No</entry>
@@ -969,6 +1330,12 @@ Arguments description:
 <entry>Data to be used, usually a file name.</entry>
 </row>
 <row>
+<entry>includeTitles</entry>
+<entry>includeTitles</entry>
+<entry>Yes</entry>
+<entry>If true will include sentences marked as headlines.</entry>
+</row>
+<row>
 <entry morerows='2' valign='middle'>pos</entry>
 <entry>data</entry>
 <entry>sampleData</entry>
@@ -1058,6 +1425,25 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
+<entry morerows='2' valign='middle'>conllu</entry>
+<entry>sentencesPerSample</entry>
+<entry>sentencesPerSample</entry>
+<entry>No</entry>
+<entry>Number of sentences per sample</entry>
+</row>
+<row>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
 <entry morerows='2' valign='middle'>letsmt</entry>
 <entry>detokenizer</entry>
 <entry>dictionary</entry>
@@ -1089,8 +1475,9 @@ Arguments description:
 
 <screen>
 <![CDATA[
-Usage: opennlp SentenceDetectorEvaluator[.ad|.pos|.conllx|.namefinder|.parse|.moses|.letsmt] -model model 
-        [-misclassified true|false] -data sampleData [-encoding charsetName] 
+Usage: opennlp 
+        SentenceDetectorEvaluator[.irishsentencebank|.ad|.pos|.conllx|.namefinder|.parse|.moses|.conllu|.letsmt] 
+        -model model [-misclassified true|false] -data sampleData [-encoding charsetName] 
 Arguments description:
 	-model model
 		the model file to be evaluated.
@@ -1109,6 +1496,19 @@ Arguments description:
 <thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
 <tbody>
 <row>
+<entry morerows='1' valign='middle'>irishsentencebank</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
 <entry morerows='3' valign='middle'>ad</entry>
 <entry>encoding</entry>
 <entry>charsetName</entry>
@@ -1116,12 +1516,6 @@ Arguments description:
 <entry>Encoding for reading and writing text.</entry>
 </row>
 <row>
-<entry>includeTitles</entry>
-<entry>includeTitles</entry>
-<entry>Yes</entry>
-<entry>If true will include sentences marked as headlines.</entry>
-</row>
-<row>
 <entry>lang</entry>
 <entry>language</entry>
 <entry>No</entry>
@@ -1134,6 +1528,12 @@ Arguments description:
 <entry>Data to be used, usually a file name.</entry>
 </row>
 <row>
+<entry>includeTitles</entry>
+<entry>includeTitles</entry>
+<entry>Yes</entry>
+<entry>If true will include sentences marked as headlines.</entry>
+</row>
+<row>
 <entry morerows='2' valign='middle'>pos</entry>
 <entry>data</entry>
 <entry>sampleData</entry>
@@ -1223,6 +1623,25 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
+<entry morerows='2' valign='middle'>conllu</entry>
+<entry>sentencesPerSample</entry>
+<entry>sentencesPerSample</entry>
+<entry>No</entry>
+<entry>Number of sentences per sample</entry>
+</row>
+<row>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
 <entry morerows='2' valign='middle'>letsmt</entry>
 <entry>detokenizer</entry>
 <entry>dictionary</entry>
@@ -1254,16 +1673,17 @@ Arguments description:
 
 <screen>
 <![CDATA[
-Usage: opennlp SentenceDetectorCrossValidator[.ad|.pos|.conllx|.namefinder|.parse|.moses|.letsmt] [-factory 
-        factoryName] [-abbDict path] [-eosChars string] [-params paramsFile] -lang language [-misclassified 
-        true|false] [-folds num] -data sampleData [-encoding charsetName] 
+Usage: opennlp 
+        SentenceDetectorCrossValidator[.irishsentencebank|.ad|.pos|.conllx|.namefinder|.parse|.moses|.conllu|.letsmt] 
+        [-factory factoryName] [-eosChars string] [-abbDict path] [-params paramsFile] -lang language 
+        [-misclassified true|false] [-folds num] -data sampleData [-encoding charsetName] 
 Arguments description:
 	-factory factoryName
 		A sub-class of SentenceDetectorFactory where to get implementation and resources.
-	-abbDict path
-		abbreviation dictionary in XML format.
 	-eosChars string
 		EOS characters.
+	-abbDict path
+		abbreviation dictionary in XML format.
 	-params paramsFile
 		training parameters file.
 	-lang language
@@ -1285,6 +1705,19 @@ Arguments description:
 <thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
 <tbody>
 <row>
+<entry morerows='1' valign='middle'>irishsentencebank</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
 <entry morerows='3' valign='middle'>ad</entry>
 <entry>encoding</entry>
 <entry>charsetName</entry>
@@ -1292,12 +1725,6 @@ Arguments description:
 <entry>Encoding for reading and writing text.</entry>
 </row>
 <row>
-<entry>includeTitles</entry>
-<entry>includeTitles</entry>
-<entry>Yes</entry>
-<entry>If true will include sentences marked as headlines.</entry>
-</row>
-<row>
 <entry>lang</entry>
 <entry>language</entry>
 <entry>No</entry>
@@ -1310,6 +1737,12 @@ Arguments description:
 <entry>Data to be used, usually a file name.</entry>
 </row>
 <row>
+<entry>includeTitles</entry>
+<entry>includeTitles</entry>
+<entry>Yes</entry>
+<entry>If true will include sentences marked as headlines.</entry>
+</row>
+<row>
 <entry morerows='2' valign='middle'>pos</entry>
 <entry>data</entry>
 <entry>sampleData</entry>
@@ -1399,6 +1832,25 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
+<entry morerows='2' valign='middle'>conllu</entry>
+<entry>sentencesPerSample</entry>
+<entry>sentencesPerSample</entry>
+<entry>No</entry>
+<entry>Number of sentences per sample</entry>
+</row>
+<row>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
 <entry morerows='2' valign='middle'>letsmt</entry>
 <entry>detokenizer</entry>
 <entry>dictionary</entry>
@@ -1426,12 +1878,12 @@ Arguments description:
 
 <title>SentenceDetectorConverter</title>
 
-<para>Converts foreign data formats (ad,pos,conllx,namefinder,parse,moses,letsmt) to native OpenNLP format</para>
+<para>Converts foreign data formats (irishsentencebank,ad,pos,conllx,namefinder,parse,moses,conllu,letsmt) to native OpenNLP format</para>
 
 <screen>
 <![CDATA[
-Usage: opennlp SentenceDetectorConverter help|ad|pos|conllx|namefinder|parse|moses|letsmt [help|options...]
-
+Usage: opennlp SentenceDetectorConverter 
+        help|irishsentencebank|ad|pos|conllx|namefinder|parse|moses|conllu|letsmt [help|options...] 
 ]]>
 </screen> 
 <para>The supported formats and arguments are:</para>
@@ -1440,6 +1892,19 @@ Usage: opennlp SentenceDetectorConverter help|ad|pos|conllx|namefinder|parse|mos
 <thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
 <tbody>
 <row>
+<entry morerows='1' valign='middle'>irishsentencebank</entry>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
 <entry morerows='3' valign='middle'>ad</entry>
 <entry>encoding</entry>
 <entry>charsetName</entry>
@@ -1447,12 +1912,6 @@ Usage: opennlp SentenceDetectorConverter help|ad|pos|conllx|namefinder|parse|mos
 <entry>Encoding for reading and writing text.</entry>
 </row>
 <row>
-<entry>includeTitles</entry>
-<entry>includeTitles</entry>
-<entry>Yes</entry>
-<entry>If true will include sentences marked as headlines.</entry>
-</row>
-<row>
 <entry>lang</entry>
 <entry>language</entry>
 <entry>No</entry>
@@ -1465,6 +1924,12 @@ Usage: opennlp SentenceDetectorConverter help|ad|pos|conllx|namefinder|parse|mos
 <entry>Data to be used, usually a file name.</entry>
 </row>
 <row>
+<entry>includeTitles</entry>
+<entry>includeTitles</entry>
+<entry>Yes</entry>
+<entry>If true will include sentences marked as headlines.</entry>
+</row>
+<row>
 <entry morerows='2' valign='middle'>pos</entry>
 <entry>data</entry>
 <entry>sampleData</entry>
@@ -1554,6 +2019,25 @@ Usage: opennlp SentenceDetectorConverter help|ad|pos|conllx|namefinder|parse|mos
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
+<entry morerows='2' valign='middle'>conllu</entry>
+<entry>sentencesPerSample</entry>
+<entry>sentencesPerSample</entry>
+<entry>No</entry>
+<entry>Number of sentences per sample</entry>
+</row>
+<row>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
+<row>
 <entry morerows='2' valign='middle'>letsmt</entry>
 <entry>detokenizer</entry>
 <entry>dictionary</entry>
@@ -1642,14 +2126,14 @@ Arguments description:
 <tbody>
 <row>
 <entry morerows='3' valign='middle'>evalita</entry>
-<entry>types</entry>
-<entry>per,loc,org,gpe</entry>
+<entry>lang</entry>
+<entry>it</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>it</entry>
+<entry>types</entry>
+<entry>per,loc,org,gpe</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -1673,12 +2157,6 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>splitHyphenatedTokens</entry>
-<entry>split</entry>
-<entry>Yes</entry>
-<entry>If true all hyphenated tokens will be separated (default true)</entry>
-</row>
-<row>
 <entry>lang</entry>
 <entry>language</entry>
 <entry>No</entry>
@@ -1691,15 +2169,21 @@ Arguments description:
 <entry>Data to be used, usually a file name.</entry>
 </row>
 <row>
+<entry>splitHyphenatedTokens</entry>
+<entry>split</entry>
+<entry>Yes</entry>
+<entry>If true all hyphenated tokens will be separated (default true)</entry>
+</row>
+<row>
 <entry morerows='3' valign='middle'>conll03</entry>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>eng|deu</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>eng|deu</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -1736,14 +2220,14 @@ Arguments description:
 </row>
 <row>
 <entry morerows='3' valign='middle'>conll02</entry>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>spa|nld</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>es|nl</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -1863,14 +2347,14 @@ Arguments description:
 <tbody>
 <row>
 <entry morerows='3' valign='middle'>evalita</entry>
-<entry>types</entry>
-<entry>per,loc,org,gpe</entry>
+<entry>lang</entry>
+<entry>it</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>it</entry>
+<entry>types</entry>
+<entry>per,loc,org,gpe</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -1894,12 +2378,6 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>splitHyphenatedTokens</entry>
-<entry>split</entry>
-<entry>Yes</entry>
-<entry>If true all hyphenated tokens will be separated (default true)</entry>
-</row>
-<row>
 <entry>lang</entry>
 <entry>language</entry>
 <entry>No</entry>
@@ -1912,15 +2390,21 @@ Arguments description:
 <entry>Data to be used, usually a file name.</entry>
 </row>
 <row>
+<entry>splitHyphenatedTokens</entry>
+<entry>split</entry>
+<entry>Yes</entry>
+<entry>If true all hyphenated tokens will be separated (default true)</entry>
+</row>
+<row>
 <entry morerows='3' valign='middle'>conll03</entry>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>eng|deu</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>eng|deu</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -1957,14 +2441,14 @@ Arguments description:
 </row>
 <row>
 <entry morerows='3' valign='middle'>conll02</entry>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>spa|nld</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>es|nl</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -2101,14 +2585,14 @@ Arguments description:
 <tbody>
 <row>
 <entry morerows='3' valign='middle'>evalita</entry>
-<entry>types</entry>
-<entry>per,loc,org,gpe</entry>
+<entry>lang</entry>
+<entry>it</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>it</entry>
+<entry>types</entry>
+<entry>per,loc,org,gpe</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -2132,12 +2616,6 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>splitHyphenatedTokens</entry>
-<entry>split</entry>
-<entry>Yes</entry>
-<entry>If true all hyphenated tokens will be separated (default true)</entry>
-</row>
-<row>
 <entry>lang</entry>
 <entry>language</entry>
 <entry>No</entry>
@@ -2150,15 +2628,21 @@ Arguments description:
 <entry>Data to be used, usually a file name.</entry>
 </row>
 <row>
+<entry>splitHyphenatedTokens</entry>
+<entry>split</entry>
+<entry>Yes</entry>
+<entry>If true all hyphenated tokens will be separated (default true)</entry>
+</row>
+<row>
 <entry morerows='3' valign='middle'>conll03</entry>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>eng|deu</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>eng|deu</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -2195,14 +2679,14 @@ Arguments description:
 </row>
 <row>
 <entry morerows='3' valign='middle'>conll02</entry>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>spa|nld</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>es|nl</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -2305,14 +2789,14 @@ Usage: opennlp TokenNameFinderConverter help|evalita|ad|conll03|bionlp2004|conll
 <tbody>
 <row>
 <entry morerows='3' valign='middle'>evalita</entry>
-<entry>types</entry>
-<entry>per,loc,org,gpe</entry>
+<entry>lang</entry>
+<entry>it</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>it</entry>
+<entry>types</entry>
+<entry>per,loc,org,gpe</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -2336,12 +2820,6 @@ Usage: opennlp TokenNameFinderConverter help|evalita|ad|conll03|bionlp2004|conll
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>splitHyphenatedTokens</entry>
-<entry>split</entry>
-<entry>Yes</entry>
-<entry>If true all hyphenated tokens will be separated (default true)</entry>
-</row>
-<row>
 <entry>lang</entry>
 <entry>language</entry>
 <entry>No</entry>
@@ -2354,15 +2832,21 @@ Usage: opennlp TokenNameFinderConverter help|evalita|ad|conll03|bionlp2004|conll
 <entry>Data to be used, usually a file name.</entry>
 </row>
 <row>
+<entry>splitHyphenatedTokens</entry>
+<entry>split</entry>
+<entry>Yes</entry>
+<entry>If true all hyphenated tokens will be separated (default true)</entry>
+</row>
+<row>
 <entry morerows='3' valign='middle'>conll03</entry>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>eng|deu</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>eng|deu</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -2399,14 +2883,14 @@ Usage: opennlp TokenNameFinderConverter help|evalita|ad|conll03|bionlp2004|conll
 </row>
 <row>
 <entry morerows='3' valign='middle'>conll02</entry>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>spa|nld</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>es|nl</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -2539,19 +3023,19 @@ Usage: opennlp POSTagger model < sentences
 <screen>
 <![CDATA[
 Usage: opennlp POSTaggerTrainer[.ad|.conllx|.parse|.ontonotes|.conllu] [-factory factoryName] [-resources 
-        resourcesDir] [-featuregen featuregenFile] [-dict dictionaryPath] [-tagDictCutoff tagDictCutoff] 
+        resourcesDir] [-tagDictCutoff tagDictCutoff] [-featuregen featuregenFile] [-dict dictionaryPath] 
         [-params paramsFile] -lang language -model modelFile -data sampleData [-encoding charsetName] 
 Arguments description:
 	-factory factoryName
 		A sub-class of POSTaggerFactory where to get implementation and resources.
 	-resources resourcesDir
 		The resources directory
+	-tagDictCutoff tagDictCutoff
+		TagDictionary cutoff. If specified will create/expand a mutable TagDictionary
 	-featuregen featuregenFile
 		The feature generator descriptor file
 	-dict dictionaryPath
 		The XML tag dictionary file
-	-tagDictCutoff tagDictCutoff
-		TagDictionary cutoff. If specified will create/expand a mutable TagDictionary
 	-params paramsFile
 		training parameters file.
 	-lang language
@@ -2578,18 +3062,6 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>expandME</entry>
-<entry>expandME</entry>
-<entry>Yes</entry>
-<entry>Expand multiword expressions.</entry>
-</row>
-<row>
-<entry>includeFeatures</entry>
-<entry>includeFeatures</entry>
-<entry>Yes</entry>
-<entry>Combine POS Tags with word features, like number and gender.</entry>
-</row>
-<row>
 <entry>lang</entry>
 <entry>language</entry>
 <entry>No</entry>
@@ -2602,6 +3074,18 @@ Arguments description:
 <entry>Data to be used, usually a file name.</entry>
 </row>
 <row>
+<entry>expandME</entry>
+<entry>expandME</entry>
+<entry>Yes</entry>
+<entry>Expand multiword expressions.</entry>
+</row>
+<row>
+<entry>includeFeatures</entry>
+<entry>includeFeatures</entry>
+<entry>Yes</entry>
+<entry>Combine POS Tags with word features, like number and gender.</entry>
+</row>
+<row>
 <entry morerows='1' valign='middle'>conllx</entry>
 <entry>data</entry>
 <entry>sampleData</entry>
@@ -2695,18 +3179,6 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>expandME</entry>
-<entry>expandME</entry>
-<entry>Yes</entry>
-<entry>Expand multiword expressions.</entry>
-</row>
-<row>
-<entry>includeFeatures</entry>
-<entry>includeFeatures</entry>
-<entry>Yes</entry>
-<entry>Combine POS Tags with word features, like number and gender.</entry>
-</row>
-<row>
 <entry>lang</entry>
 <entry>language</entry>
 <entry>No</entry>
@@ -2719,6 +3191,18 @@ Arguments description:
 <entry>Data to be used, usually a file name.</entry>
 </row>
 <row>
+<entry>expandME</entry>
+<entry>expandME</entry>
+<entry>Yes</entry>
+<entry>Expand multiword expressions.</entry>
+</row>
+<row>
+<entry>includeFeatures</entry>
+<entry>includeFeatures</entry>
+<entry>Yes</entry>
+<entry>Combine POS Tags with word features, like number and gender.</entry>
+</row>
+<row>
 <entry morerows='1' valign='middle'>conllx</entry>
 <entry>data</entry>
 <entry>sampleData</entry>
@@ -2784,9 +3268,9 @@ Arguments description:
 <screen>
 <![CDATA[
 Usage: opennlp POSTaggerCrossValidator[.ad|.conllx|.parse|.ontonotes|.conllu] [-misclassified true|false] 
-        [-folds num] [-factory factoryName] [-resources resourcesDir] [-featuregen featuregenFile] [-dict 
-        dictionaryPath] [-tagDictCutoff tagDictCutoff] [-params paramsFile] -lang language [-reportOutputFile 
-        outputFile] -data sampleData [-encoding charsetName] 
+        [-folds num] [-factory factoryName] [-resources resourcesDir] [-tagDictCutoff tagDictCutoff] 
+        [-featuregen featuregenFile] [-dict dictionaryPath] [-params paramsFile] -lang language 
+        [-reportOutputFile outputFile] -data sampleData [-encoding charsetName] 
 Arguments description:
 	-misclassified true|false
 		if true will print false negatives and false positives.
@@ -2796,12 +3280,12 @@ Arguments description:
 		A sub-class of POSTaggerFactory where to get implementation and resources.
 	-resources resourcesDir
 		The resources directory
+	-tagDictCutoff tagDictCutoff
+		TagDictionary cutoff. If specified will create/expand a mutable TagDictionary
 	-featuregen featuregenFile
 		The feature generator descriptor file
 	-dict dictionaryPath
 		The XML tag dictionary file
-	-tagDictCutoff tagDictCutoff
-		TagDictionary cutoff. If specified will create/expand a mutable TagDictionary
 	-params paramsFile
 		training parameters file.
 	-lang language
@@ -2828,18 +3312,6 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>expandME</entry>
-<entry>expandME</entry>
-<entry>Yes</entry>
-<entry>Expand multiword expressions.</entry>
-</row>
-<row>
-<entry>includeFeatures</entry>
-<entry>includeFeatures</entry>
-<entry>Yes</entry>
-<entry>Combine POS Tags with word features, like number and gender.</entry>
-</row>
-<row>
 <entry>lang</entry>
 <entry>language</entry>
 <entry>No</entry>
@@ -2852,6 +3324,18 @@ Arguments description:
 <entry>Data to be used, usually a file name.</entry>
 </row>
 <row>
+<entry>expandME</entry>
+<entry>expandME</entry>
+<entry>Yes</entry>
+<entry>Expand multiword expressions.</entry>
+</row>
+<row>
+<entry>includeFeatures</entry>
+<entry>includeFeatures</entry>
+<entry>Yes</entry>
+<entry>Combine POS Tags with word features, like number and gender.</entry>
+</row>
+<row>
 <entry morerows='1' valign='middle'>conllx</entry>
 <entry>data</entry>
 <entry>sampleData</entry>
@@ -2933,18 +3417,6 @@ Usage: opennlp POSTaggerConverter help|ad|conllx|parse|ontonotes|conllu [help|op
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>expandME</entry>
-<entry>expandME</entry>
-<entry>Yes</entry>
-<entry>Expand multiword expressions.</entry>
-</row>
-<row>
-<entry>includeFeatures</entry>
-<entry>includeFeatures</entry>
-<entry>Yes</entry>
-<entry>Combine POS Tags with word features, like number and gender.</entry>
-</row>
-<row>
 <entry>lang</entry>
 <entry>language</entry>
 <entry>No</entry>
@@ -2957,6 +3429,18 @@ Usage: opennlp POSTaggerConverter help|ad|conllx|parse|ontonotes|conllu [help|op
 <entry>Data to be used, usually a file name.</entry>
 </row>
 <row>
+<entry>expandME</entry>
+<entry>expandME</entry>
+<entry>Yes</entry>
+<entry>Expand multiword expressions.</entry>
+</row>
+<row>
+<entry>includeFeatures</entry>
+<entry>includeFeatures</entry>
+<entry>Yes</entry>
+<entry>Combine POS Tags with word features, like number and gender.</entry>
+</row>
+<row>
 <entry morerows='1' valign='middle'>conllx</entry>
 <entry>data</entry>
 <entry>sampleData</entry>
@@ -3206,18 +3690,18 @@ Arguments description:
 <entry>Language which is being processed.</entry>
 </row>
 <row>
-<entry>start</entry>
-<entry>start</entry>
-<entry>Yes</entry>
-<entry>Index of first sentence</entry>
-</row>
-<row>
 <entry>end</entry>
 <entry>end</entry>
 <entry>Yes</entry>
 <entry>Index of last sentence</entry>
 </row>
 <row>
+<entry>start</entry>
+<entry>start</entry>
+<entry>Yes</entry>
+<entry>Index of first sentence</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -3271,18 +3755,18 @@ Arguments description:
 <entry>Language which is being processed.</entry>
 </row>
 <row>
-<entry>start</entry>
-<entry>start</entry>
-<entry>Yes</entry>
-<entry>Index of first sentence</entry>
-</row>
-<row>
 <entry>end</entry>
 <entry>end</entry>
 <entry>Yes</entry>
 <entry>Index of last sentence</entry>
 </row>
 <row>
+<entry>start</entry>
+<entry>start</entry>
+<entry>Yes</entry>
+<entry>Index of first sentence</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -3343,18 +3827,18 @@ Arguments description:
 <entry>Language which is being processed.</entry>
 </row>
 <row>
-<entry>start</entry>
-<entry>start</entry>
-<entry>Yes</entry>
-<entry>Index of first sentence</entry>
-</row>
-<row>
 <entry>end</entry>
 <entry>end</entry>
 <entry>Yes</entry>
 <entry>Index of last sentence</entry>
 </row>
 <row>
+<entry>start</entry>
+<entry>start</entry>
+<entry>Yes</entry>
+<entry>Index of first sentence</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -3396,18 +3880,18 @@ Usage: opennlp ChunkerConverter help|ad [help|options...]
 <entry>Language which is being processed.</entry>
 </row>
 <row>
-<entry>start</entry>
-<entry>start</entry>
-<entry>Yes</entry>
-<entry>Index of first sentence</entry>
-</row>
-<row>
 <entry>end</entry>
 <entry>end</entry>
 <entry>Yes</entry>
 <entry>Index of last sentence</entry>
 </row>
 <row>
+<entry>start</entry>
+<entry>start</entry>
+<entry>Yes</entry>
+<entry>Index of first sentence</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
diff --git a/opennlp-docs/src/docbkx/langdetect.xml b/opennlp-docs/src/docbkx/langdetect.xml
new file mode 100644
index 0000000..9f170ce
--- /dev/null
+++ b/opennlp-docs/src/docbkx/langdetect.xml
@@ -0,0 +1,226 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!DOCTYPE chapter PUBLIC "-//OASIS//DTD DocBook XML V4.4//EN"
+"http://www.oasis-open.org/docbook/xml/4.4/docbookx.dtd"[
+]>
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+<chapter id="tools.langdetect">
+<title>Language Detector</title>
+	<section id="tools.langdetect.classifying">
+		<title>Classifying</title>
+		<para>
+		The OpenNLP Language Detector classifies a document in ISO-639-3 languages according to the model capabilities.
+		A model can be trained with Maxent, Perceptron or Naive Bayes algorithms. By default normalizes a text and
+			the context generator extracts n-grams of size 1, 2 and 3. The n-gram sizes, the normalization and the
+			context generator can be customized by extending the LanguageDetectorFactory.
+
+		</para>
+		<para>
+			The default normalizers are:
+
+			<table>
+				<title>Normalizers</title>
+				<tgroup cols="2">
+					<colspec colname="c1"/>
+					<colspec colname="c2"/>
+					<thead>
+						<row>
+							<entry>Normalizer</entry>
+							<entry>Description</entry>
+						</row>
+					</thead>
+					<tbody>
+						<row>
+							<entry>EmojiCharSequenceNormalizer</entry>
+							<entry>Replaces emojis by blank space</entry>
+						</row>
+						<row>
+							<entry>UrlCharSequenceNormalizer</entry>
+							<entry>Replaces URLs and E-Mails by a blank space.</entry>
+						</row>
+						<row>
+							<entry>TwitterCharSequenceNormalizer</entry>
+							<entry>Replaces hashtags and Twitter user names by blank spaces.</entry>
+						</row>
+						<row>
+							<entry>NumberCharSequenceNormalizer</entry>
+							<entry>Replaces number sequences by blank spaces</entry>
+						</row>
+						<row>
+							<entry>ShrinkCharSequenceNormalizer</entry>
+							<entry>Shrink characters that repeats three or more times to only two repetitions.</entry>
+						</row>
+					</tbody>
+				</tgroup>
+			</table>
+		</para>
+	</section>
+	
+	<section id="tools.langdetect.classifying.cmdline">
+		<title>Language Detector Tool</title>
+		<para>
+		The easiest way to try out the language detector is the command line tool. The tool is only
+		intended for demonstration and testing. The following command shows how to use the language detector tool.
+		  <screen>
+			<![CDATA[
+$ bin/opennlp LanguageDetector model]]>
+		 </screen>
+		 The input is read from standard input and output is written to standard output, unless they are redirected
+		 or piped.
+		</para>
+ 	 </section>
+  	<section id="tools.langdetect.classifying.api">
+		<title>Language Detector API</title>
+		<para>
+			To perform classification you will need a machine learning model -
+			these are encapsulated in the LanguageDetectorModel class of OpenNLP tools.
+		</para>
+		<para>
+			First you need to grab the bytes from the serialized model on an InputStream - 
+			we'll leave it you to do that, since you were the one who serialized it to begin with. Now for the easy part:
+						<programlisting language="java">
+				<![CDATA[
+InputStream is = ...
+LanguageDetectorModel m = new LanguageDetectorModel(is);]]>
+				</programlisting>
+				With the LanguageDetectorModel in hand we are just about there:
+						<programlisting language="java">
+				<![CDATA[
+String inputText = ...
+LanguageDetector myCategorizer = new LanguageDetectorME(m);
+
+// Get the most probable language
+Language bestLanguage = myCategorizer.predictLanguage(inputText);
+System.out.println("Best language: " + bestLanguage.getLang());
+System.out.println("Best language confidence: " + bestLanguage.getConfidence());
+
+// Get an array with the most probable languages
+Language[] languages = myCategorizer.predictLanguages(null);]]>
+				</programlisting>
+
+			Note that the both the API or the CLI will consider the complete text to choose the most probable languages.
+			To handle mixed language one can analyze smaller chunks of text to find language regions.
+		</para>
+	</section>
+	<section id="tools.langdetect.training">
+		<title>Training</title>
+		<para>
+			The Language Detector can be trained on annotated training material. The data
+			can be in OpenNLP Language Detector training format. This is one document per line,
+			containing the ISO-639-3 language code and text separated by a tab. Other formats can also be
+			available.
+			The following sample shows the sample from above in the required format.
+			<screen>
+				<![CDATA[
+spa     A la fecha tres calles bonaerenses recuerdan su nombre (en Ituzaingó, Merlo y Campana). A la fecha, unas 50 \
+		naves y 20 aviones se han perdido en esa área particular del océano Atlántico.
+deu     Alle Jahre wieder: Millionen Spanier haben am Dienstag die Auslosung in der größten Lotterie der Welt verfolgt.\
+ 		Alle Jahre wieder: So gelingt der stressfreie Geschenke-Umtausch Artikel per E-Mail empfehlen So gelingt der \
+ 		stressfre ie Geschenke-Umtausch Nicht immer liegt am Ende das unter dem Weihnachtsbaum, was man sich gewünscht hat.
+srp     Већина становника боравила је кућама од блата или шаторима, како би радили на својим удаљеним пољима у долини \
+		Јордана и напасали своје стадо оваца и коза. Већина становника говори оба језика.
+lav     Egija Tri-Active procedūru īpaši iesaka izmantot siltākajos gadalaikos, jo ziemā aukstums var šķist arī \
+		nepatīkams. Valdība vienojās, ka izmaiņas nodokļu politikā tiek konceptuāli atbalstītas, tomēr deva \
+		nedēļu laika Ekonomikas ministrijai, Finanšu ministrijai un Labklājības ministrijai, lai ar vienotu \
+		pozīciju atgrieztos pie jautājuma izskatīšanas.]]>
+			</screen>
+			Note: The line breaks marked with a backslash are just inserted for formatting purposes and must not be
+			included in the training data.
+		</para>
+		<section id="tools.langdetect.training.tool">
+			<title>Training Tool</title>
+			<para>
+				The following command will train the language detector and write the model to langdetect.bin:
+				<screen>
+					<![CDATA[
+$ bin/opennlp LanguageDetectorTrainer[.leipzig] -model modelFile [-params paramsFile] [-factory factoryName] -data sampleData [-encoding charsetName]
+]]>
+				</screen>
+				Note: To customize the language detector, extend the class opennlp.tools.langdetect.LanguageDetectorFactory
+				add it to the classpath and pass it in the -factory argument.
+			</para>
+		</section>
+		<section id="tools.langdetect.training.leipzig">
+			<title>Training with Leipzig</title>
+			<para>
+				The Leipzig Corpora collection presents corpora in different languages. The corpora is a collection
+				of individual sentences collected from the web and newspapers. The Corpora is available as plain text
+				and as MySQL database tables. The OpenNLP integration can only use the plain text version. More
+				information about the corpora and how to download can be found in the
+				<link linkend="tools.corpora.leipzig">Corpora section</link>.
+			</para>
+			<para>
+				This corpora is specially good to train Language Detector and a converter is provided. First, you need to
+				download the files that compose the Leipzig Corpora collection to a folder. Apache OpenNLP Language
+				Detector supports training, evaluation and cross validation using the Leipzig Corpora. For example,
+				the following command shows how to train a model.
+
+				<screen>
+					<![CDATA[
+$ bin/opennlp LanguageDetectorTrainer.leipzig -model modelFile [-params paramsFile] [-factory factoryName] \
+	-sentencesDir sentencesDir -sentencesPerSample sentencesPerSample -samplesPerLanguage samplesPerLanguage \
+	[-encoding charsetName]
+]]>
+				</screen>
+
+			</para>
+			<para>
+				The following sequence of commands shows how to convert the Leipzig Corpora collection at folder
+				leipzig-train/ to the default Language Detector format, by creating groups of 5 sentences as documents
+				and limiting to 10000 documents per language. Them, it shuffles the result and select the first
+				100000 lines as train corpus and the last 20000 as evaluation corpus:
+				<screen>
+					<![CDATA[
+$ bin/opennlp LanguageDetectorConverter leipzig -sentencesDir leipzig-train/ -sentencesPerSample 5 -samplesPerLanguage 10000 > leipzig.txt
+$ perl -MList::Util=shuffle -e 'print shuffle(<STDIN>);' < leipzig.txt > leipzig_shuf.txt
+$ head -100000 < leipzig_shuf.txt > leipzig.train
+$ tail -20000 < leipzig_shuf.txt > leipzig.eval
+]]>
+				</screen>
+		</para>
+		</section>
+		<section id="tools.langdetect.training.api">
+		<title>Training API</title>
+		<para>
+		The following example shows how to train a model from API.
+		<programlisting language="java">
+						<![CDATA[
+InputStreamFactory inputStreamFactory = new MarkableFileInputStreamFactory(new File("corpus.txt"));
+
+ObjectStream<String> lineStream =
+  new PlainTextByLineStream(inputStreamFactory, "UTF-8");
+ObjectStream<LanguageSample> sampleStream = new LanguageDetectorSampleStream(lineStream);
+
+TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
+params.put(TrainingParameters.ALGORITHM_PARAM,
+  PerceptronTrainer.PERCEPTRON_VALUE);
+params.put(TrainingParameters.CUTOFF_PARAM, 0);
+
+LanguageDetectorFactory factory = new LanguageDetectorFactory();
+
+LanguageDetectorModel model = LanguageDetectorME.train(sampleStream, params, factory);
+model.serialize(new File("langdetect.bin"));
+}
+]]>
+	</programlisting>
+		</para>
+		</section>
+	</section>
+</chapter>
\ No newline at end of file
diff --git a/opennlp-docs/src/docbkx/opennlp.xml b/opennlp-docs/src/docbkx/opennlp.xml
index 172d06c..2f7e2fa 100644
--- a/opennlp-docs/src/docbkx/opennlp.xml
+++ b/opennlp-docs/src/docbkx/opennlp.xml
@@ -76,6 +76,7 @@ under the License.
 	<title>Apache OpenNLP Developer Documentation</title>
 	<toc/>
 	<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="./introduction.xml"/>
+	<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="./langdetect.xml" />
 	<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="./sentdetect.xml"/>
 	<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="./tokenizer.xml" />
 	<xi:include xmlns:xi="http://www.w3.org/2001/XInclude" href="./namefinder.xml" />

-- 
To stop receiving notification emails like this one, please contact
['"commits@opennlp.apache.org" <co...@opennlp.apache.org>'].