You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/06/06 10:10:05 UTC

[21/21] opennlp git commit: OPENNLP-788: Add LanguageDetector tool

OPENNLP-788: Add LanguageDetector tool


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/a9853284
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/a9853284
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/a9853284

Branch: refs/heads/LangDetect
Commit: a985328464c130bf516d19eace49a1b8e3095022
Parents: 15ac7bd
Author: William D C M SILVA <co...@apache.org>
Authored: Wed May 17 13:34:21 2017 -0300
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Tue Jun 6 12:07:05 2017 +0200

----------------------------------------------------------------------
 NOTICE                                          |   7 +
 .../main/java/opennlp/tools/cmdline/CLI.java    |  12 +
 .../cmdline/FineGrainedReportListener.java      |  13 +-
 .../tools/cmdline/StreamFactoryRegistry.java    |   4 +
 .../LanguageDetectorConverterTool.java          |  28 ++
 .../LanguageDetectorCrossValidatorTool.java     | 123 ++++++++
 ...LanguageDetectorEvaluationErrorListener.java |  54 ++++
 .../LanguageDetectorEvaluatorTool.java          | 139 +++++++++
 ...nguageDetectorFineGrainedReportListener.java |  70 +++++
 .../langdetect/LanguageDetectorModelLoader.java |  42 +++
 .../langdetect/LanguageDetectorTool.java        |  88 ++++++
 .../langdetect/LanguageDetectorTrainerTool.java |  83 ++++++
 .../cmdline/langdetect/TrainingParams.java      |  40 +++
 .../LanguageDetectorSampleStreamFactory.java    |  66 +++++
 .../formats/LeipzigDoccatSampleStream.java      |   5 +-
 .../LeipzigDocumentSampleStreamFactory.java     |   3 +
 .../leipzig/LeipzigLanguageSampleStream.java    | 136 +++++++++
 .../LeipzigLanguageSampleStreamFactory.java     |  74 +++++
 .../java/opennlp/tools/langdetect/Language.java |  73 +++++
 .../tools/langdetect/LanguageDetector.java      |  31 ++
 .../LanguageDetectorContextGenerator.java       |  80 +++++
 .../LanguageDetectorCrossValidator.java         | 107 +++++++
 .../LanguageDetectorEvaluationMonitor.java      |  28 ++
 .../langdetect/LanguageDetectorEvaluator.java   |  99 +++++++
 .../langdetect/LanguageDetectorEventStream.java |  69 +++++
 .../langdetect/LanguageDetectorFactory.java     |  53 ++++
 .../tools/langdetect/LanguageDetectorME.java    |  97 ++++++
 .../tools/langdetect/LanguageDetectorModel.java |  82 +++++
 .../LanguageDetectorSampleStream.java           |  55 ++++
 .../tools/langdetect/LanguageSample.java        |  68 +++++
 .../AggregateCharSequenceNormalizer.java        |  39 +++
 .../util/normalizer/CharSequenceNormalizer.java |  23 ++
 .../normalizer/EmojiCharSequenceNormalizer.java |  38 +++
 .../NumberCharSequenceNormalizer.java           |  36 +++
 .../ShrinkCharSequenceNormalizer.java           |  40 +++
 .../TwitterCharSequenceNormalizer.java          |  50 ++++
 .../UnicodeCharSequenceNormalizer.java          | 297 +++++++++++++++++++
 .../normalizer/UrlCharSequenceNormalizer.java   |  40 +++
 .../normalizer/unicode_normalizer.properties    | 154 ++++++++++
 .../opennlp/tools/langdetect/DummyFactory.java  |  33 +++
 .../LanguageDetectorContextGeneratorTest.java   |  43 +++
 .../LanguageDetectorCrossValidatorTest.java     |  64 ++++
 .../LanguageDetectorEvaluatorTest.java          |  68 +++++
 .../langdetect/LanguageDetectorFactoryTest.java |  75 +++++
 .../langdetect/LanguageDetectorMETest.java      | 114 +++++++
 .../tools/langdetect/LanguageSampleTest.java    |  89 ++++++
 .../opennlp/tools/langdetect/LanguageTest.java  | 101 +++++++
 .../EmojiCharSequenceNormalizerTest.java        |  43 +++
 .../NumberCharSequenceNormalizerTest.java       |  32 ++
 .../ShrinkCharSequenceNormalizerTest.java       |  41 +++
 .../TwitterCharSequenceNormalizerTest.java      |  62 ++++
 .../UnicodeCharSequenceNormalizerTest.java      | 263 ++++++++++++++++
 .../UrlCharSequenceNormalizerTest.java          |  47 +++
 53 files changed, 3618 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/NOTICE
----------------------------------------------------------------------
diff --git a/NOTICE b/NOTICE
index c0b8394..36d90e2 100644
--- a/NOTICE
+++ b/NOTICE
@@ -10,3 +10,10 @@ opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball
 were developed by Martin Porter and Richard Boulton.
 The full snowball package is available from
 http://snowball.tartarus.org/
+
+
+The Language Detector normalizer in
+opennlp.tools.util.normalizer.UnicodeCharSequenceNormalizer.java
+and its tests and resources were developed by Shuyo Nakatani.
+The full Language Detector package is available from
+https://github.com/shuyo/language-detection

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
index b575f71..c828e26 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
@@ -37,6 +37,11 @@ import opennlp.tools.cmdline.doccat.DoccatEvaluatorTool;
 import opennlp.tools.cmdline.doccat.DoccatTool;
 import opennlp.tools.cmdline.doccat.DoccatTrainerTool;
 import opennlp.tools.cmdline.entitylinker.EntityLinkerTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorConverterTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorCrossValidatorTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorEvaluatorTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorTrainerTool;
 import opennlp.tools.cmdline.languagemodel.NGramLanguageModelTool;
 import opennlp.tools.cmdline.lemmatizer.LemmatizerEvaluatorTool;
 import opennlp.tools.cmdline.lemmatizer.LemmatizerMETool;
@@ -90,6 +95,13 @@ public final class CLI {
     tools.add(new DoccatCrossValidatorTool());
     tools.add(new DoccatConverterTool());
 
+    // Language Detector
+    tools.add(new LanguageDetectorTool());
+    tools.add(new LanguageDetectorTrainerTool());
+    tools.add(new LanguageDetectorConverterTool());
+    tools.add(new LanguageDetectorCrossValidatorTool());
+    tools.add(new LanguageDetectorEvaluatorTool());
+
     // Dictionary Builder
     tools.add(new DictionaryBuilderTool());
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
index 714561a..75b84aa 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
@@ -802,8 +802,8 @@ public abstract class FineGrainedReportListener {
       }
     }
 
-    public void add(String[] text, String ref, String pred) {
-      int length = text.length;
+    public void add(int length, String ref, String pred) {
+
       averageSentenceLength.add(length);
 
       if (minimalSentenceLength > length) {
@@ -820,7 +820,16 @@ public abstract class FineGrainedReportListener {
       updateTagFMeasure(refs, preds);
 
       commit("", ref, pred);
+    }
+
+    public void add(String[] text, String ref, String pred) {
+      int length = text.length;
+      this.add(length, ref, pred);
+    }
 
+    public void add(CharSequence text, String ref, String pred) {
+      int length = text.length();
+      this.add(length, ref, pred);
     }
 
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index 3d68945..48b8025 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -29,6 +29,7 @@ import opennlp.tools.formats.ConllXSentenceSampleStreamFactory;
 import opennlp.tools.formats.ConllXTokenSampleStreamFactory;
 import opennlp.tools.formats.DocumentSampleStreamFactory;
 import opennlp.tools.formats.EvalitaNameSampleStreamFactory;
+import opennlp.tools.formats.LanguageDetectorSampleStreamFactory;
 import opennlp.tools.formats.LeipzigDocumentSampleStreamFactory;
 import opennlp.tools.formats.LemmatizerSampleStreamFactory;
 import opennlp.tools.formats.NameSampleDataStreamFactory;
@@ -56,6 +57,7 @@ import opennlp.tools.formats.convert.ParseToTokenSampleStreamFactory;
 import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory;
 import opennlp.tools.formats.irishsentencebank.IrishSentenceBankSentenceStreamFactory;
 import opennlp.tools.formats.irishsentencebank.IrishSentenceBankTokenSampleStreamFactory;
+import opennlp.tools.formats.leipzig.LeipzigLanguageSampleStreamFactory;
 import opennlp.tools.formats.letsmt.LetsmtSentenceStreamFactory;
 import opennlp.tools.formats.moses.MosesSentenceSampleStreamFactory;
 import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
@@ -79,6 +81,7 @@ public final class StreamFactoryRegistry {
     TokenSampleStreamFactory.registerFactory();
     WordTagSampleStreamFactory.registerFactory();
     LemmatizerSampleStreamFactory.registerFactory();
+    LanguageDetectorSampleStreamFactory.registerFactory();
 
     NameToSentenceSampleStreamFactory.registerFactory();
     NameToTokenSampleStreamFactory.registerFactory();
@@ -124,6 +127,7 @@ public final class StreamFactoryRegistry {
 
     IrishSentenceBankSentenceStreamFactory.registerFactory();
     IrishSentenceBankTokenSampleStreamFactory.registerFactory();
+    LeipzigLanguageSampleStreamFactory.registerFactory();
   }
 
   public static final String DEFAULT_FORMAT = "opennlp";

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java
new file mode 100644
index 0000000..69d9db7
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import opennlp.tools.cmdline.AbstractConverterTool;
+import opennlp.tools.langdetect.LanguageSample;
+
+public class LanguageDetectorConverterTool extends AbstractConverterTool<LanguageSample> {
+
+  public LanguageDetectorConverterTool() {
+    super(LanguageSample.class);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java
new file mode 100644
index 0000000..bf68fbb
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.LinkedList;
+import java.util.List;
+
+import opennlp.tools.cmdline.AbstractCrossValidatorTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.CVParams;
+import opennlp.tools.cmdline.params.FineGrainedEvaluatorParams;
+import opennlp.tools.langdetect.LanguageDetectorCrossValidator;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageDetectorFactory;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.eval.EvaluationMonitor;
+import opennlp.tools.util.model.ModelUtil;
+
+public final class LanguageDetectorCrossValidatorTool extends
+    AbstractCrossValidatorTool<LanguageSample,
+        LanguageDetectorCrossValidatorTool.CVToolParams> {
+
+  interface CVToolParams extends CVParams, TrainingParams, FineGrainedEvaluatorParams {
+  }
+
+  public LanguageDetectorCrossValidatorTool() {
+    super(LanguageSample.class, CVToolParams.class);
+  }
+
+  public String getShortDescription() {
+    return "K-fold cross validator for the learnable Language Detector";
+  }
+
+  public void run(String format, String[] args) {
+    super.run(format, args);
+
+    mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
+    if (mlParams == null) {
+      mlParams = ModelUtil.createDefaultTrainingParameters();
+    }
+
+    List<EvaluationMonitor<LanguageSample>> listeners = new LinkedList<>();
+    if (params.getMisclassified()) {
+      listeners.add(new LanguageDetectorEvaluationErrorListener());
+    }
+
+    LanguageDetectorFineGrainedReportListener reportListener = null;
+    File reportFile = params.getReportOutputFile();
+    OutputStream reportOutputStream = null;
+    if (reportFile != null) {
+      CmdLineUtil.checkOutputFile("Report Output File", reportFile);
+      try {
+        reportOutputStream = new FileOutputStream(reportFile);
+        reportListener = new LanguageDetectorFineGrainedReportListener(reportOutputStream);
+        listeners.add(reportListener);
+      } catch (FileNotFoundException e) {
+        throw createTerminationIOException(e);
+      }
+    }
+
+    LanguageDetectorEvaluationMonitor[] listenersArr = listeners
+        .toArray(new LanguageDetectorEvaluationMonitor[listeners.size()]);
+
+    LanguageDetectorCrossValidator validator;
+    try {
+      LanguageDetectorFactory factory = LanguageDetectorFactory.create(params.getFactory());
+      validator = new LanguageDetectorCrossValidator(mlParams,
+          factory, listenersArr);
+
+      validator.evaluate(sampleStream, params.getFolds());
+    } catch (IOException e) {
+      throw new TerminateToolException(-1,
+          "IO error while reading training data or indexing data: " + e.getMessage(), e);
+    } finally {
+      try {
+        sampleStream.close();
+      } catch (IOException e) {
+        // sorry that this can fail
+      }
+    }
+
+    System.out.println("done");
+
+    if (reportListener != null) {
+      System.out.println("Writing fine-grained report to "
+          + params.getReportOutputFile().getAbsolutePath());
+      reportListener.writeReport();
+
+      try {
+        // TODO: is it a problem to close the stream now?
+        reportOutputStream.close();
+      } catch (IOException e) {
+        // nothing to do
+      }
+    }
+
+    System.out.println();
+
+    System.out.println("Accuracy: " + validator.getDocumentAccuracy() + "\n" +
+        "Number of documents: " + validator.getDocumentCount());
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java
new file mode 100644
index 0000000..073ef31
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.OutputStream;
+
+import opennlp.tools.cmdline.EvaluationErrorPrinter;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+/**
+ * A default implementation of {@link EvaluationMonitor} that prints to an
+ * output stream.
+ *
+ */
+public class LanguageDetectorEvaluationErrorListener extends
+    EvaluationErrorPrinter<LanguageSample> implements LanguageDetectorEvaluationMonitor {
+
+  /**
+   * Creates a listener that will print to System.err
+   */
+  public LanguageDetectorEvaluationErrorListener() {
+    super(System.err);
+  }
+
+  /**
+   * Creates a listener that will print to a given {@link OutputStream}
+   */
+  public LanguageDetectorEvaluationErrorListener(OutputStream outputStream) {
+    super(outputStream);
+  }
+
+  @Override
+  public void missclassified(LanguageSample reference, LanguageSample prediction) {
+    printError(reference, prediction);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java
new file mode 100644
index 0000000..fb929bf
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.LinkedList;
+import java.util.List;
+
+import opennlp.tools.cmdline.AbstractEvaluatorTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.PerformanceMonitor;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.EvaluatorParams;
+import opennlp.tools.cmdline.params.FineGrainedEvaluatorParams;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageDetectorEvaluator;
+import opennlp.tools.langdetect.LanguageDetectorME;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+public final class LanguageDetectorEvaluatorTool extends
+    AbstractEvaluatorTool<LanguageSample, LanguageDetectorEvaluatorTool.EvalToolParams> {
+
+  interface EvalToolParams extends EvaluatorParams, FineGrainedEvaluatorParams {
+  }
+
+  public LanguageDetectorEvaluatorTool() {
+    super(LanguageSample.class, EvalToolParams.class);
+  }
+
+  public String getShortDescription() {
+    return "Measures the performance of the Language Detector model with the reference data";
+  }
+
+  public void run(String format, String[] args) {
+    super.run(format, args);
+
+    LanguageDetectorModel model = new LanguageDetectorModelLoader().load(params.getModel());
+
+    List<EvaluationMonitor<LanguageSample>> listeners = new LinkedList<>();
+    if (params.getMisclassified()) {
+      listeners.add(new LanguageDetectorEvaluationErrorListener());
+    }
+
+    LanguageDetectorFineGrainedReportListener reportListener = null;
+    File reportFile = params.getReportOutputFile();
+    OutputStream reportOutputStream = null;
+    if (reportFile != null) {
+      CmdLineUtil.checkOutputFile("Report Output File", reportFile);
+      try {
+        reportOutputStream = new FileOutputStream(reportFile);
+        reportListener = new LanguageDetectorFineGrainedReportListener(reportOutputStream);
+        listeners.add(reportListener);
+      } catch (FileNotFoundException e) {
+        throw new TerminateToolException(-1,
+            "IO error while creating LanguageDetector fine-grained report file: "
+                + e.getMessage());
+      }
+    }
+
+    LanguageDetectorEvaluator evaluator = new LanguageDetectorEvaluator(
+        new LanguageDetectorME(model),
+        listeners.toArray(new LanguageDetectorEvaluationMonitor[listeners.size()]));
+
+    final PerformanceMonitor monitor = new PerformanceMonitor("doc");
+
+    ObjectStream<LanguageSample> measuredSampleStream = new ObjectStream<LanguageSample>() {
+
+      public LanguageSample read() throws IOException {
+        monitor.incrementCounter();
+        return sampleStream.read();
+      }
+
+      public void reset() throws IOException {
+        sampleStream.reset();
+      }
+
+      public void close() throws IOException {
+        sampleStream.close();
+      }
+    };
+
+    monitor.startAndPrintThroughput();
+
+    try {
+      evaluator.evaluate(measuredSampleStream);
+    } catch (IOException e) {
+      System.err.println("failed");
+      throw new TerminateToolException(-1, "IO error while reading test data: "
+          + e.getMessage(), e);
+    } finally {
+      try {
+        measuredSampleStream.close();
+      } catch (IOException e) {
+        // sorry that this can fail
+      }
+    }
+
+    monitor.stopAndPrintFinalResult();
+
+    System.out.println();
+
+    System.out.println(evaluator);
+
+    if (reportListener != null) {
+      System.out.println("Writing fine-grained report to "
+          + params.getReportOutputFile().getAbsolutePath());
+      reportListener.writeReport();
+
+      try {
+        // TODO: is it a problem to close the stream now?
+        reportOutputStream.close();
+      } catch (IOException e) {
+        // nothing to do
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java
new file mode 100644
index 0000000..70bf3eb
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.OutputStream;
+
+import opennlp.tools.cmdline.FineGrainedReportListener;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageSample;
+
+/**
+ * Generates a detailed report for the POS Tagger.
+ * <p>
+ * It is possible to use it from an API and access the statistics using the
+ * provided getters
+ */
+public class LanguageDetectorFineGrainedReportListener
+    extends FineGrainedReportListener implements LanguageDetectorEvaluationMonitor {
+
+  /**
+   * Creates a listener that will print to {@link System#err}
+   */
+  public LanguageDetectorFineGrainedReportListener() {
+    this(System.err);
+  }
+
+  /**
+   * Creates a listener that prints to a given {@link OutputStream}
+   */
+  public LanguageDetectorFineGrainedReportListener(OutputStream outputStream) {
+    super(outputStream);
+  }
+
+  // methods inherited from EvaluationMonitor
+
+  public void missclassified(LanguageSample reference, LanguageSample prediction) {
+    statsAdd(reference, prediction);
+  }
+
+  public void correctlyClassified(LanguageSample reference, LanguageSample prediction) {
+    statsAdd(reference, prediction);
+  }
+
+  private void statsAdd(LanguageSample reference, LanguageSample prediction) {
+    getStats().add(reference.getContext(),
+        reference.getLanguage().getLang(), prediction.getLanguage().getLang());
+  }
+
+  public void writeReport() {
+    printGeneralStatistics();
+    printTagsErrorRank();
+    printGeneralConfusionTable();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java
new file mode 100644
index 0000000..c8700fd
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.cmdline.ModelLoader;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+
+/**
+ * Loads a Language Detector Model for the command line tools.
+ * <p>
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class LanguageDetectorModelLoader extends ModelLoader<LanguageDetectorModel> {
+
+  public LanguageDetectorModelLoader() {
+    super("Language Detector");
+  }
+
+  @Override
+  protected LanguageDetectorModel loadModel(InputStream modelIn) throws IOException {
+    return new LanguageDetectorModel(modelIn);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java
new file mode 100644
index 0000000..6175fe3
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CLI;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.PerformanceMonitor;
+import opennlp.tools.cmdline.SystemInputStreamFactory;
+import opennlp.tools.langdetect.Language;
+import opennlp.tools.langdetect.LanguageDetector;
+import opennlp.tools.langdetect.LanguageDetectorME;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ParagraphStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+public class LanguageDetectorTool extends BasicCmdLineTool {
+
+  @Override
+  public String getShortDescription() {
+    return "learned language detector";
+  }
+
+  @Override
+  public String getHelp() {
+    return "Usage: " + CLI.CMD + " " + getName() + " model < documents";
+  }
+
+  @Override
+  public void run(String[] args) {
+
+    if (0 == args.length) {
+      System.out.println(getHelp());
+    } else {
+
+      LanguageDetectorModel model = new LanguageDetectorModelLoader().load(new File(args[0]));
+
+      LanguageDetector langDetectME = new LanguageDetectorME(model);
+
+      /*
+       * moved initialization to the try block to catch new IOException
+       */
+      ObjectStream<String> documentStream;
+
+      PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "doc");
+      perfMon.start();
+
+      try {
+        documentStream = new ParagraphStream(new PlainTextByLineStream(
+            new SystemInputStreamFactory(), SystemInputStreamFactory.encoding()));
+        String document;
+        while ((document = documentStream.read()) != null) {
+
+          Language lang = langDetectME.predictLanguage(document);
+
+          LanguageSample sample = new LanguageSample(lang, document);
+          System.out.println(sample.toString());
+
+          perfMon.incrementCounter();
+        }
+      } catch (IOException e) {
+        CmdLineUtil.handleStdinIoError(e);
+      }
+
+      perfMon.stopAndPrintFinalResult();
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java
new file mode 100644
index 0000000..6735293
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.AbstractTrainerTool;
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.langdetect.LanguageDetectorFactory;
+import opennlp.tools.langdetect.LanguageDetectorME;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.model.ModelUtil;
+
+public class LanguageDetectorTrainerTool
+    extends AbstractTrainerTool<LanguageSample, LanguageDetectorTrainerTool.TrainerToolParams> {
+
+  interface TrainerToolParams extends TrainingParams {
+    @ArgumentParser.ParameterDescription(valueName = "modelFile", description = "output model file.")
+    File getModel();
+
+    @ArgumentParser.ParameterDescription(valueName = "paramsFile", description = "training parameters file.")
+    @ArgumentParser.OptionalParameter()
+    String getParams();
+  }
+
+  public LanguageDetectorTrainerTool() {
+    super(LanguageSample.class, TrainerToolParams.class);
+  }
+
+  @Override
+  public String getShortDescription() {
+    return "trainer for the learnable language detector";
+  }
+
+  @Override
+  public void run(String format, String[] args) {
+    super.run(format, args);
+
+    mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
+    if (mlParams == null) {
+      mlParams = ModelUtil.createDefaultTrainingParameters();
+    }
+
+    File modelOutFile = params.getModel();
+
+    CmdLineUtil.checkOutputFile("language detector model", modelOutFile);
+
+    LanguageDetectorModel model;
+    try {
+      LanguageDetectorFactory factory = LanguageDetectorFactory.create(params.getFactory());
+      model = LanguageDetectorME.train(sampleStream, mlParams, factory);
+    } catch (IOException e) {
+      throw createTerminationIOException(e);
+    }
+    finally {
+      try {
+        sampleStream.close();
+      } catch (IOException e) {
+        // sorry that this can fail
+      }
+    }
+
+    CmdLineUtil.writeModel("language detector", modelOutFile, model);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java
new file mode 100644
index 0000000..2937c3d
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+
+/**
+ * TrainingParams for Language Detector.
+ *
+ * Note: Do not use this class, internal use only!
+ */
+interface TrainingParams {
+
+  @ParameterDescription(valueName = "paramsFile", description = "training parameters file.")
+  @OptionalParameter()
+  String getParams();
+
+  @ParameterDescription(valueName = "factoryName",
+      description = "A sub-class of LanguageDetectorFactory" +
+          " where to get implementation and resources.")
+  @OptionalParameter
+  String getFactory();
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
new file mode 100644
index 0000000..ef60063
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.doccat.DocumentSampleStream;
+import opennlp.tools.langdetect.LanguageDetectorSampleStream;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+/**
+ * Factory producing OpenNLP {@link DocumentSampleStream}s.
+ */
+public class LanguageDetectorSampleStreamFactory
+    extends AbstractSampleStreamFactory<LanguageSample> {
+
+  interface Parameters extends BasicFormatParams {
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(LanguageSample.class,
+            StreamFactoryRegistry.DEFAULT_FORMAT,
+            new LanguageDetectorSampleStreamFactory(Parameters.class));
+  }
+
+  protected <P> LanguageDetectorSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  public ObjectStream<LanguageSample> create(String[] args) {
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    CmdLineUtil.checkInputFile("Data", params.getData());
+    InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
+    ObjectStream<String> lineStream = null;
+    try {
+      lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
+    } catch (IOException ex) {
+      CmdLineUtil.handleCreateObjectStreamError(ex);
+    }
+
+    return new LanguageDetectorSampleStream(lineStream);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
index 8ed0036..7059e21 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
@@ -39,8 +39,11 @@ import opennlp.tools.util.PlainTextByLineStream;
  * <p>
  * The input text is tokenized with the {@link SimpleTokenizer}. The input text classified
  * by the language model must also be tokenized by the {@link SimpleTokenizer} to produce
- * exactly the same tokenization during testing and training.ø
+ * exactly the same tokenization during testing and training.
+ *
+ * @deprecated will be removed, use the language detector instead
  */
+@Deprecated
 public class LeipzigDoccatSampleStream extends
     FilterObjectStream<String, DocumentSample> {
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
index bd2453b..d6ff9ba 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
@@ -33,7 +33,10 @@ import opennlp.tools.util.ObjectStreamUtils;
 
 /**
  * <b>Note:</b> Do not use this class, internal use only!
+ *
+ * @deprecated will be removed, use the language detector instead
  */
+@Deprecated
 public class LeipzigDocumentSampleStreamFactory
     extends AbstractSampleStreamFactory<DocumentSample> {
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
new file mode 100644
index 0000000..6c4d009
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.leipzig;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import opennlp.tools.langdetect.Language;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.MarkableFileInputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample> {
+
+  private class LeipzigSentencesStream implements ObjectStream<LanguageSample> {
+    private final String lang;
+    private int sentencesPerSample;
+    private int numberOfSamples;
+
+    private ObjectStream<String> lineStream;
+    private int sampleCount;
+
+    LeipzigSentencesStream(String lang, File sentencesFile, int sentencesPerSample, int numberOfSamples)
+        throws IOException {
+      this.lang = sentencesFile.getName().substring(0, 3);
+      this.sentencesPerSample = sentencesPerSample;
+      this.numberOfSamples = numberOfSamples;
+
+      lineStream = new PlainTextByLineStream(new MarkableFileInputStreamFactory(sentencesFile),
+          StandardCharsets.UTF_8);
+    }
+
+    @Override
+    public LanguageSample read() throws IOException {
+
+      if (sampleCount < numberOfSamples) {
+        StringBuilder sampleString = new StringBuilder();
+
+        int count = 0;
+        String line;
+        while (count < sentencesPerSample && (line = lineStream.read()) != null) {
+
+          int textStart = line.indexOf('\t') + 1;
+
+          // TODO: It should it be changed to contain an array of sample strings ?!
+          sampleString.append(line.substring(textStart) + " ");
+
+          count++;
+        }
+
+        if (sampleString.length() > 0) {
+          sampleCount++;
+          return new LanguageSample(new Language(lang), sampleString);
+        }
+      }
+      return null;
+    }
+  }
+
+  private final int sentencesPerSample;
+
+  private Map<String, Integer> langSampleCounts;
+  private File[] sentencesFiles;
+
+  private Iterator<File> sentencesFilesIt;
+  private ObjectStream<LanguageSample> sampleStream;
+
+  public LeipzigLanguageSampleStream(File leipzigFolder, final int sentencesPerSample,
+                                     final int samplesPerLanguage) throws IOException {
+    this.sentencesPerSample = sentencesPerSample;
+    // TODO: Use a FileFilter to make this more reliable in case there are files which should be ignored
+    sentencesFiles = leipzigFolder.listFiles();
+    Arrays.sort(sentencesFiles);
+
+    Map<String, Integer> langCounts = Arrays.stream(sentencesFiles)
+        .map(file -> file.getName().substring(0, 3))
+        .collect(Collectors.groupingBy(String::toString, Collectors.summingInt(v -> 1)));
+
+    langSampleCounts = langCounts.entrySet().stream()
+        .collect(Collectors.toMap(Map.Entry::getKey, e -> samplesPerLanguage / e.getValue()));
+
+    reset();
+  }
+
+  public LanguageSample read() throws IOException {
+    LanguageSample sample;
+    if (sampleStream != null && (sample = sampleStream.read()) != null) {
+      return sample;
+    }
+    else {
+      if (sentencesFilesIt.hasNext()) {
+        File sentencesFile = sentencesFilesIt.next();
+        System.out.println(sentencesFile);
+        String lang = sentencesFile.getName().substring(0, 3);
+
+        sampleStream = new LeipzigSentencesStream(lang, sentencesFile,
+            sentencesPerSample, langSampleCounts.get(lang));
+
+        return read();
+      }
+    }
+    return null;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    sentencesFilesIt = Arrays.asList(sentencesFiles).iterator();
+    sampleStream = null;
+  }
+
+  public static void main(String[] args) throws Exception {
+    new LeipzigLanguageSampleStream(new File("/home/blue/opennlp-data-dir/leipzig-lang"),
+        10, 100000);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
new file mode 100644
index 0000000..59a7551
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.leipzig;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.EncodingParameter;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class LeipzigLanguageSampleStreamFactory
+    extends AbstractSampleStreamFactory<LanguageSample> {
+
+  interface Parameters extends EncodingParameter {
+    @ParameterDescription(valueName = "sentencesDir",
+        description = "dir with Leipig sentences to be used")
+    File getSentencesDir();
+
+    @ParameterDescription(valueName = "sentencesPerSample",
+        description = "number of sentences per sample")
+    String getSentencesPerSample();
+
+    @ParameterDescription(valueName = "samplesPerLanguage",
+        description = "number of samples per language")
+    String getSamplesPerLanguage();
+  }
+
+  protected <P> LeipzigLanguageSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(LanguageSample.class,
+        "leipzig", new LeipzigLanguageSampleStreamFactory(Parameters.class));
+  }
+
+  public ObjectStream<LanguageSample> create(String[] args) {
+
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+    File sentencesFileDir = params.getSentencesDir();
+
+    try {
+      return new LeipzigLanguageSampleStream(sentencesFileDir,
+          Integer.parseInt(params.getSentencesPerSample()),
+          Integer.parseInt(params.getSamplesPerLanguage()));
+    } catch (IOException e) {
+      throw new TerminateToolException(-1, "IO error while opening sample data.", e);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
new file mode 100644
index 0000000..f780759
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Objects;
+
+/**
+ * Class for holding the document language and its confidence
+ */
+public class Language {
+  private final String lang;
+  private final double confidence;
+
+  public Language(String lang) {
+    this(lang, 0);
+  }
+
+  public Language(String lang, double confidence) {
+    Objects.requireNonNull(lang, "lang must not be null");
+    this.lang = lang;
+    this.confidence = confidence;
+  }
+
+  public String getLang() {
+    return lang;
+  }
+
+  public double getConfidence() {
+    return confidence;
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append(getLang()).append(" (").append(this.confidence).append(")");
+    return sb.toString();
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(getLang(), getConfidence());
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+
+    if (obj instanceof Language) {
+      Language a = (Language) obj;
+
+      return getLang().equals(a.getLang());
+    }
+
+    return false;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
new file mode 100644
index 0000000..0004494
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+/**
+ * The interface for LanguageDetector which provide the @{@link Language} according to the context.
+ */
+public interface LanguageDetector {
+
+  Language[] predictLanguages(CharSequence content);
+
+  Language predictLanguage(CharSequence content);
+
+  String[] getSupportedLanguages();
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
new file mode 100644
index 0000000..b28c601
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Collection;
+import java.util.LinkedList;
+
+import opennlp.tools.ngram.NGramModel;
+import opennlp.tools.util.StringList;
+import opennlp.tools.util.StringUtil;
+import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.CharSequenceNormalizer;
+import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.UnicodeCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer;
+
+/**
+ * Context generator for document categorizer
+ */
+class LanguageDetectorContextGenerator {
+
+  private final int minLength;
+  private final int maxLength;
+  private final CharSequenceNormalizer normalizer;
+
+  LanguageDetectorContextGenerator(int minLength, int maxLength) {
+    this.minLength = minLength;
+    this.maxLength = maxLength;
+
+    this.normalizer = new AggregateCharSequenceNormalizer(
+        EmojiCharSequenceNormalizer.getInstance(),
+        UrlCharSequenceNormalizer.getInstance(),
+        TwitterCharSequenceNormalizer.getInstance(),
+        NumberCharSequenceNormalizer.getInstance(),
+        UnicodeCharSequenceNormalizer.getInstance(),
+        ShrinkCharSequenceNormalizer.getInstance()
+    );
+  }
+
+  /**
+   * Initializes the current instance with min 2 length and max 5 length of ngrams.
+   */
+  LanguageDetectorContextGenerator() {
+    this(2, 3);
+  }
+
+  public String[] getContext(String document) {
+
+    Collection<String> context = new LinkedList<>();
+
+    NGramModel model = new NGramModel();
+    String normalized = normalizer.normalize(document).toString();
+    model.add(normalized, minLength, maxLength);
+
+    for (StringList tokenList : model) {
+      if (tokenList.size() > 0) {
+        context.add(StringUtil.toLowerCase(tokenList.getToken(0)));
+      }
+    }
+    return context.toArray(new String[context.size()]);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java
new file mode 100644
index 0000000..ce1823a
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.IOException;
+
+import opennlp.tools.doccat.FeatureGenerator;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.eval.CrossValidationPartitioner;
+import opennlp.tools.util.eval.Mean;
+
+/**
+ * Cross validator for language detector
+ */
+public class LanguageDetectorCrossValidator {
+
+  private final TrainingParameters params;
+
+  private Mean documentAccuracy = new Mean();
+
+  private LanguageDetectorEvaluationMonitor[] listeners;
+
+  private LanguageDetectorFactory factory;
+
+
+  /**
+   * Creates a {@link LanguageDetectorCrossValidator} with the given
+   * {@link FeatureGenerator}s.
+   */
+  public LanguageDetectorCrossValidator(TrainingParameters mlParams,
+                                        LanguageDetectorFactory factory,
+                                        LanguageDetectorEvaluationMonitor ... listeners) {
+    this.params = mlParams;
+    this.listeners = listeners;
+    this.factory = factory;
+  }
+
+  /**
+   * Starts the evaluation.
+   *
+   * @param samples
+   *          the data to train and test
+   * @param nFolds
+   *          number of folds
+   *
+   * @throws IOException
+   */
+  public void evaluate(ObjectStream<LanguageSample> samples, int nFolds)
+      throws IOException {
+
+    CrossValidationPartitioner<LanguageSample> partitioner =
+        new CrossValidationPartitioner<>(samples, nFolds);
+
+    while (partitioner.hasNext()) {
+
+      CrossValidationPartitioner.TrainingSampleStream<LanguageSample> trainingSampleStream =
+          partitioner.next();
+
+      LanguageDetectorModel model = LanguageDetectorME.train(
+          trainingSampleStream, params, factory);
+
+      LanguageDetectorEvaluator evaluator = new LanguageDetectorEvaluator(
+          new LanguageDetectorME(model), listeners);
+
+      evaluator.evaluate(trainingSampleStream.getTestSampleStream());
+
+      documentAccuracy.add(evaluator.getAccuracy(),
+          evaluator.getDocumentCount());
+
+    }
+  }
+
+  /**
+   * Retrieves the accuracy for all iterations.
+   *
+   * @return the word accuracy
+   */
+  public double getDocumentAccuracy() {
+    return documentAccuracy.mean();
+  }
+
+  /**
+   * Retrieves the number of words which where validated over all iterations.
+   * The result is the amount of folds multiplied by the total number of words.
+   *
+   * @return the word count
+   */
+  public long getDocumentCount() {
+    return documentAccuracy.count();
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
new file mode 100644
index 0000000..30f3313
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+/**
+ * {@link EvaluationMonitor} for Language Detector.
+ */
+public interface LanguageDetectorEvaluationMonitor extends
+    EvaluationMonitor<LanguageSample> {
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java
new file mode 100644
index 0000000..bbf73c3
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import opennlp.tools.doccat.DocumentCategorizer;
+import opennlp.tools.util.eval.Evaluator;
+import opennlp.tools.util.eval.Mean;
+
+/**
+ * The {@link LanguageDetectorEvaluator} measures the performance of
+ * the given {@link LanguageDetector} with the provided reference
+ * {@link LanguageSample}s.
+ *
+ * @see LanguageDetector
+ * @see LanguageSample
+ */
+public class LanguageDetectorEvaluator extends Evaluator<LanguageSample> {
+
+  private LanguageDetector languageDetector;
+
+  private Mean accuracy = new Mean();
+
+  /**
+   * Initializes the current instance.
+   *
+   * @param langDetect the language detector instance
+   */
+  public LanguageDetectorEvaluator(LanguageDetector langDetect,
+                                   LanguageDetectorEvaluationMonitor ... listeners) {
+    super(listeners);
+    this.languageDetector = langDetect;
+  }
+
+  /**
+   * Evaluates the given reference {@link LanguageSample} object.
+   *
+   * This is done by categorizing the document from the provided
+   * {@link LanguageSample}. The detected language is then used
+   * to calculate and update the score.
+   *
+   * @param sample the reference {@link LanguageSample}.
+   */
+  public LanguageSample processSample(LanguageSample sample) {
+
+    CharSequence document = sample.getContext();
+
+    Language predicted = languageDetector.predictLanguage(document);
+
+
+
+    if (sample.getLanguage().getLang().equals(predicted.getLang())) {
+      accuracy.add(1);
+    }
+    else {
+      accuracy.add(0);
+    }
+
+    return new LanguageSample(predicted, sample.getContext());
+  }
+
+  /**
+   * Retrieves the accuracy of provided {@link DocumentCategorizer}.
+   *
+   * accuracy = correctly categorized documents / total documents
+   *
+   * @return the accuracy
+   */
+  public double getAccuracy() {
+    return accuracy.mean();
+  }
+
+  public long getDocumentCount() {
+    return accuracy.count();
+  }
+
+  /**
+   * Represents this objects as human readable {@link String}.
+   */
+  @Override
+  public String toString() {
+    return "Accuracy: " + accuracy.mean() + "\n" +
+        "Number of documents: " + accuracy.count();
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
new file mode 100644
index 0000000..b556a4d
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Iterator;
+
+import opennlp.tools.ml.model.Event;
+import opennlp.tools.util.AbstractEventStream;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * Iterator-like class for modeling language detector events.
+ */
+public class LanguageDetectorEventStream extends AbstractEventStream<LanguageSample> {
+
+  private LanguageDetectorContextGenerator mContextGenerator;
+
+  /**
+   * Initializes the current instance via samples and feature generators.
+   *
+   * @param data {@link ObjectStream} of {@link LanguageSample}s
+   */
+  public LanguageDetectorEventStream(ObjectStream<LanguageSample> data) {
+    super(data);
+
+    mContextGenerator =
+        new LanguageDetectorContextGenerator();
+  }
+
+  @Override
+  protected Iterator<Event> createEvents(final LanguageSample sample) {
+
+    return new Iterator<Event>() {
+
+      private boolean isVirgin = true;
+
+      public boolean hasNext() {
+        return isVirgin;
+      }
+
+      public Event next() {
+
+        isVirgin = false;
+
+        return new Event(sample.getLanguage().getLang(),
+            mContextGenerator.getContext(sample.getContext().toString()));
+      }
+
+      public void remove() {
+        throw new UnsupportedOperationException();
+      }
+    };
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
new file mode 100644
index 0000000..5cebbba
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ext.ExtensionLoader;
+
+
+public class LanguageDetectorFactory extends BaseToolFactory {
+
+  public static LanguageDetectorFactory create(String subclassName)
+      throws InvalidFormatException {
+    if (subclassName == null) {
+      // will create the default factory
+      return new LanguageDetectorFactory();
+    }
+    try {
+      LanguageDetectorFactory theFactory = ExtensionLoader.instantiateExtension(
+          LanguageDetectorFactory.class, subclassName);
+      theFactory.init();
+      return theFactory;
+    } catch (Exception e) {
+      String msg = "Could not instantiate the " + subclassName
+          + ". The initialization throw an exception.";
+      throw new InvalidFormatException(msg, e);
+    }
+  }
+
+  public void init() {
+    // nothing to do
+  }
+
+  @Override
+  public void validateArtifactMap() throws InvalidFormatException {
+    // nothing to validate
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
new file mode 100644
index 0000000..74a1cea
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.ml.AbstractEventTrainer;
+import opennlp.tools.ml.EventTrainer;
+import opennlp.tools.ml.TrainerFactory;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+
+/**
+ * Implements learnable Language Detector
+ */
+public class LanguageDetectorME implements LanguageDetector {
+
+  private LanguageDetectorModel model;
+  private LanguageDetectorContextGenerator mContextGenerator;
+
+  /**
+   * Initializes the current instance with a language detector model. Default feature
+   * generation is used.
+   *
+   * @param model the language detector model
+   */
+  public LanguageDetectorME(LanguageDetectorModel model) {
+    this.model = model;
+    this.mContextGenerator = new LanguageDetectorContextGenerator();
+  }
+
+  @Override
+  public Language[] predictLanguages(CharSequence content) {
+    double[] eval = model.getMaxentModel().eval(mContextGenerator.getContext(content.toString()));
+    Language[] arr = new Language[eval.length];
+    for (int i = 0; i < eval.length; i++) {
+      arr[i] = new Language(model.getMaxentModel().getOutcome(i), eval[i]);
+    }
+
+    Arrays.sort(arr, (o1, o2) -> Double.compare(o2.getConfidence(), o1.getConfidence()));
+    return arr;
+  }
+
+  @Override
+  public Language predictLanguage(CharSequence content) {
+    return predictLanguages(content)[0];
+  }
+
+  @Override
+  public String[] getSupportedLanguages() {
+    int numberLanguages = model.getMaxentModel().getNumOutcomes();
+    String[] languages = new String[numberLanguages];
+    for (int i = 0; i < numberLanguages; i++) {
+      languages[i] = model.getMaxentModel().getOutcome(i);
+    }
+    return languages;
+  }
+
+
+  public static LanguageDetectorModel train(ObjectStream<LanguageSample> samples,
+                                            TrainingParameters mlParams,
+                                            LanguageDetectorFactory factory)
+      throws IOException {
+
+    Map<String, String> manifestInfoEntries = new HashMap<>();
+
+    mlParams.putIfAbsent(AbstractEventTrainer.DATA_INDEXER_PARAM,
+        AbstractEventTrainer.DATA_INDEXER_ONE_PASS_VALUE);
+
+    EventTrainer trainer = TrainerFactory.getEventTrainer(
+        mlParams, manifestInfoEntries);
+
+    MaxentModel model = trainer.train(
+        new LanguageDetectorEventStream(samples));
+
+    return new LanguageDetectorModel(model, manifestInfoEntries, factory);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
new file mode 100644
index 0000000..c0d9703
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Map;
+
+import opennlp.tools.ml.model.AbstractModel;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.BaseModel;
+
+/**
+ * A model for language detection
+ */
+public class LanguageDetectorModel extends BaseModel {
+
+  private static final String COMPONENT_NAME = "LanguageDetectorME";
+  private static final String LANGDETECT_MODEL_ENTRY_NAME = "langdetect.model";
+
+  public LanguageDetectorModel(MaxentModel langdetectModel,
+                               Map<String, String> manifestInfoEntries,
+                               LanguageDetectorFactory factory) {
+    super(COMPONENT_NAME, "und", manifestInfoEntries, factory);
+
+    artifactMap.put(LANGDETECT_MODEL_ENTRY_NAME, langdetectModel);
+    checkArtifactMap();
+  }
+
+  public LanguageDetectorModel(InputStream in) throws IOException {
+    super(COMPONENT_NAME, in);
+  }
+
+  public LanguageDetectorModel(File modelFile) throws IOException {
+    super(COMPONENT_NAME, modelFile);
+  }
+
+  public LanguageDetectorModel(URL modelURL) throws IOException {
+    super(COMPONENT_NAME, modelURL);
+  }
+
+  @Override
+  protected void validateArtifactMap() throws InvalidFormatException {
+    super.validateArtifactMap();
+
+    if (!(artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME) instanceof AbstractModel)) {
+      throw new InvalidFormatException("Language detector model is incomplete!");
+    }
+  }
+
+  public LanguageDetectorFactory getFactory() {
+    return (LanguageDetectorFactory) this.toolFactory;
+  }
+
+  @Override
+  protected Class<? extends BaseToolFactory> getDefaultFactory() {
+    return LanguageDetectorFactory.class;
+  }
+
+  public MaxentModel getMaxentModel() {
+    return (MaxentModel) artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
new file mode 100644
index 0000000..2a407f7
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.IOException;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * This class reads in string encoded training samples, parses them and
+ * outputs {@link LanguageSample} objects.
+ * <p>
+ * Format:<br>
+ * Each line contains one sample document.<br>
+ * The language is the first string in the line followed by a tab and the document content.<br>
+ * Sample line: category-string tab-char document line-break-char(s)<br>
+ */
+public class LanguageDetectorSampleStream
+    extends FilterObjectStream<String, LanguageSample> {
+
+  public LanguageDetectorSampleStream(ObjectStream<String> samples) {
+    super(samples);
+  }
+
+  public LanguageSample read() throws IOException {
+    String sampleString;
+    while ((sampleString = samples.read()) != null) {
+      int tabIndex = sampleString.indexOf("\t");
+      if (tabIndex > 0) {
+        String lang = sampleString.substring(0, tabIndex);
+        String context = sampleString.substring(tabIndex + 1);
+
+        return new LanguageSample(new Language(lang), context);
+      }
+    }
+
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
new file mode 100644
index 0000000..f454864
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Objects;
+
+/**
+ * Class which holds a classified document and its @{@link Language}.
+ */
+public class LanguageSample {
+
+  private final Language language;
+  private final CharSequence context;
+
+  public LanguageSample(Language language, CharSequence context) {
+    this.language = Objects.requireNonNull(language, "language must not be null");
+    this.context = Objects.requireNonNull(context, "context must not be null");
+  }
+
+  public Language getLanguage() {
+    return language;
+  }
+
+  public CharSequence getContext() {
+    return context;
+  }
+
+  @Override
+  public String toString() {
+    return language.getLang() + '\t' +  context;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(getContext(), getLanguage());
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+
+    if (obj instanceof LanguageSample) {
+      LanguageSample a = (LanguageSample) obj;
+
+      return getLanguage().equals(a.getLanguage())
+          && getContext().equals(a.getContext());
+    }
+
+    return false;
+  }
+}