You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/06/16 09:54:29 UTC

[1/2] opennlp git commit: OPENNLP-788: Add LanguageDetector tool

Repository: opennlp
Updated Branches:
  refs/heads/master 8d7e1c3c5 -> 560c48438


http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java
new file mode 100644
index 0000000..771be19
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+public class AggregateCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private final CharSequenceNormalizer[] normalizers;
+
+  public AggregateCharSequenceNormalizer(CharSequenceNormalizer ... normalizers) {
+    this.normalizers = normalizers;
+  }
+
+  public CharSequence normalize (CharSequence text) {
+
+    for (CharSequenceNormalizer normalizers :
+        normalizers) {
+      text = normalizers.normalize(text);
+    }
+
+    return text;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
new file mode 100644
index 0000000..b5c1f3f
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+public interface CharSequenceNormalizer {
+  CharSequence normalize(CharSequence text);
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
new file mode 100644
index 0000000..d1c161c
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class EmojiCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final EmojiCharSequenceNormalizer INSTANCE = new EmojiCharSequenceNormalizer();
+
+  public static EmojiCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  private static final Pattern EMOJI_REGEX =
+      Pattern.compile("[\\uD83C-\\uDBFF\\uDC00-\\uDFFF]+");
+
+  public CharSequence normalize (CharSequence text) {
+    String modified = EMOJI_REGEX.matcher(text).replaceAll(" ");
+    return modified;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
new file mode 100644
index 0000000..6b0452d
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class NumberCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final Pattern NUMBER_REGEX = Pattern.compile("\\d+");
+
+  private static final NumberCharSequenceNormalizer INSTANCE = new NumberCharSequenceNormalizer();
+
+  public static NumberCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  public CharSequence normalize (CharSequence text) {
+    return NUMBER_REGEX.matcher(text).replaceAll(" ");
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
new file mode 100644
index 0000000..6183367
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class ShrinkCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final Pattern REPEATED_CHAR_REGEX = Pattern.compile("(.)\\1{2,}",
+      Pattern.CASE_INSENSITIVE);
+  private static final Pattern SPACE_REGEX = Pattern.compile("\\s{2,}",
+      Pattern.CASE_INSENSITIVE);
+
+  private static final ShrinkCharSequenceNormalizer INSTANCE = new ShrinkCharSequenceNormalizer();
+
+  public static ShrinkCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  public CharSequence normalize (CharSequence text) {
+    text = SPACE_REGEX.matcher(text).replaceAll(" ");
+    return REPEATED_CHAR_REGEX.matcher(text).replaceAll("$1$1").trim();
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
new file mode 100644
index 0000000..b5a8625
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class TwitterCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final Pattern HASH_USER_REGEX =
+      Pattern.compile("[#@]\\S+");
+
+  private static final Pattern RT_REGEX =
+      Pattern.compile("\\b(rt[ :])+", Pattern.CASE_INSENSITIVE);
+
+  private static final Pattern FACE_REGEX =
+      Pattern.compile("[:;x]-?[()dop]", Pattern.CASE_INSENSITIVE);
+
+  private static final Pattern LAUGH_REGEX =
+      Pattern.compile("([hj])+([aieou])+(\\1+\\2+)+", Pattern.CASE_INSENSITIVE);
+
+  private static final TwitterCharSequenceNormalizer INSTANCE = new TwitterCharSequenceNormalizer();
+
+  public static TwitterCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  public CharSequence normalize (CharSequence text) {
+    String modified = HASH_USER_REGEX.matcher(text).replaceAll(" ");
+    modified = RT_REGEX.matcher(modified).replaceAll(" ");
+    modified = FACE_REGEX.matcher(modified).replaceAll(" ");
+    modified = LAUGH_REGEX.matcher(modified).replaceAll("$1$2$1$2");
+    return modified;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
new file mode 100644
index 0000000..4be9b63
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class UrlCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final Pattern URL_REGEX =
+      Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]+");
+  private static final Pattern MAIL_REGEX =
+      Pattern.compile("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+");
+
+  private static final UrlCharSequenceNormalizer INSTANCE = new UrlCharSequenceNormalizer();
+
+  public static UrlCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  public CharSequence normalize (CharSequence text) {
+    String modified = URL_REGEX.matcher(text).replaceAll(" ");
+    return MAIL_REGEX.matcher(modified).replaceAll(" ");
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
new file mode 100644
index 0000000..1aae887
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.tools.ngram.NGramModel;
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.util.StringList;
+import opennlp.tools.util.normalizer.CharSequenceNormalizer;
+
+public class DummyFactory extends LanguageDetectorFactory {
+
+
+  public DummyFactory() {
+    super();
+  }
+
+  @Override
+  public void init() {
+    super.init();
+  }
+
+  @Override
+  public LanguageDetectorContextGenerator getContextGenerator() {
+    return new DummyFactory.MyContectGenerator(2, 5,
+        new DummyFactory.UpperCaseNormalizer());
+  }
+
+  public class UpperCaseNormalizer implements CharSequenceNormalizer {
+    @Override
+    public CharSequence normalize(CharSequence text) {
+      return text.toString().toUpperCase();
+    }
+  }
+
+  public class MyContectGenerator extends LanguageDetectorContextGenerator {
+
+    public MyContectGenerator(int min, int max, CharSequenceNormalizer... normalizers) {
+      super(min, max, normalizers);
+    }
+
+    @Override
+    public String[] getContext(String document) {
+      String[] superContext = super.getContext(document);
+
+      List<String> context = new ArrayList(Arrays.asList(superContext));
+
+      document = this.normalizer.normalize(document).toString();
+
+      SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
+      String[] words = tokenizer.tokenize(document);
+      NGramModel tokenNgramModel = new NGramModel();
+      if (words.length > 0) {
+        tokenNgramModel.add(new StringList(words), 1, 3);
+        Iterator tokenNgramIterator = tokenNgramModel.iterator();
+
+        while (tokenNgramIterator.hasNext()) {
+          StringList tokenList = (StringList) tokenNgramIterator.next();
+          if (tokenList.size() > 0) {
+            context.add("tg=" + tokenList.toString());
+          }
+        }
+      }
+
+      return context.toArray(new String[context.size()]);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
new file mode 100644
index 0000000..dc6ca26
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageDetectorContextGeneratorTest {
+
+  @Test
+  public void extractContext() throws Exception {
+    String doc = "abcde fghijk";
+
+    LanguageDetectorContextGenerator cg = new LanguageDetectorContextGenerator(1, 3);
+
+    Collection<String> features = Arrays.asList(cg.getContext(doc));
+
+    Assert.assertEquals(33, features.size());
+    Assert.assertTrue(features.contains("ab"));
+    Assert.assertTrue(features.contains("abc"));
+    Assert.assertTrue(features.contains("e f"));
+    Assert.assertTrue(features.contains(" fg"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java
new file mode 100644
index 0000000..520fc71
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.util.TrainingParameters;
+
+public class LanguageDetectorCrossValidatorTest {
+
+  @Test
+  public void evaluate() throws Exception {
+
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ITERATIONS_PARAM, 100);
+    params.put(TrainingParameters.CUTOFF_PARAM, 5);
+    params.put("PrintMessages", false);
+
+
+    final AtomicInteger correctCount = new AtomicInteger();
+    final AtomicInteger incorrectCount = new AtomicInteger();
+
+    LanguageDetectorCrossValidator cv = new LanguageDetectorCrossValidator(params,
+        new LanguageDetectorFactory(), new LanguageDetectorEvaluationMonitor() {
+          @Override
+          public void correctlyClassified(LanguageSample reference,
+                                          LanguageSample prediction) {
+            correctCount.incrementAndGet();
+          }
+
+          @Override
+          public void missclassified(LanguageSample reference,
+                                     LanguageSample prediction) {
+            incorrectCount.incrementAndGet();
+          }
+        });
+
+    LanguageDetectorSampleStream sampleStream = LanguageDetectorMETest.createSampleStream();
+
+    cv.evaluate(sampleStream, 2);
+
+    Assert.assertEquals(99, cv.getDocumentCount());
+    Assert.assertEquals(0.98989898989899, cv.getDocumentAccuracy(), 0.01);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java
new file mode 100644
index 0000000..8bdd71b
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageDetectorEvaluatorTest {
+
+  @Test
+  public void processSample() throws Exception {
+    LanguageDetectorModel model = LanguageDetectorMETest.trainModel();
+    LanguageDetectorME langdetector = new LanguageDetectorME(model);
+
+    final AtomicInteger correctCount = new AtomicInteger();
+    final AtomicInteger incorrectCount = new AtomicInteger();
+
+    LanguageDetectorEvaluator evaluator = new LanguageDetectorEvaluator(langdetector,
+        new LanguageDetectorEvaluationMonitor() {
+          @Override
+          public void correctlyClassified(LanguageSample reference,
+                                          LanguageSample prediction) {
+            correctCount.incrementAndGet();
+          }
+
+          @Override
+          public void missclassified(LanguageSample reference,
+                                     LanguageSample prediction) {
+            incorrectCount.incrementAndGet();
+          }
+        });
+
+    evaluator.evaluateSample(new LanguageSample(new Language("pob"),
+        "escreve e faz palestras pelo mundo inteiro sobre anjos"));
+
+    evaluator.evaluateSample(new LanguageSample(new Language("fra"),
+        "escreve e faz palestras pelo mundo inteiro sobre anjos"));
+
+    evaluator.evaluateSample(new LanguageSample(new Language("fra"),
+        "escreve e faz palestras pelo mundo inteiro sobre anjos"));
+
+
+    Assert.assertEquals(1, correctCount.get());
+    Assert.assertEquals(2, incorrectCount.get());
+
+    Assert.assertEquals(3, evaluator.getDocumentCount());
+    Assert.assertEquals(0.33, evaluator.getAccuracy(), 0.01);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
new file mode 100644
index 0000000..60afef2
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
+public class LanguageDetectorFactoryTest {
+
+
+  private static LanguageDetectorModel model;
+
+  @BeforeClass
+  public static void train() throws Exception {
+
+    ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory(
+        LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt");
+
+    PlainTextByLineStream lineStream = new PlainTextByLineStream(streamFactory, "UTF-8");
+
+    LanguageDetectorSampleStream sampleStream = new LanguageDetectorSampleStream(lineStream);
+
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "5");
+    params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES");
+
+    model = LanguageDetectorME.train(sampleStream, params, new DummyFactory());
+  }
+
+  @Test
+  public void testCorrectFactory() throws IOException {
+    byte[] serialized = LanguageDetectorMETest.serializeModel(model);
+
+    LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized));
+
+    Assert.assertTrue(myModel.getFactory() instanceof DummyFactory);
+
+  }
+
+  @Test
+  public void testDummyFactory() throws Exception {
+    byte[] serialized = LanguageDetectorMETest.serializeModel(model);
+
+    LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized));
+
+    Assert.assertTrue(myModel.getFactory() instanceof DummyFactory);
+  }
+
+  @Test
+  public void testDummyFactoryContextGenerator() throws Exception {
+    LanguageDetectorContextGenerator cg = model.getFactory().getContextGenerator();
+    String[] context = cg.getContext(
+        "a dummy text phrase to test if the context generator works!!!!!!!!!!!!");
+
+    Set<String> set = new HashSet(Arrays.asList(context));
+
+    Assert.assertTrue(set.contains("!!!!!")); // default normalizer would remove the repeated !
+    Assert.assertTrue(set.contains("a dum"));
+    Assert.assertTrue(set.contains("tg=[THE,CONTEXT,GENERATOR]"));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
new file mode 100644
index 0000000..e5ee8aa
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.langdetect;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
+
+public class LanguageDetectorMETest {
+
+  private LanguageDetectorModel model;
+
+  @Before
+  public void init() throws Exception {
+
+    this.model = trainModel();
+
+  }
+
+  @Test
+  public void testPredictLanguages() {
+    LanguageDetector ld = new LanguageDetectorME(this.model);
+    Language[] languages = ld.predictLanguages("estava em uma marcenaria na Rua Bruno");
+
+    Assert.assertEquals(4, languages.length);
+    Assert.assertEquals("pob", languages[0].getLang());
+    Assert.assertEquals("ita", languages[1].getLang());
+    Assert.assertEquals("spa", languages[2].getLang());
+    Assert.assertEquals("fra", languages[3].getLang());
+  }
+
+  @Test
+  public void testPredictLanguage() {
+    LanguageDetector ld = new LanguageDetectorME(this.model);
+    Language language = ld.predictLanguage("Dove è meglio che giochi");
+
+    Assert.assertEquals("ita", language.getLang());
+  }
+
+  @Test
+  public void testSupportedLanguages() {
+
+    LanguageDetector ld = new LanguageDetectorME(this.model);
+    String[] supportedLanguages = ld.getSupportedLanguages();
+
+    Assert.assertEquals(4, supportedLanguages.length);
+  }
+
+  @Test
+  public void testLoadFromSerialized() throws IOException {
+    byte[] serialized = serializeModel(model);
+
+    LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized));
+
+    Assert.assertNotNull(myModel);
+
+  }
+
+  protected static byte[] serializeModel(LanguageDetectorModel model) throws IOException {
+
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    model.serialize(out);
+    return out.toByteArray();
+  }
+
+  public static LanguageDetectorModel trainModel() throws Exception {
+    return trainModel(new LanguageDetectorFactory());
+  }
+
+  public static LanguageDetectorModel trainModel(LanguageDetectorFactory factory) throws Exception {
+
+
+    LanguageDetectorSampleStream sampleStream = createSampleStream();
+
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ITERATIONS_PARAM, 100);
+    params.put(TrainingParameters.CUTOFF_PARAM, 5);
+    params.put("DataIndexer", "TwoPass");
+    params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES");
+
+    return LanguageDetectorME.train(sampleStream, params, factory);
+  }
+
+  public static LanguageDetectorSampleStream createSampleStream() throws IOException {
+
+    ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory(
+        LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt");
+
+    PlainTextByLineStream lineStream = new PlainTextByLineStream(streamFactory, "UTF-8");
+
+    return new LanguageDetectorSampleStream(lineStream);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
new file mode 100644
index 0000000..7d12581
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageSampleTest {
+
+  @Test
+  public void testConstructor() {
+    Language lang = new Language("aLang");
+    CharSequence context = "aContext";
+
+    LanguageSample sample = new LanguageSample(lang, context);
+
+    Assert.assertEquals(lang, sample.getLanguage());
+    Assert.assertEquals(context, sample.getContext());
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void testNullLang() throws Exception {
+    CharSequence context = "aContext";
+
+    new LanguageSample(null, context);
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void testNullContext() {
+    Language lang = new Language("aLang");
+
+    new LanguageSample(lang, null);
+  }
+
+  @Test
+  public void testToString() {
+    Language lang = new Language("aLang");
+    CharSequence context = "aContext";
+
+    LanguageSample sample = new LanguageSample(lang, context);
+
+    Assert.assertEquals(lang.getLang() + "\t" + context, sample.toString());
+  }
+
+  @Test
+  public void testHash() {
+
+    int hashA = new LanguageSample(new Language("aLang"), "aContext").hashCode();
+    int hashB = new LanguageSample(new Language("bLang"), "aContext").hashCode();
+    int hashC = new LanguageSample(new Language("aLang"), "bContext").hashCode();
+
+    Assert.assertNotEquals(hashA, hashB);
+    Assert.assertNotEquals(hashA, hashC);
+    Assert.assertNotEquals(hashB, hashC);
+  }
+
+  @Test
+  public void testEquals() throws Exception {
+
+    LanguageSample sampleA = new LanguageSample(new Language("aLang"), "aContext");
+    LanguageSample sampleA1 = new LanguageSample(new Language("aLang"), "aContext");
+    LanguageSample sampleB = new LanguageSample(new Language("bLang"), "aContext");
+    LanguageSample sampleC = new LanguageSample(new Language("aLang"), "bContext");
+
+    Assert.assertEquals(sampleA, sampleA);
+    Assert.assertEquals(sampleA, sampleA1);
+    Assert.assertNotEquals(sampleA, sampleB);
+    Assert.assertNotEquals(sampleA, sampleC);
+    Assert.assertNotEquals(sampleB, sampleC);
+    Assert.assertFalse(sampleA.equals("something else"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
new file mode 100644
index 0000000..dc25bc6
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageTest {
+
+
+  @Test
+  public void emptyConfidence() throws Exception {
+    String languageCode = "aLanguage";
+    Language lang = new Language(languageCode);
+
+    Assert.assertEquals(languageCode, lang.getLang());
+    Assert.assertEquals(0, lang.getConfidence(), 0);
+  }
+
+  @Test
+  public void nonEmptyConfidence() throws Exception {
+    String languageCode = "aLanguage";
+    double confidence = 0.05;
+    Language lang = new Language(languageCode, confidence);
+
+    Assert.assertEquals(languageCode, lang.getLang());
+    Assert.assertEquals(confidence, lang.getConfidence(), 0);
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void emptyLanguage() throws Exception {
+    new Language(null);
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void emptyLanguageConfidence() throws Exception {
+    new Language(null, 0.05);
+  }
+
+  @Test
+  public void testToString() {
+    Language lang = new Language("aLang");
+
+    Assert.assertEquals("aLang (0.0)", lang.toString());
+
+    lang = new Language("aLang", 0.0886678);
+
+    Assert.assertEquals("aLang (0.0886678)", lang.toString());
+  }
+
+
+  @Test
+  public void testHash() {
+    int hashA = new Language("aLang").hashCode();
+    int hashAA = new Language("aLang").hashCode();
+    int hashB = new Language("BLang").hashCode();
+    int hashA5 = new Language("aLang", 5.0).hashCode();
+    int hashA6 = new Language("BLang", 6.0).hashCode();
+
+    Assert.assertEquals(hashA, hashAA);
+
+    Assert.assertNotEquals(hashA, hashB);
+    Assert.assertNotEquals(hashA, hashA5);
+    Assert.assertNotEquals(hashB, hashA5);
+    Assert.assertNotEquals(hashA5, hashA6);
+  }
+
+  @Test
+  public void testEquals() {
+    Language langA = new Language("langA");
+    Language langB = new Language("langB");
+    Language langA5 = new Language("langA5", 5.0);
+    Language langA6 = new Language("langA5", 6.0);
+
+    Assert.assertEquals(langA, langA);
+    Assert.assertEquals(langA5, langA5);
+
+    Assert.assertNotEquals(langA, langA5);
+    Assert.assertNotEquals(langA, langB);
+
+    Assert.assertEquals(langA6, langA5);
+
+    Assert.assertNotEquals(langA, "something else");
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..0f8dfe7
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class EmojiCharSequenceNormalizerTest {
+
+  public EmojiCharSequenceNormalizer normalizer = EmojiCharSequenceNormalizer.getInstance();
+
+  @Test
+  public void normalizeEmoji() throws Exception {
+
+    String s = new StringBuilder()
+        .append("Any funny text goes here ")
+        .appendCodePoint(0x1F606)
+        .appendCodePoint(0x1F606)
+        .appendCodePoint(0x1F606)
+        .append(" ")
+        .appendCodePoint(0x1F61B)
+        .toString();
+    Assert.assertEquals(
+        "Any funny text goes here    ", normalizer.normalize(s));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..50b1f0c
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class NumberCharSequenceNormalizerTest {
+
+  public NumberCharSequenceNormalizer normalizer = NumberCharSequenceNormalizer.getInstance();
+
+
+  @Test
+  public void normalize() throws Exception {
+    Assert.assertEquals("absc  ,  abcd", normalizer.normalize("absc 123,0123 abcd"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..95cf300
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class ShrinkCharSequenceNormalizerTest {
+
+  public ShrinkCharSequenceNormalizer normalizer = ShrinkCharSequenceNormalizer.getInstance();
+
+  @Test
+  public void normalizeSpace() throws Exception {
+    Assert.assertEquals(
+        "a text extra space", normalizer.normalize("a text    extra space"));
+  }
+
+  @Test
+  public void normalizeChar() throws Exception {
+    Assert.assertEquals("Helloo", normalizer.normalize("Helllllloooooo"));
+    Assert.assertEquals("Hello", normalizer.normalize("Hello"));
+    Assert.assertEquals("HHello", normalizer.normalize("HHello"));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..f0bd517
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class TwitterCharSequenceNormalizerTest {
+
+  public TwitterCharSequenceNormalizer normalizer = TwitterCharSequenceNormalizer.getInstance();
+
+  @Test
+  public void normalizeHashtag() throws Exception {
+    Assert.assertEquals("asdf   2nnfdf", normalizer.normalize("asdf #hasdk23 2nnfdf"));
+  }
+
+  @Test
+  public void normalizeUser() throws Exception {
+    Assert.assertEquals("asdf   2nnfdf", normalizer.normalize("asdf @hasdk23 2nnfdf"));
+  }
+
+  @Test
+  public void normalizeRT() throws Exception {
+    Assert.assertEquals(" 2nnfdf", normalizer.normalize("RT RT RT 2nnfdf"));
+  }
+
+  @Test
+  public void normalizeLaugh() throws Exception {
+    Assert.assertEquals("ahahah", normalizer.normalize("ahahahah"));
+    Assert.assertEquals("haha", normalizer.normalize("hahha"));
+    Assert.assertEquals("haha", normalizer.normalize("hahaa"));
+    Assert.assertEquals("ahaha", normalizer.normalize("ahahahahhahahhahahaaaa"));
+    Assert.assertEquals("jaja", normalizer.normalize("jajjajajaja"));
+  }
+
+
+
+  @Test
+  public void normalizeFace() throws Exception {
+    Assert.assertEquals("hello   hello", normalizer.normalize("hello :-) hello"));
+    Assert.assertEquals("hello   hello", normalizer.normalize("hello ;) hello"));
+    Assert.assertEquals("  hello", normalizer.normalize(":) hello"));
+    Assert.assertEquals("hello  ", normalizer.normalize("hello :P"));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..72eb83a
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class UrlCharSequenceNormalizerTest {
+
+  public UrlCharSequenceNormalizer normalizer = UrlCharSequenceNormalizer.getInstance();
+
+  @Test
+  public void normalizeUrl() throws Exception {
+    Assert.assertEquals(
+        "asdf   2nnfdf", normalizer.normalize("asdf http://asdf.com/dfa/cxs 2nnfdf"));
+
+
+    Assert.assertEquals(
+        "asdf   2nnfdf  ", normalizer.normalize("asdf http://asdf.com/dfa/cx" +
+            "s 2nnfdf http://asdf.com/dfa/cxs"));
+  }
+
+  @Test
+  public void normalizeEmail() throws Exception {
+    Assert.assertEquals(
+        "asdf   2nnfdf", normalizer.normalize("asdf asd.fdfa@hasdk23.com.br 2nnfdf"));
+    Assert.assertEquals(
+        "asdf   2nnfdf  ", normalizer.normalize("asdf asd.fdfa@hasdk23.com.br" +
+            " 2nnfdf asd.fdfa@hasdk23.com.br"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 363bd7c..8183c06 100644
--- a/pom.xml
+++ b/pom.xml
@@ -227,6 +227,7 @@
 					<artifactId>maven-surefire-plugin</artifactId>
 					<version>${maven.surefire.plugin}</version>
 					<configuration>
+						<argLine>-Xmx2048m</argLine>
 						<forkCount>${opennlp.forkCount}</forkCount>
 						<failIfNoSpecifiedTests>false</failIfNoSpecifiedTests>
 						<excludes>
@@ -435,6 +436,7 @@
 						<artifactId>maven-surefire-plugin</artifactId>
 						<version>${maven.surefire.plugin}</version>
 						<configuration>
+							<argLine>-Xmx4g</argLine>
 							<includes>
 								<include>**/*Test.java</include>
 								<include>**/*Eval.java</include>


[2/2] opennlp git commit: OPENNLP-788: Add LanguageDetector tool

Posted by jo...@apache.org.
OPENNLP-788: Add LanguageDetector tool


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/560c4843
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/560c4843
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/560c4843

Branch: refs/heads/master
Commit: 560c484387eea32a00afc1de8ef96a81ce304ef3
Parents: 8d7e1c3
Author: William D C M SILVA <co...@apache.org>
Authored: Wed May 17 13:34:21 2017 -0300
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Fri Jun 16 10:12:29 2017 +0200

----------------------------------------------------------------------
 .../main/java/opennlp/tools/cmdline/CLI.java    |  12 ++
 .../cmdline/FineGrainedReportListener.java      |  13 +-
 .../tools/cmdline/StreamFactoryRegistry.java    |   4 +
 .../LanguageDetectorConverterTool.java          |  28 ++++
 .../LanguageDetectorCrossValidatorTool.java     | 123 ++++++++++++++++
 ...LanguageDetectorEvaluationErrorListener.java |  54 +++++++
 .../LanguageDetectorEvaluatorTool.java          | 139 +++++++++++++++++++
 ...nguageDetectorFineGrainedReportListener.java |  70 ++++++++++
 .../langdetect/LanguageDetectorModelLoader.java |  42 ++++++
 .../langdetect/LanguageDetectorTool.java        |  88 ++++++++++++
 .../langdetect/LanguageDetectorTrainerTool.java |  83 +++++++++++
 .../cmdline/langdetect/TrainingParams.java      |  40 ++++++
 .../LanguageDetectorSampleStreamFactory.java    |  66 +++++++++
 .../formats/LeipzigDoccatSampleStream.java      |   5 +-
 .../LeipzigDocumentSampleStreamFactory.java     |   3 +
 .../leipzig/LeipzigLanguageSampleStream.java    | 136 ++++++++++++++++++
 .../LeipzigLanguageSampleStreamFactory.java     |  74 ++++++++++
 .../java/opennlp/tools/langdetect/Language.java |  73 ++++++++++
 .../tools/langdetect/LanguageDetector.java      |  31 +++++
 .../LanguageDetectorContextGenerator.java       |  70 ++++++++++
 .../LanguageDetectorCrossValidator.java         | 107 ++++++++++++++
 .../LanguageDetectorEvaluationMonitor.java      |  28 ++++
 .../langdetect/LanguageDetectorEvaluator.java   |  99 +++++++++++++
 .../langdetect/LanguageDetectorEventStream.java |  69 +++++++++
 .../langdetect/LanguageDetectorFactory.java     |  67 +++++++++
 .../tools/langdetect/LanguageDetectorME.java    |  97 +++++++++++++
 .../tools/langdetect/LanguageDetectorModel.java |  82 +++++++++++
 .../LanguageDetectorSampleStream.java           |  55 ++++++++
 .../tools/langdetect/LanguageSample.java        |  68 +++++++++
 .../AggregateCharSequenceNormalizer.java        |  39 ++++++
 .../util/normalizer/CharSequenceNormalizer.java |  23 +++
 .../normalizer/EmojiCharSequenceNormalizer.java |  38 +++++
 .../NumberCharSequenceNormalizer.java           |  36 +++++
 .../ShrinkCharSequenceNormalizer.java           |  40 ++++++
 .../TwitterCharSequenceNormalizer.java          |  50 +++++++
 .../normalizer/UrlCharSequenceNormalizer.java   |  40 ++++++
 .../opennlp/tools/langdetect/DummyFactory.java  |  88 ++++++++++++
 .../LanguageDetectorContextGeneratorTest.java   |  43 ++++++
 .../LanguageDetectorCrossValidatorTest.java     |  64 +++++++++
 .../LanguageDetectorEvaluatorTest.java          |  68 +++++++++
 .../langdetect/LanguageDetectorFactoryTest.java |  90 ++++++++++++
 .../langdetect/LanguageDetectorMETest.java      | 116 ++++++++++++++++
 .../tools/langdetect/LanguageSampleTest.java    |  89 ++++++++++++
 .../opennlp/tools/langdetect/LanguageTest.java  | 101 ++++++++++++++
 .../EmojiCharSequenceNormalizerTest.java        |  43 ++++++
 .../NumberCharSequenceNormalizerTest.java       |  32 +++++
 .../ShrinkCharSequenceNormalizerTest.java       |  41 ++++++
 .../TwitterCharSequenceNormalizerTest.java      |  62 +++++++++
 .../UrlCharSequenceNormalizerTest.java          |  47 +++++++
 pom.xml                                         |   2 +
 50 files changed, 2975 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
index b575f71..c828e26 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
@@ -37,6 +37,11 @@ import opennlp.tools.cmdline.doccat.DoccatEvaluatorTool;
 import opennlp.tools.cmdline.doccat.DoccatTool;
 import opennlp.tools.cmdline.doccat.DoccatTrainerTool;
 import opennlp.tools.cmdline.entitylinker.EntityLinkerTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorConverterTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorCrossValidatorTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorEvaluatorTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorTrainerTool;
 import opennlp.tools.cmdline.languagemodel.NGramLanguageModelTool;
 import opennlp.tools.cmdline.lemmatizer.LemmatizerEvaluatorTool;
 import opennlp.tools.cmdline.lemmatizer.LemmatizerMETool;
@@ -90,6 +95,13 @@ public final class CLI {
     tools.add(new DoccatCrossValidatorTool());
     tools.add(new DoccatConverterTool());
 
+    // Language Detector
+    tools.add(new LanguageDetectorTool());
+    tools.add(new LanguageDetectorTrainerTool());
+    tools.add(new LanguageDetectorConverterTool());
+    tools.add(new LanguageDetectorCrossValidatorTool());
+    tools.add(new LanguageDetectorEvaluatorTool());
+
     // Dictionary Builder
     tools.add(new DictionaryBuilderTool());
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
index 714561a..75b84aa 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
@@ -802,8 +802,8 @@ public abstract class FineGrainedReportListener {
       }
     }
 
-    public void add(String[] text, String ref, String pred) {
-      int length = text.length;
+    public void add(int length, String ref, String pred) {
+
       averageSentenceLength.add(length);
 
       if (minimalSentenceLength > length) {
@@ -820,7 +820,16 @@ public abstract class FineGrainedReportListener {
       updateTagFMeasure(refs, preds);
 
       commit("", ref, pred);
+    }
+
+    public void add(String[] text, String ref, String pred) {
+      int length = text.length;
+      this.add(length, ref, pred);
+    }
 
+    public void add(CharSequence text, String ref, String pred) {
+      int length = text.length();
+      this.add(length, ref, pred);
     }
 
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index 3d68945..48b8025 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -29,6 +29,7 @@ import opennlp.tools.formats.ConllXSentenceSampleStreamFactory;
 import opennlp.tools.formats.ConllXTokenSampleStreamFactory;
 import opennlp.tools.formats.DocumentSampleStreamFactory;
 import opennlp.tools.formats.EvalitaNameSampleStreamFactory;
+import opennlp.tools.formats.LanguageDetectorSampleStreamFactory;
 import opennlp.tools.formats.LeipzigDocumentSampleStreamFactory;
 import opennlp.tools.formats.LemmatizerSampleStreamFactory;
 import opennlp.tools.formats.NameSampleDataStreamFactory;
@@ -56,6 +57,7 @@ import opennlp.tools.formats.convert.ParseToTokenSampleStreamFactory;
 import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory;
 import opennlp.tools.formats.irishsentencebank.IrishSentenceBankSentenceStreamFactory;
 import opennlp.tools.formats.irishsentencebank.IrishSentenceBankTokenSampleStreamFactory;
+import opennlp.tools.formats.leipzig.LeipzigLanguageSampleStreamFactory;
 import opennlp.tools.formats.letsmt.LetsmtSentenceStreamFactory;
 import opennlp.tools.formats.moses.MosesSentenceSampleStreamFactory;
 import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
@@ -79,6 +81,7 @@ public final class StreamFactoryRegistry {
     TokenSampleStreamFactory.registerFactory();
     WordTagSampleStreamFactory.registerFactory();
     LemmatizerSampleStreamFactory.registerFactory();
+    LanguageDetectorSampleStreamFactory.registerFactory();
 
     NameToSentenceSampleStreamFactory.registerFactory();
     NameToTokenSampleStreamFactory.registerFactory();
@@ -124,6 +127,7 @@ public final class StreamFactoryRegistry {
 
     IrishSentenceBankSentenceStreamFactory.registerFactory();
     IrishSentenceBankTokenSampleStreamFactory.registerFactory();
+    LeipzigLanguageSampleStreamFactory.registerFactory();
   }
 
   public static final String DEFAULT_FORMAT = "opennlp";

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java
new file mode 100644
index 0000000..69d9db7
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import opennlp.tools.cmdline.AbstractConverterTool;
+import opennlp.tools.langdetect.LanguageSample;
+
+public class LanguageDetectorConverterTool extends AbstractConverterTool<LanguageSample> {
+
+  public LanguageDetectorConverterTool() {
+    super(LanguageSample.class);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java
new file mode 100644
index 0000000..bf68fbb
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.LinkedList;
+import java.util.List;
+
+import opennlp.tools.cmdline.AbstractCrossValidatorTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.CVParams;
+import opennlp.tools.cmdline.params.FineGrainedEvaluatorParams;
+import opennlp.tools.langdetect.LanguageDetectorCrossValidator;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageDetectorFactory;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.eval.EvaluationMonitor;
+import opennlp.tools.util.model.ModelUtil;
+
+public final class LanguageDetectorCrossValidatorTool extends
+    AbstractCrossValidatorTool<LanguageSample,
+        LanguageDetectorCrossValidatorTool.CVToolParams> {
+
+  interface CVToolParams extends CVParams, TrainingParams, FineGrainedEvaluatorParams {
+  }
+
+  public LanguageDetectorCrossValidatorTool() {
+    super(LanguageSample.class, CVToolParams.class);
+  }
+
+  public String getShortDescription() {
+    return "K-fold cross validator for the learnable Language Detector";
+  }
+
+  public void run(String format, String[] args) {
+    super.run(format, args);
+
+    mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
+    if (mlParams == null) {
+      mlParams = ModelUtil.createDefaultTrainingParameters();
+    }
+
+    List<EvaluationMonitor<LanguageSample>> listeners = new LinkedList<>();
+    if (params.getMisclassified()) {
+      listeners.add(new LanguageDetectorEvaluationErrorListener());
+    }
+
+    LanguageDetectorFineGrainedReportListener reportListener = null;
+    File reportFile = params.getReportOutputFile();
+    OutputStream reportOutputStream = null;
+    if (reportFile != null) {
+      CmdLineUtil.checkOutputFile("Report Output File", reportFile);
+      try {
+        reportOutputStream = new FileOutputStream(reportFile);
+        reportListener = new LanguageDetectorFineGrainedReportListener(reportOutputStream);
+        listeners.add(reportListener);
+      } catch (FileNotFoundException e) {
+        throw createTerminationIOException(e);
+      }
+    }
+
+    LanguageDetectorEvaluationMonitor[] listenersArr = listeners
+        .toArray(new LanguageDetectorEvaluationMonitor[listeners.size()]);
+
+    LanguageDetectorCrossValidator validator;
+    try {
+      LanguageDetectorFactory factory = LanguageDetectorFactory.create(params.getFactory());
+      validator = new LanguageDetectorCrossValidator(mlParams,
+          factory, listenersArr);
+
+      validator.evaluate(sampleStream, params.getFolds());
+    } catch (IOException e) {
+      throw new TerminateToolException(-1,
+          "IO error while reading training data or indexing data: " + e.getMessage(), e);
+    } finally {
+      try {
+        sampleStream.close();
+      } catch (IOException e) {
+        // sorry that this can fail
+      }
+    }
+
+    System.out.println("done");
+
+    if (reportListener != null) {
+      System.out.println("Writing fine-grained report to "
+          + params.getReportOutputFile().getAbsolutePath());
+      reportListener.writeReport();
+
+      try {
+        // TODO: is it a problem to close the stream now?
+        reportOutputStream.close();
+      } catch (IOException e) {
+        // nothing to do
+      }
+    }
+
+    System.out.println();
+
+    System.out.println("Accuracy: " + validator.getDocumentAccuracy() + "\n" +
+        "Number of documents: " + validator.getDocumentCount());
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java
new file mode 100644
index 0000000..073ef31
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.OutputStream;
+
+import opennlp.tools.cmdline.EvaluationErrorPrinter;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+/**
+ * A default implementation of {@link EvaluationMonitor} that prints to an
+ * output stream.
+ *
+ */
+public class LanguageDetectorEvaluationErrorListener extends
+    EvaluationErrorPrinter<LanguageSample> implements LanguageDetectorEvaluationMonitor {
+
+  /**
+   * Creates a listener that will print to System.err
+   */
+  public LanguageDetectorEvaluationErrorListener() {
+    super(System.err);
+  }
+
+  /**
+   * Creates a listener that will print to a given {@link OutputStream}
+   */
+  public LanguageDetectorEvaluationErrorListener(OutputStream outputStream) {
+    super(outputStream);
+  }
+
+  @Override
+  public void missclassified(LanguageSample reference, LanguageSample prediction) {
+    printError(reference, prediction);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java
new file mode 100644
index 0000000..fb929bf
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.LinkedList;
+import java.util.List;
+
+import opennlp.tools.cmdline.AbstractEvaluatorTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.PerformanceMonitor;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.EvaluatorParams;
+import opennlp.tools.cmdline.params.FineGrainedEvaluatorParams;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageDetectorEvaluator;
+import opennlp.tools.langdetect.LanguageDetectorME;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+public final class LanguageDetectorEvaluatorTool extends
+    AbstractEvaluatorTool<LanguageSample, LanguageDetectorEvaluatorTool.EvalToolParams> {
+
+  interface EvalToolParams extends EvaluatorParams, FineGrainedEvaluatorParams {
+  }
+
+  public LanguageDetectorEvaluatorTool() {
+    super(LanguageSample.class, EvalToolParams.class);
+  }
+
+  public String getShortDescription() {
+    return "Measures the performance of the Language Detector model with the reference data";
+  }
+
+  public void run(String format, String[] args) {
+    super.run(format, args);
+
+    LanguageDetectorModel model = new LanguageDetectorModelLoader().load(params.getModel());
+
+    List<EvaluationMonitor<LanguageSample>> listeners = new LinkedList<>();
+    if (params.getMisclassified()) {
+      listeners.add(new LanguageDetectorEvaluationErrorListener());
+    }
+
+    LanguageDetectorFineGrainedReportListener reportListener = null;
+    File reportFile = params.getReportOutputFile();
+    OutputStream reportOutputStream = null;
+    if (reportFile != null) {
+      CmdLineUtil.checkOutputFile("Report Output File", reportFile);
+      try {
+        reportOutputStream = new FileOutputStream(reportFile);
+        reportListener = new LanguageDetectorFineGrainedReportListener(reportOutputStream);
+        listeners.add(reportListener);
+      } catch (FileNotFoundException e) {
+        throw new TerminateToolException(-1,
+            "IO error while creating LanguageDetector fine-grained report file: "
+                + e.getMessage());
+      }
+    }
+
+    LanguageDetectorEvaluator evaluator = new LanguageDetectorEvaluator(
+        new LanguageDetectorME(model),
+        listeners.toArray(new LanguageDetectorEvaluationMonitor[listeners.size()]));
+
+    final PerformanceMonitor monitor = new PerformanceMonitor("doc");
+
+    ObjectStream<LanguageSample> measuredSampleStream = new ObjectStream<LanguageSample>() {
+
+      public LanguageSample read() throws IOException {
+        monitor.incrementCounter();
+        return sampleStream.read();
+      }
+
+      public void reset() throws IOException {
+        sampleStream.reset();
+      }
+
+      public void close() throws IOException {
+        sampleStream.close();
+      }
+    };
+
+    monitor.startAndPrintThroughput();
+
+    try {
+      evaluator.evaluate(measuredSampleStream);
+    } catch (IOException e) {
+      System.err.println("failed");
+      throw new TerminateToolException(-1, "IO error while reading test data: "
+          + e.getMessage(), e);
+    } finally {
+      try {
+        measuredSampleStream.close();
+      } catch (IOException e) {
+        // sorry that this can fail
+      }
+    }
+
+    monitor.stopAndPrintFinalResult();
+
+    System.out.println();
+
+    System.out.println(evaluator);
+
+    if (reportListener != null) {
+      System.out.println("Writing fine-grained report to "
+          + params.getReportOutputFile().getAbsolutePath());
+      reportListener.writeReport();
+
+      try {
+        // TODO: is it a problem to close the stream now?
+        reportOutputStream.close();
+      } catch (IOException e) {
+        // nothing to do
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java
new file mode 100644
index 0000000..70bf3eb
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.OutputStream;
+
+import opennlp.tools.cmdline.FineGrainedReportListener;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageSample;
+
+/**
+ * Generates a detailed report for the POS Tagger.
+ * <p>
+ * It is possible to use it from an API and access the statistics using the
+ * provided getters
+ */
+public class LanguageDetectorFineGrainedReportListener
+    extends FineGrainedReportListener implements LanguageDetectorEvaluationMonitor {
+
+  /**
+   * Creates a listener that will print to {@link System#err}
+   */
+  public LanguageDetectorFineGrainedReportListener() {
+    this(System.err);
+  }
+
+  /**
+   * Creates a listener that prints to a given {@link OutputStream}
+   */
+  public LanguageDetectorFineGrainedReportListener(OutputStream outputStream) {
+    super(outputStream);
+  }
+
+  // methods inherited from EvaluationMonitor
+
+  public void missclassified(LanguageSample reference, LanguageSample prediction) {
+    statsAdd(reference, prediction);
+  }
+
+  public void correctlyClassified(LanguageSample reference, LanguageSample prediction) {
+    statsAdd(reference, prediction);
+  }
+
+  private void statsAdd(LanguageSample reference, LanguageSample prediction) {
+    getStats().add(reference.getContext(),
+        reference.getLanguage().getLang(), prediction.getLanguage().getLang());
+  }
+
+  public void writeReport() {
+    printGeneralStatistics();
+    printTagsErrorRank();
+    printGeneralConfusionTable();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java
new file mode 100644
index 0000000..c8700fd
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.cmdline.ModelLoader;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+
+/**
+ * Loads a Language Detector Model for the command line tools.
+ * <p>
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class LanguageDetectorModelLoader extends ModelLoader<LanguageDetectorModel> {
+
+  public LanguageDetectorModelLoader() {
+    super("Language Detector");
+  }
+
+  @Override
+  protected LanguageDetectorModel loadModel(InputStream modelIn) throws IOException {
+    return new LanguageDetectorModel(modelIn);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java
new file mode 100644
index 0000000..6175fe3
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CLI;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.PerformanceMonitor;
+import opennlp.tools.cmdline.SystemInputStreamFactory;
+import opennlp.tools.langdetect.Language;
+import opennlp.tools.langdetect.LanguageDetector;
+import opennlp.tools.langdetect.LanguageDetectorME;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ParagraphStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+public class LanguageDetectorTool extends BasicCmdLineTool {
+
+  @Override
+  public String getShortDescription() {
+    return "learned language detector";
+  }
+
+  @Override
+  public String getHelp() {
+    return "Usage: " + CLI.CMD + " " + getName() + " model < documents";
+  }
+
+  @Override
+  public void run(String[] args) {
+
+    if (0 == args.length) {
+      System.out.println(getHelp());
+    } else {
+
+      LanguageDetectorModel model = new LanguageDetectorModelLoader().load(new File(args[0]));
+
+      LanguageDetector langDetectME = new LanguageDetectorME(model);
+
+      /*
+       * moved initialization to the try block to catch new IOException
+       */
+      ObjectStream<String> documentStream;
+
+      PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "doc");
+      perfMon.start();
+
+      try {
+        documentStream = new ParagraphStream(new PlainTextByLineStream(
+            new SystemInputStreamFactory(), SystemInputStreamFactory.encoding()));
+        String document;
+        while ((document = documentStream.read()) != null) {
+
+          Language lang = langDetectME.predictLanguage(document);
+
+          LanguageSample sample = new LanguageSample(lang, document);
+          System.out.println(sample.toString());
+
+          perfMon.incrementCounter();
+        }
+      } catch (IOException e) {
+        CmdLineUtil.handleStdinIoError(e);
+      }
+
+      perfMon.stopAndPrintFinalResult();
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java
new file mode 100644
index 0000000..6735293
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.AbstractTrainerTool;
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.langdetect.LanguageDetectorFactory;
+import opennlp.tools.langdetect.LanguageDetectorME;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.model.ModelUtil;
+
+public class LanguageDetectorTrainerTool
+    extends AbstractTrainerTool<LanguageSample, LanguageDetectorTrainerTool.TrainerToolParams> {
+
+  interface TrainerToolParams extends TrainingParams {
+    @ArgumentParser.ParameterDescription(valueName = "modelFile", description = "output model file.")
+    File getModel();
+
+    @ArgumentParser.ParameterDescription(valueName = "paramsFile", description = "training parameters file.")
+    @ArgumentParser.OptionalParameter()
+    String getParams();
+  }
+
+  public LanguageDetectorTrainerTool() {
+    super(LanguageSample.class, TrainerToolParams.class);
+  }
+
+  @Override
+  public String getShortDescription() {
+    return "trainer for the learnable language detector";
+  }
+
+  @Override
+  public void run(String format, String[] args) {
+    super.run(format, args);
+
+    mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
+    if (mlParams == null) {
+      mlParams = ModelUtil.createDefaultTrainingParameters();
+    }
+
+    File modelOutFile = params.getModel();
+
+    CmdLineUtil.checkOutputFile("language detector model", modelOutFile);
+
+    LanguageDetectorModel model;
+    try {
+      LanguageDetectorFactory factory = LanguageDetectorFactory.create(params.getFactory());
+      model = LanguageDetectorME.train(sampleStream, mlParams, factory);
+    } catch (IOException e) {
+      throw createTerminationIOException(e);
+    }
+    finally {
+      try {
+        sampleStream.close();
+      } catch (IOException e) {
+        // sorry that this can fail
+      }
+    }
+
+    CmdLineUtil.writeModel("language detector", modelOutFile, model);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java
new file mode 100644
index 0000000..2937c3d
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+
+/**
+ * TrainingParams for Language Detector.
+ *
+ * Note: Do not use this class, internal use only!
+ */
+interface TrainingParams {
+
+  @ParameterDescription(valueName = "paramsFile", description = "training parameters file.")
+  @OptionalParameter()
+  String getParams();
+
+  @ParameterDescription(valueName = "factoryName",
+      description = "A sub-class of LanguageDetectorFactory" +
+          " where to get implementation and resources.")
+  @OptionalParameter
+  String getFactory();
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
new file mode 100644
index 0000000..ef60063
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.doccat.DocumentSampleStream;
+import opennlp.tools.langdetect.LanguageDetectorSampleStream;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+/**
+ * Factory producing OpenNLP {@link DocumentSampleStream}s.
+ */
+public class LanguageDetectorSampleStreamFactory
+    extends AbstractSampleStreamFactory<LanguageSample> {
+
+  interface Parameters extends BasicFormatParams {
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(LanguageSample.class,
+            StreamFactoryRegistry.DEFAULT_FORMAT,
+            new LanguageDetectorSampleStreamFactory(Parameters.class));
+  }
+
+  protected <P> LanguageDetectorSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  public ObjectStream<LanguageSample> create(String[] args) {
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    CmdLineUtil.checkInputFile("Data", params.getData());
+    InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
+    ObjectStream<String> lineStream = null;
+    try {
+      lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
+    } catch (IOException ex) {
+      CmdLineUtil.handleCreateObjectStreamError(ex);
+    }
+
+    return new LanguageDetectorSampleStream(lineStream);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
index 8ed0036..7059e21 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
@@ -39,8 +39,11 @@ import opennlp.tools.util.PlainTextByLineStream;
  * <p>
  * The input text is tokenized with the {@link SimpleTokenizer}. The input text classified
  * by the language model must also be tokenized by the {@link SimpleTokenizer} to produce
- * exactly the same tokenization during testing and training.ø
+ * exactly the same tokenization during testing and training.
+ *
+ * @deprecated will be removed, use the language detector instead
  */
+@Deprecated
 public class LeipzigDoccatSampleStream extends
     FilterObjectStream<String, DocumentSample> {
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
index bd2453b..d6ff9ba 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
@@ -33,7 +33,10 @@ import opennlp.tools.util.ObjectStreamUtils;
 
 /**
  * <b>Note:</b> Do not use this class, internal use only!
+ *
+ * @deprecated will be removed, use the language detector instead
  */
+@Deprecated
 public class LeipzigDocumentSampleStreamFactory
     extends AbstractSampleStreamFactory<DocumentSample> {
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
new file mode 100644
index 0000000..6c4d009
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.leipzig;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import opennlp.tools.langdetect.Language;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.MarkableFileInputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample> {
+
+  private class LeipzigSentencesStream implements ObjectStream<LanguageSample> {
+    private final String lang;
+    private int sentencesPerSample;
+    private int numberOfSamples;
+
+    private ObjectStream<String> lineStream;
+    private int sampleCount;
+
+    LeipzigSentencesStream(String lang, File sentencesFile, int sentencesPerSample, int numberOfSamples)
+        throws IOException {
+      this.lang = sentencesFile.getName().substring(0, 3);
+      this.sentencesPerSample = sentencesPerSample;
+      this.numberOfSamples = numberOfSamples;
+
+      lineStream = new PlainTextByLineStream(new MarkableFileInputStreamFactory(sentencesFile),
+          StandardCharsets.UTF_8);
+    }
+
+    @Override
+    public LanguageSample read() throws IOException {
+
+      if (sampleCount < numberOfSamples) {
+        StringBuilder sampleString = new StringBuilder();
+
+        int count = 0;
+        String line;
+        while (count < sentencesPerSample && (line = lineStream.read()) != null) {
+
+          int textStart = line.indexOf('\t') + 1;
+
+          // TODO: It should it be changed to contain an array of sample strings ?!
+          sampleString.append(line.substring(textStart) + " ");
+
+          count++;
+        }
+
+        if (sampleString.length() > 0) {
+          sampleCount++;
+          return new LanguageSample(new Language(lang), sampleString);
+        }
+      }
+      return null;
+    }
+  }
+
+  private final int sentencesPerSample;
+
+  private Map<String, Integer> langSampleCounts;
+  private File[] sentencesFiles;
+
+  private Iterator<File> sentencesFilesIt;
+  private ObjectStream<LanguageSample> sampleStream;
+
+  public LeipzigLanguageSampleStream(File leipzigFolder, final int sentencesPerSample,
+                                     final int samplesPerLanguage) throws IOException {
+    this.sentencesPerSample = sentencesPerSample;
+    // TODO: Use a FileFilter to make this more reliable in case there are files which should be ignored
+    sentencesFiles = leipzigFolder.listFiles();
+    Arrays.sort(sentencesFiles);
+
+    Map<String, Integer> langCounts = Arrays.stream(sentencesFiles)
+        .map(file -> file.getName().substring(0, 3))
+        .collect(Collectors.groupingBy(String::toString, Collectors.summingInt(v -> 1)));
+
+    langSampleCounts = langCounts.entrySet().stream()
+        .collect(Collectors.toMap(Map.Entry::getKey, e -> samplesPerLanguage / e.getValue()));
+
+    reset();
+  }
+
+  public LanguageSample read() throws IOException {
+    LanguageSample sample;
+    if (sampleStream != null && (sample = sampleStream.read()) != null) {
+      return sample;
+    }
+    else {
+      if (sentencesFilesIt.hasNext()) {
+        File sentencesFile = sentencesFilesIt.next();
+        System.out.println(sentencesFile);
+        String lang = sentencesFile.getName().substring(0, 3);
+
+        sampleStream = new LeipzigSentencesStream(lang, sentencesFile,
+            sentencesPerSample, langSampleCounts.get(lang));
+
+        return read();
+      }
+    }
+    return null;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    sentencesFilesIt = Arrays.asList(sentencesFiles).iterator();
+    sampleStream = null;
+  }
+
+  public static void main(String[] args) throws Exception {
+    new LeipzigLanguageSampleStream(new File("/home/blue/opennlp-data-dir/leipzig-lang"),
+        10, 100000);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
new file mode 100644
index 0000000..59a7551
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.leipzig;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.EncodingParameter;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class LeipzigLanguageSampleStreamFactory
+    extends AbstractSampleStreamFactory<LanguageSample> {
+
+  interface Parameters extends EncodingParameter {
+    @ParameterDescription(valueName = "sentencesDir",
+        description = "dir with Leipig sentences to be used")
+    File getSentencesDir();
+
+    @ParameterDescription(valueName = "sentencesPerSample",
+        description = "number of sentences per sample")
+    String getSentencesPerSample();
+
+    @ParameterDescription(valueName = "samplesPerLanguage",
+        description = "number of samples per language")
+    String getSamplesPerLanguage();
+  }
+
+  protected <P> LeipzigLanguageSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(LanguageSample.class,
+        "leipzig", new LeipzigLanguageSampleStreamFactory(Parameters.class));
+  }
+
+  public ObjectStream<LanguageSample> create(String[] args) {
+
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+    File sentencesFileDir = params.getSentencesDir();
+
+    try {
+      return new LeipzigLanguageSampleStream(sentencesFileDir,
+          Integer.parseInt(params.getSentencesPerSample()),
+          Integer.parseInt(params.getSamplesPerLanguage()));
+    } catch (IOException e) {
+      throw new TerminateToolException(-1, "IO error while opening sample data.", e);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
new file mode 100644
index 0000000..f780759
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Objects;
+
+/**
+ * Class for holding the document language and its confidence
+ */
+public class Language {
+  private final String lang;
+  private final double confidence;
+
+  public Language(String lang) {
+    this(lang, 0);
+  }
+
+  public Language(String lang, double confidence) {
+    Objects.requireNonNull(lang, "lang must not be null");
+    this.lang = lang;
+    this.confidence = confidence;
+  }
+
+  public String getLang() {
+    return lang;
+  }
+
+  public double getConfidence() {
+    return confidence;
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append(getLang()).append(" (").append(this.confidence).append(")");
+    return sb.toString();
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(getLang(), getConfidence());
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+
+    if (obj instanceof Language) {
+      Language a = (Language) obj;
+
+      return getLang().equals(a.getLang());
+    }
+
+    return false;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
new file mode 100644
index 0000000..0004494
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+/**
+ * The interface for LanguageDetector which provide the @{@link Language} according to the context.
+ */
+public interface LanguageDetector {
+
+  Language[] predictLanguages(CharSequence content);
+
+  Language predictLanguage(CharSequence content);
+
+  String[] getSupportedLanguages();
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
new file mode 100644
index 0000000..1ec42fd
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.ArrayList;
+import java.util.Collection;
+
+import opennlp.tools.ngram.NGramModel;
+import opennlp.tools.util.StringList;
+import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.CharSequenceNormalizer;
+
+/**
+ * A context generator for language detector.
+ */
+class LanguageDetectorContextGenerator {
+
+  protected final int minLength;
+  protected final int maxLength;
+  protected final CharSequenceNormalizer normalizer;
+
+  /**
+   * Creates a customizable @{@link LanguageDetectorContextGenerator} that computes ngrams from text
+   * @param minLength min ngrams chars
+   * @param maxLength max ngrams chars
+   * @param normalizers zero or more normalizers to
+   *                    be applied in to the text before extracting ngrams
+   */
+  public LanguageDetectorContextGenerator(int minLength, int maxLength,
+                                          CharSequenceNormalizer... normalizers) {
+    this.minLength = minLength;
+    this.maxLength = maxLength;
+
+    this.normalizer = new AggregateCharSequenceNormalizer(normalizers);
+  }
+
+  /**
+   * Generates the context for a document using character ngrams.
+   * @param document document to extract context from
+   * @return the generated context
+   */
+  public String[] getContext(String document) {
+    Collection<String> context = new ArrayList<>();
+
+    NGramModel model = new NGramModel();
+    model.add(document, minLength, maxLength);
+
+    for (StringList tokenList : model) {
+      if (tokenList.size() > 0) {
+        context.add(tokenList.getToken(0));
+      }
+    }
+    return context.toArray(new String[context.size()]);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java
new file mode 100644
index 0000000..ce1823a
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.IOException;
+
+import opennlp.tools.doccat.FeatureGenerator;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.eval.CrossValidationPartitioner;
+import opennlp.tools.util.eval.Mean;
+
+/**
+ * Cross validator for language detector
+ */
+public class LanguageDetectorCrossValidator {
+
+  private final TrainingParameters params;
+
+  private Mean documentAccuracy = new Mean();
+
+  private LanguageDetectorEvaluationMonitor[] listeners;
+
+  private LanguageDetectorFactory factory;
+
+
+  /**
+   * Creates a {@link LanguageDetectorCrossValidator} with the given
+   * {@link FeatureGenerator}s.
+   */
+  public LanguageDetectorCrossValidator(TrainingParameters mlParams,
+                                        LanguageDetectorFactory factory,
+                                        LanguageDetectorEvaluationMonitor ... listeners) {
+    this.params = mlParams;
+    this.listeners = listeners;
+    this.factory = factory;
+  }
+
+  /**
+   * Starts the evaluation.
+   *
+   * @param samples
+   *          the data to train and test
+   * @param nFolds
+   *          number of folds
+   *
+   * @throws IOException
+   */
+  public void evaluate(ObjectStream<LanguageSample> samples, int nFolds)
+      throws IOException {
+
+    CrossValidationPartitioner<LanguageSample> partitioner =
+        new CrossValidationPartitioner<>(samples, nFolds);
+
+    while (partitioner.hasNext()) {
+
+      CrossValidationPartitioner.TrainingSampleStream<LanguageSample> trainingSampleStream =
+          partitioner.next();
+
+      LanguageDetectorModel model = LanguageDetectorME.train(
+          trainingSampleStream, params, factory);
+
+      LanguageDetectorEvaluator evaluator = new LanguageDetectorEvaluator(
+          new LanguageDetectorME(model), listeners);
+
+      evaluator.evaluate(trainingSampleStream.getTestSampleStream());
+
+      documentAccuracy.add(evaluator.getAccuracy(),
+          evaluator.getDocumentCount());
+
+    }
+  }
+
+  /**
+   * Retrieves the accuracy for all iterations.
+   *
+   * @return the word accuracy
+   */
+  public double getDocumentAccuracy() {
+    return documentAccuracy.mean();
+  }
+
+  /**
+   * Retrieves the number of words which where validated over all iterations.
+   * The result is the amount of folds multiplied by the total number of words.
+   *
+   * @return the word count
+   */
+  public long getDocumentCount() {
+    return documentAccuracy.count();
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
new file mode 100644
index 0000000..30f3313
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+/**
+ * {@link EvaluationMonitor} for Language Detector.
+ */
+public interface LanguageDetectorEvaluationMonitor extends
+    EvaluationMonitor<LanguageSample> {
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java
new file mode 100644
index 0000000..bbf73c3
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import opennlp.tools.doccat.DocumentCategorizer;
+import opennlp.tools.util.eval.Evaluator;
+import opennlp.tools.util.eval.Mean;
+
+/**
+ * The {@link LanguageDetectorEvaluator} measures the performance of
+ * the given {@link LanguageDetector} with the provided reference
+ * {@link LanguageSample}s.
+ *
+ * @see LanguageDetector
+ * @see LanguageSample
+ */
+public class LanguageDetectorEvaluator extends Evaluator<LanguageSample> {
+
+  private LanguageDetector languageDetector;
+
+  private Mean accuracy = new Mean();
+
+  /**
+   * Initializes the current instance.
+   *
+   * @param langDetect the language detector instance
+   */
+  public LanguageDetectorEvaluator(LanguageDetector langDetect,
+                                   LanguageDetectorEvaluationMonitor ... listeners) {
+    super(listeners);
+    this.languageDetector = langDetect;
+  }
+
+  /**
+   * Evaluates the given reference {@link LanguageSample} object.
+   *
+   * This is done by categorizing the document from the provided
+   * {@link LanguageSample}. The detected language is then used
+   * to calculate and update the score.
+   *
+   * @param sample the reference {@link LanguageSample}.
+   */
+  public LanguageSample processSample(LanguageSample sample) {
+
+    CharSequence document = sample.getContext();
+
+    Language predicted = languageDetector.predictLanguage(document);
+
+
+
+    if (sample.getLanguage().getLang().equals(predicted.getLang())) {
+      accuracy.add(1);
+    }
+    else {
+      accuracy.add(0);
+    }
+
+    return new LanguageSample(predicted, sample.getContext());
+  }
+
+  /**
+   * Retrieves the accuracy of provided {@link DocumentCategorizer}.
+   *
+   * accuracy = correctly categorized documents / total documents
+   *
+   * @return the accuracy
+   */
+  public double getAccuracy() {
+    return accuracy.mean();
+  }
+
+  public long getDocumentCount() {
+    return accuracy.count();
+  }
+
+  /**
+   * Represents this objects as human readable {@link String}.
+   */
+  @Override
+  public String toString() {
+    return "Accuracy: " + accuracy.mean() + "\n" +
+        "Number of documents: " + accuracy.count();
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
new file mode 100644
index 0000000..19e6d46
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Iterator;
+
+import opennlp.tools.ml.model.Event;
+import opennlp.tools.util.AbstractEventStream;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * Iterator-like class for modeling language detector events.
+ */
+public class LanguageDetectorEventStream extends AbstractEventStream<LanguageSample> {
+
+  private LanguageDetectorContextGenerator mContextGenerator;
+
+  /**
+   * Initializes the current instance via samples and feature generators.
+   *
+   * @param data {@link ObjectStream} of {@link LanguageSample}s
+   */
+  public LanguageDetectorEventStream(ObjectStream<LanguageSample> data,
+                                     LanguageDetectorContextGenerator contextGenerator) {
+    super(data);
+
+    mContextGenerator = contextGenerator;
+  }
+
+  @Override
+  protected Iterator<Event> createEvents(final LanguageSample sample) {
+
+    return new Iterator<Event>() {
+
+      private boolean isVirgin = true;
+
+      public boolean hasNext() {
+        return isVirgin;
+      }
+
+      public Event next() {
+
+        isVirgin = false;
+
+        return new Event(sample.getLanguage().getLang(),
+            mContextGenerator.getContext(sample.getContext().toString()));
+      }
+
+      public void remove() {
+        throw new UnsupportedOperationException();
+      }
+    };
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
new file mode 100644
index 0000000..11357ec
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ext.ExtensionLoader;
+import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer;
+
+
+public class LanguageDetectorFactory extends BaseToolFactory {
+
+  public LanguageDetectorContextGenerator getContextGenerator() {
+    return new LanguageDetectorContextGenerator(1, 3,
+        EmojiCharSequenceNormalizer.getInstance(),
+        UrlCharSequenceNormalizer.getInstance(),
+        TwitterCharSequenceNormalizer.getInstance(),
+        NumberCharSequenceNormalizer.getInstance(),
+        ShrinkCharSequenceNormalizer.getInstance());
+  }
+
+  public static LanguageDetectorFactory create(String subclassName)
+      throws InvalidFormatException {
+    if (subclassName == null) {
+      // will create the default factory
+      return new LanguageDetectorFactory();
+    }
+    try {
+      LanguageDetectorFactory theFactory = ExtensionLoader.instantiateExtension(
+          LanguageDetectorFactory.class, subclassName);
+      theFactory.init();
+      return theFactory;
+    } catch (Exception e) {
+      String msg = "Could not instantiate the " + subclassName
+          + ". The initialization throw an exception.";
+      throw new InvalidFormatException(msg, e);
+    }
+  }
+
+  public void init() {
+    // nothing to do
+  }
+
+  @Override
+  public void validateArtifactMap() throws InvalidFormatException {
+    // nothing to validate
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
new file mode 100644
index 0000000..3af6afd
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.ml.AbstractEventTrainer;
+import opennlp.tools.ml.EventTrainer;
+import opennlp.tools.ml.TrainerFactory;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+
+/**
+ * Implements learnable Language Detector
+ */
+public class LanguageDetectorME implements LanguageDetector {
+
+  private LanguageDetectorModel model;
+  private LanguageDetectorContextGenerator mContextGenerator;
+
+  /**
+   * Initializes the current instance with a language detector model. Default feature
+   * generation is used.
+   *
+   * @param model the language detector model
+   */
+  public LanguageDetectorME(LanguageDetectorModel model) {
+    this.model = model;
+    this.mContextGenerator = model.getFactory().getContextGenerator();
+  }
+
+  @Override
+  public Language[] predictLanguages(CharSequence content) {
+    double[] eval = model.getMaxentModel().eval(mContextGenerator.getContext(content.toString()));
+    Language[] arr = new Language[eval.length];
+    for (int i = 0; i < eval.length; i++) {
+      arr[i] = new Language(model.getMaxentModel().getOutcome(i), eval[i]);
+    }
+
+    Arrays.sort(arr, (o1, o2) -> Double.compare(o2.getConfidence(), o1.getConfidence()));
+    return arr;
+  }
+
+  @Override
+  public Language predictLanguage(CharSequence content) {
+    return predictLanguages(content)[0];
+  }
+
+  @Override
+  public String[] getSupportedLanguages() {
+    int numberLanguages = model.getMaxentModel().getNumOutcomes();
+    String[] languages = new String[numberLanguages];
+    for (int i = 0; i < numberLanguages; i++) {
+      languages[i] = model.getMaxentModel().getOutcome(i);
+    }
+    return languages;
+  }
+
+
+  public static LanguageDetectorModel train(ObjectStream<LanguageSample> samples,
+                                            TrainingParameters mlParams,
+                                            LanguageDetectorFactory factory)
+      throws IOException {
+
+    Map<String, String> manifestInfoEntries = new HashMap<>();
+
+    mlParams.putIfAbsent(AbstractEventTrainer.DATA_INDEXER_PARAM,
+        AbstractEventTrainer.DATA_INDEXER_ONE_PASS_VALUE);
+
+    EventTrainer trainer = TrainerFactory.getEventTrainer(
+        mlParams, manifestInfoEntries);
+
+    MaxentModel model = trainer.train(
+        new LanguageDetectorEventStream(samples, factory.getContextGenerator()));
+
+    return new LanguageDetectorModel(model, manifestInfoEntries, factory);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
new file mode 100644
index 0000000..c0d9703
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Map;
+
+import opennlp.tools.ml.model.AbstractModel;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.BaseModel;
+
+/**
+ * A model for language detection
+ */
+public class LanguageDetectorModel extends BaseModel {
+
+  private static final String COMPONENT_NAME = "LanguageDetectorME";
+  private static final String LANGDETECT_MODEL_ENTRY_NAME = "langdetect.model";
+
+  public LanguageDetectorModel(MaxentModel langdetectModel,
+                               Map<String, String> manifestInfoEntries,
+                               LanguageDetectorFactory factory) {
+    super(COMPONENT_NAME, "und", manifestInfoEntries, factory);
+
+    artifactMap.put(LANGDETECT_MODEL_ENTRY_NAME, langdetectModel);
+    checkArtifactMap();
+  }
+
+  public LanguageDetectorModel(InputStream in) throws IOException {
+    super(COMPONENT_NAME, in);
+  }
+
+  public LanguageDetectorModel(File modelFile) throws IOException {
+    super(COMPONENT_NAME, modelFile);
+  }
+
+  public LanguageDetectorModel(URL modelURL) throws IOException {
+    super(COMPONENT_NAME, modelURL);
+  }
+
+  @Override
+  protected void validateArtifactMap() throws InvalidFormatException {
+    super.validateArtifactMap();
+
+    if (!(artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME) instanceof AbstractModel)) {
+      throw new InvalidFormatException("Language detector model is incomplete!");
+    }
+  }
+
+  public LanguageDetectorFactory getFactory() {
+    return (LanguageDetectorFactory) this.toolFactory;
+  }
+
+  @Override
+  protected Class<? extends BaseToolFactory> getDefaultFactory() {
+    return LanguageDetectorFactory.class;
+  }
+
+  public MaxentModel getMaxentModel() {
+    return (MaxentModel) artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
new file mode 100644
index 0000000..2a407f7
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.IOException;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * This class reads in string encoded training samples, parses them and
+ * outputs {@link LanguageSample} objects.
+ * <p>
+ * Format:<br>
+ * Each line contains one sample document.<br>
+ * The language is the first string in the line followed by a tab and the document content.<br>
+ * Sample line: category-string tab-char document line-break-char(s)<br>
+ */
+public class LanguageDetectorSampleStream
+    extends FilterObjectStream<String, LanguageSample> {
+
+  public LanguageDetectorSampleStream(ObjectStream<String> samples) {
+    super(samples);
+  }
+
+  public LanguageSample read() throws IOException {
+    String sampleString;
+    while ((sampleString = samples.read()) != null) {
+      int tabIndex = sampleString.indexOf("\t");
+      if (tabIndex > 0) {
+        String lang = sampleString.substring(0, tabIndex);
+        String context = sampleString.substring(tabIndex + 1);
+
+        return new LanguageSample(new Language(lang), context);
+      }
+    }
+
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
new file mode 100644
index 0000000..f454864
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Objects;
+
+/**
+ * Class which holds a classified document and its @{@link Language}.
+ */
+public class LanguageSample {
+
+  private final Language language;
+  private final CharSequence context;
+
+  public LanguageSample(Language language, CharSequence context) {
+    this.language = Objects.requireNonNull(language, "language must not be null");
+    this.context = Objects.requireNonNull(context, "context must not be null");
+  }
+
+  public Language getLanguage() {
+    return language;
+  }
+
+  public CharSequence getContext() {
+    return context;
+  }
+
+  @Override
+  public String toString() {
+    return language.getLang() + '\t' +  context;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(getContext(), getLanguage());
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+
+    if (obj instanceof LanguageSample) {
+      LanguageSample a = (LanguageSample) obj;
+
+      return getLanguage().equals(a.getLanguage())
+          && getContext().equals(a.getContext());
+    }
+
+    return false;
+  }
+}