You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/06/14 15:26:30 UTC

[3/4] opennlp git commit: OPENNLP-788: Add LanguageDetector tool

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java
new file mode 100644
index 0000000..771be19
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+public class AggregateCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private final CharSequenceNormalizer[] normalizers;
+
+  public AggregateCharSequenceNormalizer(CharSequenceNormalizer ... normalizers) {
+    this.normalizers = normalizers;
+  }
+
+  public CharSequence normalize (CharSequence text) {
+
+    for (CharSequenceNormalizer normalizers :
+        normalizers) {
+      text = normalizers.normalize(text);
+    }
+
+    return text;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
new file mode 100644
index 0000000..b5c1f3f
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+public interface CharSequenceNormalizer {
+  CharSequence normalize(CharSequence text);
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
new file mode 100644
index 0000000..d1c161c
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class EmojiCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final EmojiCharSequenceNormalizer INSTANCE = new EmojiCharSequenceNormalizer();
+
+  public static EmojiCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  private static final Pattern EMOJI_REGEX =
+      Pattern.compile("[\\uD83C-\\uDBFF\\uDC00-\\uDFFF]+");
+
+  public CharSequence normalize (CharSequence text) {
+    String modified = EMOJI_REGEX.matcher(text).replaceAll(" ");
+    return modified;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
new file mode 100644
index 0000000..6b0452d
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class NumberCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final Pattern NUMBER_REGEX = Pattern.compile("\\d+");
+
+  private static final NumberCharSequenceNormalizer INSTANCE = new NumberCharSequenceNormalizer();
+
+  public static NumberCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  public CharSequence normalize (CharSequence text) {
+    return NUMBER_REGEX.matcher(text).replaceAll(" ");
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
new file mode 100644
index 0000000..6183367
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class ShrinkCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final Pattern REPEATED_CHAR_REGEX = Pattern.compile("(.)\\1{2,}",
+      Pattern.CASE_INSENSITIVE);
+  private static final Pattern SPACE_REGEX = Pattern.compile("\\s{2,}",
+      Pattern.CASE_INSENSITIVE);
+
+  private static final ShrinkCharSequenceNormalizer INSTANCE = new ShrinkCharSequenceNormalizer();
+
+  public static ShrinkCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  public CharSequence normalize (CharSequence text) {
+    text = SPACE_REGEX.matcher(text).replaceAll(" ");
+    return REPEATED_CHAR_REGEX.matcher(text).replaceAll("$1$1").trim();
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
new file mode 100644
index 0000000..b5a8625
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class TwitterCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final Pattern HASH_USER_REGEX =
+      Pattern.compile("[#@]\\S+");
+
+  private static final Pattern RT_REGEX =
+      Pattern.compile("\\b(rt[ :])+", Pattern.CASE_INSENSITIVE);
+
+  private static final Pattern FACE_REGEX =
+      Pattern.compile("[:;x]-?[()dop]", Pattern.CASE_INSENSITIVE);
+
+  private static final Pattern LAUGH_REGEX =
+      Pattern.compile("([hj])+([aieou])+(\\1+\\2+)+", Pattern.CASE_INSENSITIVE);
+
+  private static final TwitterCharSequenceNormalizer INSTANCE = new TwitterCharSequenceNormalizer();
+
+  public static TwitterCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  public CharSequence normalize (CharSequence text) {
+    String modified = HASH_USER_REGEX.matcher(text).replaceAll(" ");
+    modified = RT_REGEX.matcher(modified).replaceAll(" ");
+    modified = FACE_REGEX.matcher(modified).replaceAll(" ");
+    modified = LAUGH_REGEX.matcher(modified).replaceAll("$1$2$1$2");
+    return modified;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
new file mode 100644
index 0000000..4be9b63
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class UrlCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final Pattern URL_REGEX =
+      Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]+");
+  private static final Pattern MAIL_REGEX =
+      Pattern.compile("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+");
+
+  private static final UrlCharSequenceNormalizer INSTANCE = new UrlCharSequenceNormalizer();
+
+  public static UrlCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  public CharSequence normalize (CharSequence text) {
+    String modified = URL_REGEX.matcher(text).replaceAll(" ");
+    return MAIL_REGEX.matcher(modified).replaceAll(" ");
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
new file mode 100644
index 0000000..7c31598
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.tools.ngram.NGramModel;
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.util.StringList;
+import opennlp.tools.util.normalizer.CharSequenceNormalizer;
+
+public class DummyFactory extends LanguageDetectorFactory {
+
+
+  public DummyFactory() {
+    super();
+  }
+
+  @Override
+  public void init() {
+    super.init();
+  }
+
+  @Override
+  public LanguageDetectorContextGenerator getContextGenerator() {
+    return new DummyFactory.MyContectGenerator(1, 5,
+        new DummyFactory.UpperCaseNormalizer());
+  }
+
+  public class UpperCaseNormalizer implements CharSequenceNormalizer {
+    @Override
+    public CharSequence normalize(CharSequence text) {
+      return text.toString().toUpperCase();
+    }
+  }
+
+  public class MyContectGenerator extends LanguageDetectorContextGenerator {
+
+    public MyContectGenerator(int min, int max, CharSequenceNormalizer... normalizers) {
+      super(min, max, normalizers);
+    }
+
+    @Override
+    public String[] getContext(String document) {
+      String[] superContext = super.getContext(document);
+
+      List<String> context = new ArrayList(Arrays.asList(superContext));
+
+      document = this.normalizer.normalize(document).toString();
+
+      SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
+      String[] words = tokenizer.tokenize(document);
+      NGramModel tokenNgramModel = new NGramModel();
+      if (words.length > 0) {
+        tokenNgramModel.add(new StringList(words), 1, 3);
+        Iterator tokenNgramIterator = tokenNgramModel.iterator();
+
+        while (tokenNgramIterator.hasNext()) {
+          StringList tokenList = (StringList) tokenNgramIterator.next();
+          if (tokenList.size() > 0) {
+            context.add("tg=" + tokenList.toString());
+          }
+        }
+      }
+
+      return context.toArray(new String[context.size()]);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
new file mode 100644
index 0000000..dc6ca26
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageDetectorContextGeneratorTest {
+
+  @Test
+  public void extractContext() throws Exception {
+    String doc = "abcde fghijk";
+
+    LanguageDetectorContextGenerator cg = new LanguageDetectorContextGenerator(1, 3);
+
+    Collection<String> features = Arrays.asList(cg.getContext(doc));
+
+    Assert.assertEquals(33, features.size());
+    Assert.assertTrue(features.contains("ab"));
+    Assert.assertTrue(features.contains("abc"));
+    Assert.assertTrue(features.contains("e f"));
+    Assert.assertTrue(features.contains(" fg"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java
new file mode 100644
index 0000000..520fc71
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.util.TrainingParameters;
+
+public class LanguageDetectorCrossValidatorTest {
+
+  @Test
+  public void evaluate() throws Exception {
+
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ITERATIONS_PARAM, 100);
+    params.put(TrainingParameters.CUTOFF_PARAM, 5);
+    params.put("PrintMessages", false);
+
+
+    final AtomicInteger correctCount = new AtomicInteger();
+    final AtomicInteger incorrectCount = new AtomicInteger();
+
+    LanguageDetectorCrossValidator cv = new LanguageDetectorCrossValidator(params,
+        new LanguageDetectorFactory(), new LanguageDetectorEvaluationMonitor() {
+          @Override
+          public void correctlyClassified(LanguageSample reference,
+                                          LanguageSample prediction) {
+            correctCount.incrementAndGet();
+          }
+
+          @Override
+          public void missclassified(LanguageSample reference,
+                                     LanguageSample prediction) {
+            incorrectCount.incrementAndGet();
+          }
+        });
+
+    LanguageDetectorSampleStream sampleStream = LanguageDetectorMETest.createSampleStream();
+
+    cv.evaluate(sampleStream, 2);
+
+    Assert.assertEquals(99, cv.getDocumentCount());
+    Assert.assertEquals(0.98989898989899, cv.getDocumentAccuracy(), 0.01);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java
new file mode 100644
index 0000000..8bdd71b
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageDetectorEvaluatorTest {
+
+  @Test
+  public void processSample() throws Exception {
+    LanguageDetectorModel model = LanguageDetectorMETest.trainModel();
+    LanguageDetectorME langdetector = new LanguageDetectorME(model);
+
+    final AtomicInteger correctCount = new AtomicInteger();
+    final AtomicInteger incorrectCount = new AtomicInteger();
+
+    LanguageDetectorEvaluator evaluator = new LanguageDetectorEvaluator(langdetector,
+        new LanguageDetectorEvaluationMonitor() {
+          @Override
+          public void correctlyClassified(LanguageSample reference,
+                                          LanguageSample prediction) {
+            correctCount.incrementAndGet();
+          }
+
+          @Override
+          public void missclassified(LanguageSample reference,
+                                     LanguageSample prediction) {
+            incorrectCount.incrementAndGet();
+          }
+        });
+
+    evaluator.evaluateSample(new LanguageSample(new Language("pob"),
+        "escreve e faz palestras pelo mundo inteiro sobre anjos"));
+
+    evaluator.evaluateSample(new LanguageSample(new Language("fra"),
+        "escreve e faz palestras pelo mundo inteiro sobre anjos"));
+
+    evaluator.evaluateSample(new LanguageSample(new Language("fra"),
+        "escreve e faz palestras pelo mundo inteiro sobre anjos"));
+
+
+    Assert.assertEquals(1, correctCount.get());
+    Assert.assertEquals(2, incorrectCount.get());
+
+    Assert.assertEquals(3, evaluator.getDocumentCount());
+    Assert.assertEquals(0.33, evaluator.getAccuracy(), 0.01);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
new file mode 100644
index 0000000..c696ec1
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
+public class LanguageDetectorFactoryTest {
+
+
+  private static LanguageDetectorModel model;
+
+  @BeforeClass
+  public static void train() throws Exception {
+
+    ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory(
+        LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt");
+
+    PlainTextByLineStream lineStream = new PlainTextByLineStream(streamFactory, "UTF-8");
+
+    LanguageDetectorSampleStream sampleStream = new LanguageDetectorSampleStream(lineStream);
+
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "0");
+    params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES");
+
+    model = LanguageDetectorME.train(sampleStream, params, new DummyFactory());
+  }
+
+  @Test
+  public void testCorrectFactory() throws IOException {
+    byte[] serialized = LanguageDetectorMETest.serializeModel(model);
+
+    LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized));
+
+    Assert.assertTrue(myModel.getFactory() instanceof DummyFactory);
+
+  }
+
+  @Test
+  public void testDummyFactory() throws Exception {
+    byte[] serialized = LanguageDetectorMETest.serializeModel(model);
+
+    LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized));
+
+    Assert.assertTrue(myModel.getFactory() instanceof DummyFactory);
+  }
+
+  @Test
+  public void testDummyFactoryContextGenerator() throws Exception {
+    LanguageDetectorContextGenerator cg = model.getFactory().getContextGenerator();
+    String[] context = cg.getContext(
+        "a dummy text phrase to test if the context generator works!!!!!!!!!!!!");
+
+    Set<String> set = new HashSet(Arrays.asList(context));
+
+    Assert.assertTrue(set.contains("!!!!!")); // default normalizer would remove the repeated !
+    Assert.assertTrue(set.contains("a dum"));
+    Assert.assertTrue(set.contains("tg=[THE,CONTEXT,GENERATOR]"));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
new file mode 100644
index 0000000..beb7589
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.langdetect;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
+
+public class LanguageDetectorMETest {
+
+  private LanguageDetectorModel model;
+
+  @Before
+  public void init() throws Exception {
+
+    this.model = trainModel();
+
+  }
+
+  @Test
+  public void testPredictLanguages() {
+    LanguageDetector ld = new LanguageDetectorME(this.model);
+    Language[] languages = ld.predictLanguages("estava em uma marcenaria na Rua Bruno");
+
+    Assert.assertEquals(4, languages.length);
+    Assert.assertEquals("pob", languages[0].getLang());
+    Assert.assertEquals("ita", languages[1].getLang());
+    Assert.assertEquals("spa", languages[2].getLang());
+    Assert.assertEquals("fra", languages[3].getLang());
+  }
+
+  @Test
+  public void testPredictLanguage() {
+    LanguageDetector ld = new LanguageDetectorME(this.model);
+    Language language = ld.predictLanguage("Dove รจ meglio che giochi");
+
+    Assert.assertEquals("ita", language.getLang());
+  }
+
+  @Test
+  public void testSupportedLanguages() {
+
+    LanguageDetector ld = new LanguageDetectorME(this.model);
+    String[] supportedLanguages = ld.getSupportedLanguages();
+
+    Assert.assertEquals(4, supportedLanguages.length);
+  }
+
+  @Test
+  public void testLoadFromSerialized() throws IOException {
+    byte[] serialized = serializeModel(model);
+
+    LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized));
+
+    Assert.assertNotNull(myModel);
+
+  }
+
+  protected static byte[] serializeModel(LanguageDetectorModel model) throws IOException {
+
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    model.serialize(out);
+    return out.toByteArray();
+  }
+
+  public static LanguageDetectorModel trainModel() throws Exception {
+    return trainModel(new LanguageDetectorFactory());
+  }
+
+  public static LanguageDetectorModel trainModel(LanguageDetectorFactory factory) throws Exception {
+
+
+    LanguageDetectorSampleStream sampleStream = createSampleStream();
+
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "2");
+    params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES");
+
+    return LanguageDetectorME.train(sampleStream, params, factory);
+  }
+
+  public static LanguageDetectorSampleStream createSampleStream() throws IOException {
+
+    ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory(
+        LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt");
+
+    PlainTextByLineStream lineStream = new PlainTextByLineStream(streamFactory, "UTF-8");
+
+    return new LanguageDetectorSampleStream(lineStream);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
new file mode 100644
index 0000000..7d12581
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageSampleTest {
+
+  @Test
+  public void testConstructor() {
+    Language lang = new Language("aLang");
+    CharSequence context = "aContext";
+
+    LanguageSample sample = new LanguageSample(lang, context);
+
+    Assert.assertEquals(lang, sample.getLanguage());
+    Assert.assertEquals(context, sample.getContext());
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void testNullLang() throws Exception {
+    CharSequence context = "aContext";
+
+    new LanguageSample(null, context);
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void testNullContext() {
+    Language lang = new Language("aLang");
+
+    new LanguageSample(lang, null);
+  }
+
+  @Test
+  public void testToString() {
+    Language lang = new Language("aLang");
+    CharSequence context = "aContext";
+
+    LanguageSample sample = new LanguageSample(lang, context);
+
+    Assert.assertEquals(lang.getLang() + "\t" + context, sample.toString());
+  }
+
+  @Test
+  public void testHash() {
+
+    int hashA = new LanguageSample(new Language("aLang"), "aContext").hashCode();
+    int hashB = new LanguageSample(new Language("bLang"), "aContext").hashCode();
+    int hashC = new LanguageSample(new Language("aLang"), "bContext").hashCode();
+
+    Assert.assertNotEquals(hashA, hashB);
+    Assert.assertNotEquals(hashA, hashC);
+    Assert.assertNotEquals(hashB, hashC);
+  }
+
+  @Test
+  public void testEquals() throws Exception {
+
+    LanguageSample sampleA = new LanguageSample(new Language("aLang"), "aContext");
+    LanguageSample sampleA1 = new LanguageSample(new Language("aLang"), "aContext");
+    LanguageSample sampleB = new LanguageSample(new Language("bLang"), "aContext");
+    LanguageSample sampleC = new LanguageSample(new Language("aLang"), "bContext");
+
+    Assert.assertEquals(sampleA, sampleA);
+    Assert.assertEquals(sampleA, sampleA1);
+    Assert.assertNotEquals(sampleA, sampleB);
+    Assert.assertNotEquals(sampleA, sampleC);
+    Assert.assertNotEquals(sampleB, sampleC);
+    Assert.assertFalse(sampleA.equals("something else"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
new file mode 100644
index 0000000..dc25bc6
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageTest {
+
+
+  @Test
+  public void emptyConfidence() throws Exception {
+    String languageCode = "aLanguage";
+    Language lang = new Language(languageCode);
+
+    Assert.assertEquals(languageCode, lang.getLang());
+    Assert.assertEquals(0, lang.getConfidence(), 0);
+  }
+
+  @Test
+  public void nonEmptyConfidence() throws Exception {
+    String languageCode = "aLanguage";
+    double confidence = 0.05;
+    Language lang = new Language(languageCode, confidence);
+
+    Assert.assertEquals(languageCode, lang.getLang());
+    Assert.assertEquals(confidence, lang.getConfidence(), 0);
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void emptyLanguage() throws Exception {
+    new Language(null);
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void emptyLanguageConfidence() throws Exception {
+    new Language(null, 0.05);
+  }
+
+  @Test
+  public void testToString() {
+    Language lang = new Language("aLang");
+
+    Assert.assertEquals("aLang (0.0)", lang.toString());
+
+    lang = new Language("aLang", 0.0886678);
+
+    Assert.assertEquals("aLang (0.0886678)", lang.toString());
+  }
+
+
+  @Test
+  public void testHash() {
+    int hashA = new Language("aLang").hashCode();
+    int hashAA = new Language("aLang").hashCode();
+    int hashB = new Language("BLang").hashCode();
+    int hashA5 = new Language("aLang", 5.0).hashCode();
+    int hashA6 = new Language("BLang", 6.0).hashCode();
+
+    Assert.assertEquals(hashA, hashAA);
+
+    Assert.assertNotEquals(hashA, hashB);
+    Assert.assertNotEquals(hashA, hashA5);
+    Assert.assertNotEquals(hashB, hashA5);
+    Assert.assertNotEquals(hashA5, hashA6);
+  }
+
+  @Test
+  public void testEquals() {
+    Language langA = new Language("langA");
+    Language langB = new Language("langB");
+    Language langA5 = new Language("langA5", 5.0);
+    Language langA6 = new Language("langA5", 6.0);
+
+    Assert.assertEquals(langA, langA);
+    Assert.assertEquals(langA5, langA5);
+
+    Assert.assertNotEquals(langA, langA5);
+    Assert.assertNotEquals(langA, langB);
+
+    Assert.assertEquals(langA6, langA5);
+
+    Assert.assertNotEquals(langA, "something else");
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..0f8dfe7
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class EmojiCharSequenceNormalizerTest {
+
+  public EmojiCharSequenceNormalizer normalizer = EmojiCharSequenceNormalizer.getInstance();
+
+  @Test
+  public void normalizeEmoji() throws Exception {
+
+    String s = new StringBuilder()
+        .append("Any funny text goes here ")
+        .appendCodePoint(0x1F606)
+        .appendCodePoint(0x1F606)
+        .appendCodePoint(0x1F606)
+        .append(" ")
+        .appendCodePoint(0x1F61B)
+        .toString();
+    Assert.assertEquals(
+        "Any funny text goes here    ", normalizer.normalize(s));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..50b1f0c
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class NumberCharSequenceNormalizerTest {
+
+  public NumberCharSequenceNormalizer normalizer = NumberCharSequenceNormalizer.getInstance();
+
+
+  @Test
+  public void normalize() throws Exception {
+    Assert.assertEquals("absc  ,  abcd", normalizer.normalize("absc 123,0123 abcd"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..95cf300
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class ShrinkCharSequenceNormalizerTest {
+
+  public ShrinkCharSequenceNormalizer normalizer = ShrinkCharSequenceNormalizer.getInstance();
+
+  @Test
+  public void normalizeSpace() throws Exception {
+    Assert.assertEquals(
+        "a text extra space", normalizer.normalize("a text    extra space"));
+  }
+
+  @Test
+  public void normalizeChar() throws Exception {
+    Assert.assertEquals("Helloo", normalizer.normalize("Helllllloooooo"));
+    Assert.assertEquals("Hello", normalizer.normalize("Hello"));
+    Assert.assertEquals("HHello", normalizer.normalize("HHello"));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..f0bd517
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class TwitterCharSequenceNormalizerTest {
+
+  public TwitterCharSequenceNormalizer normalizer = TwitterCharSequenceNormalizer.getInstance();
+
+  @Test
+  public void normalizeHashtag() throws Exception {
+    Assert.assertEquals("asdf   2nnfdf", normalizer.normalize("asdf #hasdk23 2nnfdf"));
+  }
+
+  @Test
+  public void normalizeUser() throws Exception {
+    Assert.assertEquals("asdf   2nnfdf", normalizer.normalize("asdf @hasdk23 2nnfdf"));
+  }
+
+  @Test
+  public void normalizeRT() throws Exception {
+    Assert.assertEquals(" 2nnfdf", normalizer.normalize("RT RT RT 2nnfdf"));
+  }
+
+  @Test
+  public void normalizeLaugh() throws Exception {
+    Assert.assertEquals("ahahah", normalizer.normalize("ahahahah"));
+    Assert.assertEquals("haha", normalizer.normalize("hahha"));
+    Assert.assertEquals("haha", normalizer.normalize("hahaa"));
+    Assert.assertEquals("ahaha", normalizer.normalize("ahahahahhahahhahahaaaa"));
+    Assert.assertEquals("jaja", normalizer.normalize("jajjajajaja"));
+  }
+
+
+
+  @Test
+  public void normalizeFace() throws Exception {
+    Assert.assertEquals("hello   hello", normalizer.normalize("hello :-) hello"));
+    Assert.assertEquals("hello   hello", normalizer.normalize("hello ;) hello"));
+    Assert.assertEquals("  hello", normalizer.normalize(":) hello"));
+    Assert.assertEquals("hello  ", normalizer.normalize("hello :P"));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..72eb83a
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class UrlCharSequenceNormalizerTest {
+
+  public UrlCharSequenceNormalizer normalizer = UrlCharSequenceNormalizer.getInstance();
+
+  @Test
+  public void normalizeUrl() throws Exception {
+    Assert.assertEquals(
+        "asdf   2nnfdf", normalizer.normalize("asdf http://asdf.com/dfa/cxs 2nnfdf"));
+
+
+    Assert.assertEquals(
+        "asdf   2nnfdf  ", normalizer.normalize("asdf http://asdf.com/dfa/cx" +
+            "s 2nnfdf http://asdf.com/dfa/cxs"));
+  }
+
+  @Test
+  public void normalizeEmail() throws Exception {
+    Assert.assertEquals(
+        "asdf   2nnfdf", normalizer.normalize("asdf asd.fdfa@hasdk23.com.br 2nnfdf"));
+    Assert.assertEquals(
+        "asdf   2nnfdf  ", normalizer.normalize("asdf asd.fdfa@hasdk23.com.br" +
+            " 2nnfdf asd.fdfa@hasdk23.com.br"));
+  }
+}