You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/06/14 15:26:30 UTC
[3/4] opennlp git commit: OPENNLP-788: Add LanguageDetector tool
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java
new file mode 100644
index 0000000..771be19
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+public class AggregateCharSequenceNormalizer implements CharSequenceNormalizer {
+
+ private final CharSequenceNormalizer[] normalizers;
+
+ public AggregateCharSequenceNormalizer(CharSequenceNormalizer ... normalizers) {
+ this.normalizers = normalizers;
+ }
+
+ public CharSequence normalize (CharSequence text) {
+
+ for (CharSequenceNormalizer normalizers :
+ normalizers) {
+ text = normalizers.normalize(text);
+ }
+
+ return text;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
new file mode 100644
index 0000000..b5c1f3f
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+public interface CharSequenceNormalizer {
+ CharSequence normalize(CharSequence text);
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
new file mode 100644
index 0000000..d1c161c
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class EmojiCharSequenceNormalizer implements CharSequenceNormalizer {
+
+ private static final EmojiCharSequenceNormalizer INSTANCE = new EmojiCharSequenceNormalizer();
+
+ public static EmojiCharSequenceNormalizer getInstance() {
+ return INSTANCE;
+ }
+
+ private static final Pattern EMOJI_REGEX =
+ Pattern.compile("[\\uD83C-\\uDBFF\\uDC00-\\uDFFF]+");
+
+ public CharSequence normalize (CharSequence text) {
+ String modified = EMOJI_REGEX.matcher(text).replaceAll(" ");
+ return modified;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
new file mode 100644
index 0000000..6b0452d
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class NumberCharSequenceNormalizer implements CharSequenceNormalizer {
+
+ private static final Pattern NUMBER_REGEX = Pattern.compile("\\d+");
+
+ private static final NumberCharSequenceNormalizer INSTANCE = new NumberCharSequenceNormalizer();
+
+ public static NumberCharSequenceNormalizer getInstance() {
+ return INSTANCE;
+ }
+
+ public CharSequence normalize (CharSequence text) {
+ return NUMBER_REGEX.matcher(text).replaceAll(" ");
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
new file mode 100644
index 0000000..6183367
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class ShrinkCharSequenceNormalizer implements CharSequenceNormalizer {
+
+ private static final Pattern REPEATED_CHAR_REGEX = Pattern.compile("(.)\\1{2,}",
+ Pattern.CASE_INSENSITIVE);
+ private static final Pattern SPACE_REGEX = Pattern.compile("\\s{2,}",
+ Pattern.CASE_INSENSITIVE);
+
+ private static final ShrinkCharSequenceNormalizer INSTANCE = new ShrinkCharSequenceNormalizer();
+
+ public static ShrinkCharSequenceNormalizer getInstance() {
+ return INSTANCE;
+ }
+
+ public CharSequence normalize (CharSequence text) {
+ text = SPACE_REGEX.matcher(text).replaceAll(" ");
+ return REPEATED_CHAR_REGEX.matcher(text).replaceAll("$1$1").trim();
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
new file mode 100644
index 0000000..b5a8625
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class TwitterCharSequenceNormalizer implements CharSequenceNormalizer {
+
+ private static final Pattern HASH_USER_REGEX =
+ Pattern.compile("[#@]\\S+");
+
+ private static final Pattern RT_REGEX =
+ Pattern.compile("\\b(rt[ :])+", Pattern.CASE_INSENSITIVE);
+
+ private static final Pattern FACE_REGEX =
+ Pattern.compile("[:;x]-?[()dop]", Pattern.CASE_INSENSITIVE);
+
+ private static final Pattern LAUGH_REGEX =
+ Pattern.compile("([hj])+([aieou])+(\\1+\\2+)+", Pattern.CASE_INSENSITIVE);
+
+ private static final TwitterCharSequenceNormalizer INSTANCE = new TwitterCharSequenceNormalizer();
+
+ public static TwitterCharSequenceNormalizer getInstance() {
+ return INSTANCE;
+ }
+
+ public CharSequence normalize (CharSequence text) {
+ String modified = HASH_USER_REGEX.matcher(text).replaceAll(" ");
+ modified = RT_REGEX.matcher(modified).replaceAll(" ");
+ modified = FACE_REGEX.matcher(modified).replaceAll(" ");
+ modified = LAUGH_REGEX.matcher(modified).replaceAll("$1$2$1$2");
+ return modified;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
new file mode 100644
index 0000000..4be9b63
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class UrlCharSequenceNormalizer implements CharSequenceNormalizer {
+
+ private static final Pattern URL_REGEX =
+ Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]+");
+ private static final Pattern MAIL_REGEX =
+ Pattern.compile("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+");
+
+ private static final UrlCharSequenceNormalizer INSTANCE = new UrlCharSequenceNormalizer();
+
+ public static UrlCharSequenceNormalizer getInstance() {
+ return INSTANCE;
+ }
+
+ public CharSequence normalize (CharSequence text) {
+ String modified = URL_REGEX.matcher(text).replaceAll(" ");
+ return MAIL_REGEX.matcher(modified).replaceAll(" ");
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
new file mode 100644
index 0000000..7c31598
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.tools.ngram.NGramModel;
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.util.StringList;
+import opennlp.tools.util.normalizer.CharSequenceNormalizer;
+
+public class DummyFactory extends LanguageDetectorFactory {
+
+
+ public DummyFactory() {
+ super();
+ }
+
+ @Override
+ public void init() {
+ super.init();
+ }
+
+ @Override
+ public LanguageDetectorContextGenerator getContextGenerator() {
+ return new DummyFactory.MyContectGenerator(1, 5,
+ new DummyFactory.UpperCaseNormalizer());
+ }
+
+ public class UpperCaseNormalizer implements CharSequenceNormalizer {
+ @Override
+ public CharSequence normalize(CharSequence text) {
+ return text.toString().toUpperCase();
+ }
+ }
+
+ public class MyContectGenerator extends LanguageDetectorContextGenerator {
+
+ public MyContectGenerator(int min, int max, CharSequenceNormalizer... normalizers) {
+ super(min, max, normalizers);
+ }
+
+ @Override
+ public String[] getContext(String document) {
+ String[] superContext = super.getContext(document);
+
+ List<String> context = new ArrayList(Arrays.asList(superContext));
+
+ document = this.normalizer.normalize(document).toString();
+
+ SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
+ String[] words = tokenizer.tokenize(document);
+ NGramModel tokenNgramModel = new NGramModel();
+ if (words.length > 0) {
+ tokenNgramModel.add(new StringList(words), 1, 3);
+ Iterator tokenNgramIterator = tokenNgramModel.iterator();
+
+ while (tokenNgramIterator.hasNext()) {
+ StringList tokenList = (StringList) tokenNgramIterator.next();
+ if (tokenList.size() > 0) {
+ context.add("tg=" + tokenList.toString());
+ }
+ }
+ }
+
+ return context.toArray(new String[context.size()]);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
new file mode 100644
index 0000000..dc6ca26
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageDetectorContextGeneratorTest {
+
+ @Test
+ public void extractContext() throws Exception {
+ String doc = "abcde fghijk";
+
+ LanguageDetectorContextGenerator cg = new LanguageDetectorContextGenerator(1, 3);
+
+ Collection<String> features = Arrays.asList(cg.getContext(doc));
+
+ Assert.assertEquals(33, features.size());
+ Assert.assertTrue(features.contains("ab"));
+ Assert.assertTrue(features.contains("abc"));
+ Assert.assertTrue(features.contains("e f"));
+ Assert.assertTrue(features.contains(" fg"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java
new file mode 100644
index 0000000..520fc71
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.util.TrainingParameters;
+
+public class LanguageDetectorCrossValidatorTest {
+
+ @Test
+ public void evaluate() throws Exception {
+
+ TrainingParameters params = new TrainingParameters();
+ params.put(TrainingParameters.ITERATIONS_PARAM, 100);
+ params.put(TrainingParameters.CUTOFF_PARAM, 5);
+ params.put("PrintMessages", false);
+
+
+ final AtomicInteger correctCount = new AtomicInteger();
+ final AtomicInteger incorrectCount = new AtomicInteger();
+
+ LanguageDetectorCrossValidator cv = new LanguageDetectorCrossValidator(params,
+ new LanguageDetectorFactory(), new LanguageDetectorEvaluationMonitor() {
+ @Override
+ public void correctlyClassified(LanguageSample reference,
+ LanguageSample prediction) {
+ correctCount.incrementAndGet();
+ }
+
+ @Override
+ public void missclassified(LanguageSample reference,
+ LanguageSample prediction) {
+ incorrectCount.incrementAndGet();
+ }
+ });
+
+ LanguageDetectorSampleStream sampleStream = LanguageDetectorMETest.createSampleStream();
+
+ cv.evaluate(sampleStream, 2);
+
+ Assert.assertEquals(99, cv.getDocumentCount());
+ Assert.assertEquals(0.98989898989899, cv.getDocumentAccuracy(), 0.01);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java
new file mode 100644
index 0000000..8bdd71b
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageDetectorEvaluatorTest {
+
+ @Test
+ public void processSample() throws Exception {
+ LanguageDetectorModel model = LanguageDetectorMETest.trainModel();
+ LanguageDetectorME langdetector = new LanguageDetectorME(model);
+
+ final AtomicInteger correctCount = new AtomicInteger();
+ final AtomicInteger incorrectCount = new AtomicInteger();
+
+ LanguageDetectorEvaluator evaluator = new LanguageDetectorEvaluator(langdetector,
+ new LanguageDetectorEvaluationMonitor() {
+ @Override
+ public void correctlyClassified(LanguageSample reference,
+ LanguageSample prediction) {
+ correctCount.incrementAndGet();
+ }
+
+ @Override
+ public void missclassified(LanguageSample reference,
+ LanguageSample prediction) {
+ incorrectCount.incrementAndGet();
+ }
+ });
+
+ evaluator.evaluateSample(new LanguageSample(new Language("pob"),
+ "escreve e faz palestras pelo mundo inteiro sobre anjos"));
+
+ evaluator.evaluateSample(new LanguageSample(new Language("fra"),
+ "escreve e faz palestras pelo mundo inteiro sobre anjos"));
+
+ evaluator.evaluateSample(new LanguageSample(new Language("fra"),
+ "escreve e faz palestras pelo mundo inteiro sobre anjos"));
+
+
+ Assert.assertEquals(1, correctCount.get());
+ Assert.assertEquals(2, incorrectCount.get());
+
+ Assert.assertEquals(3, evaluator.getDocumentCount());
+ Assert.assertEquals(0.33, evaluator.getAccuracy(), 0.01);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
new file mode 100644
index 0000000..c696ec1
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
+public class LanguageDetectorFactoryTest {
+
+
+ private static LanguageDetectorModel model;
+
+ @BeforeClass
+ public static void train() throws Exception {
+
+ ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory(
+ LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt");
+
+ PlainTextByLineStream lineStream = new PlainTextByLineStream(streamFactory, "UTF-8");
+
+ LanguageDetectorSampleStream sampleStream = new LanguageDetectorSampleStream(lineStream);
+
+ TrainingParameters params = new TrainingParameters();
+ params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+ params.put(TrainingParameters.CUTOFF_PARAM, "0");
+ params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES");
+
+ model = LanguageDetectorME.train(sampleStream, params, new DummyFactory());
+ }
+
+ @Test
+ public void testCorrectFactory() throws IOException {
+ byte[] serialized = LanguageDetectorMETest.serializeModel(model);
+
+ LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized));
+
+ Assert.assertTrue(myModel.getFactory() instanceof DummyFactory);
+
+ }
+
+ @Test
+ public void testDummyFactory() throws Exception {
+ byte[] serialized = LanguageDetectorMETest.serializeModel(model);
+
+ LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized));
+
+ Assert.assertTrue(myModel.getFactory() instanceof DummyFactory);
+ }
+
+ @Test
+ public void testDummyFactoryContextGenerator() throws Exception {
+ LanguageDetectorContextGenerator cg = model.getFactory().getContextGenerator();
+ String[] context = cg.getContext(
+ "a dummy text phrase to test if the context generator works!!!!!!!!!!!!");
+
+ Set<String> set = new HashSet(Arrays.asList(context));
+
+ Assert.assertTrue(set.contains("!!!!!")); // default normalizer would remove the repeated !
+ Assert.assertTrue(set.contains("a dum"));
+ Assert.assertTrue(set.contains("tg=[THE,CONTEXT,GENERATOR]"));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
new file mode 100644
index 0000000..beb7589
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.langdetect;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
+
+public class LanguageDetectorMETest {
+
+ private LanguageDetectorModel model;
+
+ @Before
+ public void init() throws Exception {
+
+ this.model = trainModel();
+
+ }
+
+ @Test
+ public void testPredictLanguages() {
+ LanguageDetector ld = new LanguageDetectorME(this.model);
+ Language[] languages = ld.predictLanguages("estava em uma marcenaria na Rua Bruno");
+
+ Assert.assertEquals(4, languages.length);
+ Assert.assertEquals("pob", languages[0].getLang());
+ Assert.assertEquals("ita", languages[1].getLang());
+ Assert.assertEquals("spa", languages[2].getLang());
+ Assert.assertEquals("fra", languages[3].getLang());
+ }
+
+ @Test
+ public void testPredictLanguage() {
+ LanguageDetector ld = new LanguageDetectorME(this.model);
+ Language language = ld.predictLanguage("Dove รจ meglio che giochi");
+
+ Assert.assertEquals("ita", language.getLang());
+ }
+
+ @Test
+ public void testSupportedLanguages() {
+
+ LanguageDetector ld = new LanguageDetectorME(this.model);
+ String[] supportedLanguages = ld.getSupportedLanguages();
+
+ Assert.assertEquals(4, supportedLanguages.length);
+ }
+
+ @Test
+ public void testLoadFromSerialized() throws IOException {
+ byte[] serialized = serializeModel(model);
+
+ LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized));
+
+ Assert.assertNotNull(myModel);
+
+ }
+
+ protected static byte[] serializeModel(LanguageDetectorModel model) throws IOException {
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ model.serialize(out);
+ return out.toByteArray();
+ }
+
+ public static LanguageDetectorModel trainModel() throws Exception {
+ return trainModel(new LanguageDetectorFactory());
+ }
+
+ public static LanguageDetectorModel trainModel(LanguageDetectorFactory factory) throws Exception {
+
+
+ LanguageDetectorSampleStream sampleStream = createSampleStream();
+
+ TrainingParameters params = new TrainingParameters();
+ params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+ params.put(TrainingParameters.CUTOFF_PARAM, "2");
+ params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES");
+
+ return LanguageDetectorME.train(sampleStream, params, factory);
+ }
+
+ public static LanguageDetectorSampleStream createSampleStream() throws IOException {
+
+ ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory(
+ LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt");
+
+ PlainTextByLineStream lineStream = new PlainTextByLineStream(streamFactory, "UTF-8");
+
+ return new LanguageDetectorSampleStream(lineStream);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
new file mode 100644
index 0000000..7d12581
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageSampleTest {
+
+ @Test
+ public void testConstructor() {
+ Language lang = new Language("aLang");
+ CharSequence context = "aContext";
+
+ LanguageSample sample = new LanguageSample(lang, context);
+
+ Assert.assertEquals(lang, sample.getLanguage());
+ Assert.assertEquals(context, sample.getContext());
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void testNullLang() throws Exception {
+ CharSequence context = "aContext";
+
+ new LanguageSample(null, context);
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void testNullContext() {
+ Language lang = new Language("aLang");
+
+ new LanguageSample(lang, null);
+ }
+
+ @Test
+ public void testToString() {
+ Language lang = new Language("aLang");
+ CharSequence context = "aContext";
+
+ LanguageSample sample = new LanguageSample(lang, context);
+
+ Assert.assertEquals(lang.getLang() + "\t" + context, sample.toString());
+ }
+
+ @Test
+ public void testHash() {
+
+ int hashA = new LanguageSample(new Language("aLang"), "aContext").hashCode();
+ int hashB = new LanguageSample(new Language("bLang"), "aContext").hashCode();
+ int hashC = new LanguageSample(new Language("aLang"), "bContext").hashCode();
+
+ Assert.assertNotEquals(hashA, hashB);
+ Assert.assertNotEquals(hashA, hashC);
+ Assert.assertNotEquals(hashB, hashC);
+ }
+
+ @Test
+ public void testEquals() throws Exception {
+
+ LanguageSample sampleA = new LanguageSample(new Language("aLang"), "aContext");
+ LanguageSample sampleA1 = new LanguageSample(new Language("aLang"), "aContext");
+ LanguageSample sampleB = new LanguageSample(new Language("bLang"), "aContext");
+ LanguageSample sampleC = new LanguageSample(new Language("aLang"), "bContext");
+
+ Assert.assertEquals(sampleA, sampleA);
+ Assert.assertEquals(sampleA, sampleA1);
+ Assert.assertNotEquals(sampleA, sampleB);
+ Assert.assertNotEquals(sampleA, sampleC);
+ Assert.assertNotEquals(sampleB, sampleC);
+ Assert.assertFalse(sampleA.equals("something else"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
new file mode 100644
index 0000000..dc25bc6
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageTest {
+
+
+ @Test
+ public void emptyConfidence() throws Exception {
+ String languageCode = "aLanguage";
+ Language lang = new Language(languageCode);
+
+ Assert.assertEquals(languageCode, lang.getLang());
+ Assert.assertEquals(0, lang.getConfidence(), 0);
+ }
+
+ @Test
+ public void nonEmptyConfidence() throws Exception {
+ String languageCode = "aLanguage";
+ double confidence = 0.05;
+ Language lang = new Language(languageCode, confidence);
+
+ Assert.assertEquals(languageCode, lang.getLang());
+ Assert.assertEquals(confidence, lang.getConfidence(), 0);
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void emptyLanguage() throws Exception {
+ new Language(null);
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void emptyLanguageConfidence() throws Exception {
+ new Language(null, 0.05);
+ }
+
+ @Test
+ public void testToString() {
+ Language lang = new Language("aLang");
+
+ Assert.assertEquals("aLang (0.0)", lang.toString());
+
+ lang = new Language("aLang", 0.0886678);
+
+ Assert.assertEquals("aLang (0.0886678)", lang.toString());
+ }
+
+
+ @Test
+ public void testHash() {
+ int hashA = new Language("aLang").hashCode();
+ int hashAA = new Language("aLang").hashCode();
+ int hashB = new Language("BLang").hashCode();
+ int hashA5 = new Language("aLang", 5.0).hashCode();
+ int hashA6 = new Language("BLang", 6.0).hashCode();
+
+ Assert.assertEquals(hashA, hashAA);
+
+ Assert.assertNotEquals(hashA, hashB);
+ Assert.assertNotEquals(hashA, hashA5);
+ Assert.assertNotEquals(hashB, hashA5);
+ Assert.assertNotEquals(hashA5, hashA6);
+ }
+
+ @Test
+ public void testEquals() {
+ Language langA = new Language("langA");
+ Language langB = new Language("langB");
+ Language langA5 = new Language("langA5", 5.0);
+ Language langA6 = new Language("langA5", 6.0);
+
+ Assert.assertEquals(langA, langA);
+ Assert.assertEquals(langA5, langA5);
+
+ Assert.assertNotEquals(langA, langA5);
+ Assert.assertNotEquals(langA, langB);
+
+ Assert.assertEquals(langA6, langA5);
+
+ Assert.assertNotEquals(langA, "something else");
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..0f8dfe7
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class EmojiCharSequenceNormalizerTest {
+
+ public EmojiCharSequenceNormalizer normalizer = EmojiCharSequenceNormalizer.getInstance();
+
+ @Test
+ public void normalizeEmoji() throws Exception {
+
+ String s = new StringBuilder()
+ .append("Any funny text goes here ")
+ .appendCodePoint(0x1F606)
+ .appendCodePoint(0x1F606)
+ .appendCodePoint(0x1F606)
+ .append(" ")
+ .appendCodePoint(0x1F61B)
+ .toString();
+ Assert.assertEquals(
+ "Any funny text goes here ", normalizer.normalize(s));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..50b1f0c
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class NumberCharSequenceNormalizerTest {
+
+ public NumberCharSequenceNormalizer normalizer = NumberCharSequenceNormalizer.getInstance();
+
+
+ @Test
+ public void normalize() throws Exception {
+ Assert.assertEquals("absc , abcd", normalizer.normalize("absc 123,0123 abcd"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..95cf300
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class ShrinkCharSequenceNormalizerTest {
+
+ public ShrinkCharSequenceNormalizer normalizer = ShrinkCharSequenceNormalizer.getInstance();
+
+ @Test
+ public void normalizeSpace() throws Exception {
+ Assert.assertEquals(
+ "a text extra space", normalizer.normalize("a text extra space"));
+ }
+
+ @Test
+ public void normalizeChar() throws Exception {
+ Assert.assertEquals("Helloo", normalizer.normalize("Helllllloooooo"));
+ Assert.assertEquals("Hello", normalizer.normalize("Hello"));
+ Assert.assertEquals("HHello", normalizer.normalize("HHello"));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..f0bd517
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class TwitterCharSequenceNormalizerTest {
+
+ public TwitterCharSequenceNormalizer normalizer = TwitterCharSequenceNormalizer.getInstance();
+
+ @Test
+ public void normalizeHashtag() throws Exception {
+ Assert.assertEquals("asdf 2nnfdf", normalizer.normalize("asdf #hasdk23 2nnfdf"));
+ }
+
+ @Test
+ public void normalizeUser() throws Exception {
+ Assert.assertEquals("asdf 2nnfdf", normalizer.normalize("asdf @hasdk23 2nnfdf"));
+ }
+
+ @Test
+ public void normalizeRT() throws Exception {
+ Assert.assertEquals(" 2nnfdf", normalizer.normalize("RT RT RT 2nnfdf"));
+ }
+
+ @Test
+ public void normalizeLaugh() throws Exception {
+ Assert.assertEquals("ahahah", normalizer.normalize("ahahahah"));
+ Assert.assertEquals("haha", normalizer.normalize("hahha"));
+ Assert.assertEquals("haha", normalizer.normalize("hahaa"));
+ Assert.assertEquals("ahaha", normalizer.normalize("ahahahahhahahhahahaaaa"));
+ Assert.assertEquals("jaja", normalizer.normalize("jajjajajaja"));
+ }
+
+
+
+ @Test
+ public void normalizeFace() throws Exception {
+ Assert.assertEquals("hello hello", normalizer.normalize("hello :-) hello"));
+ Assert.assertEquals("hello hello", normalizer.normalize("hello ;) hello"));
+ Assert.assertEquals(" hello", normalizer.normalize(":) hello"));
+ Assert.assertEquals("hello ", normalizer.normalize("hello :P"));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/6b689681/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..72eb83a
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class UrlCharSequenceNormalizerTest {
+
+ public UrlCharSequenceNormalizer normalizer = UrlCharSequenceNormalizer.getInstance();
+
+ @Test
+ public void normalizeUrl() throws Exception {
+ Assert.assertEquals(
+ "asdf 2nnfdf", normalizer.normalize("asdf http://asdf.com/dfa/cxs 2nnfdf"));
+
+
+ Assert.assertEquals(
+ "asdf 2nnfdf ", normalizer.normalize("asdf http://asdf.com/dfa/cx" +
+ "s 2nnfdf http://asdf.com/dfa/cxs"));
+ }
+
+ @Test
+ public void normalizeEmail() throws Exception {
+ Assert.assertEquals(
+ "asdf 2nnfdf", normalizer.normalize("asdf asd.fdfa@hasdk23.com.br 2nnfdf"));
+ Assert.assertEquals(
+ "asdf 2nnfdf ", normalizer.normalize("asdf asd.fdfa@hasdk23.com.br" +
+ " 2nnfdf asd.fdfa@hasdk23.com.br"));
+ }
+}