You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/06/16 09:54:29 UTC
[1/2] opennlp git commit: OPENNLP-788: Add LanguageDetector tool
Repository: opennlp
Updated Branches:
refs/heads/master 8d7e1c3c5 -> 560c48438
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java
new file mode 100644
index 0000000..771be19
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+public class AggregateCharSequenceNormalizer implements CharSequenceNormalizer {
+
+ private final CharSequenceNormalizer[] normalizers;
+
+ public AggregateCharSequenceNormalizer(CharSequenceNormalizer ... normalizers) {
+ this.normalizers = normalizers;
+ }
+
+ public CharSequence normalize (CharSequence text) {
+
+ for (CharSequenceNormalizer normalizers :
+ normalizers) {
+ text = normalizers.normalize(text);
+ }
+
+ return text;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
new file mode 100644
index 0000000..b5c1f3f
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+public interface CharSequenceNormalizer {
+ CharSequence normalize(CharSequence text);
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
new file mode 100644
index 0000000..d1c161c
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class EmojiCharSequenceNormalizer implements CharSequenceNormalizer {
+
+ private static final EmojiCharSequenceNormalizer INSTANCE = new EmojiCharSequenceNormalizer();
+
+ public static EmojiCharSequenceNormalizer getInstance() {
+ return INSTANCE;
+ }
+
+ private static final Pattern EMOJI_REGEX =
+ Pattern.compile("[\\uD83C-\\uDBFF\\uDC00-\\uDFFF]+");
+
+ public CharSequence normalize (CharSequence text) {
+ String modified = EMOJI_REGEX.matcher(text).replaceAll(" ");
+ return modified;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
new file mode 100644
index 0000000..6b0452d
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class NumberCharSequenceNormalizer implements CharSequenceNormalizer {
+
+ private static final Pattern NUMBER_REGEX = Pattern.compile("\\d+");
+
+ private static final NumberCharSequenceNormalizer INSTANCE = new NumberCharSequenceNormalizer();
+
+ public static NumberCharSequenceNormalizer getInstance() {
+ return INSTANCE;
+ }
+
+ public CharSequence normalize (CharSequence text) {
+ return NUMBER_REGEX.matcher(text).replaceAll(" ");
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
new file mode 100644
index 0000000..6183367
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class ShrinkCharSequenceNormalizer implements CharSequenceNormalizer {
+
+ private static final Pattern REPEATED_CHAR_REGEX = Pattern.compile("(.)\\1{2,}",
+ Pattern.CASE_INSENSITIVE);
+ private static final Pattern SPACE_REGEX = Pattern.compile("\\s{2,}",
+ Pattern.CASE_INSENSITIVE);
+
+ private static final ShrinkCharSequenceNormalizer INSTANCE = new ShrinkCharSequenceNormalizer();
+
+ public static ShrinkCharSequenceNormalizer getInstance() {
+ return INSTANCE;
+ }
+
+ public CharSequence normalize (CharSequence text) {
+ text = SPACE_REGEX.matcher(text).replaceAll(" ");
+ return REPEATED_CHAR_REGEX.matcher(text).replaceAll("$1$1").trim();
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
new file mode 100644
index 0000000..b5a8625
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class TwitterCharSequenceNormalizer implements CharSequenceNormalizer {
+
+ private static final Pattern HASH_USER_REGEX =
+ Pattern.compile("[#@]\\S+");
+
+ private static final Pattern RT_REGEX =
+ Pattern.compile("\\b(rt[ :])+", Pattern.CASE_INSENSITIVE);
+
+ private static final Pattern FACE_REGEX =
+ Pattern.compile("[:;x]-?[()dop]", Pattern.CASE_INSENSITIVE);
+
+ private static final Pattern LAUGH_REGEX =
+ Pattern.compile("([hj])+([aieou])+(\\1+\\2+)+", Pattern.CASE_INSENSITIVE);
+
+ private static final TwitterCharSequenceNormalizer INSTANCE = new TwitterCharSequenceNormalizer();
+
+ public static TwitterCharSequenceNormalizer getInstance() {
+ return INSTANCE;
+ }
+
+ public CharSequence normalize (CharSequence text) {
+ String modified = HASH_USER_REGEX.matcher(text).replaceAll(" ");
+ modified = RT_REGEX.matcher(modified).replaceAll(" ");
+ modified = FACE_REGEX.matcher(modified).replaceAll(" ");
+ modified = LAUGH_REGEX.matcher(modified).replaceAll("$1$2$1$2");
+ return modified;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
new file mode 100644
index 0000000..4be9b63
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class UrlCharSequenceNormalizer implements CharSequenceNormalizer {
+
+ private static final Pattern URL_REGEX =
+ Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]+");
+ private static final Pattern MAIL_REGEX =
+ Pattern.compile("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+");
+
+ private static final UrlCharSequenceNormalizer INSTANCE = new UrlCharSequenceNormalizer();
+
+ public static UrlCharSequenceNormalizer getInstance() {
+ return INSTANCE;
+ }
+
+ public CharSequence normalize (CharSequence text) {
+ String modified = URL_REGEX.matcher(text).replaceAll(" ");
+ return MAIL_REGEX.matcher(modified).replaceAll(" ");
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
new file mode 100644
index 0000000..1aae887
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.List;
+
+import opennlp.tools.ngram.NGramModel;
+import opennlp.tools.tokenize.SimpleTokenizer;
+import opennlp.tools.util.StringList;
+import opennlp.tools.util.normalizer.CharSequenceNormalizer;
+
+public class DummyFactory extends LanguageDetectorFactory {
+
+
+ public DummyFactory() {
+ super();
+ }
+
+ @Override
+ public void init() {
+ super.init();
+ }
+
+ @Override
+ public LanguageDetectorContextGenerator getContextGenerator() {
+ return new DummyFactory.MyContectGenerator(2, 5,
+ new DummyFactory.UpperCaseNormalizer());
+ }
+
+ public class UpperCaseNormalizer implements CharSequenceNormalizer {
+ @Override
+ public CharSequence normalize(CharSequence text) {
+ return text.toString().toUpperCase();
+ }
+ }
+
+ public class MyContectGenerator extends LanguageDetectorContextGenerator {
+
+ public MyContectGenerator(int min, int max, CharSequenceNormalizer... normalizers) {
+ super(min, max, normalizers);
+ }
+
+ @Override
+ public String[] getContext(String document) {
+ String[] superContext = super.getContext(document);
+
+ List<String> context = new ArrayList(Arrays.asList(superContext));
+
+ document = this.normalizer.normalize(document).toString();
+
+ SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
+ String[] words = tokenizer.tokenize(document);
+ NGramModel tokenNgramModel = new NGramModel();
+ if (words.length > 0) {
+ tokenNgramModel.add(new StringList(words), 1, 3);
+ Iterator tokenNgramIterator = tokenNgramModel.iterator();
+
+ while (tokenNgramIterator.hasNext()) {
+ StringList tokenList = (StringList) tokenNgramIterator.next();
+ if (tokenList.size() > 0) {
+ context.add("tg=" + tokenList.toString());
+ }
+ }
+ }
+
+ return context.toArray(new String[context.size()]);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
new file mode 100644
index 0000000..dc6ca26
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageDetectorContextGeneratorTest {
+
+ @Test
+ public void extractContext() throws Exception {
+ String doc = "abcde fghijk";
+
+ LanguageDetectorContextGenerator cg = new LanguageDetectorContextGenerator(1, 3);
+
+ Collection<String> features = Arrays.asList(cg.getContext(doc));
+
+ Assert.assertEquals(33, features.size());
+ Assert.assertTrue(features.contains("ab"));
+ Assert.assertTrue(features.contains("abc"));
+ Assert.assertTrue(features.contains("e f"));
+ Assert.assertTrue(features.contains(" fg"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java
new file mode 100644
index 0000000..520fc71
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.util.TrainingParameters;
+
+public class LanguageDetectorCrossValidatorTest {
+
+ @Test
+ public void evaluate() throws Exception {
+
+ TrainingParameters params = new TrainingParameters();
+ params.put(TrainingParameters.ITERATIONS_PARAM, 100);
+ params.put(TrainingParameters.CUTOFF_PARAM, 5);
+ params.put("PrintMessages", false);
+
+
+ final AtomicInteger correctCount = new AtomicInteger();
+ final AtomicInteger incorrectCount = new AtomicInteger();
+
+ LanguageDetectorCrossValidator cv = new LanguageDetectorCrossValidator(params,
+ new LanguageDetectorFactory(), new LanguageDetectorEvaluationMonitor() {
+ @Override
+ public void correctlyClassified(LanguageSample reference,
+ LanguageSample prediction) {
+ correctCount.incrementAndGet();
+ }
+
+ @Override
+ public void missclassified(LanguageSample reference,
+ LanguageSample prediction) {
+ incorrectCount.incrementAndGet();
+ }
+ });
+
+ LanguageDetectorSampleStream sampleStream = LanguageDetectorMETest.createSampleStream();
+
+ cv.evaluate(sampleStream, 2);
+
+ Assert.assertEquals(99, cv.getDocumentCount());
+ Assert.assertEquals(0.98989898989899, cv.getDocumentAccuracy(), 0.01);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java
new file mode 100644
index 0000000..8bdd71b
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageDetectorEvaluatorTest {
+
+ @Test
+ public void processSample() throws Exception {
+ LanguageDetectorModel model = LanguageDetectorMETest.trainModel();
+ LanguageDetectorME langdetector = new LanguageDetectorME(model);
+
+ final AtomicInteger correctCount = new AtomicInteger();
+ final AtomicInteger incorrectCount = new AtomicInteger();
+
+ LanguageDetectorEvaluator evaluator = new LanguageDetectorEvaluator(langdetector,
+ new LanguageDetectorEvaluationMonitor() {
+ @Override
+ public void correctlyClassified(LanguageSample reference,
+ LanguageSample prediction) {
+ correctCount.incrementAndGet();
+ }
+
+ @Override
+ public void missclassified(LanguageSample reference,
+ LanguageSample prediction) {
+ incorrectCount.incrementAndGet();
+ }
+ });
+
+ evaluator.evaluateSample(new LanguageSample(new Language("pob"),
+ "escreve e faz palestras pelo mundo inteiro sobre anjos"));
+
+ evaluator.evaluateSample(new LanguageSample(new Language("fra"),
+ "escreve e faz palestras pelo mundo inteiro sobre anjos"));
+
+ evaluator.evaluateSample(new LanguageSample(new Language("fra"),
+ "escreve e faz palestras pelo mundo inteiro sobre anjos"));
+
+
+ Assert.assertEquals(1, correctCount.get());
+ Assert.assertEquals(2, incorrectCount.get());
+
+ Assert.assertEquals(3, evaluator.getDocumentCount());
+ Assert.assertEquals(0.33, evaluator.getAccuracy(), 0.01);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
new file mode 100644
index 0000000..60afef2
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
@@ -0,0 +1,90 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
+public class LanguageDetectorFactoryTest {
+
+
+ private static LanguageDetectorModel model;
+
+ @BeforeClass
+ public static void train() throws Exception {
+
+ ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory(
+ LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt");
+
+ PlainTextByLineStream lineStream = new PlainTextByLineStream(streamFactory, "UTF-8");
+
+ LanguageDetectorSampleStream sampleStream = new LanguageDetectorSampleStream(lineStream);
+
+ TrainingParameters params = new TrainingParameters();
+ params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+ params.put(TrainingParameters.CUTOFF_PARAM, "5");
+ params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES");
+
+ model = LanguageDetectorME.train(sampleStream, params, new DummyFactory());
+ }
+
+ @Test
+ public void testCorrectFactory() throws IOException {
+ byte[] serialized = LanguageDetectorMETest.serializeModel(model);
+
+ LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized));
+
+ Assert.assertTrue(myModel.getFactory() instanceof DummyFactory);
+
+ }
+
+ @Test
+ public void testDummyFactory() throws Exception {
+ byte[] serialized = LanguageDetectorMETest.serializeModel(model);
+
+ LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized));
+
+ Assert.assertTrue(myModel.getFactory() instanceof DummyFactory);
+ }
+
+ @Test
+ public void testDummyFactoryContextGenerator() throws Exception {
+ LanguageDetectorContextGenerator cg = model.getFactory().getContextGenerator();
+ String[] context = cg.getContext(
+ "a dummy text phrase to test if the context generator works!!!!!!!!!!!!");
+
+ Set<String> set = new HashSet(Arrays.asList(context));
+
+ Assert.assertTrue(set.contains("!!!!!")); // default normalizer would remove the repeated !
+ Assert.assertTrue(set.contains("a dum"));
+ Assert.assertTrue(set.contains("tg=[THE,CONTEXT,GENERATOR]"));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
new file mode 100644
index 0000000..e5ee8aa
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.langdetect;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
+
+public class LanguageDetectorMETest {
+
+ private LanguageDetectorModel model;
+
+ @Before
+ public void init() throws Exception {
+
+ this.model = trainModel();
+
+ }
+
+ @Test
+ public void testPredictLanguages() {
+ LanguageDetector ld = new LanguageDetectorME(this.model);
+ Language[] languages = ld.predictLanguages("estava em uma marcenaria na Rua Bruno");
+
+ Assert.assertEquals(4, languages.length);
+ Assert.assertEquals("pob", languages[0].getLang());
+ Assert.assertEquals("ita", languages[1].getLang());
+ Assert.assertEquals("spa", languages[2].getLang());
+ Assert.assertEquals("fra", languages[3].getLang());
+ }
+
+ @Test
+ public void testPredictLanguage() {
+ LanguageDetector ld = new LanguageDetectorME(this.model);
+ Language language = ld.predictLanguage("Dove è meglio che giochi");
+
+ Assert.assertEquals("ita", language.getLang());
+ }
+
+ @Test
+ public void testSupportedLanguages() {
+
+ LanguageDetector ld = new LanguageDetectorME(this.model);
+ String[] supportedLanguages = ld.getSupportedLanguages();
+
+ Assert.assertEquals(4, supportedLanguages.length);
+ }
+
+ @Test
+ public void testLoadFromSerialized() throws IOException {
+ byte[] serialized = serializeModel(model);
+
+ LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized));
+
+ Assert.assertNotNull(myModel);
+
+ }
+
+ protected static byte[] serializeModel(LanguageDetectorModel model) throws IOException {
+
+ ByteArrayOutputStream out = new ByteArrayOutputStream();
+ model.serialize(out);
+ return out.toByteArray();
+ }
+
+ public static LanguageDetectorModel trainModel() throws Exception {
+ return trainModel(new LanguageDetectorFactory());
+ }
+
+ public static LanguageDetectorModel trainModel(LanguageDetectorFactory factory) throws Exception {
+
+
+ LanguageDetectorSampleStream sampleStream = createSampleStream();
+
+ TrainingParameters params = new TrainingParameters();
+ params.put(TrainingParameters.ITERATIONS_PARAM, 100);
+ params.put(TrainingParameters.CUTOFF_PARAM, 5);
+ params.put("DataIndexer", "TwoPass");
+ params.put(TrainingParameters.ALGORITHM_PARAM, "NAIVEBAYES");
+
+ return LanguageDetectorME.train(sampleStream, params, factory);
+ }
+
+ public static LanguageDetectorSampleStream createSampleStream() throws IOException {
+
+ ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory(
+ LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt");
+
+ PlainTextByLineStream lineStream = new PlainTextByLineStream(streamFactory, "UTF-8");
+
+ return new LanguageDetectorSampleStream(lineStream);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
new file mode 100644
index 0000000..7d12581
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageSampleTest {
+
+ @Test
+ public void testConstructor() {
+ Language lang = new Language("aLang");
+ CharSequence context = "aContext";
+
+ LanguageSample sample = new LanguageSample(lang, context);
+
+ Assert.assertEquals(lang, sample.getLanguage());
+ Assert.assertEquals(context, sample.getContext());
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void testNullLang() throws Exception {
+ CharSequence context = "aContext";
+
+ new LanguageSample(null, context);
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void testNullContext() {
+ Language lang = new Language("aLang");
+
+ new LanguageSample(lang, null);
+ }
+
+ @Test
+ public void testToString() {
+ Language lang = new Language("aLang");
+ CharSequence context = "aContext";
+
+ LanguageSample sample = new LanguageSample(lang, context);
+
+ Assert.assertEquals(lang.getLang() + "\t" + context, sample.toString());
+ }
+
+ @Test
+ public void testHash() {
+
+ int hashA = new LanguageSample(new Language("aLang"), "aContext").hashCode();
+ int hashB = new LanguageSample(new Language("bLang"), "aContext").hashCode();
+ int hashC = new LanguageSample(new Language("aLang"), "bContext").hashCode();
+
+ Assert.assertNotEquals(hashA, hashB);
+ Assert.assertNotEquals(hashA, hashC);
+ Assert.assertNotEquals(hashB, hashC);
+ }
+
+ @Test
+ public void testEquals() throws Exception {
+
+ LanguageSample sampleA = new LanguageSample(new Language("aLang"), "aContext");
+ LanguageSample sampleA1 = new LanguageSample(new Language("aLang"), "aContext");
+ LanguageSample sampleB = new LanguageSample(new Language("bLang"), "aContext");
+ LanguageSample sampleC = new LanguageSample(new Language("aLang"), "bContext");
+
+ Assert.assertEquals(sampleA, sampleA);
+ Assert.assertEquals(sampleA, sampleA1);
+ Assert.assertNotEquals(sampleA, sampleB);
+ Assert.assertNotEquals(sampleA, sampleC);
+ Assert.assertNotEquals(sampleB, sampleC);
+ Assert.assertFalse(sampleA.equals("something else"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
new file mode 100644
index 0000000..dc25bc6
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageTest {
+
+
+ @Test
+ public void emptyConfidence() throws Exception {
+ String languageCode = "aLanguage";
+ Language lang = new Language(languageCode);
+
+ Assert.assertEquals(languageCode, lang.getLang());
+ Assert.assertEquals(0, lang.getConfidence(), 0);
+ }
+
+ @Test
+ public void nonEmptyConfidence() throws Exception {
+ String languageCode = "aLanguage";
+ double confidence = 0.05;
+ Language lang = new Language(languageCode, confidence);
+
+ Assert.assertEquals(languageCode, lang.getLang());
+ Assert.assertEquals(confidence, lang.getConfidence(), 0);
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void emptyLanguage() throws Exception {
+ new Language(null);
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void emptyLanguageConfidence() throws Exception {
+ new Language(null, 0.05);
+ }
+
+ @Test
+ public void testToString() {
+ Language lang = new Language("aLang");
+
+ Assert.assertEquals("aLang (0.0)", lang.toString());
+
+ lang = new Language("aLang", 0.0886678);
+
+ Assert.assertEquals("aLang (0.0886678)", lang.toString());
+ }
+
+
+ @Test
+ public void testHash() {
+ int hashA = new Language("aLang").hashCode();
+ int hashAA = new Language("aLang").hashCode();
+ int hashB = new Language("BLang").hashCode();
+ int hashA5 = new Language("aLang", 5.0).hashCode();
+ int hashA6 = new Language("BLang", 6.0).hashCode();
+
+ Assert.assertEquals(hashA, hashAA);
+
+ Assert.assertNotEquals(hashA, hashB);
+ Assert.assertNotEquals(hashA, hashA5);
+ Assert.assertNotEquals(hashB, hashA5);
+ Assert.assertNotEquals(hashA5, hashA6);
+ }
+
+ @Test
+ public void testEquals() {
+ Language langA = new Language("langA");
+ Language langB = new Language("langB");
+ Language langA5 = new Language("langA5", 5.0);
+ Language langA6 = new Language("langA5", 6.0);
+
+ Assert.assertEquals(langA, langA);
+ Assert.assertEquals(langA5, langA5);
+
+ Assert.assertNotEquals(langA, langA5);
+ Assert.assertNotEquals(langA, langB);
+
+ Assert.assertEquals(langA6, langA5);
+
+ Assert.assertNotEquals(langA, "something else");
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..0f8dfe7
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class EmojiCharSequenceNormalizerTest {
+
+ public EmojiCharSequenceNormalizer normalizer = EmojiCharSequenceNormalizer.getInstance();
+
+ @Test
+ public void normalizeEmoji() throws Exception {
+
+ String s = new StringBuilder()
+ .append("Any funny text goes here ")
+ .appendCodePoint(0x1F606)
+ .appendCodePoint(0x1F606)
+ .appendCodePoint(0x1F606)
+ .append(" ")
+ .appendCodePoint(0x1F61B)
+ .toString();
+ Assert.assertEquals(
+ "Any funny text goes here ", normalizer.normalize(s));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..50b1f0c
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class NumberCharSequenceNormalizerTest {
+
+ public NumberCharSequenceNormalizer normalizer = NumberCharSequenceNormalizer.getInstance();
+
+
+ @Test
+ public void normalize() throws Exception {
+ Assert.assertEquals("absc , abcd", normalizer.normalize("absc 123,0123 abcd"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..95cf300
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class ShrinkCharSequenceNormalizerTest {
+
+ public ShrinkCharSequenceNormalizer normalizer = ShrinkCharSequenceNormalizer.getInstance();
+
+ @Test
+ public void normalizeSpace() throws Exception {
+ Assert.assertEquals(
+ "a text extra space", normalizer.normalize("a text extra space"));
+ }
+
+ @Test
+ public void normalizeChar() throws Exception {
+ Assert.assertEquals("Helloo", normalizer.normalize("Helllllloooooo"));
+ Assert.assertEquals("Hello", normalizer.normalize("Hello"));
+ Assert.assertEquals("HHello", normalizer.normalize("HHello"));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..f0bd517
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class TwitterCharSequenceNormalizerTest {
+
+ public TwitterCharSequenceNormalizer normalizer = TwitterCharSequenceNormalizer.getInstance();
+
+ @Test
+ public void normalizeHashtag() throws Exception {
+ Assert.assertEquals("asdf 2nnfdf", normalizer.normalize("asdf #hasdk23 2nnfdf"));
+ }
+
+ @Test
+ public void normalizeUser() throws Exception {
+ Assert.assertEquals("asdf 2nnfdf", normalizer.normalize("asdf @hasdk23 2nnfdf"));
+ }
+
+ @Test
+ public void normalizeRT() throws Exception {
+ Assert.assertEquals(" 2nnfdf", normalizer.normalize("RT RT RT 2nnfdf"));
+ }
+
+ @Test
+ public void normalizeLaugh() throws Exception {
+ Assert.assertEquals("ahahah", normalizer.normalize("ahahahah"));
+ Assert.assertEquals("haha", normalizer.normalize("hahha"));
+ Assert.assertEquals("haha", normalizer.normalize("hahaa"));
+ Assert.assertEquals("ahaha", normalizer.normalize("ahahahahhahahhahahaaaa"));
+ Assert.assertEquals("jaja", normalizer.normalize("jajjajajaja"));
+ }
+
+
+
+ @Test
+ public void normalizeFace() throws Exception {
+ Assert.assertEquals("hello hello", normalizer.normalize("hello :-) hello"));
+ Assert.assertEquals("hello hello", normalizer.normalize("hello ;) hello"));
+ Assert.assertEquals(" hello", normalizer.normalize(":) hello"));
+ Assert.assertEquals("hello ", normalizer.normalize("hello :P"));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..72eb83a
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class UrlCharSequenceNormalizerTest {
+
+ public UrlCharSequenceNormalizer normalizer = UrlCharSequenceNormalizer.getInstance();
+
+ @Test
+ public void normalizeUrl() throws Exception {
+ Assert.assertEquals(
+ "asdf 2nnfdf", normalizer.normalize("asdf http://asdf.com/dfa/cxs 2nnfdf"));
+
+
+ Assert.assertEquals(
+ "asdf 2nnfdf ", normalizer.normalize("asdf http://asdf.com/dfa/cx" +
+ "s 2nnfdf http://asdf.com/dfa/cxs"));
+ }
+
+ @Test
+ public void normalizeEmail() throws Exception {
+ Assert.assertEquals(
+ "asdf 2nnfdf", normalizer.normalize("asdf asd.fdfa@hasdk23.com.br 2nnfdf"));
+ Assert.assertEquals(
+ "asdf 2nnfdf ", normalizer.normalize("asdf asd.fdfa@hasdk23.com.br" +
+ " 2nnfdf asd.fdfa@hasdk23.com.br"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 363bd7c..8183c06 100644
--- a/pom.xml
+++ b/pom.xml
@@ -227,6 +227,7 @@
<artifactId>maven-surefire-plugin</artifactId>
<version>${maven.surefire.plugin}</version>
<configuration>
+ <argLine>-Xmx2048m</argLine>
<forkCount>${opennlp.forkCount}</forkCount>
<failIfNoSpecifiedTests>false</failIfNoSpecifiedTests>
<excludes>
@@ -435,6 +436,7 @@
<artifactId>maven-surefire-plugin</artifactId>
<version>${maven.surefire.plugin}</version>
<configuration>
+ <argLine>-Xmx4g</argLine>
<includes>
<include>**/*Test.java</include>
<include>**/*Eval.java</include>
[2/2] opennlp git commit: OPENNLP-788: Add LanguageDetector tool
Posted by jo...@apache.org.
OPENNLP-788: Add LanguageDetector tool
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/560c4843
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/560c4843
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/560c4843
Branch: refs/heads/master
Commit: 560c484387eea32a00afc1de8ef96a81ce304ef3
Parents: 8d7e1c3
Author: William D C M SILVA <co...@apache.org>
Authored: Wed May 17 13:34:21 2017 -0300
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Fri Jun 16 10:12:29 2017 +0200
----------------------------------------------------------------------
.../main/java/opennlp/tools/cmdline/CLI.java | 12 ++
.../cmdline/FineGrainedReportListener.java | 13 +-
.../tools/cmdline/StreamFactoryRegistry.java | 4 +
.../LanguageDetectorConverterTool.java | 28 ++++
.../LanguageDetectorCrossValidatorTool.java | 123 ++++++++++++++++
...LanguageDetectorEvaluationErrorListener.java | 54 +++++++
.../LanguageDetectorEvaluatorTool.java | 139 +++++++++++++++++++
...nguageDetectorFineGrainedReportListener.java | 70 ++++++++++
.../langdetect/LanguageDetectorModelLoader.java | 42 ++++++
.../langdetect/LanguageDetectorTool.java | 88 ++++++++++++
.../langdetect/LanguageDetectorTrainerTool.java | 83 +++++++++++
.../cmdline/langdetect/TrainingParams.java | 40 ++++++
.../LanguageDetectorSampleStreamFactory.java | 66 +++++++++
.../formats/LeipzigDoccatSampleStream.java | 5 +-
.../LeipzigDocumentSampleStreamFactory.java | 3 +
.../leipzig/LeipzigLanguageSampleStream.java | 136 ++++++++++++++++++
.../LeipzigLanguageSampleStreamFactory.java | 74 ++++++++++
.../java/opennlp/tools/langdetect/Language.java | 73 ++++++++++
.../tools/langdetect/LanguageDetector.java | 31 +++++
.../LanguageDetectorContextGenerator.java | 70 ++++++++++
.../LanguageDetectorCrossValidator.java | 107 ++++++++++++++
.../LanguageDetectorEvaluationMonitor.java | 28 ++++
.../langdetect/LanguageDetectorEvaluator.java | 99 +++++++++++++
.../langdetect/LanguageDetectorEventStream.java | 69 +++++++++
.../langdetect/LanguageDetectorFactory.java | 67 +++++++++
.../tools/langdetect/LanguageDetectorME.java | 97 +++++++++++++
.../tools/langdetect/LanguageDetectorModel.java | 82 +++++++++++
.../LanguageDetectorSampleStream.java | 55 ++++++++
.../tools/langdetect/LanguageSample.java | 68 +++++++++
.../AggregateCharSequenceNormalizer.java | 39 ++++++
.../util/normalizer/CharSequenceNormalizer.java | 23 +++
.../normalizer/EmojiCharSequenceNormalizer.java | 38 +++++
.../NumberCharSequenceNormalizer.java | 36 +++++
.../ShrinkCharSequenceNormalizer.java | 40 ++++++
.../TwitterCharSequenceNormalizer.java | 50 +++++++
.../normalizer/UrlCharSequenceNormalizer.java | 40 ++++++
.../opennlp/tools/langdetect/DummyFactory.java | 88 ++++++++++++
.../LanguageDetectorContextGeneratorTest.java | 43 ++++++
.../LanguageDetectorCrossValidatorTest.java | 64 +++++++++
.../LanguageDetectorEvaluatorTest.java | 68 +++++++++
.../langdetect/LanguageDetectorFactoryTest.java | 90 ++++++++++++
.../langdetect/LanguageDetectorMETest.java | 116 ++++++++++++++++
.../tools/langdetect/LanguageSampleTest.java | 89 ++++++++++++
.../opennlp/tools/langdetect/LanguageTest.java | 101 ++++++++++++++
.../EmojiCharSequenceNormalizerTest.java | 43 ++++++
.../NumberCharSequenceNormalizerTest.java | 32 +++++
.../ShrinkCharSequenceNormalizerTest.java | 41 ++++++
.../TwitterCharSequenceNormalizerTest.java | 62 +++++++++
.../UrlCharSequenceNormalizerTest.java | 47 +++++++
pom.xml | 2 +
50 files changed, 2975 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
index b575f71..c828e26 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
@@ -37,6 +37,11 @@ import opennlp.tools.cmdline.doccat.DoccatEvaluatorTool;
import opennlp.tools.cmdline.doccat.DoccatTool;
import opennlp.tools.cmdline.doccat.DoccatTrainerTool;
import opennlp.tools.cmdline.entitylinker.EntityLinkerTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorConverterTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorCrossValidatorTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorEvaluatorTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorTrainerTool;
import opennlp.tools.cmdline.languagemodel.NGramLanguageModelTool;
import opennlp.tools.cmdline.lemmatizer.LemmatizerEvaluatorTool;
import opennlp.tools.cmdline.lemmatizer.LemmatizerMETool;
@@ -90,6 +95,13 @@ public final class CLI {
tools.add(new DoccatCrossValidatorTool());
tools.add(new DoccatConverterTool());
+ // Language Detector
+ tools.add(new LanguageDetectorTool());
+ tools.add(new LanguageDetectorTrainerTool());
+ tools.add(new LanguageDetectorConverterTool());
+ tools.add(new LanguageDetectorCrossValidatorTool());
+ tools.add(new LanguageDetectorEvaluatorTool());
+
// Dictionary Builder
tools.add(new DictionaryBuilderTool());
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
index 714561a..75b84aa 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
@@ -802,8 +802,8 @@ public abstract class FineGrainedReportListener {
}
}
- public void add(String[] text, String ref, String pred) {
- int length = text.length;
+ public void add(int length, String ref, String pred) {
+
averageSentenceLength.add(length);
if (minimalSentenceLength > length) {
@@ -820,7 +820,16 @@ public abstract class FineGrainedReportListener {
updateTagFMeasure(refs, preds);
commit("", ref, pred);
+ }
+
+ public void add(String[] text, String ref, String pred) {
+ int length = text.length;
+ this.add(length, ref, pred);
+ }
+ public void add(CharSequence text, String ref, String pred) {
+ int length = text.length();
+ this.add(length, ref, pred);
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index 3d68945..48b8025 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -29,6 +29,7 @@ import opennlp.tools.formats.ConllXSentenceSampleStreamFactory;
import opennlp.tools.formats.ConllXTokenSampleStreamFactory;
import opennlp.tools.formats.DocumentSampleStreamFactory;
import opennlp.tools.formats.EvalitaNameSampleStreamFactory;
+import opennlp.tools.formats.LanguageDetectorSampleStreamFactory;
import opennlp.tools.formats.LeipzigDocumentSampleStreamFactory;
import opennlp.tools.formats.LemmatizerSampleStreamFactory;
import opennlp.tools.formats.NameSampleDataStreamFactory;
@@ -56,6 +57,7 @@ import opennlp.tools.formats.convert.ParseToTokenSampleStreamFactory;
import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory;
import opennlp.tools.formats.irishsentencebank.IrishSentenceBankSentenceStreamFactory;
import opennlp.tools.formats.irishsentencebank.IrishSentenceBankTokenSampleStreamFactory;
+import opennlp.tools.formats.leipzig.LeipzigLanguageSampleStreamFactory;
import opennlp.tools.formats.letsmt.LetsmtSentenceStreamFactory;
import opennlp.tools.formats.moses.MosesSentenceSampleStreamFactory;
import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
@@ -79,6 +81,7 @@ public final class StreamFactoryRegistry {
TokenSampleStreamFactory.registerFactory();
WordTagSampleStreamFactory.registerFactory();
LemmatizerSampleStreamFactory.registerFactory();
+ LanguageDetectorSampleStreamFactory.registerFactory();
NameToSentenceSampleStreamFactory.registerFactory();
NameToTokenSampleStreamFactory.registerFactory();
@@ -124,6 +127,7 @@ public final class StreamFactoryRegistry {
IrishSentenceBankSentenceStreamFactory.registerFactory();
IrishSentenceBankTokenSampleStreamFactory.registerFactory();
+ LeipzigLanguageSampleStreamFactory.registerFactory();
}
public static final String DEFAULT_FORMAT = "opennlp";
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java
new file mode 100644
index 0000000..69d9db7
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import opennlp.tools.cmdline.AbstractConverterTool;
+import opennlp.tools.langdetect.LanguageSample;
+
+public class LanguageDetectorConverterTool extends AbstractConverterTool<LanguageSample> {
+
+ public LanguageDetectorConverterTool() {
+ super(LanguageSample.class);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java
new file mode 100644
index 0000000..bf68fbb
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.LinkedList;
+import java.util.List;
+
+import opennlp.tools.cmdline.AbstractCrossValidatorTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.CVParams;
+import opennlp.tools.cmdline.params.FineGrainedEvaluatorParams;
+import opennlp.tools.langdetect.LanguageDetectorCrossValidator;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageDetectorFactory;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.eval.EvaluationMonitor;
+import opennlp.tools.util.model.ModelUtil;
+
+public final class LanguageDetectorCrossValidatorTool extends
+ AbstractCrossValidatorTool<LanguageSample,
+ LanguageDetectorCrossValidatorTool.CVToolParams> {
+
+ interface CVToolParams extends CVParams, TrainingParams, FineGrainedEvaluatorParams {
+ }
+
+ public LanguageDetectorCrossValidatorTool() {
+ super(LanguageSample.class, CVToolParams.class);
+ }
+
+ public String getShortDescription() {
+ return "K-fold cross validator for the learnable Language Detector";
+ }
+
+ public void run(String format, String[] args) {
+ super.run(format, args);
+
+ mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
+ if (mlParams == null) {
+ mlParams = ModelUtil.createDefaultTrainingParameters();
+ }
+
+ List<EvaluationMonitor<LanguageSample>> listeners = new LinkedList<>();
+ if (params.getMisclassified()) {
+ listeners.add(new LanguageDetectorEvaluationErrorListener());
+ }
+
+ LanguageDetectorFineGrainedReportListener reportListener = null;
+ File reportFile = params.getReportOutputFile();
+ OutputStream reportOutputStream = null;
+ if (reportFile != null) {
+ CmdLineUtil.checkOutputFile("Report Output File", reportFile);
+ try {
+ reportOutputStream = new FileOutputStream(reportFile);
+ reportListener = new LanguageDetectorFineGrainedReportListener(reportOutputStream);
+ listeners.add(reportListener);
+ } catch (FileNotFoundException e) {
+ throw createTerminationIOException(e);
+ }
+ }
+
+ LanguageDetectorEvaluationMonitor[] listenersArr = listeners
+ .toArray(new LanguageDetectorEvaluationMonitor[listeners.size()]);
+
+ LanguageDetectorCrossValidator validator;
+ try {
+ LanguageDetectorFactory factory = LanguageDetectorFactory.create(params.getFactory());
+ validator = new LanguageDetectorCrossValidator(mlParams,
+ factory, listenersArr);
+
+ validator.evaluate(sampleStream, params.getFolds());
+ } catch (IOException e) {
+ throw new TerminateToolException(-1,
+ "IO error while reading training data or indexing data: " + e.getMessage(), e);
+ } finally {
+ try {
+ sampleStream.close();
+ } catch (IOException e) {
+ // sorry that this can fail
+ }
+ }
+
+ System.out.println("done");
+
+ if (reportListener != null) {
+ System.out.println("Writing fine-grained report to "
+ + params.getReportOutputFile().getAbsolutePath());
+ reportListener.writeReport();
+
+ try {
+ // TODO: is it a problem to close the stream now?
+ reportOutputStream.close();
+ } catch (IOException e) {
+ // nothing to do
+ }
+ }
+
+ System.out.println();
+
+ System.out.println("Accuracy: " + validator.getDocumentAccuracy() + "\n" +
+ "Number of documents: " + validator.getDocumentCount());
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java
new file mode 100644
index 0000000..073ef31
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.OutputStream;
+
+import opennlp.tools.cmdline.EvaluationErrorPrinter;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+/**
+ * A default implementation of {@link EvaluationMonitor} that prints to an
+ * output stream.
+ *
+ */
+public class LanguageDetectorEvaluationErrorListener extends
+ EvaluationErrorPrinter<LanguageSample> implements LanguageDetectorEvaluationMonitor {
+
+ /**
+ * Creates a listener that will print to System.err
+ */
+ public LanguageDetectorEvaluationErrorListener() {
+ super(System.err);
+ }
+
+ /**
+ * Creates a listener that will print to a given {@link OutputStream}
+ */
+ public LanguageDetectorEvaluationErrorListener(OutputStream outputStream) {
+ super(outputStream);
+ }
+
+ @Override
+ public void missclassified(LanguageSample reference, LanguageSample prediction) {
+ printError(reference, prediction);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java
new file mode 100644
index 0000000..fb929bf
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.LinkedList;
+import java.util.List;
+
+import opennlp.tools.cmdline.AbstractEvaluatorTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.PerformanceMonitor;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.EvaluatorParams;
+import opennlp.tools.cmdline.params.FineGrainedEvaluatorParams;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageDetectorEvaluator;
+import opennlp.tools.langdetect.LanguageDetectorME;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+public final class LanguageDetectorEvaluatorTool extends
+ AbstractEvaluatorTool<LanguageSample, LanguageDetectorEvaluatorTool.EvalToolParams> {
+
+ interface EvalToolParams extends EvaluatorParams, FineGrainedEvaluatorParams {
+ }
+
+ public LanguageDetectorEvaluatorTool() {
+ super(LanguageSample.class, EvalToolParams.class);
+ }
+
+ public String getShortDescription() {
+ return "Measures the performance of the Language Detector model with the reference data";
+ }
+
+ public void run(String format, String[] args) {
+ super.run(format, args);
+
+ LanguageDetectorModel model = new LanguageDetectorModelLoader().load(params.getModel());
+
+ List<EvaluationMonitor<LanguageSample>> listeners = new LinkedList<>();
+ if (params.getMisclassified()) {
+ listeners.add(new LanguageDetectorEvaluationErrorListener());
+ }
+
+ LanguageDetectorFineGrainedReportListener reportListener = null;
+ File reportFile = params.getReportOutputFile();
+ OutputStream reportOutputStream = null;
+ if (reportFile != null) {
+ CmdLineUtil.checkOutputFile("Report Output File", reportFile);
+ try {
+ reportOutputStream = new FileOutputStream(reportFile);
+ reportListener = new LanguageDetectorFineGrainedReportListener(reportOutputStream);
+ listeners.add(reportListener);
+ } catch (FileNotFoundException e) {
+ throw new TerminateToolException(-1,
+ "IO error while creating LanguageDetector fine-grained report file: "
+ + e.getMessage());
+ }
+ }
+
+ LanguageDetectorEvaluator evaluator = new LanguageDetectorEvaluator(
+ new LanguageDetectorME(model),
+ listeners.toArray(new LanguageDetectorEvaluationMonitor[listeners.size()]));
+
+ final PerformanceMonitor monitor = new PerformanceMonitor("doc");
+
+ ObjectStream<LanguageSample> measuredSampleStream = new ObjectStream<LanguageSample>() {
+
+ public LanguageSample read() throws IOException {
+ monitor.incrementCounter();
+ return sampleStream.read();
+ }
+
+ public void reset() throws IOException {
+ sampleStream.reset();
+ }
+
+ public void close() throws IOException {
+ sampleStream.close();
+ }
+ };
+
+ monitor.startAndPrintThroughput();
+
+ try {
+ evaluator.evaluate(measuredSampleStream);
+ } catch (IOException e) {
+ System.err.println("failed");
+ throw new TerminateToolException(-1, "IO error while reading test data: "
+ + e.getMessage(), e);
+ } finally {
+ try {
+ measuredSampleStream.close();
+ } catch (IOException e) {
+ // sorry that this can fail
+ }
+ }
+
+ monitor.stopAndPrintFinalResult();
+
+ System.out.println();
+
+ System.out.println(evaluator);
+
+ if (reportListener != null) {
+ System.out.println("Writing fine-grained report to "
+ + params.getReportOutputFile().getAbsolutePath());
+ reportListener.writeReport();
+
+ try {
+ // TODO: is it a problem to close the stream now?
+ reportOutputStream.close();
+ } catch (IOException e) {
+ // nothing to do
+ }
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java
new file mode 100644
index 0000000..70bf3eb
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.OutputStream;
+
+import opennlp.tools.cmdline.FineGrainedReportListener;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageSample;
+
+/**
+ * Generates a detailed report for the POS Tagger.
+ * <p>
+ * It is possible to use it from an API and access the statistics using the
+ * provided getters
+ */
+public class LanguageDetectorFineGrainedReportListener
+ extends FineGrainedReportListener implements LanguageDetectorEvaluationMonitor {
+
+ /**
+ * Creates a listener that will print to {@link System#err}
+ */
+ public LanguageDetectorFineGrainedReportListener() {
+ this(System.err);
+ }
+
+ /**
+ * Creates a listener that prints to a given {@link OutputStream}
+ */
+ public LanguageDetectorFineGrainedReportListener(OutputStream outputStream) {
+ super(outputStream);
+ }
+
+ // methods inherited from EvaluationMonitor
+
+ public void missclassified(LanguageSample reference, LanguageSample prediction) {
+ statsAdd(reference, prediction);
+ }
+
+ public void correctlyClassified(LanguageSample reference, LanguageSample prediction) {
+ statsAdd(reference, prediction);
+ }
+
+ private void statsAdd(LanguageSample reference, LanguageSample prediction) {
+ getStats().add(reference.getContext(),
+ reference.getLanguage().getLang(), prediction.getLanguage().getLang());
+ }
+
+ public void writeReport() {
+ printGeneralStatistics();
+ printTagsErrorRank();
+ printGeneralConfusionTable();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java
new file mode 100644
index 0000000..c8700fd
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.cmdline.ModelLoader;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+
+/**
+ * Loads a Language Detector Model for the command line tools.
+ * <p>
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class LanguageDetectorModelLoader extends ModelLoader<LanguageDetectorModel> {
+
+ public LanguageDetectorModelLoader() {
+ super("Language Detector");
+ }
+
+ @Override
+ protected LanguageDetectorModel loadModel(InputStream modelIn) throws IOException {
+ return new LanguageDetectorModel(modelIn);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java
new file mode 100644
index 0000000..6175fe3
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CLI;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.PerformanceMonitor;
+import opennlp.tools.cmdline.SystemInputStreamFactory;
+import opennlp.tools.langdetect.Language;
+import opennlp.tools.langdetect.LanguageDetector;
+import opennlp.tools.langdetect.LanguageDetectorME;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ParagraphStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+public class LanguageDetectorTool extends BasicCmdLineTool {
+
+ @Override
+ public String getShortDescription() {
+ return "learned language detector";
+ }
+
+ @Override
+ public String getHelp() {
+ return "Usage: " + CLI.CMD + " " + getName() + " model < documents";
+ }
+
+ @Override
+ public void run(String[] args) {
+
+ if (0 == args.length) {
+ System.out.println(getHelp());
+ } else {
+
+ LanguageDetectorModel model = new LanguageDetectorModelLoader().load(new File(args[0]));
+
+ LanguageDetector langDetectME = new LanguageDetectorME(model);
+
+ /*
+ * moved initialization to the try block to catch new IOException
+ */
+ ObjectStream<String> documentStream;
+
+ PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "doc");
+ perfMon.start();
+
+ try {
+ documentStream = new ParagraphStream(new PlainTextByLineStream(
+ new SystemInputStreamFactory(), SystemInputStreamFactory.encoding()));
+ String document;
+ while ((document = documentStream.read()) != null) {
+
+ Language lang = langDetectME.predictLanguage(document);
+
+ LanguageSample sample = new LanguageSample(lang, document);
+ System.out.println(sample.toString());
+
+ perfMon.incrementCounter();
+ }
+ } catch (IOException e) {
+ CmdLineUtil.handleStdinIoError(e);
+ }
+
+ perfMon.stopAndPrintFinalResult();
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java
new file mode 100644
index 0000000..6735293
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.AbstractTrainerTool;
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.langdetect.LanguageDetectorFactory;
+import opennlp.tools.langdetect.LanguageDetectorME;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.model.ModelUtil;
+
+public class LanguageDetectorTrainerTool
+ extends AbstractTrainerTool<LanguageSample, LanguageDetectorTrainerTool.TrainerToolParams> {
+
+ interface TrainerToolParams extends TrainingParams {
+ @ArgumentParser.ParameterDescription(valueName = "modelFile", description = "output model file.")
+ File getModel();
+
+ @ArgumentParser.ParameterDescription(valueName = "paramsFile", description = "training parameters file.")
+ @ArgumentParser.OptionalParameter()
+ String getParams();
+ }
+
+ public LanguageDetectorTrainerTool() {
+ super(LanguageSample.class, TrainerToolParams.class);
+ }
+
+ @Override
+ public String getShortDescription() {
+ return "trainer for the learnable language detector";
+ }
+
+ @Override
+ public void run(String format, String[] args) {
+ super.run(format, args);
+
+ mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
+ if (mlParams == null) {
+ mlParams = ModelUtil.createDefaultTrainingParameters();
+ }
+
+ File modelOutFile = params.getModel();
+
+ CmdLineUtil.checkOutputFile("language detector model", modelOutFile);
+
+ LanguageDetectorModel model;
+ try {
+ LanguageDetectorFactory factory = LanguageDetectorFactory.create(params.getFactory());
+ model = LanguageDetectorME.train(sampleStream, mlParams, factory);
+ } catch (IOException e) {
+ throw createTerminationIOException(e);
+ }
+ finally {
+ try {
+ sampleStream.close();
+ } catch (IOException e) {
+ // sorry that this can fail
+ }
+ }
+
+ CmdLineUtil.writeModel("language detector", modelOutFile, model);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java
new file mode 100644
index 0000000..2937c3d
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+
+/**
+ * TrainingParams for Language Detector.
+ *
+ * Note: Do not use this class, internal use only!
+ */
+interface TrainingParams {
+
+ @ParameterDescription(valueName = "paramsFile", description = "training parameters file.")
+ @OptionalParameter()
+ String getParams();
+
+ @ParameterDescription(valueName = "factoryName",
+ description = "A sub-class of LanguageDetectorFactory" +
+ " where to get implementation and resources.")
+ @OptionalParameter
+ String getFactory();
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
new file mode 100644
index 0000000..ef60063
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.doccat.DocumentSampleStream;
+import opennlp.tools.langdetect.LanguageDetectorSampleStream;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+/**
+ * Factory producing OpenNLP {@link DocumentSampleStream}s.
+ */
+public class LanguageDetectorSampleStreamFactory
+ extends AbstractSampleStreamFactory<LanguageSample> {
+
+ interface Parameters extends BasicFormatParams {
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(LanguageSample.class,
+ StreamFactoryRegistry.DEFAULT_FORMAT,
+ new LanguageDetectorSampleStreamFactory(Parameters.class));
+ }
+
+ protected <P> LanguageDetectorSampleStreamFactory(Class<P> params) {
+ super(params);
+ }
+
+ public ObjectStream<LanguageSample> create(String[] args) {
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+ CmdLineUtil.checkInputFile("Data", params.getData());
+ InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
+ ObjectStream<String> lineStream = null;
+ try {
+ lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
+ } catch (IOException ex) {
+ CmdLineUtil.handleCreateObjectStreamError(ex);
+ }
+
+ return new LanguageDetectorSampleStream(lineStream);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
index 8ed0036..7059e21 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
@@ -39,8 +39,11 @@ import opennlp.tools.util.PlainTextByLineStream;
* <p>
* The input text is tokenized with the {@link SimpleTokenizer}. The input text classified
* by the language model must also be tokenized by the {@link SimpleTokenizer} to produce
- * exactly the same tokenization during testing and training.ø
+ * exactly the same tokenization during testing and training.
+ *
+ * @deprecated will be removed, use the language detector instead
*/
+@Deprecated
public class LeipzigDoccatSampleStream extends
FilterObjectStream<String, DocumentSample> {
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
index bd2453b..d6ff9ba 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
@@ -33,7 +33,10 @@ import opennlp.tools.util.ObjectStreamUtils;
/**
* <b>Note:</b> Do not use this class, internal use only!
+ *
+ * @deprecated will be removed, use the language detector instead
*/
+@Deprecated
public class LeipzigDocumentSampleStreamFactory
extends AbstractSampleStreamFactory<DocumentSample> {
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
new file mode 100644
index 0000000..6c4d009
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.leipzig;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import opennlp.tools.langdetect.Language;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.MarkableFileInputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample> {
+
+ private class LeipzigSentencesStream implements ObjectStream<LanguageSample> {
+ private final String lang;
+ private int sentencesPerSample;
+ private int numberOfSamples;
+
+ private ObjectStream<String> lineStream;
+ private int sampleCount;
+
+ LeipzigSentencesStream(String lang, File sentencesFile, int sentencesPerSample, int numberOfSamples)
+ throws IOException {
+ this.lang = sentencesFile.getName().substring(0, 3);
+ this.sentencesPerSample = sentencesPerSample;
+ this.numberOfSamples = numberOfSamples;
+
+ lineStream = new PlainTextByLineStream(new MarkableFileInputStreamFactory(sentencesFile),
+ StandardCharsets.UTF_8);
+ }
+
+ @Override
+ public LanguageSample read() throws IOException {
+
+ if (sampleCount < numberOfSamples) {
+ StringBuilder sampleString = new StringBuilder();
+
+ int count = 0;
+ String line;
+ while (count < sentencesPerSample && (line = lineStream.read()) != null) {
+
+ int textStart = line.indexOf('\t') + 1;
+
+ // TODO: It should it be changed to contain an array of sample strings ?!
+ sampleString.append(line.substring(textStart) + " ");
+
+ count++;
+ }
+
+ if (sampleString.length() > 0) {
+ sampleCount++;
+ return new LanguageSample(new Language(lang), sampleString);
+ }
+ }
+ return null;
+ }
+ }
+
+ private final int sentencesPerSample;
+
+ private Map<String, Integer> langSampleCounts;
+ private File[] sentencesFiles;
+
+ private Iterator<File> sentencesFilesIt;
+ private ObjectStream<LanguageSample> sampleStream;
+
+ public LeipzigLanguageSampleStream(File leipzigFolder, final int sentencesPerSample,
+ final int samplesPerLanguage) throws IOException {
+ this.sentencesPerSample = sentencesPerSample;
+ // TODO: Use a FileFilter to make this more reliable in case there are files which should be ignored
+ sentencesFiles = leipzigFolder.listFiles();
+ Arrays.sort(sentencesFiles);
+
+ Map<String, Integer> langCounts = Arrays.stream(sentencesFiles)
+ .map(file -> file.getName().substring(0, 3))
+ .collect(Collectors.groupingBy(String::toString, Collectors.summingInt(v -> 1)));
+
+ langSampleCounts = langCounts.entrySet().stream()
+ .collect(Collectors.toMap(Map.Entry::getKey, e -> samplesPerLanguage / e.getValue()));
+
+ reset();
+ }
+
+ public LanguageSample read() throws IOException {
+ LanguageSample sample;
+ if (sampleStream != null && (sample = sampleStream.read()) != null) {
+ return sample;
+ }
+ else {
+ if (sentencesFilesIt.hasNext()) {
+ File sentencesFile = sentencesFilesIt.next();
+ System.out.println(sentencesFile);
+ String lang = sentencesFile.getName().substring(0, 3);
+
+ sampleStream = new LeipzigSentencesStream(lang, sentencesFile,
+ sentencesPerSample, langSampleCounts.get(lang));
+
+ return read();
+ }
+ }
+ return null;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ sentencesFilesIt = Arrays.asList(sentencesFiles).iterator();
+ sampleStream = null;
+ }
+
+ public static void main(String[] args) throws Exception {
+ new LeipzigLanguageSampleStream(new File("/home/blue/opennlp-data-dir/leipzig-lang"),
+ 10, 100000);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
new file mode 100644
index 0000000..59a7551
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.leipzig;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.EncodingParameter;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class LeipzigLanguageSampleStreamFactory
+ extends AbstractSampleStreamFactory<LanguageSample> {
+
+ interface Parameters extends EncodingParameter {
+ @ParameterDescription(valueName = "sentencesDir",
+ description = "dir with Leipig sentences to be used")
+ File getSentencesDir();
+
+ @ParameterDescription(valueName = "sentencesPerSample",
+ description = "number of sentences per sample")
+ String getSentencesPerSample();
+
+ @ParameterDescription(valueName = "samplesPerLanguage",
+ description = "number of samples per language")
+ String getSamplesPerLanguage();
+ }
+
+ protected <P> LeipzigLanguageSampleStreamFactory(Class<P> params) {
+ super(params);
+ }
+
+ public static void registerFactory() {
+ StreamFactoryRegistry.registerFactory(LanguageSample.class,
+ "leipzig", new LeipzigLanguageSampleStreamFactory(Parameters.class));
+ }
+
+ public ObjectStream<LanguageSample> create(String[] args) {
+
+ Parameters params = ArgumentParser.parse(args, Parameters.class);
+ File sentencesFileDir = params.getSentencesDir();
+
+ try {
+ return new LeipzigLanguageSampleStream(sentencesFileDir,
+ Integer.parseInt(params.getSentencesPerSample()),
+ Integer.parseInt(params.getSamplesPerLanguage()));
+ } catch (IOException e) {
+ throw new TerminateToolException(-1, "IO error while opening sample data.", e);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
new file mode 100644
index 0000000..f780759
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Objects;
+
+/**
+ * Class for holding the document language and its confidence
+ */
+public class Language {
+ private final String lang;
+ private final double confidence;
+
+ public Language(String lang) {
+ this(lang, 0);
+ }
+
+ public Language(String lang, double confidence) {
+ Objects.requireNonNull(lang, "lang must not be null");
+ this.lang = lang;
+ this.confidence = confidence;
+ }
+
+ public String getLang() {
+ return lang;
+ }
+
+ public double getConfidence() {
+ return confidence;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append(getLang()).append(" (").append(this.confidence).append(")");
+ return sb.toString();
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(getLang(), getConfidence());
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+
+ if (obj instanceof Language) {
+ Language a = (Language) obj;
+
+ return getLang().equals(a.getLang());
+ }
+
+ return false;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
new file mode 100644
index 0000000..0004494
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+/**
+ * The interface for LanguageDetector which provide the @{@link Language} according to the context.
+ */
+public interface LanguageDetector {
+
+ Language[] predictLanguages(CharSequence content);
+
+ Language predictLanguage(CharSequence content);
+
+ String[] getSupportedLanguages();
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
new file mode 100644
index 0000000..1ec42fd
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.ArrayList;
+import java.util.Collection;
+
+import opennlp.tools.ngram.NGramModel;
+import opennlp.tools.util.StringList;
+import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.CharSequenceNormalizer;
+
+/**
+ * A context generator for language detector.
+ */
+class LanguageDetectorContextGenerator {
+
+ protected final int minLength;
+ protected final int maxLength;
+ protected final CharSequenceNormalizer normalizer;
+
+ /**
+ * Creates a customizable @{@link LanguageDetectorContextGenerator} that computes ngrams from text
+ * @param minLength min ngrams chars
+ * @param maxLength max ngrams chars
+ * @param normalizers zero or more normalizers to
+ * be applied in to the text before extracting ngrams
+ */
+ public LanguageDetectorContextGenerator(int minLength, int maxLength,
+ CharSequenceNormalizer... normalizers) {
+ this.minLength = minLength;
+ this.maxLength = maxLength;
+
+ this.normalizer = new AggregateCharSequenceNormalizer(normalizers);
+ }
+
+ /**
+ * Generates the context for a document using character ngrams.
+ * @param document document to extract context from
+ * @return the generated context
+ */
+ public String[] getContext(String document) {
+ Collection<String> context = new ArrayList<>();
+
+ NGramModel model = new NGramModel();
+ model.add(document, minLength, maxLength);
+
+ for (StringList tokenList : model) {
+ if (tokenList.size() > 0) {
+ context.add(tokenList.getToken(0));
+ }
+ }
+ return context.toArray(new String[context.size()]);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java
new file mode 100644
index 0000000..ce1823a
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.IOException;
+
+import opennlp.tools.doccat.FeatureGenerator;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.eval.CrossValidationPartitioner;
+import opennlp.tools.util.eval.Mean;
+
+/**
+ * Cross validator for language detector
+ */
+public class LanguageDetectorCrossValidator {
+
+ private final TrainingParameters params;
+
+ private Mean documentAccuracy = new Mean();
+
+ private LanguageDetectorEvaluationMonitor[] listeners;
+
+ private LanguageDetectorFactory factory;
+
+
+ /**
+ * Creates a {@link LanguageDetectorCrossValidator} with the given
+ * {@link FeatureGenerator}s.
+ */
+ public LanguageDetectorCrossValidator(TrainingParameters mlParams,
+ LanguageDetectorFactory factory,
+ LanguageDetectorEvaluationMonitor ... listeners) {
+ this.params = mlParams;
+ this.listeners = listeners;
+ this.factory = factory;
+ }
+
+ /**
+ * Starts the evaluation.
+ *
+ * @param samples
+ * the data to train and test
+ * @param nFolds
+ * number of folds
+ *
+ * @throws IOException
+ */
+ public void evaluate(ObjectStream<LanguageSample> samples, int nFolds)
+ throws IOException {
+
+ CrossValidationPartitioner<LanguageSample> partitioner =
+ new CrossValidationPartitioner<>(samples, nFolds);
+
+ while (partitioner.hasNext()) {
+
+ CrossValidationPartitioner.TrainingSampleStream<LanguageSample> trainingSampleStream =
+ partitioner.next();
+
+ LanguageDetectorModel model = LanguageDetectorME.train(
+ trainingSampleStream, params, factory);
+
+ LanguageDetectorEvaluator evaluator = new LanguageDetectorEvaluator(
+ new LanguageDetectorME(model), listeners);
+
+ evaluator.evaluate(trainingSampleStream.getTestSampleStream());
+
+ documentAccuracy.add(evaluator.getAccuracy(),
+ evaluator.getDocumentCount());
+
+ }
+ }
+
+ /**
+ * Retrieves the accuracy for all iterations.
+ *
+ * @return the word accuracy
+ */
+ public double getDocumentAccuracy() {
+ return documentAccuracy.mean();
+ }
+
+ /**
+ * Retrieves the number of words which where validated over all iterations.
+ * The result is the amount of folds multiplied by the total number of words.
+ *
+ * @return the word count
+ */
+ public long getDocumentCount() {
+ return documentAccuracy.count();
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
new file mode 100644
index 0000000..30f3313
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+/**
+ * {@link EvaluationMonitor} for Language Detector.
+ */
+public interface LanguageDetectorEvaluationMonitor extends
+ EvaluationMonitor<LanguageSample> {
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java
new file mode 100644
index 0000000..bbf73c3
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import opennlp.tools.doccat.DocumentCategorizer;
+import opennlp.tools.util.eval.Evaluator;
+import opennlp.tools.util.eval.Mean;
+
+/**
+ * The {@link LanguageDetectorEvaluator} measures the performance of
+ * the given {@link LanguageDetector} with the provided reference
+ * {@link LanguageSample}s.
+ *
+ * @see LanguageDetector
+ * @see LanguageSample
+ */
+public class LanguageDetectorEvaluator extends Evaluator<LanguageSample> {
+
+ private LanguageDetector languageDetector;
+
+ private Mean accuracy = new Mean();
+
+ /**
+ * Initializes the current instance.
+ *
+ * @param langDetect the language detector instance
+ */
+ public LanguageDetectorEvaluator(LanguageDetector langDetect,
+ LanguageDetectorEvaluationMonitor ... listeners) {
+ super(listeners);
+ this.languageDetector = langDetect;
+ }
+
+ /**
+ * Evaluates the given reference {@link LanguageSample} object.
+ *
+ * This is done by categorizing the document from the provided
+ * {@link LanguageSample}. The detected language is then used
+ * to calculate and update the score.
+ *
+ * @param sample the reference {@link LanguageSample}.
+ */
+ public LanguageSample processSample(LanguageSample sample) {
+
+ CharSequence document = sample.getContext();
+
+ Language predicted = languageDetector.predictLanguage(document);
+
+
+
+ if (sample.getLanguage().getLang().equals(predicted.getLang())) {
+ accuracy.add(1);
+ }
+ else {
+ accuracy.add(0);
+ }
+
+ return new LanguageSample(predicted, sample.getContext());
+ }
+
+ /**
+ * Retrieves the accuracy of provided {@link DocumentCategorizer}.
+ *
+ * accuracy = correctly categorized documents / total documents
+ *
+ * @return the accuracy
+ */
+ public double getAccuracy() {
+ return accuracy.mean();
+ }
+
+ public long getDocumentCount() {
+ return accuracy.count();
+ }
+
+ /**
+ * Represents this objects as human readable {@link String}.
+ */
+ @Override
+ public String toString() {
+ return "Accuracy: " + accuracy.mean() + "\n" +
+ "Number of documents: " + accuracy.count();
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
new file mode 100644
index 0000000..19e6d46
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Iterator;
+
+import opennlp.tools.ml.model.Event;
+import opennlp.tools.util.AbstractEventStream;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * Iterator-like class for modeling language detector events.
+ */
+public class LanguageDetectorEventStream extends AbstractEventStream<LanguageSample> {
+
+ private LanguageDetectorContextGenerator mContextGenerator;
+
+ /**
+ * Initializes the current instance via samples and feature generators.
+ *
+ * @param data {@link ObjectStream} of {@link LanguageSample}s
+ */
+ public LanguageDetectorEventStream(ObjectStream<LanguageSample> data,
+ LanguageDetectorContextGenerator contextGenerator) {
+ super(data);
+
+ mContextGenerator = contextGenerator;
+ }
+
+ @Override
+ protected Iterator<Event> createEvents(final LanguageSample sample) {
+
+ return new Iterator<Event>() {
+
+ private boolean isVirgin = true;
+
+ public boolean hasNext() {
+ return isVirgin;
+ }
+
+ public Event next() {
+
+ isVirgin = false;
+
+ return new Event(sample.getLanguage().getLang(),
+ mContextGenerator.getContext(sample.getContext().toString()));
+ }
+
+ public void remove() {
+ throw new UnsupportedOperationException();
+ }
+ };
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
new file mode 100644
index 0000000..11357ec
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ext.ExtensionLoader;
+import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer;
+
+
+public class LanguageDetectorFactory extends BaseToolFactory {
+
+ public LanguageDetectorContextGenerator getContextGenerator() {
+ return new LanguageDetectorContextGenerator(1, 3,
+ EmojiCharSequenceNormalizer.getInstance(),
+ UrlCharSequenceNormalizer.getInstance(),
+ TwitterCharSequenceNormalizer.getInstance(),
+ NumberCharSequenceNormalizer.getInstance(),
+ ShrinkCharSequenceNormalizer.getInstance());
+ }
+
+ public static LanguageDetectorFactory create(String subclassName)
+ throws InvalidFormatException {
+ if (subclassName == null) {
+ // will create the default factory
+ return new LanguageDetectorFactory();
+ }
+ try {
+ LanguageDetectorFactory theFactory = ExtensionLoader.instantiateExtension(
+ LanguageDetectorFactory.class, subclassName);
+ theFactory.init();
+ return theFactory;
+ } catch (Exception e) {
+ String msg = "Could not instantiate the " + subclassName
+ + ". The initialization throw an exception.";
+ throw new InvalidFormatException(msg, e);
+ }
+ }
+
+ public void init() {
+ // nothing to do
+ }
+
+ @Override
+ public void validateArtifactMap() throws InvalidFormatException {
+ // nothing to validate
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
new file mode 100644
index 0000000..3af6afd
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.ml.AbstractEventTrainer;
+import opennlp.tools.ml.EventTrainer;
+import opennlp.tools.ml.TrainerFactory;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+
+/**
+ * Implements learnable Language Detector
+ */
+public class LanguageDetectorME implements LanguageDetector {
+
+ private LanguageDetectorModel model;
+ private LanguageDetectorContextGenerator mContextGenerator;
+
+ /**
+ * Initializes the current instance with a language detector model. Default feature
+ * generation is used.
+ *
+ * @param model the language detector model
+ */
+ public LanguageDetectorME(LanguageDetectorModel model) {
+ this.model = model;
+ this.mContextGenerator = model.getFactory().getContextGenerator();
+ }
+
+ @Override
+ public Language[] predictLanguages(CharSequence content) {
+ double[] eval = model.getMaxentModel().eval(mContextGenerator.getContext(content.toString()));
+ Language[] arr = new Language[eval.length];
+ for (int i = 0; i < eval.length; i++) {
+ arr[i] = new Language(model.getMaxentModel().getOutcome(i), eval[i]);
+ }
+
+ Arrays.sort(arr, (o1, o2) -> Double.compare(o2.getConfidence(), o1.getConfidence()));
+ return arr;
+ }
+
+ @Override
+ public Language predictLanguage(CharSequence content) {
+ return predictLanguages(content)[0];
+ }
+
+ @Override
+ public String[] getSupportedLanguages() {
+ int numberLanguages = model.getMaxentModel().getNumOutcomes();
+ String[] languages = new String[numberLanguages];
+ for (int i = 0; i < numberLanguages; i++) {
+ languages[i] = model.getMaxentModel().getOutcome(i);
+ }
+ return languages;
+ }
+
+
+ public static LanguageDetectorModel train(ObjectStream<LanguageSample> samples,
+ TrainingParameters mlParams,
+ LanguageDetectorFactory factory)
+ throws IOException {
+
+ Map<String, String> manifestInfoEntries = new HashMap<>();
+
+ mlParams.putIfAbsent(AbstractEventTrainer.DATA_INDEXER_PARAM,
+ AbstractEventTrainer.DATA_INDEXER_ONE_PASS_VALUE);
+
+ EventTrainer trainer = TrainerFactory.getEventTrainer(
+ mlParams, manifestInfoEntries);
+
+ MaxentModel model = trainer.train(
+ new LanguageDetectorEventStream(samples, factory.getContextGenerator()));
+
+ return new LanguageDetectorModel(model, manifestInfoEntries, factory);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
new file mode 100644
index 0000000..c0d9703
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Map;
+
+import opennlp.tools.ml.model.AbstractModel;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.BaseModel;
+
+/**
+ * A model for language detection
+ */
+public class LanguageDetectorModel extends BaseModel {
+
+ private static final String COMPONENT_NAME = "LanguageDetectorME";
+ private static final String LANGDETECT_MODEL_ENTRY_NAME = "langdetect.model";
+
+ public LanguageDetectorModel(MaxentModel langdetectModel,
+ Map<String, String> manifestInfoEntries,
+ LanguageDetectorFactory factory) {
+ super(COMPONENT_NAME, "und", manifestInfoEntries, factory);
+
+ artifactMap.put(LANGDETECT_MODEL_ENTRY_NAME, langdetectModel);
+ checkArtifactMap();
+ }
+
+ public LanguageDetectorModel(InputStream in) throws IOException {
+ super(COMPONENT_NAME, in);
+ }
+
+ public LanguageDetectorModel(File modelFile) throws IOException {
+ super(COMPONENT_NAME, modelFile);
+ }
+
+ public LanguageDetectorModel(URL modelURL) throws IOException {
+ super(COMPONENT_NAME, modelURL);
+ }
+
+ @Override
+ protected void validateArtifactMap() throws InvalidFormatException {
+ super.validateArtifactMap();
+
+ if (!(artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME) instanceof AbstractModel)) {
+ throw new InvalidFormatException("Language detector model is incomplete!");
+ }
+ }
+
+ public LanguageDetectorFactory getFactory() {
+ return (LanguageDetectorFactory) this.toolFactory;
+ }
+
+ @Override
+ protected Class<? extends BaseToolFactory> getDefaultFactory() {
+ return LanguageDetectorFactory.class;
+ }
+
+ public MaxentModel getMaxentModel() {
+ return (MaxentModel) artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
new file mode 100644
index 0000000..2a407f7
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.IOException;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * This class reads in string encoded training samples, parses them and
+ * outputs {@link LanguageSample} objects.
+ * <p>
+ * Format:<br>
+ * Each line contains one sample document.<br>
+ * The language is the first string in the line followed by a tab and the document content.<br>
+ * Sample line: category-string tab-char document line-break-char(s)<br>
+ */
+public class LanguageDetectorSampleStream
+ extends FilterObjectStream<String, LanguageSample> {
+
+ public LanguageDetectorSampleStream(ObjectStream<String> samples) {
+ super(samples);
+ }
+
+ public LanguageSample read() throws IOException {
+ String sampleString;
+ while ((sampleString = samples.read()) != null) {
+ int tabIndex = sampleString.indexOf("\t");
+ if (tabIndex > 0) {
+ String lang = sampleString.substring(0, tabIndex);
+ String context = sampleString.substring(tabIndex + 1);
+
+ return new LanguageSample(new Language(lang), context);
+ }
+ }
+
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/560c4843/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
new file mode 100644
index 0000000..f454864
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Objects;
+
+/**
+ * Class which holds a classified document and its @{@link Language}.
+ */
+public class LanguageSample {
+
+ private final Language language;
+ private final CharSequence context;
+
+ public LanguageSample(Language language, CharSequence context) {
+ this.language = Objects.requireNonNull(language, "language must not be null");
+ this.context = Objects.requireNonNull(context, "context must not be null");
+ }
+
+ public Language getLanguage() {
+ return language;
+ }
+
+ public CharSequence getContext() {
+ return context;
+ }
+
+ @Override
+ public String toString() {
+ return language.getLang() + '\t' + context;
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(getContext(), getLanguage());
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+
+ if (obj instanceof LanguageSample) {
+ LanguageSample a = (LanguageSample) obj;
+
+ return getLanguage().equals(a.getLanguage())
+ && getContext().equals(a.getContext());
+ }
+
+ return false;
+ }
+}