You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2017/03/14 02:42:00 UTC
opennlp git commit: OPENNLP-778: Add LanguageDetector infrastructure
classes [Forced Update!]
Repository: opennlp
Updated Branches:
refs/heads/OPENNLP-778 d9a9c7dbb -> 64949f179 (forced update)
OPENNLP-778: Add LanguageDetector infrastructure classes
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/64949f17
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/64949f17
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/64949f17
Branch: refs/heads/OPENNLP-778
Commit: 64949f1798ddf0adfceed772a5a49c1ad451167d
Parents: 11d7581
Author: William D C M SILVA <co...@apache.org>
Authored: Mon Mar 13 13:48:33 2017 -0300
Committer: William D C M SILVA <co...@apache.org>
Committed: Mon Mar 13 23:40:12 2017 -0300
----------------------------------------------------------------------
.../java/opennlp/tools/langdetect/Language.java | 34 +++++++
.../tools/langdetect/LanguageDetector.java | 10 +-
.../langdetect/LanguageDetectorFactory.java | 55 +++++++++++
.../tools/langdetect/LanguageDetectorME.java | 38 ++++++++
.../tools/langdetect/LanguageDetectorModel.java | 82 ++++++++++++++++
.../langdetect/LanguageDetectorSample.java | 75 +++++++++++++++
.../tools/langdetect/LanguageSampleTest.java | 88 ++++++++++++++++++
.../opennlp/tools/langdetect/LanguageTest.java | 98 ++++++++++++++++++++
8 files changed, 474 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
index 773201f..57655b4 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
@@ -17,6 +17,8 @@
package opennlp.tools.langdetect;
+import java.util.Objects;
+
/**
* Class for holding the document language and its confidence
*/
@@ -24,7 +26,12 @@ public class Language {
private final String lang;
private final double confidence;
+ public Language(String lang) {
+ this(lang, 0);
+ }
+
public Language(String lang, double confidence) {
+ Objects.requireNonNull(lang, "lang must not be null");
this.lang = lang;
this.confidence = confidence;
}
@@ -36,4 +43,31 @@ public class Language {
public double getConfidence() {
return confidence;
}
+
+ @Override
+ public String toString() {
+
+ return getLang();
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(getLang(), getConfidence());
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+
+ if (obj instanceof Language) {
+ Language a = (Language) obj;
+
+ return getLang().equals(a.getLang())
+ && getConfidence() == a.getConfidence();
+ }
+
+ return false;
+ }
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
index ca897fd..5e9833a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
@@ -17,17 +17,15 @@
package opennlp.tools.langdetect;
-import java.util.Set;
-
/**
- * The interface for name finders which provide name tags for a sequence of tokens.
+ * The interface for LanguageDetector which provide the @{@link Language} according to the context.
*/
public interface LanguageDetector {
- Language[] detectLanguage(CharSequence content);
+ Language[] predictLanguages(CharSequence content);
- Set<String> getSupportedLanguages();
+ Language predictLanguage(CharSequence content);
- String getLanguageCoding();
+ Language[] getSupportedLanguages();
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
new file mode 100644
index 0000000..a0fb84e
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ext.ExtensionLoader;
+
+
+public class LanguageDetectorFactory extends BaseToolFactory {
+
+ public static LanguageDetectorFactory create(String subclassName)
+ throws InvalidFormatException {
+ if (subclassName == null) {
+ // will create the default factory
+ return new LanguageDetectorFactory();
+ }
+ try {
+ LanguageDetectorFactory theFactory = ExtensionLoader.instantiateExtension(
+ LanguageDetectorFactory.class, subclassName);
+ theFactory.init();
+ return theFactory;
+ } catch (Exception e) {
+ String msg = "Could not instantiate the " + subclassName
+ + ". The initialization throw an exception.";
+ System.err.println(msg);
+ e.printStackTrace();
+ throw new InvalidFormatException(msg, e);
+ }
+ }
+
+ public void init() {
+ // nothing to do
+ }
+
+ @Override
+ public void validateArtifactMap() throws InvalidFormatException {
+ // nothing to validate
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
new file mode 100644
index 0000000..c88ec33
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+/**
+ * Implements learnable Language Detector
+ */
+public class LanguageDetectorME implements LanguageDetector {
+ @Override
+ public Language[] predictLanguages(CharSequence content) {
+ return new Language[0];
+ }
+
+ @Override
+ public Language predictLanguage(CharSequence content) {
+ return null;
+ }
+
+ @Override
+ public Language[] getSupportedLanguages() {
+ return new Language[0];
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
new file mode 100644
index 0000000..eb38847
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Map;
+
+import opennlp.tools.ml.model.AbstractModel;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.BaseModel;
+
+/**
+ * A model for language detection
+ */
+public class LanguageDetectorModel extends BaseModel {
+
+ private static final String COMPONENT_NAME = "LanguageDetectorME";
+ private static final String LANGDETECT_MODEL_ENTRY_NAME = "langdetect.model";
+
+ public LanguageDetectorModel(String languageCode, MaxentModel langdetectModel,
+ Map<String, String> manifestInfoEntries,
+ LanguageDetectorFactory factory) {
+ super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);
+
+ artifactMap.put(LANGDETECT_MODEL_ENTRY_NAME, langdetectModel);
+ checkArtifactMap();
+ }
+
+ public LanguageDetectorModel(InputStream in) throws IOException {
+ super(COMPONENT_NAME, in);
+ }
+
+ public LanguageDetectorModel(File modelFile) throws IOException {
+ super(COMPONENT_NAME, modelFile);
+ }
+
+ public LanguageDetectorModel(URL modelURL) throws IOException {
+ super(COMPONENT_NAME, modelURL);
+ }
+
+ @Override
+ protected void validateArtifactMap() throws InvalidFormatException {
+ super.validateArtifactMap();
+
+ if (!(artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME) instanceof AbstractModel)) {
+ throw new InvalidFormatException("Language detector model is incomplete!");
+ }
+ }
+
+ public LanguageDetectorFactory getFactory() {
+ return (LanguageDetectorFactory) this.toolFactory;
+ }
+
+ @Override
+ protected Class<? extends BaseToolFactory> getDefaultFactory() {
+ return LanguageDetectorFactory.class;
+ }
+
+ public MaxentModel getMaxentModel() {
+ return (MaxentModel) artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME);
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java
new file mode 100644
index 0000000..2c30044
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Objects;
+
+/**
+ * Class which holds a classified document and its @{@link Language}.
+ */
+public class LanguageDetectorSample {
+
+ private final Language language;
+ private final CharSequence context;
+
+ public LanguageDetectorSample(Language language, CharSequence context) {
+ Objects.requireNonNull(context, "context must not be null");
+ Objects.requireNonNull(language, "language must not be null");
+ this.language = language;
+ this.context = context;
+ }
+
+ public Language getLanguage() {
+ return language;
+ }
+
+ public CharSequence getContext() {
+ return context;
+ }
+
+ @Override
+ public String toString() {
+
+ StringBuilder sampleString = new StringBuilder();
+
+ sampleString.append(language.getLang()).append('\t').append(context);
+
+ return sampleString.toString();
+ }
+
+ @Override
+ public int hashCode() {
+ return Objects.hash(getContext(), getLanguage());
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj) {
+ return true;
+ }
+
+ if (obj instanceof LanguageDetectorSample) {
+ LanguageDetectorSample a = (LanguageDetectorSample) obj;
+
+ return getLanguage().equals(a.getLanguage())
+ && getContext().equals(a.getContext());
+ }
+
+ return false;
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
new file mode 100644
index 0000000..31a5727
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageSampleTest {
+
+ @Test
+ public void testConstructor() {
+ Language lang = new Language("aLang");
+ CharSequence context = "aContext";
+
+ LanguageDetectorSample sample = new LanguageDetectorSample(lang, context);
+
+ Assert.assertEquals(lang, sample.getLanguage());
+ Assert.assertEquals(context, sample.getContext());
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void testNullLang() throws Exception {
+ CharSequence context = "aContext";
+
+ new LanguageDetectorSample(null, context);
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void testNullContext() {
+ Language lang = new Language("aLang");
+
+ new LanguageDetectorSample(lang, null);
+ }
+
+ @Test
+ public void testToString() {
+ Language lang = new Language("aLang");
+ CharSequence context = "aContext";
+
+ LanguageDetectorSample sample = new LanguageDetectorSample(lang, context);
+
+ Assert.assertEquals(lang.getLang() + "\t" + context, sample.toString());
+ }
+
+ @Test
+ public void testHash() {
+
+ int hashA = new LanguageDetectorSample(new Language("aLang"), "aContext").hashCode();
+ int hashB = new LanguageDetectorSample(new Language("bLang"), "aContext").hashCode();
+ int hashC = new LanguageDetectorSample(new Language("aLang"), "bContext").hashCode();
+
+ Assert.assertNotEquals(hashA, hashB);
+ Assert.assertNotEquals(hashA, hashC);
+ Assert.assertNotEquals(hashB, hashC);
+ }
+
+ @Test
+ public void testEquals() throws Exception {
+
+ LanguageDetectorSample sampleA = new LanguageDetectorSample(new Language("aLang"), "aContext");
+ LanguageDetectorSample sampleA1 = new LanguageDetectorSample(new Language("aLang"), "aContext");
+ LanguageDetectorSample sampleB = new LanguageDetectorSample(new Language("bLang"), "aContext");
+ LanguageDetectorSample sampleC = new LanguageDetectorSample(new Language("aLang"), "bContext");
+
+ Assert.assertEquals(sampleA, sampleA);
+ Assert.assertEquals(sampleA, sampleA1);
+ Assert.assertNotEquals(sampleA, sampleB);
+ Assert.assertNotEquals(sampleA, sampleC);
+ Assert.assertNotEquals(sampleB, sampleC);
+ Assert.assertFalse(sampleA.equals("something else"));
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
new file mode 100644
index 0000000..56c5b80
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageTest {
+
+
+ @Test
+ public void emptyConfidence() throws Exception {
+ String languageCode = "aLanguage";
+ Language lang = new Language(languageCode);
+
+ Assert.assertEquals(languageCode, lang.getLang());
+ Assert.assertEquals(0, lang.getConfidence(), 0);
+ }
+
+ @Test
+ public void nonEmptyConfidence() throws Exception {
+ String languageCode = "aLanguage";
+ double confidence = 0.05;
+ Language lang = new Language(languageCode, confidence);
+
+ Assert.assertEquals(languageCode, lang.getLang());
+ Assert.assertEquals(confidence, lang.getConfidence(), 0);
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void emptyLanguage() throws Exception {
+ new Language(null);
+ }
+
+ @Test(expected = NullPointerException.class)
+ public void emptyLanguageConfidence() throws Exception {
+ new Language(null, 0.05);
+ }
+
+ @Test
+ public void testToString() {
+ Language lang = new Language("aLang");
+
+ Assert.assertEquals(lang.getLang(), lang.toString());
+ }
+
+
+
+ @Test
+ public void testHash() {
+ int hashA = new Language("aLang").hashCode();
+ int hashAA = new Language("aLang").hashCode();
+ int hashB = new Language("BLang").hashCode();
+ int hashA5 = new Language("aLang", 5.0).hashCode();
+ int hashA6 = new Language("BLang", 6.0).hashCode();
+
+ Assert.assertEquals(hashA, hashAA);
+
+ Assert.assertNotEquals(hashA, hashB);
+ Assert.assertNotEquals(hashA, hashA5);
+ Assert.assertNotEquals(hashB, hashA5);
+ Assert.assertNotEquals(hashA5, hashA6);
+ }
+
+ @Test
+ public void testEquals() {
+ Language langA = new Language("langA");
+ Language langB = new Language("langB");
+ Language langA5 = new Language("langA5", 5.0);
+ Language langA6 = new Language("langA5", 6.0);
+
+ Assert.assertEquals(langA, langA);
+ Assert.assertEquals(langA5, langA5);
+
+ Assert.assertNotEquals(langA, langA5);
+ Assert.assertNotEquals(langA, langB);
+
+ Assert.assertNotEquals(langA6, langA5);
+
+ Assert.assertNotEquals(langA, "something else");
+ }
+}