You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2017/03/14 02:42:00 UTC

opennlp git commit: OPENNLP-778: Add LanguageDetector infrastructure classes [Forced Update!]

Repository: opennlp
Updated Branches:
  refs/heads/OPENNLP-778 d9a9c7dbb -> 64949f179 (forced update)


OPENNLP-778: Add LanguageDetector infrastructure classes


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/64949f17
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/64949f17
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/64949f17

Branch: refs/heads/OPENNLP-778
Commit: 64949f1798ddf0adfceed772a5a49c1ad451167d
Parents: 11d7581
Author: William D C M SILVA <co...@apache.org>
Authored: Mon Mar 13 13:48:33 2017 -0300
Committer: William D C M SILVA <co...@apache.org>
Committed: Mon Mar 13 23:40:12 2017 -0300

----------------------------------------------------------------------
 .../java/opennlp/tools/langdetect/Language.java | 34 +++++++
 .../tools/langdetect/LanguageDetector.java      | 10 +-
 .../langdetect/LanguageDetectorFactory.java     | 55 +++++++++++
 .../tools/langdetect/LanguageDetectorME.java    | 38 ++++++++
 .../tools/langdetect/LanguageDetectorModel.java | 82 ++++++++++++++++
 .../langdetect/LanguageDetectorSample.java      | 75 +++++++++++++++
 .../tools/langdetect/LanguageSampleTest.java    | 88 ++++++++++++++++++
 .../opennlp/tools/langdetect/LanguageTest.java  | 98 ++++++++++++++++++++
 8 files changed, 474 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
index 773201f..57655b4 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
@@ -17,6 +17,8 @@
 
 package opennlp.tools.langdetect;
 
+import java.util.Objects;
+
 /**
  * Class for holding the document language and its confidence
  */
@@ -24,7 +26,12 @@ public class Language {
   private final String lang;
   private final double confidence;
 
+  public Language(String lang) {
+    this(lang, 0);
+  }
+
   public Language(String lang, double confidence) {
+    Objects.requireNonNull(lang, "lang must not be null");
     this.lang = lang;
     this.confidence = confidence;
   }
@@ -36,4 +43,31 @@ public class Language {
   public double getConfidence() {
     return confidence;
   }
+
+  @Override
+  public String toString() {
+
+    return getLang();
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(getLang(), getConfidence());
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+
+    if (obj instanceof Language) {
+      Language a = (Language) obj;
+
+      return getLang().equals(a.getLang())
+          && getConfidence() == a.getConfidence();
+    }
+
+    return false;
+  }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
index ca897fd..5e9833a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
@@ -17,17 +17,15 @@
 
 package opennlp.tools.langdetect;
 
-import java.util.Set;
-
 /**
- * The interface for name finders which provide name tags for a sequence of tokens.
+ * The interface for LanguageDetector which provide the @{@link Language} according to the context.
  */
 public interface LanguageDetector {
 
-  Language[] detectLanguage(CharSequence content);
+  Language[] predictLanguages(CharSequence content);
 
-  Set<String> getSupportedLanguages();
+  Language predictLanguage(CharSequence content);
 
-  String getLanguageCoding();
+  Language[] getSupportedLanguages();
 
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
new file mode 100644
index 0000000..a0fb84e
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ext.ExtensionLoader;
+
+
+public class LanguageDetectorFactory extends BaseToolFactory {
+
+  public static LanguageDetectorFactory create(String subclassName)
+      throws InvalidFormatException {
+    if (subclassName == null) {
+      // will create the default factory
+      return new LanguageDetectorFactory();
+    }
+    try {
+      LanguageDetectorFactory theFactory = ExtensionLoader.instantiateExtension(
+          LanguageDetectorFactory.class, subclassName);
+      theFactory.init();
+      return theFactory;
+    } catch (Exception e) {
+      String msg = "Could not instantiate the " + subclassName
+          + ". The initialization throw an exception.";
+      System.err.println(msg);
+      e.printStackTrace();
+      throw new InvalidFormatException(msg, e);
+    }
+  }
+
+  public void init() {
+    // nothing to do
+  }
+
+  @Override
+  public void validateArtifactMap() throws InvalidFormatException {
+    // nothing to validate
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
new file mode 100644
index 0000000..c88ec33
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+/**
+ * Implements learnable Language Detector
+ */
+public class LanguageDetectorME implements LanguageDetector {
+  @Override
+  public Language[] predictLanguages(CharSequence content) {
+    return new Language[0];
+  }
+
+  @Override
+  public Language predictLanguage(CharSequence content) {
+    return null;
+  }
+
+  @Override
+  public Language[] getSupportedLanguages() {
+    return new Language[0];
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
new file mode 100644
index 0000000..eb38847
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Map;
+
+import opennlp.tools.ml.model.AbstractModel;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.BaseModel;
+
+/**
+ * A model for language detection
+ */
+public class LanguageDetectorModel extends BaseModel {
+
+  private static final String COMPONENT_NAME = "LanguageDetectorME";
+  private static final String LANGDETECT_MODEL_ENTRY_NAME = "langdetect.model";
+
+  public LanguageDetectorModel(String languageCode, MaxentModel langdetectModel,
+                               Map<String, String> manifestInfoEntries,
+                               LanguageDetectorFactory factory) {
+    super(COMPONENT_NAME, languageCode, manifestInfoEntries, factory);
+
+    artifactMap.put(LANGDETECT_MODEL_ENTRY_NAME, langdetectModel);
+    checkArtifactMap();
+  }
+
+  public LanguageDetectorModel(InputStream in) throws IOException {
+    super(COMPONENT_NAME, in);
+  }
+
+  public LanguageDetectorModel(File modelFile) throws IOException {
+    super(COMPONENT_NAME, modelFile);
+  }
+
+  public LanguageDetectorModel(URL modelURL) throws IOException {
+    super(COMPONENT_NAME, modelURL);
+  }
+
+  @Override
+  protected void validateArtifactMap() throws InvalidFormatException {
+    super.validateArtifactMap();
+
+    if (!(artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME) instanceof AbstractModel)) {
+      throw new InvalidFormatException("Language detector model is incomplete!");
+    }
+  }
+
+  public LanguageDetectorFactory getFactory() {
+    return (LanguageDetectorFactory) this.toolFactory;
+  }
+
+  @Override
+  protected Class<? extends BaseToolFactory> getDefaultFactory() {
+    return LanguageDetectorFactory.class;
+  }
+
+  public MaxentModel getMaxentModel() {
+    return (MaxentModel) artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java
new file mode 100644
index 0000000..2c30044
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSample.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Objects;
+
+/**
+ * Class which holds a classified document and its @{@link Language}.
+ */
+public class LanguageDetectorSample {
+
+  private final Language language;
+  private final CharSequence context;
+
+  public LanguageDetectorSample(Language language, CharSequence context) {
+    Objects.requireNonNull(context, "context must not be null");
+    Objects.requireNonNull(language, "language must not be null");
+    this.language = language;
+    this.context = context;
+  }
+
+  public Language getLanguage() {
+    return language;
+  }
+
+  public CharSequence getContext() {
+    return context;
+  }
+
+  @Override
+  public String toString() {
+
+    StringBuilder sampleString = new StringBuilder();
+
+    sampleString.append(language.getLang()).append('\t').append(context);
+
+    return sampleString.toString();
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(getContext(), getLanguage());
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+
+    if (obj instanceof LanguageDetectorSample) {
+      LanguageDetectorSample a = (LanguageDetectorSample) obj;
+
+      return getLanguage().equals(a.getLanguage())
+          && getContext().equals(a.getContext());
+    }
+
+    return false;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
new file mode 100644
index 0000000..31a5727
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageSampleTest {
+
+  @Test
+  public void testConstructor() {
+    Language lang = new Language("aLang");
+    CharSequence context = "aContext";
+
+    LanguageDetectorSample sample = new LanguageDetectorSample(lang, context);
+
+    Assert.assertEquals(lang, sample.getLanguage());
+    Assert.assertEquals(context, sample.getContext());
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void testNullLang() throws Exception {
+    CharSequence context = "aContext";
+
+    new LanguageDetectorSample(null, context);
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void testNullContext() {
+    Language lang = new Language("aLang");
+
+    new LanguageDetectorSample(lang, null);
+  }
+
+  @Test
+  public void testToString() {
+    Language lang = new Language("aLang");
+    CharSequence context = "aContext";
+
+    LanguageDetectorSample sample = new LanguageDetectorSample(lang, context);
+
+    Assert.assertEquals(lang.getLang() + "\t" + context, sample.toString());
+  }
+
+  @Test
+  public void testHash() {
+
+    int hashA = new LanguageDetectorSample(new Language("aLang"), "aContext").hashCode();
+    int hashB = new LanguageDetectorSample(new Language("bLang"), "aContext").hashCode();
+    int hashC = new LanguageDetectorSample(new Language("aLang"), "bContext").hashCode();
+
+    Assert.assertNotEquals(hashA, hashB);
+    Assert.assertNotEquals(hashA, hashC);
+    Assert.assertNotEquals(hashB, hashC);
+  }
+
+  @Test
+  public void testEquals() throws Exception {
+
+    LanguageDetectorSample sampleA  = new LanguageDetectorSample(new Language("aLang"), "aContext");
+    LanguageDetectorSample sampleA1 = new LanguageDetectorSample(new Language("aLang"), "aContext");
+    LanguageDetectorSample sampleB  = new LanguageDetectorSample(new Language("bLang"), "aContext");
+    LanguageDetectorSample sampleC  = new LanguageDetectorSample(new Language("aLang"), "bContext");
+
+    Assert.assertEquals(sampleA, sampleA);
+    Assert.assertEquals(sampleA, sampleA1);
+    Assert.assertNotEquals(sampleA, sampleB);
+    Assert.assertNotEquals(sampleA, sampleC);
+    Assert.assertNotEquals(sampleB, sampleC);
+    Assert.assertFalse(sampleA.equals("something else"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/64949f17/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
new file mode 100644
index 0000000..56c5b80
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
@@ -0,0 +1,98 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageTest {
+
+
+  @Test
+  public void emptyConfidence() throws Exception {
+    String languageCode = "aLanguage";
+    Language lang = new Language(languageCode);
+
+    Assert.assertEquals(languageCode, lang.getLang());
+    Assert.assertEquals(0, lang.getConfidence(), 0);
+  }
+
+  @Test
+  public void nonEmptyConfidence() throws Exception {
+    String languageCode = "aLanguage";
+    double confidence = 0.05;
+    Language lang = new Language(languageCode, confidence);
+
+    Assert.assertEquals(languageCode, lang.getLang());
+    Assert.assertEquals(confidence, lang.getConfidence(), 0);
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void emptyLanguage() throws Exception {
+    new Language(null);
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void emptyLanguageConfidence() throws Exception {
+    new Language(null, 0.05);
+  }
+
+  @Test
+  public void testToString() {
+    Language lang = new Language("aLang");
+
+    Assert.assertEquals(lang.getLang(), lang.toString());
+  }
+
+
+
+  @Test
+  public void testHash() {
+    int hashA = new Language("aLang").hashCode();
+    int hashAA = new Language("aLang").hashCode();
+    int hashB = new Language("BLang").hashCode();
+    int hashA5 = new Language("aLang", 5.0).hashCode();
+    int hashA6 = new Language("BLang", 6.0).hashCode();
+
+    Assert.assertEquals(hashA, hashAA);
+
+    Assert.assertNotEquals(hashA, hashB);
+    Assert.assertNotEquals(hashA, hashA5);
+    Assert.assertNotEquals(hashB, hashA5);
+    Assert.assertNotEquals(hashA5, hashA6);
+  }
+
+  @Test
+  public void testEquals() {
+    Language langA = new Language("langA");
+    Language langB = new Language("langB");
+    Language langA5 = new Language("langA5", 5.0);
+    Language langA6 = new Language("langA5", 6.0);
+
+    Assert.assertEquals(langA, langA);
+    Assert.assertEquals(langA5, langA5);
+
+    Assert.assertNotEquals(langA, langA5);
+    Assert.assertNotEquals(langA, langB);
+
+    Assert.assertNotEquals(langA6, langA5);
+
+    Assert.assertNotEquals(langA, "something else");
+  }
+}