You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2017/05/17 15:52:33 UTC

[07/50] opennlp git commit: OPENNLP-1033: Adds unit tests for opennlp.tools.ngram, closes apache/opennlp#172

OPENNLP-1033: Adds unit tests for opennlp.tools.ngram, closes apache/opennlp#172


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/3ba27e9f
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/3ba27e9f
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/3ba27e9f

Branch: refs/heads/LangDetect
Commit: 3ba27e9f4a29be1922b3a19f9c6c5127f93027ab
Parents: d447459
Author: jzonthemtn <je...@mtnfog.com>
Authored: Wed Apr 19 15:53:32 2017 -0400
Committer: smarthi <sm...@apache.org>
Committed: Wed Apr 19 15:53:32 2017 -0400

----------------------------------------------------------------------
 .../java/opennlp/tools/ngram/NGramModel.java    |  6 +-
 .../opennlp/tools/ngram/NGramGeneratorTest.java | 91 ++++++++++++++++++
 .../opennlp/tools/ngram/NGramModelTest.java     | 98 ++++++++++++++------
 .../tools/ngram/ngram-model-no-count.xml        | 27 ++++++
 .../tools/ngram/ngram-model-not-a-number.xml    | 27 ++++++
 5 files changed, 222 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java b/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java
index 7005dc4..0e0e4dd 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java
@@ -216,6 +216,7 @@ public class NGramModel implements Iterable<StringList> {
    *
    * @return iterator over all grams
    */
+  @Override
   public Iterator<StringList> iterator() {
     return mNGrams.keySet().iterator();
   }
@@ -306,10 +307,12 @@ public class NGramModel implements Iterable<StringList> {
     {
       private Iterator<StringList> mDictionaryIterator = NGramModel.this.iterator();
 
+      @Override
       public boolean hasNext() {
         return mDictionaryIterator.hasNext();
       }
 
+      @Override
       public Entry next() {
 
         StringList tokens = mDictionaryIterator.next();
@@ -317,10 +320,11 @@ public class NGramModel implements Iterable<StringList> {
         Attributes attributes = new Attributes();
 
         attributes.setValue(COUNT, Integer.toString(getCount(tokens)));
-
+        
         return new Entry(tokens, attributes);
       }
 
+      @Override
       public void remove() {
         throw new UnsupportedOperationException();
       }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramGeneratorTest.java
new file mode 100644
index 0000000..b1da5d6
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramGeneratorTest.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.ngram;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class NGramGeneratorTest {
+  
+  @Test
+  public void generateListTest() {
+    
+    final List<String> input = Arrays.asList("This", "is", "a", "sentence");
+    final int window = 2;
+    final String separator = "-";
+    
+    final List<String> ngrams = NGramGenerator.generate(input, window, separator);
+    
+    Assert.assertEquals(3,  ngrams.size());
+    Assert.assertTrue(ngrams.contains("This-is"));
+    Assert.assertTrue(ngrams.contains("is-a"));
+    Assert.assertTrue(ngrams.contains("a-sentence"));
+    
+  }
+  
+  @Test
+  public void generateCharTest() {
+    
+    final char[] input = "Test again".toCharArray();
+    final int window = 4;
+    final String separator = "-";
+    
+    final List<String> ngrams = NGramGenerator.generate(input, window, separator);
+
+    Assert.assertEquals(7,  ngrams.size());
+    Assert.assertTrue(ngrams.contains("T-e-s-t"));
+    Assert.assertTrue(ngrams.contains("e-s-t- "));
+    Assert.assertTrue(ngrams.contains("s-t- -a"));
+    Assert.assertTrue(ngrams.contains("t- -a-g"));
+    Assert.assertTrue(ngrams.contains(" -a-g-a"));
+    Assert.assertTrue(ngrams.contains("a-g-a-i"));
+    Assert.assertTrue(ngrams.contains("g-a-i-n"));
+    
+  }
+  
+  @Test
+  public void generateLargerWindowThanListTest() {
+    
+    final List<String> input = Arrays.asList("One", "two");
+    final int window = 3;
+    final String separator = "-";
+    
+    final List<String> ngrams = NGramGenerator.generate(input, window, separator);
+    
+    Assert.assertTrue(ngrams.isEmpty());
+    
+  }
+  
+  @Test
+  public void emptyTest() {
+    
+    final List<String> input = new ArrayList<>();
+    final int window = 2;
+    final String separator = "-";
+    
+    final List<String> ngrams = NGramGenerator.generate(input, window, separator);
+
+    Assert.assertTrue(ngrams.isEmpty());
+    
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java b/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java
index e4fb43d..47c228c 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java
@@ -17,16 +17,17 @@
 
 package opennlp.tools.ngram;
 
+import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.InputStream;
 import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 
-import org.apache.commons.io.IOUtils;
 import org.junit.Assert;
-import org.junit.Ignore;
 import org.junit.Test;
 
 import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.StringList;
 
 /**
@@ -169,31 +170,76 @@ public class NGramModelTest {
     Assert.assertEquals(1, dictionary.getMinTokenCount());
     Assert.assertEquals(3, dictionary.getMaxTokenCount());
   }
-
-  @Ignore
+  
+  @Test(expected = InvalidFormatException.class)
+  public void testInvalidFormat() throws Exception {
+    InputStream stream = new ByteArrayInputStream("inputstring".getBytes(StandardCharsets.UTF_8));
+    NGramModel ngramModel = new NGramModel(stream);
+    stream.close();
+    ngramModel.toDictionary(true);
+  }
+  
+  @Test
+  public void testFromFile() throws Exception {
+    InputStream stream = getClass().getResourceAsStream("/opennlp/tools/ngram/ngram-model.xml");
+    NGramModel ngramModel = new NGramModel(stream);
+    stream.close();
+    Dictionary dictionary = ngramModel.toDictionary(true);
+    Assert.assertNotNull(dictionary);
+    Assert.assertEquals(14, dictionary.size());
+    Assert.assertEquals(3, dictionary.getMaxTokenCount());
+    Assert.assertEquals(1, dictionary.getMinTokenCount());
+  }
+  
   @Test
   public void testSerialize() throws Exception {
-    NGramModel ngramModel = new NGramModel();
-    StringList tokens = new StringList("the", "brown", "fox", "jumped");
-    ngramModel.add(tokens, 1, 3);
-    tokens = new StringList("the", "brown", "Fox", "jumped");
-    ngramModel.add(tokens, 1, 3);
-    ByteArrayOutputStream out = new ByteArrayOutputStream();
-    ngramModel.serialize(out);
-    Assert.assertNotNull(out);
-    InputStream nGramModelStream = getClass()
-        .getResourceAsStream("/opennlp/tools/ngram/ngram-model.xml");
-    String modelString = IOUtils.toString(nGramModelStream);
-    // remove AL header
-    int start = modelString.indexOf("<!--");
-    int end = modelString.indexOf("-->");
-    String asfHeaderString = modelString.substring(start, end + 3);
-    modelString = modelString.replace(asfHeaderString, "");
-    String outputString = out.toString(Charset.forName("UTF-8").name());
-    Assert.assertEquals(
-        modelString.replaceAll("\n", "").replaceAll("\r", "")
-            .replaceAll("\t", "").replaceAll(" ", ""),
-        outputString.replaceAll("\n", "").replaceAll("\r", "")
-            .replaceAll("\t", "").replaceAll(" ", ""));
+   
+    InputStream stream = getClass().getResourceAsStream("/opennlp/tools/ngram/ngram-model.xml");
+    
+    NGramModel ngramModel1 = new NGramModel(stream);
+    stream.close();
+    
+    Dictionary dictionary = ngramModel1.toDictionary(true);
+    Assert.assertNotNull(dictionary);
+    Assert.assertEquals(14, dictionary.size());
+    Assert.assertEquals(3, dictionary.getMaxTokenCount());
+    Assert.assertEquals(1, dictionary.getMinTokenCount());
+    
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    ngramModel1.serialize(baos);
+    
+    final String serialized = new String(baos.toByteArray(), Charset.defaultCharset());
+    InputStream inputStream = new ByteArrayInputStream(serialized.getBytes(StandardCharsets.UTF_8));
+        
+    NGramModel ngramModel2 = new NGramModel(inputStream);
+    stream.close();
+        
+    Assert.assertEquals(ngramModel2.numberOfGrams(), ngramModel2.numberOfGrams());
+    Assert.assertEquals(ngramModel2.size(), ngramModel2.size());
+    
+    dictionary = ngramModel2.toDictionary(true);
+    
+    Assert.assertNotNull(dictionary);
+    Assert.assertEquals(14, dictionary.size());
+    Assert.assertEquals(3, dictionary.getMaxTokenCount());
+    Assert.assertEquals(1, dictionary.getMinTokenCount());
+    
   }
+  
+  @Test(expected = InvalidFormatException.class)
+  public void testFromInvalidFileMissingCount() throws Exception {
+    InputStream stream = getClass().getResourceAsStream("/opennlp/tools/ngram/ngram-model-no-count.xml");
+    NGramModel ngramModel = new NGramModel(stream);
+    stream.close();
+    ngramModel.toDictionary(true);
+  }
+  
+  @Test(expected = InvalidFormatException.class)
+  public void testFromInvalidFileNotANumber() throws Exception {
+    InputStream stream = getClass().getResourceAsStream("/opennlp/tools/ngram/ngram-model-not-a-number.xml");
+    NGramModel ngramModel = new NGramModel(stream);
+    stream.close();
+    ngramModel.toDictionary(true);
+  }
+
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-no-count.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-no-count.xml b/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-no-count.xml
new file mode 100644
index 0000000..62a1d90
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-no-count.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+<dictionary case_sensitive="false">
+    <entry>
+        <token>brown</token>
+        <token>fox</token>
+    </entry>
+</dictionary>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-not-a-number.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-not-a-number.xml b/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-not-a-number.xml
new file mode 100644
index 0000000..e132ea4
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-not-a-number.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+<dictionary case_sensitive="false">
+    <entry count="asdf">
+        <token>brown</token>
+        <token>fox</token>
+    </entry>
+</dictionary>