You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2017/05/17 15:52:33 UTC
[07/50] opennlp git commit: OPENNLP-1033: Adds unit tests for
opennlp.tools.ngram, closes apache/opennlp#172
OPENNLP-1033: Adds unit tests for opennlp.tools.ngram, closes apache/opennlp#172
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/3ba27e9f
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/3ba27e9f
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/3ba27e9f
Branch: refs/heads/LangDetect
Commit: 3ba27e9f4a29be1922b3a19f9c6c5127f93027ab
Parents: d447459
Author: jzonthemtn <je...@mtnfog.com>
Authored: Wed Apr 19 15:53:32 2017 -0400
Committer: smarthi <sm...@apache.org>
Committed: Wed Apr 19 15:53:32 2017 -0400
----------------------------------------------------------------------
.../java/opennlp/tools/ngram/NGramModel.java | 6 +-
.../opennlp/tools/ngram/NGramGeneratorTest.java | 91 ++++++++++++++++++
.../opennlp/tools/ngram/NGramModelTest.java | 98 ++++++++++++++------
.../tools/ngram/ngram-model-no-count.xml | 27 ++++++
.../tools/ngram/ngram-model-not-a-number.xml | 27 ++++++
5 files changed, 222 insertions(+), 27 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java b/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java
index 7005dc4..0e0e4dd 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java
@@ -216,6 +216,7 @@ public class NGramModel implements Iterable<StringList> {
*
* @return iterator over all grams
*/
+ @Override
public Iterator<StringList> iterator() {
return mNGrams.keySet().iterator();
}
@@ -306,10 +307,12 @@ public class NGramModel implements Iterable<StringList> {
{
private Iterator<StringList> mDictionaryIterator = NGramModel.this.iterator();
+ @Override
public boolean hasNext() {
return mDictionaryIterator.hasNext();
}
+ @Override
public Entry next() {
StringList tokens = mDictionaryIterator.next();
@@ -317,10 +320,11 @@ public class NGramModel implements Iterable<StringList> {
Attributes attributes = new Attributes();
attributes.setValue(COUNT, Integer.toString(getCount(tokens)));
-
+
return new Entry(tokens, attributes);
}
+ @Override
public void remove() {
throw new UnsupportedOperationException();
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramGeneratorTest.java
new file mode 100644
index 0000000..b1da5d6
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramGeneratorTest.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.ngram;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class NGramGeneratorTest {
+
+ @Test
+ public void generateListTest() {
+
+ final List<String> input = Arrays.asList("This", "is", "a", "sentence");
+ final int window = 2;
+ final String separator = "-";
+
+ final List<String> ngrams = NGramGenerator.generate(input, window, separator);
+
+ Assert.assertEquals(3, ngrams.size());
+ Assert.assertTrue(ngrams.contains("This-is"));
+ Assert.assertTrue(ngrams.contains("is-a"));
+ Assert.assertTrue(ngrams.contains("a-sentence"));
+
+ }
+
+ @Test
+ public void generateCharTest() {
+
+ final char[] input = "Test again".toCharArray();
+ final int window = 4;
+ final String separator = "-";
+
+ final List<String> ngrams = NGramGenerator.generate(input, window, separator);
+
+ Assert.assertEquals(7, ngrams.size());
+ Assert.assertTrue(ngrams.contains("T-e-s-t"));
+ Assert.assertTrue(ngrams.contains("e-s-t- "));
+ Assert.assertTrue(ngrams.contains("s-t- -a"));
+ Assert.assertTrue(ngrams.contains("t- -a-g"));
+ Assert.assertTrue(ngrams.contains(" -a-g-a"));
+ Assert.assertTrue(ngrams.contains("a-g-a-i"));
+ Assert.assertTrue(ngrams.contains("g-a-i-n"));
+
+ }
+
+ @Test
+ public void generateLargerWindowThanListTest() {
+
+ final List<String> input = Arrays.asList("One", "two");
+ final int window = 3;
+ final String separator = "-";
+
+ final List<String> ngrams = NGramGenerator.generate(input, window, separator);
+
+ Assert.assertTrue(ngrams.isEmpty());
+
+ }
+
+ @Test
+ public void emptyTest() {
+
+ final List<String> input = new ArrayList<>();
+ final int window = 2;
+ final String separator = "-";
+
+ final List<String> ngrams = NGramGenerator.generate(input, window, separator);
+
+ Assert.assertTrue(ngrams.isEmpty());
+
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java b/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java
index e4fb43d..47c228c 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java
@@ -17,16 +17,17 @@
package opennlp.tools.ngram;
+import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.InputStream;
import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
-import org.apache.commons.io.IOUtils;
import org.junit.Assert;
-import org.junit.Ignore;
import org.junit.Test;
import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.util.InvalidFormatException;
import opennlp.tools.util.StringList;
/**
@@ -169,31 +170,76 @@ public class NGramModelTest {
Assert.assertEquals(1, dictionary.getMinTokenCount());
Assert.assertEquals(3, dictionary.getMaxTokenCount());
}
-
- @Ignore
+
+ @Test(expected = InvalidFormatException.class)
+ public void testInvalidFormat() throws Exception {
+ InputStream stream = new ByteArrayInputStream("inputstring".getBytes(StandardCharsets.UTF_8));
+ NGramModel ngramModel = new NGramModel(stream);
+ stream.close();
+ ngramModel.toDictionary(true);
+ }
+
+ @Test
+ public void testFromFile() throws Exception {
+ InputStream stream = getClass().getResourceAsStream("/opennlp/tools/ngram/ngram-model.xml");
+ NGramModel ngramModel = new NGramModel(stream);
+ stream.close();
+ Dictionary dictionary = ngramModel.toDictionary(true);
+ Assert.assertNotNull(dictionary);
+ Assert.assertEquals(14, dictionary.size());
+ Assert.assertEquals(3, dictionary.getMaxTokenCount());
+ Assert.assertEquals(1, dictionary.getMinTokenCount());
+ }
+
@Test
public void testSerialize() throws Exception {
- NGramModel ngramModel = new NGramModel();
- StringList tokens = new StringList("the", "brown", "fox", "jumped");
- ngramModel.add(tokens, 1, 3);
- tokens = new StringList("the", "brown", "Fox", "jumped");
- ngramModel.add(tokens, 1, 3);
- ByteArrayOutputStream out = new ByteArrayOutputStream();
- ngramModel.serialize(out);
- Assert.assertNotNull(out);
- InputStream nGramModelStream = getClass()
- .getResourceAsStream("/opennlp/tools/ngram/ngram-model.xml");
- String modelString = IOUtils.toString(nGramModelStream);
- // remove AL header
- int start = modelString.indexOf("<!--");
- int end = modelString.indexOf("-->");
- String asfHeaderString = modelString.substring(start, end + 3);
- modelString = modelString.replace(asfHeaderString, "");
- String outputString = out.toString(Charset.forName("UTF-8").name());
- Assert.assertEquals(
- modelString.replaceAll("\n", "").replaceAll("\r", "")
- .replaceAll("\t", "").replaceAll(" ", ""),
- outputString.replaceAll("\n", "").replaceAll("\r", "")
- .replaceAll("\t", "").replaceAll(" ", ""));
+
+ InputStream stream = getClass().getResourceAsStream("/opennlp/tools/ngram/ngram-model.xml");
+
+ NGramModel ngramModel1 = new NGramModel(stream);
+ stream.close();
+
+ Dictionary dictionary = ngramModel1.toDictionary(true);
+ Assert.assertNotNull(dictionary);
+ Assert.assertEquals(14, dictionary.size());
+ Assert.assertEquals(3, dictionary.getMaxTokenCount());
+ Assert.assertEquals(1, dictionary.getMinTokenCount());
+
+ ByteArrayOutputStream baos = new ByteArrayOutputStream();
+ ngramModel1.serialize(baos);
+
+ final String serialized = new String(baos.toByteArray(), Charset.defaultCharset());
+ InputStream inputStream = new ByteArrayInputStream(serialized.getBytes(StandardCharsets.UTF_8));
+
+ NGramModel ngramModel2 = new NGramModel(inputStream);
+ stream.close();
+
+ Assert.assertEquals(ngramModel2.numberOfGrams(), ngramModel2.numberOfGrams());
+ Assert.assertEquals(ngramModel2.size(), ngramModel2.size());
+
+ dictionary = ngramModel2.toDictionary(true);
+
+ Assert.assertNotNull(dictionary);
+ Assert.assertEquals(14, dictionary.size());
+ Assert.assertEquals(3, dictionary.getMaxTokenCount());
+ Assert.assertEquals(1, dictionary.getMinTokenCount());
+
}
+
+ @Test(expected = InvalidFormatException.class)
+ public void testFromInvalidFileMissingCount() throws Exception {
+ InputStream stream = getClass().getResourceAsStream("/opennlp/tools/ngram/ngram-model-no-count.xml");
+ NGramModel ngramModel = new NGramModel(stream);
+ stream.close();
+ ngramModel.toDictionary(true);
+ }
+
+ @Test(expected = InvalidFormatException.class)
+ public void testFromInvalidFileNotANumber() throws Exception {
+ InputStream stream = getClass().getResourceAsStream("/opennlp/tools/ngram/ngram-model-not-a-number.xml");
+ NGramModel ngramModel = new NGramModel(stream);
+ stream.close();
+ ngramModel.toDictionary(true);
+ }
+
}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-no-count.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-no-count.xml b/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-no-count.xml
new file mode 100644
index 0000000..62a1d90
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-no-count.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+<dictionary case_sensitive="false">
+ <entry>
+ <token>brown</token>
+ <token>fox</token>
+ </entry>
+</dictionary>
http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-not-a-number.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-not-a-number.xml b/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-not-a-number.xml
new file mode 100644
index 0000000..e132ea4
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-not-a-number.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied. See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+<dictionary case_sensitive="false">
+ <entry count="asdf">
+ <token>brown</token>
+ <token>fox</token>
+ </entry>
+</dictionary>