You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2017/05/17 15:52:30 UTC
[04/50] opennlp git commit: OPENNLP-1028: Add tests for
FeatureGenerators in doccat. This closes apache/opennlp#166
OPENNLP-1028: Add tests for FeatureGenerators in doccat. This closes apache/opennlp#166
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/580e0d1e
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/580e0d1e
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/580e0d1e
Branch: refs/heads/LangDetect
Commit: 580e0d1e26ab2a9275f01506f3af56fe8fc32988
Parents: e220a72
Author: koji <ko...@apache.org>
Authored: Wed Apr 19 10:14:47 2017 +0900
Committer: koji <ko...@apache.org>
Committed: Wed Apr 19 10:14:47 2017 +0900
----------------------------------------------------------------------
.../doccat/BagOfWordsFeatureGenerator.java | 6 +-
.../tools/doccat/NGramFeatureGenerator.java | 11 +-
.../doccat/BagOfWordsFeatureGeneratorTest.java | 62 +++++++++
.../tools/doccat/NGramFeatureGeneratorTest.java | 129 +++++++++++++++++++
4 files changed, 201 insertions(+), 7 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/580e0d1e/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
index ac39afc..51a3277 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
@@ -21,6 +21,7 @@ package opennlp.tools.doccat;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;
+import java.util.Objects;
import opennlp.tools.util.featuregen.StringPattern;
@@ -29,9 +30,10 @@ import opennlp.tools.util.featuregen.StringPattern;
*/
public class BagOfWordsFeatureGenerator implements FeatureGenerator {
- private boolean useOnlyAllLetterTokens = false;
+ private final boolean useOnlyAllLetterTokens;
public BagOfWordsFeatureGenerator() {
+ this(false);
}
BagOfWordsFeatureGenerator(boolean useOnlyAllLetterTokens) {
@@ -40,7 +42,7 @@ public class BagOfWordsFeatureGenerator implements FeatureGenerator {
@Override
public Collection<String> extractFeatures(String[] text, Map<String, Object> extraInformation) {
-
+ Objects.requireNonNull(text, "text must not be null");
Collection<String> bagOfWords = new ArrayList<>(text.length);
for (String word : text) {
http://git-wip-us.apache.org/repos/asf/opennlp/blob/580e0d1e/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
index 967b105..6e1786f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
@@ -21,6 +21,7 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Map;
+import java.util.Objects;
import opennlp.tools.util.InvalidFormatException;
@@ -30,9 +31,8 @@ import opennlp.tools.util.InvalidFormatException;
*/
public class NGramFeatureGenerator implements FeatureGenerator {
- //default values for bigrams
- private int minGram = 2;
- private int maxGram = 2;
+ private final int minGram;
+ private final int maxGram;
/**
* Constructor for ngrams.
@@ -59,7 +59,8 @@ public class NGramFeatureGenerator implements FeatureGenerator {
/**
* Default constructor for Bi grams
*/
- public NGramFeatureGenerator() {
+ public NGramFeatureGenerator() throws InvalidFormatException {
+ this(2, 2);
}
/**
@@ -70,7 +71,7 @@ public class NGramFeatureGenerator implements FeatureGenerator {
* @return a collection of n gram features
*/
public Collection<String> extractFeatures(String[] text, Map<String, Object> extraInfo) {
-
+ Objects.requireNonNull(text, "text must not be null");
List<String> features = new ArrayList<>();
for (int i = 0; i <= text.length - minGram; i++) {
http://git-wip-us.apache.org/repos/asf/opennlp/blob/580e0d1e/opennlp-tools/src/test/java/opennlp/tools/doccat/BagOfWordsFeatureGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/BagOfWordsFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/BagOfWordsFeatureGeneratorTest.java
new file mode 100644
index 0000000..2b128d9
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/BagOfWordsFeatureGeneratorTest.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.doccat;
+
+import java.util.Collections;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class BagOfWordsFeatureGeneratorTest {
+
+ @Test
+ public void testNull() {
+ BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator();
+ try {
+ generator.extractFeatures(null, Collections.emptyMap());
+ Assert.fail("NullPointerException must be thrown");
+ }
+ catch (NullPointerException expected) {
+ }
+ }
+
+ @Test
+ public void testEmpty() {
+ BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator();
+
+ Assert.assertEquals(0, generator.extractFeatures(new String[]{}, Collections.emptyMap()).size());
+ }
+
+ @Test
+ public void testUseAllTokens() {
+ BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator();
+
+ Assert.assertArrayEquals(new String[]{"bow=it", "bow=is", "bow=12.345", "bow=feet", "bow=long"},
+ generator.extractFeatures(new String[]{"it", "is", "12.345", "feet", "long"},
+ Collections.emptyMap()).toArray());
+ }
+
+ @Test
+ public void testOnlyLetterTokens() {
+ BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator(true);
+
+ Assert.assertArrayEquals(new String[]{"bow=it", "bow=is", "bow=feet", "bow=long"},
+ generator.extractFeatures(new String[]{"it", "is", "12.345", "feet", "long"},
+ Collections.emptyMap()).toArray());
+ }
+}
http://git-wip-us.apache.org/repos/asf/opennlp/blob/580e0d1e/opennlp-tools/src/test/java/opennlp/tools/doccat/NGramFeatureGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/NGramFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/NGramFeatureGeneratorTest.java
new file mode 100644
index 0000000..0aef3ea
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/NGramFeatureGeneratorTest.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.doccat;
+
+import java.util.Collections;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.util.InvalidFormatException;
+
+public class NGramFeatureGeneratorTest {
+
+ static final String[] TOKENS = new String[]{"a", "b", "c", "d", "e", "f", "g"};
+
+ @Test
+ public void testNull() throws Exception {
+ NGramFeatureGenerator generator = new NGramFeatureGenerator();
+ try {
+ generator.extractFeatures(null, Collections.emptyMap());
+ Assert.fail("NullPointerException must be thrown");
+ }
+ catch (NullPointerException expected) {
+ }
+ }
+
+ @Test
+ public void testEmpty() throws Exception {
+ NGramFeatureGenerator generator = new NGramFeatureGenerator();
+
+ Assert.assertEquals(0, generator.extractFeatures(new String[]{}, Collections.emptyMap()).size());
+ }
+
+ @Test
+ public void testInvalidGramSize1() {
+ try {
+ new NGramFeatureGenerator(0, 1);
+ Assert.fail("InvalidFormatException must be thrown");
+ }
+ catch (InvalidFormatException expected) {
+ }
+ }
+
+ @Test
+ public void testInvalidGramSize2() {
+ try {
+ new NGramFeatureGenerator(2, 1);
+ Assert.fail("InvalidFormatException must be thrown");
+ }
+ catch (InvalidFormatException expected) {
+ }
+ }
+
+ @Test
+ public void testUnigram() throws Exception {
+ NGramFeatureGenerator generator = new NGramFeatureGenerator(1, 1);
+
+ Assert.assertArrayEquals(
+ new String[]{"ng=:a", "ng=:b", "ng=:c", "ng=:d", "ng=:e", "ng=:f", "ng=:g"},
+ generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray());
+ }
+
+ @Test
+ public void testBigram() throws Exception {
+ NGramFeatureGenerator generator = new NGramFeatureGenerator(2, 2);
+
+ Assert.assertArrayEquals(
+ new String[]{"ng=:a:b", "ng=:b:c", "ng=:c:d", "ng=:d:e", "ng=:e:f", "ng=:f:g"},
+ generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray());
+ }
+
+ @Test
+ public void testTrigram() throws Exception {
+ NGramFeatureGenerator generator = new NGramFeatureGenerator(3, 3);
+
+ Assert.assertArrayEquals(
+ new String[]{"ng=:a:b:c", "ng=:b:c:d", "ng=:c:d:e", "ng=:d:e:f", "ng=:e:f:g"},
+ generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray());
+ }
+
+ @Test
+ public void test12gram() throws Exception {
+ NGramFeatureGenerator generator = new NGramFeatureGenerator(1, 2);
+
+ Assert.assertArrayEquals(
+ new String[]{
+ "ng=:a", "ng=:a:b",
+ "ng=:b", "ng=:b:c",
+ "ng=:c", "ng=:c:d",
+ "ng=:d", "ng=:d:e",
+ "ng=:e", "ng=:e:f",
+ "ng=:f", "ng=:f:g",
+ "ng=:g"
+ },
+ generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray());
+ }
+
+ @Test
+ public void test13gram() throws Exception {
+ NGramFeatureGenerator generator = new NGramFeatureGenerator(1, 3);
+
+ Assert.assertArrayEquals(
+ new String[]{
+ "ng=:a", "ng=:a:b", "ng=:a:b:c",
+ "ng=:b", "ng=:b:c", "ng=:b:c:d",
+ "ng=:c", "ng=:c:d", "ng=:c:d:e",
+ "ng=:d", "ng=:d:e", "ng=:d:e:f",
+ "ng=:e", "ng=:e:f", "ng=:e:f:g",
+ "ng=:f", "ng=:f:g",
+ "ng=:g"
+ },
+ generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray());
+ }
+}