You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2017/05/17 15:52:30 UTC

[04/50] opennlp git commit: OPENNLP-1028: Add tests for FeatureGenerators in doccat. This closes apache/opennlp#166

OPENNLP-1028: Add tests for FeatureGenerators in doccat. This closes apache/opennlp#166


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/580e0d1e
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/580e0d1e
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/580e0d1e

Branch: refs/heads/LangDetect
Commit: 580e0d1e26ab2a9275f01506f3af56fe8fc32988
Parents: e220a72
Author: koji <ko...@apache.org>
Authored: Wed Apr 19 10:14:47 2017 +0900
Committer: koji <ko...@apache.org>
Committed: Wed Apr 19 10:14:47 2017 +0900

----------------------------------------------------------------------
 .../doccat/BagOfWordsFeatureGenerator.java      |   6 +-
 .../tools/doccat/NGramFeatureGenerator.java     |  11 +-
 .../doccat/BagOfWordsFeatureGeneratorTest.java  |  62 +++++++++
 .../tools/doccat/NGramFeatureGeneratorTest.java | 129 +++++++++++++++++++
 4 files changed, 201 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/580e0d1e/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
index ac39afc..51a3277 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
@@ -21,6 +21,7 @@ package opennlp.tools.doccat;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Map;
+import java.util.Objects;
 
 import opennlp.tools.util.featuregen.StringPattern;
 
@@ -29,9 +30,10 @@ import opennlp.tools.util.featuregen.StringPattern;
  */
 public class BagOfWordsFeatureGenerator implements FeatureGenerator {
 
-  private boolean useOnlyAllLetterTokens = false;
+  private final boolean useOnlyAllLetterTokens;
 
   public BagOfWordsFeatureGenerator() {
+    this(false);
   }
 
   BagOfWordsFeatureGenerator(boolean useOnlyAllLetterTokens) {
@@ -40,7 +42,7 @@ public class BagOfWordsFeatureGenerator implements FeatureGenerator {
 
   @Override
   public Collection<String> extractFeatures(String[] text, Map<String, Object> extraInformation) {
-
+    Objects.requireNonNull(text, "text must not be null");
     Collection<String> bagOfWords = new ArrayList<>(text.length);
 
     for (String word : text) {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/580e0d1e/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
index 967b105..6e1786f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
@@ -21,6 +21,7 @@ import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 
 import opennlp.tools.util.InvalidFormatException;
 
@@ -30,9 +31,8 @@ import opennlp.tools.util.InvalidFormatException;
  */
 public class NGramFeatureGenerator implements FeatureGenerator {
 
-  //default values for bigrams
-  private int minGram = 2;
-  private int maxGram = 2;
+  private final int minGram;
+  private final int maxGram;
 
   /**
    * Constructor for ngrams.
@@ -59,7 +59,8 @@ public class NGramFeatureGenerator implements FeatureGenerator {
   /**
    * Default constructor for Bi grams
    */
-  public NGramFeatureGenerator() {
+  public NGramFeatureGenerator() throws InvalidFormatException {
+    this(2, 2);
   }
 
   /**
@@ -70,7 +71,7 @@ public class NGramFeatureGenerator implements FeatureGenerator {
    * @return a collection of n gram features
    */
   public Collection<String> extractFeatures(String[] text, Map<String, Object> extraInfo) {
-
+    Objects.requireNonNull(text, "text must not be null");
     List<String> features = new ArrayList<>();
 
     for (int i = 0; i <= text.length - minGram; i++) {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/580e0d1e/opennlp-tools/src/test/java/opennlp/tools/doccat/BagOfWordsFeatureGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/BagOfWordsFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/BagOfWordsFeatureGeneratorTest.java
new file mode 100644
index 0000000..2b128d9
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/BagOfWordsFeatureGeneratorTest.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.doccat;
+
+import java.util.Collections;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class BagOfWordsFeatureGeneratorTest {
+
+  @Test
+  public void testNull() {
+    BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator();
+    try {
+      generator.extractFeatures(null, Collections.emptyMap());
+      Assert.fail("NullPointerException must be thrown");
+    }
+    catch (NullPointerException expected) {
+    }
+  }
+
+  @Test
+  public void testEmpty() {
+    BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator();
+
+    Assert.assertEquals(0, generator.extractFeatures(new String[]{}, Collections.emptyMap()).size());
+  }
+
+  @Test
+  public void testUseAllTokens() {
+    BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator();
+
+    Assert.assertArrayEquals(new String[]{"bow=it", "bow=is", "bow=12.345", "bow=feet", "bow=long"},
+        generator.extractFeatures(new String[]{"it", "is", "12.345", "feet", "long"},
+            Collections.emptyMap()).toArray());
+  }
+
+  @Test
+  public void testOnlyLetterTokens() {
+    BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator(true);
+
+    Assert.assertArrayEquals(new String[]{"bow=it", "bow=is", "bow=feet", "bow=long"},
+            generator.extractFeatures(new String[]{"it", "is", "12.345", "feet", "long"},
+                    Collections.emptyMap()).toArray());
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/580e0d1e/opennlp-tools/src/test/java/opennlp/tools/doccat/NGramFeatureGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/NGramFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/NGramFeatureGeneratorTest.java
new file mode 100644
index 0000000..0aef3ea
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/NGramFeatureGeneratorTest.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.doccat;
+
+import java.util.Collections;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.util.InvalidFormatException;
+
+public class NGramFeatureGeneratorTest {
+
+  static final String[] TOKENS = new String[]{"a", "b", "c", "d", "e", "f", "g"};
+
+  @Test
+  public void testNull() throws Exception {
+    NGramFeatureGenerator generator = new NGramFeatureGenerator();
+    try {
+      generator.extractFeatures(null, Collections.emptyMap());
+      Assert.fail("NullPointerException must be thrown");
+    }
+    catch (NullPointerException expected) {
+    }
+  }
+
+  @Test
+  public void testEmpty() throws Exception {
+    NGramFeatureGenerator generator = new NGramFeatureGenerator();
+
+    Assert.assertEquals(0, generator.extractFeatures(new String[]{}, Collections.emptyMap()).size());
+  }
+
+  @Test
+  public void testInvalidGramSize1() {
+    try {
+      new NGramFeatureGenerator(0, 1);
+      Assert.fail("InvalidFormatException must be thrown");
+    }
+    catch (InvalidFormatException expected) {
+    }
+  }
+
+  @Test
+  public void testInvalidGramSize2() {
+    try {
+      new NGramFeatureGenerator(2, 1);
+      Assert.fail("InvalidFormatException must be thrown");
+    }
+    catch (InvalidFormatException expected) {
+    }
+  }
+
+  @Test
+  public void testUnigram() throws Exception {
+    NGramFeatureGenerator generator = new NGramFeatureGenerator(1, 1);
+
+    Assert.assertArrayEquals(
+            new String[]{"ng=:a", "ng=:b", "ng=:c", "ng=:d", "ng=:e", "ng=:f", "ng=:g"},
+        generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray());
+  }
+
+  @Test
+  public void testBigram() throws Exception {
+    NGramFeatureGenerator generator = new NGramFeatureGenerator(2, 2);
+
+    Assert.assertArrayEquals(
+            new String[]{"ng=:a:b", "ng=:b:c", "ng=:c:d", "ng=:d:e", "ng=:e:f", "ng=:f:g"},
+        generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray());
+  }
+
+  @Test
+  public void testTrigram() throws Exception {
+    NGramFeatureGenerator generator = new NGramFeatureGenerator(3, 3);
+
+    Assert.assertArrayEquals(
+            new String[]{"ng=:a:b:c", "ng=:b:c:d", "ng=:c:d:e", "ng=:d:e:f", "ng=:e:f:g"},
+        generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray());
+  }
+
+  @Test
+  public void test12gram() throws Exception {
+    NGramFeatureGenerator generator = new NGramFeatureGenerator(1, 2);
+
+    Assert.assertArrayEquals(
+            new String[]{
+                "ng=:a", "ng=:a:b",
+                "ng=:b", "ng=:b:c",
+                "ng=:c", "ng=:c:d",
+                "ng=:d", "ng=:d:e",
+                "ng=:e", "ng=:e:f",
+                "ng=:f", "ng=:f:g",
+                "ng=:g"
+            },
+        generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray());
+  }
+
+  @Test
+  public void test13gram() throws Exception {
+    NGramFeatureGenerator generator = new NGramFeatureGenerator(1, 3);
+
+    Assert.assertArrayEquals(
+            new String[]{
+                "ng=:a", "ng=:a:b", "ng=:a:b:c",
+                "ng=:b", "ng=:b:c", "ng=:b:c:d",
+                "ng=:c", "ng=:c:d", "ng=:c:d:e",
+                "ng=:d", "ng=:d:e", "ng=:d:e:f",
+                "ng=:e", "ng=:e:f", "ng=:e:f:g",
+                "ng=:f", "ng=:f:g",
+                "ng=:g"
+            },
+        generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray());
+  }
+}