You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by co...@apache.org on 2017/05/17 15:52:27 UTC

[01/50] opennlp git commit: OPENNLP-1027: Add tests for Event. The previous commit doean't have EventTest [Forced Update!]

Repository: opennlp
Updated Branches:
  refs/heads/LangDetect 661e5a104 -> 5a234de70 (forced update)


OPENNLP-1027: Add tests for Event. The previous commit doean't have EventTest


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/d8cdd5ee
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/d8cdd5ee
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/d8cdd5ee

Branch: refs/heads/LangDetect
Commit: d8cdd5eeffe9b63949129d09f6b0c7ad2d517e90
Parents: 0733d7c
Author: koji <ko...@apache.org>
Authored: Wed Apr 19 07:07:15 2017 +0900
Committer: koji <ko...@apache.org>
Committed: Wed Apr 19 07:07:15 2017 +0900

----------------------------------------------------------------------
 .../java/opennlp/tools/ml/model/EventTest.java  | 67 ++++++++++++++++++++
 1 file changed, 67 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/d8cdd5ee/opennlp-tools/src/test/java/opennlp/tools/ml/model/EventTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/model/EventTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/model/EventTest.java
new file mode 100644
index 0000000..7400e9e
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/ml/model/EventTest.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.ml.model;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class EventTest {
+
+  @Test
+  public void testNullOutcome() {
+    try {
+      new Event(null, new String[]{"aa", "bb", "cc"});
+      Assert.fail("NPE must be thrown");
+    }
+    catch (NullPointerException expected) {
+    }
+  }
+
+  @Test
+  public void testNullContext() {
+    try {
+      new Event("o1", null);
+      Assert.fail("NPE must be thrown");
+    }
+    catch (NullPointerException expected) {
+    }
+  }
+
+  @Test
+  public void testWithValues() {
+    Event event = new Event("o1",
+            new String[]{"aa", "bb", "cc"});
+
+    Assert.assertEquals("o1", event.getOutcome());
+    Assert.assertArrayEquals(new String[]{"aa", "bb", "cc"}, event.getContext());
+    Assert.assertNull(event.getValues());
+    Assert.assertEquals("o1 [aa bb cc]", event.toString());
+  }
+
+  @Test
+  public void testWithoutValues() {
+    Event event = new Event("o1",
+            new String[]{"aa", "bb", "cc"},
+            new float[]{0.2F, 0.4F, 0.4F});
+
+    Assert.assertEquals("o1", event.getOutcome());
+    Assert.assertArrayEquals(new String[]{"aa", "bb", "cc"}, event.getContext());
+    Assert.assertArrayEquals(new float[]{0.2F, 0.4F, 0.4F}, event.getValues(), 0.001F);
+    Assert.assertEquals("o1 [aa=0.2 bb=0.4 cc=0.4]", event.toString());
+  }
+}


[33/50] opennlp git commit: [maven-release-plugin] prepare release opennlp-1.8.0

Posted by co...@apache.org.
[maven-release-plugin] prepare release opennlp-1.8.0


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/286e45b5
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/286e45b5
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/286e45b5

Branch: refs/heads/LangDetect
Commit: 286e45b549dcfd1e95e5d8cbb9d2549d73583d74
Parents: db9c511
Author: Jörn Kottmann <jo...@apache.org>
Authored: Tue May 9 18:26:19 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Tue May 9 18:26:19 2017 +0200

----------------------------------------------------------------------
 opennlp-brat-annotator/pom.xml   | 2 +-
 opennlp-distr/pom.xml            | 2 +-
 opennlp-docs/pom.xml             | 2 +-
 opennlp-morfologik-addon/pom.xml | 2 +-
 opennlp-tools/pom.xml            | 2 +-
 opennlp-uima/pom.xml             | 2 +-
 pom.xml                          | 4 ++--
 7 files changed, 8 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/286e45b5/opennlp-brat-annotator/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-brat-annotator/pom.xml b/opennlp-brat-annotator/pom.xml
index 6c7be0d..008fd65 100644
--- a/opennlp-brat-annotator/pom.xml
+++ b/opennlp-brat-annotator/pom.xml
@@ -17,7 +17,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.0-SNAPSHOT</version>
+		<version>1.8.0</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/286e45b5/opennlp-distr/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-distr/pom.xml b/opennlp-distr/pom.xml
index 3f838cd..317c37f 100644
--- a/opennlp-distr/pom.xml
+++ b/opennlp-distr/pom.xml
@@ -24,7 +24,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.0-SNAPSHOT</version>
+		<version>1.8.0</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/286e45b5/opennlp-docs/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/pom.xml b/opennlp-docs/pom.xml
index fbf0b5c..6b407b8 100644
--- a/opennlp-docs/pom.xml
+++ b/opennlp-docs/pom.xml
@@ -24,7 +24,7 @@
   <parent>
 	<groupId>org.apache.opennlp</groupId>
 	<artifactId>opennlp</artifactId>
-	<version>1.8.0-SNAPSHOT</version>
+	<version>1.8.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
   

http://git-wip-us.apache.org/repos/asf/opennlp/blob/286e45b5/opennlp-morfologik-addon/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/pom.xml b/opennlp-morfologik-addon/pom.xml
index c46f101..bfae09f 100644
--- a/opennlp-morfologik-addon/pom.xml
+++ b/opennlp-morfologik-addon/pom.xml
@@ -24,7 +24,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.0-SNAPSHOT</version>
+		<version>1.8.0</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/286e45b5/opennlp-tools/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/pom.xml b/opennlp-tools/pom.xml
index a499375..573861b 100644
--- a/opennlp-tools/pom.xml
+++ b/opennlp-tools/pom.xml
@@ -25,7 +25,7 @@
   <parent>
     <groupId>org.apache.opennlp</groupId>
     <artifactId>opennlp</artifactId>
-    <version>1.8.0-SNAPSHOT</version>
+    <version>1.8.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
     

http://git-wip-us.apache.org/repos/asf/opennlp/blob/286e45b5/opennlp-uima/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-uima/pom.xml b/opennlp-uima/pom.xml
index 7cfdb72..1db9c38 100644
--- a/opennlp-uima/pom.xml
+++ b/opennlp-uima/pom.xml
@@ -25,7 +25,7 @@
 	<parent>
 	    <groupId>org.apache.opennlp</groupId>
 	    <artifactId>opennlp</artifactId>
-	    <version>1.8.0-SNAPSHOT</version>
+	    <version>1.8.0</version>
 	    <relativePath>../pom.xml</relativePath>
     </parent>
     

http://git-wip-us.apache.org/repos/asf/opennlp/blob/286e45b5/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 2190a26..29a0699 100644
--- a/pom.xml
+++ b/pom.xml
@@ -31,7 +31,7 @@
 
 	<groupId>org.apache.opennlp</groupId>
 	<artifactId>opennlp</artifactId>
-	<version>1.8.0-SNAPSHOT</version>
+	<version>1.8.0</version>
 	<packaging>pom</packaging>
 
 	<name>Apache OpenNLP Reactor</name>
@@ -40,7 +40,7 @@
 		<connection>scm:git:git@github.com:apache/opennlp.git</connection>
 		<developerConnection>scm:git:https://git-wip-us.apache.org/repos/asf/opennlp.git</developerConnection>
 		<url>https://git-wip-us.apache.org/repos/asf?p=opennlp.git</url>
-		<tag>HEAD</tag>
+		<tag>opennlp-1.8.0</tag>
 	</scm>
 
 	<mailingLists>


[49/50] opennlp git commit: OPENNLP-1066: Add MorfologikLemmatizer constructor taking a Dictionary

Posted by co...@apache.org.
OPENNLP-1066: Add MorfologikLemmatizer constructor taking a Dictionary

Closes #208


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/911d59f4
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/911d59f4
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/911d59f4

Branch: refs/heads/LangDetect
Commit: 911d59f46d0dce2e07a93dc0ff705154f568b7ba
Parents: 08b2c42
Author: William D C M SILVA <co...@apache.org>
Authored: Wed May 17 12:07:28 2017 -0300
Committer: William D C M SILVA <co...@apache.org>
Committed: Wed May 17 12:07:28 2017 -0300

----------------------------------------------------------------------
 .../opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java   | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/911d59f4/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
index 4c9de2c..22f4282 100644
--- a/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
+++ b/opennlp-morfologik-addon/src/main/java/opennlp/morfologik/lemmatizer/MorfologikLemmatizer.java
@@ -39,7 +39,12 @@ public class MorfologikLemmatizer implements Lemmatizer {
 
   public MorfologikLemmatizer(Path dictionaryPath) throws IllegalArgumentException,
       IOException {
-    dictLookup = new DictionaryLookup(Dictionary.read(dictionaryPath));
+    this(Dictionary.read(dictionaryPath));
+  }
+
+  public MorfologikLemmatizer(Dictionary dictionary) throws IllegalArgumentException,
+      IOException {
+    dictLookup = new DictionaryLookup(dictionary);
   }
 
   private List<String> lemmatize(String word, String postag) {


[18/50] opennlp git commit: OPENNLP-1035:Add unit tests and javadocs for BrownBigramFeatureGenerator, closes apache/opennlp#174

Posted by co...@apache.org.
OPENNLP-1035:Add unit tests and javadocs for BrownBigramFeatureGenerator, closes apache/opennlp#174


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/60595251
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/60595251
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/60595251

Branch: refs/heads/LangDetect
Commit: 60595251eec5979e14540c6d00043e24905a7404
Parents: 4060217
Author: jzonthemtn <je...@mtnfog.com>
Authored: Tue Apr 25 08:05:49 2017 -0400
Committer: smarthi <sm...@apache.org>
Committed: Tue Apr 25 08:05:49 2017 -0400

----------------------------------------------------------------------
 .../featuregen/BrownBigramFeatureGenerator.java |  20 +-
 .../BrownBigramFeatureGeneratorTest.java        |  87 +++
 .../opennlp/tools/formats/brown-cluster.txt     | 665 +++++++++++++++++++
 3 files changed, 764 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/60595251/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java
index 4f0a24a..f16ba97 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/BrownBigramFeatureGenerator.java
@@ -24,25 +24,30 @@ import java.util.List;
  */
 public class BrownBigramFeatureGenerator implements AdaptiveFeatureGenerator {
 
-  private BrownCluster brownLexicon;
-
-  public BrownBigramFeatureGenerator(BrownCluster dict) {
-    this.brownLexicon = dict;
+  private BrownCluster brownCluster;
+
+  /**
+   * Creates a new Brown Cluster bigram feature generator.
+   * @param brownCluster A {@link BrownCluster}.
+   */
+  public BrownBigramFeatureGenerator(BrownCluster brownCluster) {
+    this.brownCluster = brownCluster;
   }
 
+  @Override
   public void createFeatures(List<String> features, String[] tokens, int index,
       String[] previousOutcomes) {
 
-    List<String> wordClasses = BrownTokenClasses.getWordClasses(tokens[index], brownLexicon);
+    List<String> wordClasses = BrownTokenClasses.getWordClasses(tokens[index], brownCluster);
     if (index > 0) {
-      List<String> prevWordClasses = BrownTokenClasses.getWordClasses(tokens[index - 1], brownLexicon);
+      List<String> prevWordClasses = BrownTokenClasses.getWordClasses(tokens[index - 1], brownCluster);
       for (int i = 0; i < wordClasses.size() && i < prevWordClasses.size(); i++)
       features.add("p" + "browncluster" + "," + "browncluster" + "="
           + prevWordClasses.get(i) + "," + wordClasses.get(i));
     }
 
     if (index + 1 < tokens.length) {
-      List<String> nextWordClasses = BrownTokenClasses.getWordClasses(tokens[index + 1], brownLexicon);
+      List<String> nextWordClasses = BrownTokenClasses.getWordClasses(tokens[index + 1], brownCluster);
       for (int i = 0; i < wordClasses.size() && i < nextWordClasses.size(); i++) {
         features.add("browncluster" + "," + "n" + "browncluster" + "="
             + wordClasses.get(i) + "," + nextWordClasses.get(i));
@@ -51,4 +56,3 @@ public class BrownBigramFeatureGenerator implements AdaptiveFeatureGenerator {
   }
 
 }
-

http://git-wip-us.apache.org/repos/asf/opennlp/blob/60595251/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java
new file mode 100644
index 0000000..03810e8
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/BrownBigramFeatureGeneratorTest.java
@@ -0,0 +1,87 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+
+public class BrownBigramFeatureGeneratorTest {
+
+  private AdaptiveFeatureGenerator generator;
+  
+  @Before
+  public void setup() throws IOException {
+
+    ResourceAsStreamFactory stream = new ResourceAsStreamFactory(
+        getClass(), "/opennlp/tools/formats/brown-cluster.txt");
+
+    BrownCluster brownCluster = new BrownCluster(stream.createInputStream()); 
+    
+    generator = new BrownBigramFeatureGenerator(brownCluster);
+
+  }
+
+  @Test
+  public void createFeaturesTest() throws IOException {
+
+    String[] tokens = new String[] {"he", "went", "with", "you"};
+
+    List<String> features = new ArrayList<>();
+    generator.createFeatures(features, tokens, 3, null);
+
+    Assert.assertEquals(2, features.size());
+    Assert.assertTrue(features.contains("pbrowncluster,browncluster=0101,0010"));
+    Assert.assertTrue(features.contains("pbrowncluster,browncluster=01010,00101"));
+    
+  }
+  
+  @Test
+  public void createFeaturesSuccessiveTokensTest() throws IOException {
+
+    final String[] testSentence = new String[] {"he", "went", "with", "you", "in", "town"};
+
+    List<String> features = new ArrayList<>();
+    generator.createFeatures(features, testSentence, 3, null);
+
+    Assert.assertEquals(3, features.size());
+    Assert.assertTrue(features.contains("pbrowncluster,browncluster=0101,0010"));
+    Assert.assertTrue(features.contains("pbrowncluster,browncluster=01010,00101"));
+    Assert.assertTrue(features.contains("browncluster,nbrowncluster=0010,0000"));
+    
+  }
+  
+  @Test
+  public void noFeaturesTest() throws IOException {
+
+    final String[] testSentence = new String[] {"he", "went", "with", "you"};
+
+    List<String> features = new ArrayList<>();
+    generator.createFeatures(features, testSentence, 0, null);
+
+    Assert.assertEquals(0, features.size());
+    
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/60595251/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt b/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt
new file mode 100644
index 0000000..df31bc7
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/brown-cluster.txt
@@ -0,0 +1,665 @@
+0000	18,	1
+0000	wedding	1
+0000	A	1
+0000	No,	1
+0000	prefered	1
+0000	hurry	1
+0000	address?	1
+0000	sounds	1
+0000	any	1
+0000	soon,	1
+0000	in	56
+0000	Worcesterstreet	1
+00010	summer.	1
+00010	56473	1
+00010	different	1
+00010	20193	1
+00010	Ulm	1
+00010	17818	1
+00010	beautiful	1
+00010	23213	1
+00010	12424	1
+00010	Rue-de-Grandes-Illusions	1
+00010	good.	1
+00010	Barmerstr.	1
+00010	81737	1
+00010	order	1
+00010	1912	1
+00010	63737	1
+00010	Chesterstr.	1
+00010	80333	1
+00010	81234	1
+00010	that's	1
+00010	78181	1
+00010	30291	1
+00010	84630	1
+00010	25334	1
+00010	30303	2
+00010	Leipzig.	2
+00010	your	3
+00010	her	10
+000110	5.	1
+000110	Hamburg,	1
+000110	contact	1
+000110	faked.	1
+000110	streetname	1
+000110	34.	1
+000110	83939	1
+000110	25.	1
+000110	2.	1
+000110	part-time	1
+000110	help-wanted	1
+000110	11	1
+000110	some	1
+000110	Gauting.	1
+000110	address.	1
+000110	parent's	1
+000110	reply.	1
+000110	touch	1
+000110	Berlin.	5
+000110	Munich.	5
+000111	there,	1
+000111	Schulz	1
+000111	Paris	1
+000111	Edinburgh,	1
+000111	day	1
+000111	1	1
+000111	you?	1
+000111	saw	1
+000111	see	1
+000111	house	1
+000111	recently	1
+000111	Don't	1
+000111	back	1
+000111	apartment	1
+000111	12,	1
+000111	Are	2
+000111	Could	2
+000111	did	2
+000111	job	2
+000111	still	3
+000111	Thank	3
+000111	up	3
+00100	30202.	1
+00100	Yesterday,	1
+00100	ad	1
+00100	homesick,	1
+00100	Now,	1
+00100	man	1
+00100	help.	1
+00100	area.	1
+00100	"Westbad".	1
+00100	or	2
+00100	It's	2
+00100	It	2
+00100	The	7
+00100	As	3
+00101	Arent't	1
+00101	offer.	1
+00101	celebrated	1
+00101	available.	1
+00101	spontaneously.	1
+00101	sounding	1
+00101	party	2
+00101	you	12
+001100	last	1
+001100	called,	1
+001100	That	1
+001100	life	1
+001100	pointed	1
+001100	building	1
+001100	restaurant	1
+001100	5,	1
+001100	one	1
+001100	interested	1
+001100	located	1
+001100	Please	1
+001100	answered	1
+001100	Hospital	1
+001100	112,	2
+001100	arrived	3
+001100	lived	4
+001100	lives	4
+001101	Unter-den-Linden	1
+001101	this	1
+001101	moment.	1
+001101	tip	1
+001101	10th	1
+001101	reckon.	1
+001101	factory	1
+001101	line	1
+001101	Paracelsus	1
+001101	Alan	1
+001101	it's	2
+001101	company	2
+001101	who	4
+001110	didn't	1
+001110	postcode	1
+001110	police	1
+001110	building.	1
+001110	concierge	1
+001110	flaring	1
+001110	finally	3
+001110	she	7
+001110	Last	4
+001110	She	5
+0011110	Erding,	1
+0011110	Spain,	1
+0011110	resident,	1
+0011110	lady,	1
+0011110	later	1
+0011110	business	1
+0011110	idea	1
+0011110	Berlin	1
+0011110	England,	1
+0011110	Sure,	1
+0011110	,	10
+0011110	longer	1
+0011111	is.	1
+0011111	15	1
+0011111	Schneider	1
+0011111	Hinterhofer	1
+0011111	me.	1
+0011111	Our	1
+0011111	Seile	1
+0011111	Meier	1
+0011111	Bauer	1
+0011111	Sander	1
+0011111	Clara	1
+0011111	Schmidt	2
+0011111	minutes	2
+0011111	Miller	5
+0100	school	1
+0100	They	1
+0100	8	1
+0100	9	1
+0100	Europe.	1
+0100	those	1
+0100	Baumann,	1
+0100	a	38
+0100	high	1
+01010	About	1
+01010	has	1
+01010	us,	1
+01010	13,	1
+01010	university.	1
+01010	tell	1
+01010	On	2
+01010	than	2
+01010	An	2
+01010	Alisa	2
+01010	on	3
+01010	with	7
+01010	called	5
+01010	got	5
+01011	through	1
+01011	shoes?	1
+01011	city.	1
+01011	quickly	1
+01011	trauma,	1
+01011	situate	1
+01011	much!	1
+01011	then,	1
+01011	friday!	1
+01011	about	1
+01011	knew	2
+01011	of	17
+01011	him	3
+011000	drove	1
+011000	Yes,	1
+011000	away.	1
+011000	parents'	1
+011000	life-threatening,	1
+011000	Weilheim,	1
+011000	15.	1
+011000	33,	1
+011000	86th	1
+011000	1995.	1
+011000	apartment,	1
+011000	took	2
+011000	where	3
+011000	if	5
+011000	But	7
+011001	the	54
+011001	Blumenweg	1
+011010	problem	1
+011010	country	1
+011010	Her	1
+011010	rumour	1
+011010	middle-aged	1
+011010	police.	1
+011010	exhibition.	1
+011010	empty	1
+011010	hours	1
+011010	father	1
+011010	area	1
+011010	staff	1
+011010	Reichstag.	1
+011010	"Tapasbar"	1
+011010	to.	1
+011010	Lenbachhaus	1
+011010	complete	1
+011010	owner	1
+011010	1.	1
+011010	11,	1
+011010	15,	2
+011010	street	2
+011010	accident	2
+011010	Ostbahnhof	2
+011010	address	3
+0110110	help	1
+0110110	grateful	1
+0110110	singer	1
+0110110	new	1
+0110110	moment	1
+0110110	costumers	1
+0110110	ancestors.	1
+0110110	Schubert	1
+0110110	ups	1
+0110110	pedestrians.	1
+0110110	hint	1
+0110110	semester,	1
+0110110	aunt	1
+0110110	face-to-face,	1
+0110110	guests	1
+0110110	happy	1
+0110110	number	2
+0110110	6,	2
+0110110	name	8
+01101110	French	1
+01101110	Luise	1
+01101110	knowledge	1
+01101110	pictures	1
+01101110	them	2
+01101110	away	2
+01101110	out	4
+01101110	years	2
+01101111	pain,	1
+01101111	Is	1
+01101111	sign	1
+01101111	home,	1
+01101111	14,	1
+01101111	appreciated	1
+01101111	happened	1
+01101111	by	1
+01101111	point:	1
+01101111	opened	2
+01101111	near	4
+01101111	instantly	3
+01110	taxi	1
+01110	p.m.!	1
+01110	13	1
+01110	barbecue.	1
+01110	speed	1
+01110	tree.	1
+01110	tenant	1
+01110	metropolis	1
+01110	delivery	1
+01110	family	1
+01110	list	1
+01110	week.	1
+01110	student,	1
+01110	delicious	1
+01110	good	1
+01110	well-payed	1
+01110	student	1
+01110	person!	1
+01110	smaller	1
+01110	small	2
+01110	more	2
+01110	look	2
+01110	quite	2
+01110	bigger	2
+01110	young	2
+01110	tourist	2
+01110	great	3
+01110	letter	3
+01110	friend	4
+0111100	Elenor	1
+0111100	definitely	1
+0111100	Gina	1
+0111100	currently	1
+0111100	Marie	1
+0111100	McKennedy	1
+0111100	ten	1
+0111100	sometimes.	1
+0111100	Michael	1
+0111100	Michel	1
+0111100	competent	1
+0111100	Gerhard	1
+0111100	Stefanie	2
+0111100	five	2
+0111100	Mike	2
+0111100	Stefan	3
+0111101	particulary	1
+0111101	broken.	1
+0111101	10	1
+0111101	leather?	1
+0111101	grandaunt.	1
+0111101	90	1
+0111101	Julie	1
+0111101	badly	1
+0111101	you:	1
+0111101	July	1
+0111101	painfully	1
+0111101	founded	1
+0111101	Fernandes	1
+0111101	old	2
+0111101	elderly	2
+0111101	March	2
+0111101	him.	2
+0111101	2	2
+0111101	an	5
+0111110	6th	1
+0111110	Peter	1
+0111110	turbulent	1
+0111110	German	1
+0111110	informatics,	1
+0111110	phone	1
+0111110	October	1
+0111110	directly	1
+0111110	His	2
+0111110	My	4
+0111110	his	5
+0111110	our	5
+01111110	Oh	1
+01111110	mortal	1
+01111110	Natalie	1
+01111110	83454	1
+01111110	programming	1
+01111110	she's	2
+01111110	Hi	2
+01111110	that	9
+01111111	attention.	1
+01111111	central	1
+01111111	town.	1
+01111111	town	1
+01111111	Spanish	1
+01111111	lodge	1
+01111111	right	1
+01111111	married	2
+01111111	later,	2
+01111111	from	9
+01111111	local	2
+1000	information.	1
+1000	capital.	1
+1000	officer.	1
+1000	retired	1
+1000	most.	1
+1000	reception	1
+1000	wounds	1
+1000	12	1
+1000	personal	1
+1000	colour.	1
+1000	shoes	1
+1000	030/827234.	1
+1000	inquiries?	1
+1000	Brandenburger	1
+1000	computer...	1
+1000	underground	1
+1000	smalltown	1
+1000	city	2
+1000	only	2
+1000	first	4
+1000	home	3
+1000	woman	3
+1000	famous	4
+1001	multiple	1
+1001	France	1
+1001	care	1
+1001	burnt	1
+1001	birthday	1
+1001	there	2
+1001	they	3
+1001	it	8
+1001	He	4
+1001	which	4
+1010	Now	1
+1010	off	1
+1010	yes,	1
+1010	too.	1
+1010	and	30
+1010	56,	1
+10110	Euro,	1
+10110	Heidelberg.	1
+10110	countries,	1
+10110	injured.	1
+10110	widow.	1
+10110	danger.	1
+10110	fact	1
+10110	magazine.	1
+10110	12.	1
+10110	anniversary.	1
+10110	traditional	1
+10110	up,	1
+10110	that?	1
+10110	Fritsch.	1
+10110	amazing,	1
+10110	"Twentytwo".	1
+10110	am	1
+10110	Ottobrunn.	1
+10110	years.	1
+10110	her.	1
+10110	whom	2
+10110	Hamburg.	4
+10110	.	4
+10110	So	6
+10111	photo	1
+10111	place.	1
+10111	p.m..	1
+10111	Heidelberg's	1
+10111	September,	1
+10111	21,	1
+10111	jacket,	1
+10111	anyway,	1
+10111	Therefore,	1
+10111	couple,	1
+10111	so	2
+10111	When	2
+10111	year,	3
+10111	husband	2
+1100	place,	1
+1100	Convulsed	1
+1100	Driving	1
+1100	notable	1
+1100	album	1
+1100	meal.	1
+1100	I've	2
+1100	Hi,	2
+1100	We	2
+1100	I	37
+110100	takes	1
+110100	reported	1
+110100	is	15
+110100	wasn't	3
+110101	Bye!	1
+110101	He's	1
+110101	bike	1
+110101	can	1
+110101	agency	1
+110101	Highfly-Hotel	1
+110101	shop	1
+110101	"Daily's"	1
+110101	was	15
+110101	depended	1
+110110	Afterwards,	1
+110110	maps.	1
+110110	Lenbachhaus.	1
+110110	flair	1
+110110	immediately	1
+110110	weren't	1
+110110	addresses	1
+110110	desk	1
+110110	station	1
+110110	I'll	1
+110110	Tor	1
+110110	hospital	1
+110110	because	2
+110110	own	2
+110110	into	6
+110110	as	4
+1101110	frequented	1
+1101110	yet	1
+1101110	Since	1
+1101110	made	1
+1101110	what	1
+1101110	he	9
+1101110	information	2
+1101111	Italian.	1
+1101111	entertainer	1
+1101111	foreign	1
+1101111	delighted.	1
+1101111	George	3
+1101111	we	7
+111000	wrote	1
+111000	hadnt't	1
+111000	looking	1
+111000	just	1
+111000	realized	1
+111000	their	1
+111000	never	1
+111000	love	1
+111000	brought	2
+111000	really	2
+111000	heard	2
+111000	Although	2
+111000	like	7
+1110010	live	1
+1110010	don't	1
+1110010	injured	1
+1110010	first,	1
+1110010	hope	1
+1110010	want	1
+1110010	didn`t	1
+1110010	knows	1
+1110010	merely	1
+1110010	two	1
+1110010	worked	2
+1110010	tried	2
+1110010	no	2
+1110010	moved	4
+1110010	best	2
+1110011	need	1
+1110011	always	1
+1110011	alone	1
+1110011	liked	1
+1110011	forward	1
+1110011	proposed	1
+1110011	came	1
+1110011	talking	1
+1110011	pick	1
+1110011	told	2
+1110011	went	2
+1110011	decided	3
+1110011	wanted	3
+1110011	how	3
+1110011	have	4
+1110100	gave	1
+1110100	downs	1
+1110100	appartment	1
+1110100	hospital.	1
+1110100	last-minute.	1
+1110100	languages,	1
+1110100	sights,	1
+1110100	enjoyed	1
+1110100	I'm	6
+1110100	I'd	4
+1110101	felt	1
+1110101	flames	1
+1110101	enjoy	1
+1110101	deem	1
+1110101	called?	1
+1110101	hardly	1
+1110101	spent	1
+1110101	asked	2
+1110101	had	7
+1110101	found	3
+1110110	Munich,	1
+1110110	Scotland,	1
+1110110	day,	1
+1110110	study	1
+1110110	friend.	1
+1110110	after	1
+1110110	apartments	1
+1110110	show	1
+1110110	there.	1
+1110110	read	2
+1110110	get	3
+1110110	know	6
+1110111	right?	1
+1110111	soon	1
+1110111	uni.	1
+1110111	ambulance.	1
+1110111	Sunday	1
+1110111	before.	1
+1110111	possible.	1
+1110111	my	9
+1110111	he'd	2
+111100	you'll	1
+111100	?	1
+111100	not	2
+111100	to	42
+111101	it.	1
+111101	call	1
+111101	One	1
+111101	Bruno	1
+111101	once	1
+111101	around	1
+111101	for	7
+111101	at	13
+1111100	Hauptbahnhof?	1
+1111100	hesitant	1
+1111100	visit	1
+1111100	completely	1
+1111100	start	1
+1111100	managed	1
+1111100	money	1
+1111100	go	1
+1111100	offered	1
+1111100	possible	1
+1111100	afford	1
+1111100	driver	2
+1111100	write	3
+1111100	easy	2
+1111101	relaxed	1
+1111101	simply	1
+1111101	sure.	1
+1111101	starts	1
+1111101	friendly	1
+1111101	give	1
+1111101	sitting	1
+1111101	going	1
+1111101	urgent	1
+1111101	please	2
+1111101	next	3
+1111101	very	6
+1111110	who's	1
+1111110	much,	1
+1111110	friday?	1
+1111110	explained	1
+1111110	met	1
+1111110	Where	1
+1111110	How	2
+1111110	much	2
+1111110	are	2
+1111110	could	2
+1111110	me	6
+1111110	enough	3
+1111111	seen	1
+1111111	papers	1
+1111111	"Mondnacht"	1
+1111111	both.	1
+1111111	crashed	1
+1111111	studies	1
+1111111	bring	1
+1111111	pull	1
+1111111	teacher	1
+1111111	boy	1
+1111111	far	1
+1111111	move	1
+1111111	travelling	1
+1111111	Yeah	2
+1111111	ring	2
+1111111	meet	2
+1111111	find	5
+1111111	be	3
\ No newline at end of file


[24/50] opennlp git commit: OPENNLP-1037: OpenNLP build fails if only the eval tests are run

Posted by co...@apache.org.
OPENNLP-1037: OpenNLP build fails if only the eval tests are run


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/d44fe15b
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/d44fe15b
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/d44fe15b

Branch: refs/heads/LangDetect
Commit: d44fe15b1bc46639d7ba6b3eb58d8441985ae345
Parents: 9803662
Author: Peter Thygesen <th...@apache.org>
Authored: Wed May 3 20:55:07 2017 +0200
Committer: Peter Thygesen <th...@apache.org>
Committed: Wed May 3 20:55:07 2017 +0200

----------------------------------------------------------------------
 opennlp-tools/pom.xml | 11 +----------
 pom.xml               | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 10 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/d44fe15b/opennlp-tools/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/pom.xml b/opennlp-tools/pom.xml
index 663e903..a499375 100644
--- a/opennlp-tools/pom.xml
+++ b/opennlp-tools/pom.xml
@@ -73,16 +73,7 @@
       </resource>
     </resources>
     <plugins>
-      <plugin>
-        <groupId>org.apache.maven.plugins</groupId>
-        <artifactId>maven-surefire-plugin</artifactId>
-        <configuration>
-          <excludes>
-            <exclude>/opennlp/tools/eval/**/*</exclude>
-          </excludes>
-        </configuration>
-      </plugin>
-			
+
       <plugin>
         <groupId>org.apache.maven.plugins</groupId>
         <artifactId>maven-jar-plugin</artifactId>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d44fe15b/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 12c9ee6..2190a26 100644
--- a/pom.xml
+++ b/pom.xml
@@ -228,6 +228,7 @@
 					<version>${maven.surefire.plugin}</version>
 					<configuration>
 						<forkCount>${opennlp.forkCount}</forkCount>
+						<failIfNoSpecifiedTests>false</failIfNoSpecifiedTests>
 						<excludes>
 							<exclude>**/stemmer/*</exclude>
 							<exclude>**/stemmer/snowball/*</exclude>
@@ -416,6 +417,30 @@
 				</plugins>
 			</build>
 		</profile>
+
+		<profile>
+			<id>eval</id>
+			<activation>
+				<property>
+					<name>OPENNLP_DATA_DIR</name>
+				</property>
+			</activation>
+			<build>
+				<plugins>
+					<plugin>
+						<groupId>org.apache.maven.plugins</groupId>
+						<artifactId>maven-surefire-plugin</artifactId>
+						<version>${maven.surefire.plugin}</version>
+						<configuration>
+							<includes>
+								<include>**/*Test.java</include>
+								<include>**/SourceForgeModelEval.java</include>
+							</includes>
+						</configuration>
+					</plugin>
+				</plugins>
+			</build>
+		</profile>
 	</profiles>
 
 	<modules>


[20/50] opennlp git commit: NoJira: Trivial fixes, use try-with-resources, closes apache/opennlp#186

Posted by co...@apache.org.
NoJira: Trivial fixes, use try-with-resources, closes apache/opennlp#186


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/98036628
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/98036628
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/98036628

Branch: refs/heads/LangDetect
Commit: 980366284db98e0515dc5e827aaf4f750699bc80
Parents: bbbb431
Author: smarthi <sm...@apache.org>
Authored: Thu Apr 27 18:40:23 2017 -0400
Committer: smarthi <sm...@apache.org>
Committed: Thu Apr 27 18:40:23 2017 -0400

----------------------------------------------------------------------
 .../tools/cmdline/TokenNameFinderToolTest.java  | 20 +++++--------------
 .../convert/FileToStringSampleStreamTest.java   | 21 ++++++++++----------
 2 files changed, 15 insertions(+), 26 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/98036628/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java b/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java
index ba02e50..a163b0c 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java
@@ -95,13 +95,11 @@ public class TokenNameFinderToolTest {
   }
   
   private File trainModel() throws IOException {
-    
-    String encoding = "ISO-8859-1";
 
     ObjectStream<String> lineStream =
         new PlainTextByLineStream(new MockInputStreamFactory(
-            new File("opennlp/tools/namefind/AnnotatedSentencesWithTypes.txt")), encoding);
-    ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
+            new File("opennlp/tools/namefind/AnnotatedSentencesWithTypes.txt")),
+            StandardCharsets.ISO_8859_1);
 
     TrainingParameters params = new TrainingParameters();
     params.put(TrainingParameters.ITERATIONS_PARAM, 70);
@@ -111,24 +109,16 @@ public class TokenNameFinderToolTest {
 
     TokenNameFinderFactory nameFinderFactory = new TokenNameFinderFactory();
 
-    try {
+    try (ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream)) {
       model = NameFinderME.train("en", null, sampleStream, params,
           nameFinderFactory);
     }
-    finally {
-      sampleStream.close();
-    }
-
-    BufferedOutputStream modelOut = null;
     
     File modelFile = File.createTempFile("model", ".bin");
     
-    try {
-      modelOut = new BufferedOutputStream(new FileOutputStream(modelFile));
+    try (BufferedOutputStream modelOut =
+             new BufferedOutputStream(new FileOutputStream(modelFile))) {
       model.serialize(modelOut);
-    } finally {
-      if (modelOut != null) 
-       modelOut.close();    
     }
     
     return modelFile;

http://git-wip-us.apache.org/repos/asf/opennlp/blob/98036628/opennlp-tools/src/test/java/opennlp/tools/convert/FileToStringSampleStreamTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/convert/FileToStringSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/convert/FileToStringSampleStreamTest.java
index 6f6f7dc..e9f3892 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/convert/FileToStringSampleStreamTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/convert/FileToStringSampleStreamTest.java
@@ -45,7 +45,8 @@ public class FileToStringSampleStreamTest {
   
     List<String> sentences = Arrays.asList(sentence1, sentence2);
     
-    DirectorySampleStream directorySampleStream = new DirectorySampleStream(directory.getRoot(), null, false);
+    DirectorySampleStream directorySampleStream =
+        new DirectorySampleStream(directory.getRoot(), null, false);
       
     File tempFile1 = directory.newFile();
     FileUtils.writeStringToFile(tempFile1, sentence1);
@@ -53,17 +54,15 @@ public class FileToStringSampleStreamTest {
     File tempFile2 = directory.newFile();
     FileUtils.writeStringToFile(tempFile2, sentence2);
     
-    FileToStringSampleStream stream = 
-        new FileToStringSampleStream(directorySampleStream, Charset.defaultCharset());
+    try (FileToStringSampleStream stream =
+        new FileToStringSampleStream(directorySampleStream, Charset.defaultCharset())) {
 
-    String read = stream.read();    
-    Assert.assertTrue(sentences.contains(read));
-    
-    read = stream.read();    
-    Assert.assertTrue(sentences.contains(read));
-    
-    stream.close();
-    
+      String read = stream.read();
+      Assert.assertTrue(sentences.contains(read));
+
+      read = stream.read();
+      Assert.assertTrue(sentences.contains(read));
+    }
   }
 
 }


[07/50] opennlp git commit: OPENNLP-1033: Adds unit tests for opennlp.tools.ngram, closes apache/opennlp#172

Posted by co...@apache.org.
OPENNLP-1033: Adds unit tests for opennlp.tools.ngram, closes apache/opennlp#172


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/3ba27e9f
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/3ba27e9f
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/3ba27e9f

Branch: refs/heads/LangDetect
Commit: 3ba27e9f4a29be1922b3a19f9c6c5127f93027ab
Parents: d447459
Author: jzonthemtn <je...@mtnfog.com>
Authored: Wed Apr 19 15:53:32 2017 -0400
Committer: smarthi <sm...@apache.org>
Committed: Wed Apr 19 15:53:32 2017 -0400

----------------------------------------------------------------------
 .../java/opennlp/tools/ngram/NGramModel.java    |  6 +-
 .../opennlp/tools/ngram/NGramGeneratorTest.java | 91 ++++++++++++++++++
 .../opennlp/tools/ngram/NGramModelTest.java     | 98 ++++++++++++++------
 .../tools/ngram/ngram-model-no-count.xml        | 27 ++++++
 .../tools/ngram/ngram-model-not-a-number.xml    | 27 ++++++
 5 files changed, 222 insertions(+), 27 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java b/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java
index 7005dc4..0e0e4dd 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ngram/NGramModel.java
@@ -216,6 +216,7 @@ public class NGramModel implements Iterable<StringList> {
    *
    * @return iterator over all grams
    */
+  @Override
   public Iterator<StringList> iterator() {
     return mNGrams.keySet().iterator();
   }
@@ -306,10 +307,12 @@ public class NGramModel implements Iterable<StringList> {
     {
       private Iterator<StringList> mDictionaryIterator = NGramModel.this.iterator();
 
+      @Override
       public boolean hasNext() {
         return mDictionaryIterator.hasNext();
       }
 
+      @Override
       public Entry next() {
 
         StringList tokens = mDictionaryIterator.next();
@@ -317,10 +320,11 @@ public class NGramModel implements Iterable<StringList> {
         Attributes attributes = new Attributes();
 
         attributes.setValue(COUNT, Integer.toString(getCount(tokens)));
-
+        
         return new Entry(tokens, attributes);
       }
 
+      @Override
       public void remove() {
         throw new UnsupportedOperationException();
       }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramGeneratorTest.java
new file mode 100644
index 0000000..b1da5d6
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramGeneratorTest.java
@@ -0,0 +1,91 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.ngram;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class NGramGeneratorTest {
+  
+  @Test
+  public void generateListTest() {
+    
+    final List<String> input = Arrays.asList("This", "is", "a", "sentence");
+    final int window = 2;
+    final String separator = "-";
+    
+    final List<String> ngrams = NGramGenerator.generate(input, window, separator);
+    
+    Assert.assertEquals(3,  ngrams.size());
+    Assert.assertTrue(ngrams.contains("This-is"));
+    Assert.assertTrue(ngrams.contains("is-a"));
+    Assert.assertTrue(ngrams.contains("a-sentence"));
+    
+  }
+  
+  @Test
+  public void generateCharTest() {
+    
+    final char[] input = "Test again".toCharArray();
+    final int window = 4;
+    final String separator = "-";
+    
+    final List<String> ngrams = NGramGenerator.generate(input, window, separator);
+
+    Assert.assertEquals(7,  ngrams.size());
+    Assert.assertTrue(ngrams.contains("T-e-s-t"));
+    Assert.assertTrue(ngrams.contains("e-s-t- "));
+    Assert.assertTrue(ngrams.contains("s-t- -a"));
+    Assert.assertTrue(ngrams.contains("t- -a-g"));
+    Assert.assertTrue(ngrams.contains(" -a-g-a"));
+    Assert.assertTrue(ngrams.contains("a-g-a-i"));
+    Assert.assertTrue(ngrams.contains("g-a-i-n"));
+    
+  }
+  
+  @Test
+  public void generateLargerWindowThanListTest() {
+    
+    final List<String> input = Arrays.asList("One", "two");
+    final int window = 3;
+    final String separator = "-";
+    
+    final List<String> ngrams = NGramGenerator.generate(input, window, separator);
+    
+    Assert.assertTrue(ngrams.isEmpty());
+    
+  }
+  
+  @Test
+  public void emptyTest() {
+    
+    final List<String> input = new ArrayList<>();
+    final int window = 2;
+    final String separator = "-";
+    
+    final List<String> ngrams = NGramGenerator.generate(input, window, separator);
+
+    Assert.assertTrue(ngrams.isEmpty());
+    
+  }
+  
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java b/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java
index e4fb43d..47c228c 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ngram/NGramModelTest.java
@@ -17,16 +17,17 @@
 
 package opennlp.tools.ngram;
 
+import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.InputStream;
 import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 
-import org.apache.commons.io.IOUtils;
 import org.junit.Assert;
-import org.junit.Ignore;
 import org.junit.Test;
 
 import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.StringList;
 
 /**
@@ -169,31 +170,76 @@ public class NGramModelTest {
     Assert.assertEquals(1, dictionary.getMinTokenCount());
     Assert.assertEquals(3, dictionary.getMaxTokenCount());
   }
-
-  @Ignore
+  
+  @Test(expected = InvalidFormatException.class)
+  public void testInvalidFormat() throws Exception {
+    InputStream stream = new ByteArrayInputStream("inputstring".getBytes(StandardCharsets.UTF_8));
+    NGramModel ngramModel = new NGramModel(stream);
+    stream.close();
+    ngramModel.toDictionary(true);
+  }
+  
+  @Test
+  public void testFromFile() throws Exception {
+    InputStream stream = getClass().getResourceAsStream("/opennlp/tools/ngram/ngram-model.xml");
+    NGramModel ngramModel = new NGramModel(stream);
+    stream.close();
+    Dictionary dictionary = ngramModel.toDictionary(true);
+    Assert.assertNotNull(dictionary);
+    Assert.assertEquals(14, dictionary.size());
+    Assert.assertEquals(3, dictionary.getMaxTokenCount());
+    Assert.assertEquals(1, dictionary.getMinTokenCount());
+  }
+  
   @Test
   public void testSerialize() throws Exception {
-    NGramModel ngramModel = new NGramModel();
-    StringList tokens = new StringList("the", "brown", "fox", "jumped");
-    ngramModel.add(tokens, 1, 3);
-    tokens = new StringList("the", "brown", "Fox", "jumped");
-    ngramModel.add(tokens, 1, 3);
-    ByteArrayOutputStream out = new ByteArrayOutputStream();
-    ngramModel.serialize(out);
-    Assert.assertNotNull(out);
-    InputStream nGramModelStream = getClass()
-        .getResourceAsStream("/opennlp/tools/ngram/ngram-model.xml");
-    String modelString = IOUtils.toString(nGramModelStream);
-    // remove AL header
-    int start = modelString.indexOf("<!--");
-    int end = modelString.indexOf("-->");
-    String asfHeaderString = modelString.substring(start, end + 3);
-    modelString = modelString.replace(asfHeaderString, "");
-    String outputString = out.toString(Charset.forName("UTF-8").name());
-    Assert.assertEquals(
-        modelString.replaceAll("\n", "").replaceAll("\r", "")
-            .replaceAll("\t", "").replaceAll(" ", ""),
-        outputString.replaceAll("\n", "").replaceAll("\r", "")
-            .replaceAll("\t", "").replaceAll(" ", ""));
+   
+    InputStream stream = getClass().getResourceAsStream("/opennlp/tools/ngram/ngram-model.xml");
+    
+    NGramModel ngramModel1 = new NGramModel(stream);
+    stream.close();
+    
+    Dictionary dictionary = ngramModel1.toDictionary(true);
+    Assert.assertNotNull(dictionary);
+    Assert.assertEquals(14, dictionary.size());
+    Assert.assertEquals(3, dictionary.getMaxTokenCount());
+    Assert.assertEquals(1, dictionary.getMinTokenCount());
+    
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    ngramModel1.serialize(baos);
+    
+    final String serialized = new String(baos.toByteArray(), Charset.defaultCharset());
+    InputStream inputStream = new ByteArrayInputStream(serialized.getBytes(StandardCharsets.UTF_8));
+        
+    NGramModel ngramModel2 = new NGramModel(inputStream);
+    stream.close();
+        
+    Assert.assertEquals(ngramModel2.numberOfGrams(), ngramModel2.numberOfGrams());
+    Assert.assertEquals(ngramModel2.size(), ngramModel2.size());
+    
+    dictionary = ngramModel2.toDictionary(true);
+    
+    Assert.assertNotNull(dictionary);
+    Assert.assertEquals(14, dictionary.size());
+    Assert.assertEquals(3, dictionary.getMaxTokenCount());
+    Assert.assertEquals(1, dictionary.getMinTokenCount());
+    
   }
+  
+  @Test(expected = InvalidFormatException.class)
+  public void testFromInvalidFileMissingCount() throws Exception {
+    InputStream stream = getClass().getResourceAsStream("/opennlp/tools/ngram/ngram-model-no-count.xml");
+    NGramModel ngramModel = new NGramModel(stream);
+    stream.close();
+    ngramModel.toDictionary(true);
+  }
+  
+  @Test(expected = InvalidFormatException.class)
+  public void testFromInvalidFileNotANumber() throws Exception {
+    InputStream stream = getClass().getResourceAsStream("/opennlp/tools/ngram/ngram-model-not-a-number.xml");
+    NGramModel ngramModel = new NGramModel(stream);
+    stream.close();
+    ngramModel.toDictionary(true);
+  }
+
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-no-count.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-no-count.xml b/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-no-count.xml
new file mode 100644
index 0000000..62a1d90
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-no-count.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+<dictionary case_sensitive="false">
+    <entry>
+        <token>brown</token>
+        <token>fox</token>
+    </entry>
+</dictionary>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ba27e9f/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-not-a-number.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-not-a-number.xml b/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-not-a-number.xml
new file mode 100644
index 0000000..e132ea4
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/ngram/ngram-model-not-a-number.xml
@@ -0,0 +1,27 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.
+-->
+
+<dictionary case_sensitive="false">
+    <entry count="asdf">
+        <token>brown</token>
+        <token>fox</token>
+    </entry>
+</dictionary>


[17/50] opennlp git commit: OPENNLP-1040: Add OntoNotes4 training data verification

Posted by co...@apache.org.
OPENNLP-1040: Add OntoNotes4 training data verification


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/40602173
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/40602173
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/40602173

Branch: refs/heads/LangDetect
Commit: 406021733baf6cdd339d7b14a413b2ffeeaae42d
Parents: 32afb6a
Author: Jörn Kottmann <jo...@apache.org>
Authored: Fri Apr 21 12:57:19 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Mon Apr 24 12:49:20 2017 +0200

----------------------------------------------------------------------
 .../tools/eval/OntoNotes4NameFinderEval.java    | 56 +++++++++++++++-----
 .../tools/eval/OntoNotes4ParserEval.java        | 45 ++++++++++++----
 .../tools/eval/OntoNotes4PosTaggerEval.java     | 45 ++++++++++++----
 3 files changed, 116 insertions(+), 30 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/40602173/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
index e0e3912..ef018cd 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
@@ -19,9 +19,13 @@ package opennlp.tools.eval;
 
 import java.io.File;
 import java.io.IOException;
-import java.nio.charset.Charset;
+import java.math.BigInteger;
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
 
 import org.junit.Assert;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import opennlp.tools.formats.DirectorySampleStream;
@@ -37,9 +41,7 @@ import opennlp.tools.util.model.ModelUtil;
 
 public class OntoNotes4NameFinderEval {
 
-  private static void crossEval(TrainingParameters params, String type, double expectedScore)
-      throws IOException {
-
+  private static ObjectStream<NameSample> createNameSampleStream() throws IOException {
     ObjectStream<File> documentStream = new DirectorySampleStream(new File(
         EvalUtil.getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
         file -> {
@@ -50,19 +52,49 @@ public class OntoNotes4NameFinderEval {
           return file.isDirectory();
         }, true);
 
-    ObjectStream<NameSample> samples = new OntoNotesNameSampleStream(new FileToStringSampleStream(
-        documentStream, Charset.forName("UTF-8")));
+    return new OntoNotesNameSampleStream(new FileToStringSampleStream(
+        documentStream, StandardCharsets.UTF_8));
+  }
+
+  private static void crossEval(TrainingParameters params, String type, double expectedScore)
+      throws IOException {
+    try (ObjectStream<NameSample> samples = createNameSampleStream()) {
 
-    TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("en", null,
-        params, new TokenNameFinderFactory());
+      TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("en", null,
+          params, new TokenNameFinderFactory());
 
-    if (type != null) {
-      samples = new NameSampleTypeFilter(new String[] {type}, samples);
+      ObjectStream<NameSample> filteredSamples;
+      if (type != null) {
+        filteredSamples = new NameSampleTypeFilter(new String[] {type}, samples);
+      }
+      else {
+        filteredSamples = samples;
+      }
+
+      cv.evaluate(filteredSamples, 10);
+
+      Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.001d);
+    }
+  }
+
+  @BeforeClass
+  public static void verifyTrainingData() throws IOException {
+    MessageDigest digest;
+    try {
+      digest = MessageDigest.getInstance("MD5");
+    } catch (NoSuchAlgorithmException e) {
+      throw new IllegalStateException(e);
     }
 
-    cv.evaluate(samples, 10);
+    try (ObjectStream<NameSample> samples = createNameSampleStream()) {
+      NameSample sample;
+      while ((sample = samples.read()) != null) {
+        digest.update(sample.toString().getBytes(StandardCharsets.UTF_8));
+      }
 
-    Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.001d);
+      Assert.assertEquals(new BigInteger("168206908604555450993491898907821588182"),
+          new BigInteger(1, digest.digest()));
+    }
   }
 
   @Test

http://git-wip-us.apache.org/repos/asf/opennlp/blob/40602173/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
index 2182957..3a5b30d 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
@@ -21,9 +21,13 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
-import java.nio.charset.Charset;
+import java.math.BigInteger;
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
 
 import org.junit.Assert;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import opennlp.tools.formats.DirectorySampleStream;
@@ -31,6 +35,7 @@ import opennlp.tools.formats.convert.FileToStringSampleStream;
 import opennlp.tools.formats.ontonotes.DocumentToLineStream;
 import opennlp.tools.formats.ontonotes.OntoNotesParseSampleStream;
 import opennlp.tools.parser.HeadRules;
+import opennlp.tools.parser.Parse;
 import opennlp.tools.parser.ParserCrossValidator;
 import opennlp.tools.parser.ParserType;
 import opennlp.tools.parser.lang.en.HeadRulesTest;
@@ -40,9 +45,7 @@ import opennlp.tools.util.model.ModelUtil;
 
 public class OntoNotes4ParserEval {
 
-  private static void crossEval(TrainingParameters params, HeadRules rules, double expectedScore)
-      throws IOException {
-
+  private static ObjectStream<Parse> createParseSampleStream() throws IOException {
     ObjectStream<File> documentStream = new DirectorySampleStream(new File(
         EvalUtil.getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
         file -> {
@@ -53,15 +56,39 @@ public class OntoNotes4ParserEval {
           return file.isDirectory();
         }, true);
 
-    OntoNotesParseSampleStream samples = new OntoNotesParseSampleStream(
+    return new OntoNotesParseSampleStream(
         new DocumentToLineStream(new FileToStringSampleStream(
-            documentStream, Charset.forName("UTF-8"))));
+            documentStream, StandardCharsets.UTF_8)));
+  }
+
+  private static void crossEval(TrainingParameters params, HeadRules rules, double expectedScore)
+      throws IOException {
+    try (ObjectStream<Parse> samples = createParseSampleStream()) {
+      ParserCrossValidator cv = new ParserCrossValidator("en", params, rules, ParserType.CHUNKING);
+      cv.evaluate(samples, 10);
+
+      Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.001d);
+    }
+  }
 
-    ParserCrossValidator cv = new ParserCrossValidator("en", params, rules, ParserType.CHUNKING);
+  @BeforeClass
+  public static void verifyTrainingData() throws IOException {
+    MessageDigest digest;
+    try {
+      digest = MessageDigest.getInstance("MD5");
+    } catch (NoSuchAlgorithmException e) {
+      throw new IllegalStateException(e);
+    }
 
-    cv.evaluate(samples, 10);
+    try (ObjectStream<Parse> samples = createParseSampleStream()) {
+      Parse sample;
+      while ((sample = samples.read()) != null) {
+        digest.update(sample.toString().getBytes(StandardCharsets.UTF_8));
+      }
 
-    Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.001d);
+      Assert.assertEquals(new BigInteger("83833369887442127665956850482411800415"),
+          new BigInteger(1, digest.digest()));
+    }
   }
 
   @Test

http://git-wip-us.apache.org/repos/asf/opennlp/blob/40602173/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
index ab33568..b171978 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
@@ -19,9 +19,13 @@ package opennlp.tools.eval;
 
 import java.io.File;
 import java.io.IOException;
-import java.nio.charset.Charset;
+import java.math.BigInteger;
+import java.nio.charset.StandardCharsets;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
 
 import org.junit.Assert;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import opennlp.tools.formats.DirectorySampleStream;
@@ -29,6 +33,7 @@ import opennlp.tools.formats.convert.FileToStringSampleStream;
 import opennlp.tools.formats.convert.ParseToPOSSampleStream;
 import opennlp.tools.formats.ontonotes.DocumentToLineStream;
 import opennlp.tools.formats.ontonotes.OntoNotesParseSampleStream;
+import opennlp.tools.postag.POSSample;
 import opennlp.tools.postag.POSTaggerCrossValidator;
 import opennlp.tools.postag.POSTaggerFactory;
 import opennlp.tools.util.ObjectStream;
@@ -37,9 +42,7 @@ import opennlp.tools.util.model.ModelUtil;
 
 public class OntoNotes4PosTaggerEval {
 
-  private static void crossEval(TrainingParameters params, double expectedScore)
-      throws IOException {
-
+  private static ObjectStream<POSSample> createPOSSampleStream() throws IOException {
     ObjectStream<File> documentStream = new DirectorySampleStream(new File(
         EvalUtil.getOpennlpDataDir(), "ontonotes4/data/files/data/english"),
         file -> {
@@ -50,16 +53,40 @@ public class OntoNotes4PosTaggerEval {
           return file.isDirectory();
         }, true);
 
-    ParseToPOSSampleStream samples = new ParseToPOSSampleStream(new OntoNotesParseSampleStream(
+    return new ParseToPOSSampleStream(new OntoNotesParseSampleStream(
         new DocumentToLineStream(
-            new FileToStringSampleStream(documentStream, Charset.forName("UTF-8")))));
+            new FileToStringSampleStream(documentStream, StandardCharsets.UTF_8))));
+  }
 
-    POSTaggerCrossValidator cv = new POSTaggerCrossValidator("en", params, new POSTaggerFactory());
-    cv.evaluate(samples, 10);
+  private static void crossEval(TrainingParameters params, double expectedScore)
+      throws IOException {
+    try (ObjectStream<POSSample> samples = createPOSSampleStream()) {
+      POSTaggerCrossValidator cv = new POSTaggerCrossValidator("en", params, new POSTaggerFactory());
+      cv.evaluate(samples, 10);
 
-    Assert.assertEquals(expectedScore, cv.getWordAccuracy(), 0.0001d);
+      Assert.assertEquals(expectedScore, cv.getWordAccuracy(), 0.0001d);
+    }
   }
 
+  @BeforeClass
+  public static void verifyTrainingData() throws IOException {
+    MessageDigest digest;
+    try {
+      digest = MessageDigest.getInstance("MD5");
+    } catch (NoSuchAlgorithmException e) {
+      throw new IllegalStateException(e);
+    }
+
+    try (ObjectStream<POSSample> samples = createPOSSampleStream()) {
+      POSSample sample;
+      while ((sample = samples.read()) != null) {
+        digest.update(sample.toString().getBytes(StandardCharsets.UTF_8));
+      }
+
+      Assert.assertEquals(new BigInteger("300430765214895870888056958221353356972"),
+          new BigInteger(1, digest.digest()));
+    }
+  }
   @Test
   public void evalEnglishMaxentTagger() throws IOException {
     crossEval(ModelUtil.createDefaultTrainingParameters(), 0.9698145168879707d);


[36/50] opennlp git commit: OPENNLP-1056: Fix NullPointerException in DictionaryLemmatizer

Posted by co...@apache.org.
OPENNLP-1056: Fix NullPointerException in DictionaryLemmatizer


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/cb6ee2cb
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/cb6ee2cb
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/cb6ee2cb

Branch: refs/heads/LangDetect
Commit: cb6ee2cbdeadad5d277a6e7293d88bb915090c4f
Parents: 60792b8
Author: Daniel Russ <dr...@mail.nih.gov>
Authored: Thu May 11 10:56:58 2017 -0400
Committer: Daniel Russ <dr...@mail.nih.gov>
Committed: Thu May 11 11:06:49 2017 -0400

----------------------------------------------------------------------
 .../tools/lemmatizer/DictionaryLemmatizer.java  |  2 +-
 .../lemmatizer/DictionaryLemmatizerTest.java    | 49 ++++++++++++++++++++
 .../tools/lemmatizer/smalldictionary.dict       |  5 ++
 3 files changed, 55 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/cb6ee2cb/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
index 9f0b0b0..37d488c 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
@@ -114,7 +114,7 @@ public class DictionaryLemmatizer implements Lemmatizer {
     final List<String> keys = this.getDictKeys(word, postag);
     // lookup lemma as value of the map
     final List<String> keyValues = this.dictMap.get(keys);
-    if (!keyValues.isEmpty()) {
+    if ( keyValues != null && !keyValues.isEmpty()) {
       lemma = keyValues.get(0);
     } else {
       lemma = "O";

http://git-wip-us.apache.org/repos/asf/opennlp/blob/cb6ee2cb/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DictionaryLemmatizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DictionaryLemmatizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DictionaryLemmatizerTest.java
new file mode 100644
index 0000000..6cf72cf
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DictionaryLemmatizerTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.lemmatizer;
+
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class DictionaryLemmatizerTest {
+
+  private static DictionaryLemmatizer dictionaryLemmatizer;
+
+  @BeforeClass
+  public static void loadDictionary() throws Exception {
+    dictionaryLemmatizer = new DictionaryLemmatizer(
+        DictionaryLemmatizerTest.class.getResourceAsStream("/opennlp/tools/lemmatizer/smalldictionary.dict") 
+    );
+  }
+  
+  @Test
+  public void testForNullPointerException() {
+    String[] sentence = new String[]{"The","dogs","were","running","and","barking","down","the","street"};
+    String[] sentencePOS = new String[]{"DT","NNS","VBD","VBG","CC","VBG","RP","DT","NN"};
+    String[] expectedLemma = new String[]{"the","dog","is","run","and","bark","down","the","street"};
+    
+    String[] actualLemma = dictionaryLemmatizer.lemmatize(sentence, sentencePOS);
+    
+    for (int i = 0;i < sentence.length;i++) {
+      // don't compare cases where the word is not in the dictionary...
+      if (!actualLemma[i].equals("O")) Assert.assertEquals(expectedLemma[i], actualLemma[i]);
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/cb6ee2cb/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/smalldictionary.dict
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/smalldictionary.dict b/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/smalldictionary.dict
new file mode 100644
index 0000000..edeb7a0
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/smalldictionary.dict
@@ -0,0 +1,5 @@
+barking	VBG	bark
+dogs	NNS	dog
+running	VBG	run
+down	RP	down
+street	NN	street
\ No newline at end of file


[02/50] opennlp git commit: OPENNLP-1030: Add unit test for TokenNameFinderTool

Posted by co...@apache.org.
OPENNLP-1030: Add unit test for TokenNameFinderTool


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/e2cf4811
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/e2cf4811
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/e2cf4811

Branch: refs/heads/LangDetect
Commit: e2cf4811ba485b0bb29d7d80bc853666bfbfa958
Parents: d8cdd5e
Author: jzonthemtn <je...@mtnfog.com>
Authored: Tue Apr 18 20:39:07 2017 -0400
Committer: smarthi <sm...@apache.org>
Committed: Tue Apr 18 20:39:07 2017 -0400

----------------------------------------------------------------------
 .../cmdline/namefind/TokenNameFinderTool.java   |   1 +
 .../tools/cmdline/TokenNameFinderToolTest.java  | 137 +++++++++++++++++++
 2 files changed, 138 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/e2cf4811/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTool.java
index 59b2f3a..a5c9bd6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTool.java
@@ -47,6 +47,7 @@ public final class TokenNameFinderTool extends BasicCmdLineTool {
     return "Usage: " + CLI.CMD + " " + getName() + " model1 model2 ... modelN < sentences";
   }
 
+  @Override
   public void run(String[] args) {
 
     if (args.length == 0) {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/e2cf4811/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java b/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java
new file mode 100644
index 0000000..3ade0d5
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline;
+
+import java.io.BufferedOutputStream;
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.PrintStream;
+import java.nio.charset.StandardCharsets;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.cmdline.namefind.TokenNameFinderTool;
+import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.namefind.NameSampleDataStream;
+import opennlp.tools.namefind.TokenNameFinderFactory;
+import opennlp.tools.namefind.TokenNameFinderModel;
+import opennlp.tools.util.MockInputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
+public class TokenNameFinderToolTest {
+
+  @Test
+  public void run() throws IOException {
+
+    File model1 = trainModel();
+
+    String[] args = new String[]{model1.getAbsolutePath()};
+    
+    final String in = "It is Stefanie Schmidt.\n\nNothing in this sentence.";
+    InputStream stream = new ByteArrayInputStream(in.getBytes(StandardCharsets.UTF_8));
+    
+    System.setIn(stream);
+    
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    PrintStream ps = new PrintStream(baos);
+    System.setOut(ps);
+
+    TokenNameFinderTool tool = new TokenNameFinderTool();
+    tool.run(args);
+    
+    final String content = new String(baos.toByteArray(), StandardCharsets.UTF_8);
+    Assert.assertTrue(content.contains("It is <START:person> Stefanie Schmidt. <END>"));
+    
+  }
+  
+  @Test(expected = TerminateToolException.class)
+  public void invalidModel() {
+
+    String[] args = new String[]{"invalidmodel.bin"};
+
+    TokenNameFinderTool tool = new TokenNameFinderTool();
+    tool.run(args);
+
+  }
+  
+  @Test()
+  public void usage() {
+
+    String[] args = new String[]{};
+    
+    ByteArrayOutputStream baos = new ByteArrayOutputStream();
+    PrintStream ps = new PrintStream(baos);
+    System.setOut(ps);
+
+    TokenNameFinderTool tool = new TokenNameFinderTool();
+    tool.run(args);
+
+    final String content = new String(baos.toByteArray(), StandardCharsets.UTF_8);
+    Assert.assertEquals(tool.getHelp(), content.trim());
+    
+  }
+  
+  private File trainModel() throws IOException {
+    
+    String encoding = "ISO-8859-1";
+
+    ObjectStream<String> lineStream =
+        new PlainTextByLineStream(new MockInputStreamFactory(
+            new File("opennlp/tools/namefind/AnnotatedSentencesWithTypes.txt")), encoding);
+    ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
+
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(70));
+    params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(1));
+    
+    TokenNameFinderModel model;
+
+    TokenNameFinderFactory nameFinderFactory = new TokenNameFinderFactory();
+
+    try {
+      model = NameFinderME.train("en", null, sampleStream, params,
+          nameFinderFactory);
+    }
+    finally {
+      sampleStream.close();
+    }
+
+    BufferedOutputStream modelOut = null;
+    
+    File modelFile = File.createTempFile("model", ".bin");
+    
+    try {
+      modelOut = new BufferedOutputStream(new FileOutputStream(modelFile));
+      model.serialize(modelOut);
+    } finally {
+      if (modelOut != null) 
+       modelOut.close();    
+    }
+    
+    return modelFile;
+  }
+  
+}


[23/50] opennlp git commit: OPENNLP-1048: Add stemmer for Irish

Posted by co...@apache.org.
OPENNLP-1048: Add stemmer for Irish

Closes #189


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/6c2dbf28
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/6c2dbf28
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/6c2dbf28

Branch: refs/heads/LangDetect
Commit: 6c2dbf2885fb4602b8e42bd208ebef66df23329b
Parents: caeaaee
Author: Jim O'Regan <ja...@tcd.ie>
Authored: Sat Apr 29 00:15:29 2017 +0100
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 3 12:15:27 2017 +0200

----------------------------------------------------------------------
 .../tools/stemmer/snowball/SnowballStemmer.java |   4 +
 .../tools/stemmer/snowball/irishStemmer.java    | 616 +++++++++++++++++++
 .../tools/stemmer/SnowballStemmerTest.java      |   9 +
 3 files changed, 629 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/6c2dbf28/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/SnowballStemmer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/SnowballStemmer.java b/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/SnowballStemmer.java
index dd75754..86ebe84 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/SnowballStemmer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/SnowballStemmer.java
@@ -29,6 +29,7 @@ public class SnowballStemmer implements Stemmer {
     FRENCH,
     GERMAN,
     HUNGARIAN,
+    IRISH,
     ITALIAN,
     NORWEGIAN,
     PORTER,
@@ -67,6 +68,9 @@ public class SnowballStemmer implements Stemmer {
     else if (ALGORITHM.HUNGARIAN.equals(algorithm)) {
       stemmer = new hungarianStemmer();
     }
+    else if (ALGORITHM.IRISH.equals(algorithm)) {
+      stemmer = new irishStemmer();
+    }
     else if (ALGORITHM.ITALIAN.equals(algorithm)) {
       stemmer = new italianStemmer();
     }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6c2dbf28/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/irishStemmer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/irishStemmer.java b/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/irishStemmer.java
new file mode 100644
index 0000000..316288f
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball/irishStemmer.java
@@ -0,0 +1,616 @@
+// CHECKSTYLE:OFF
+/*
+
+Copyright (c) 2001, Dr Martin Porter
+Copyright (c) 2002, Richard Boulton
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+    * Redistributions of source code must retain the above copyright notice,
+    * this list of conditions and the following disclaimer.
+    * Redistributions in binary form must reproduce the above copyright
+    * notice, this list of conditions and the following disclaimer in the
+    * documentation and/or other materials provided with the distribution.
+    * Neither the name of the copyright holders nor the names of its contributors
+    * may be used to endorse or promote products derived from this software
+    * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ */
+
+// This file was generated automatically by the Snowball to Java compiler
+
+package opennlp.tools.stemmer.snowball;
+
+ /**
+  * This class was automatically generated by a Snowball to Java compiler
+  * It implements the stemming algorithm defined by a snowball script.
+  */
+
+public class irishStemmer extends opennlp.tools.stemmer.snowball.AbstractSnowballStemmer {
+
+private static final long serialVersionUID = 1L;
+
+        private final static irishStemmer methodObject = new irishStemmer ();
+
+                private final static Among a_0[] = {
+                    new Among ( "b'", -1, 4, "", methodObject ),
+                    new Among ( "bh", -1, 14, "", methodObject ),
+                    new Among ( "bhf", 1, 9, "", methodObject ),
+                    new Among ( "bp", -1, 11, "", methodObject ),
+                    new Among ( "ch", -1, 15, "", methodObject ),
+                    new Among ( "d'", -1, 2, "", methodObject ),
+                    new Among ( "d'fh", 5, 3, "", methodObject ),
+                    new Among ( "dh", -1, 16, "", methodObject ),
+                    new Among ( "dt", -1, 13, "", methodObject ),
+                    new Among ( "fh", -1, 17, "", methodObject ),
+                    new Among ( "gc", -1, 7, "", methodObject ),
+                    new Among ( "gh", -1, 18, "", methodObject ),
+                    new Among ( "h-", -1, 1, "", methodObject ),
+                    new Among ( "m'", -1, 4, "", methodObject ),
+                    new Among ( "mb", -1, 6, "", methodObject ),
+                    new Among ( "mh", -1, 19, "", methodObject ),
+                    new Among ( "n-", -1, 1, "", methodObject ),
+                    new Among ( "nd", -1, 8, "", methodObject ),
+                    new Among ( "ng", -1, 10, "", methodObject ),
+                    new Among ( "ph", -1, 20, "", methodObject ),
+                    new Among ( "sh", -1, 5, "", methodObject ),
+                    new Among ( "t-", -1, 1, "", methodObject ),
+                    new Among ( "th", -1, 21, "", methodObject ),
+                    new Among ( "ts", -1, 12, "", methodObject )
+                };
+
+                private final static Among a_1[] = {
+                    new Among ( "\u00EDochta", -1, 1, "", methodObject ),
+                    new Among ( "a\u00EDochta", 0, 1, "", methodObject ),
+                    new Among ( "ire", -1, 2, "", methodObject ),
+                    new Among ( "aire", 2, 2, "", methodObject ),
+                    new Among ( "abh", -1, 1, "", methodObject ),
+                    new Among ( "eabh", 4, 1, "", methodObject ),
+                    new Among ( "ibh", -1, 1, "", methodObject ),
+                    new Among ( "aibh", 6, 1, "", methodObject ),
+                    new Among ( "amh", -1, 1, "", methodObject ),
+                    new Among ( "eamh", 8, 1, "", methodObject ),
+                    new Among ( "imh", -1, 1, "", methodObject ),
+                    new Among ( "aimh", 10, 1, "", methodObject ),
+                    new Among ( "\u00EDocht", -1, 1, "", methodObject ),
+                    new Among ( "a\u00EDocht", 12, 1, "", methodObject ),
+                    new Among ( "ir\u00ED", -1, 2, "", methodObject ),
+                    new Among ( "air\u00ED", 14, 2, "", methodObject )
+                };
+
+                private final static Among a_2[] = {
+                    new Among ( "\u00F3ideacha", -1, 6, "", methodObject ),
+                    new Among ( "patacha", -1, 5, "", methodObject ),
+                    new Among ( "achta", -1, 1, "", methodObject ),
+                    new Among ( "arcachta", 2, 2, "", methodObject ),
+                    new Among ( "eachta", 2, 1, "", methodObject ),
+                    new Among ( "grafa\u00EDochta", -1, 4, "", methodObject ),
+                    new Among ( "paite", -1, 5, "", methodObject ),
+                    new Among ( "ach", -1, 1, "", methodObject ),
+                    new Among ( "each", 7, 1, "", methodObject ),
+                    new Among ( "\u00F3ideach", 8, 6, "", methodObject ),
+                    new Among ( "gineach", 8, 3, "", methodObject ),
+                    new Among ( "patach", 7, 5, "", methodObject ),
+                    new Among ( "grafa\u00EDoch", -1, 4, "", methodObject ),
+                    new Among ( "pataigh", -1, 5, "", methodObject ),
+                    new Among ( "\u00F3idigh", -1, 6, "", methodObject ),
+                    new Among ( "acht\u00FAil", -1, 1, "", methodObject ),
+                    new Among ( "eacht\u00FAil", 15, 1, "", methodObject ),
+                    new Among ( "gineas", -1, 3, "", methodObject ),
+                    new Among ( "ginis", -1, 3, "", methodObject ),
+                    new Among ( "acht", -1, 1, "", methodObject ),
+                    new Among ( "arcacht", 19, 2, "", methodObject ),
+                    new Among ( "eacht", 19, 1, "", methodObject ),
+                    new Among ( "grafa\u00EDocht", -1, 4, "", methodObject ),
+                    new Among ( "arcachta\u00ED", -1, 2, "", methodObject ),
+                    new Among ( "grafa\u00EDochta\u00ED", -1, 4, "", methodObject )
+                };
+
+                private final static Among a_3[] = {
+                    new Among ( "imid", -1, 1, "", methodObject ),
+                    new Among ( "aimid", 0, 1, "", methodObject ),
+                    new Among ( "\u00EDmid", -1, 1, "", methodObject ),
+                    new Among ( "a\u00EDmid", 2, 1, "", methodObject ),
+                    new Among ( "adh", -1, 2, "", methodObject ),
+                    new Among ( "eadh", 4, 2, "", methodObject ),
+                    new Among ( "faidh", -1, 1, "", methodObject ),
+                    new Among ( "fidh", -1, 1, "", methodObject ),
+                    new Among ( "\u00E1il", -1, 2, "", methodObject ),
+                    new Among ( "ain", -1, 2, "", methodObject ),
+                    new Among ( "tear", -1, 2, "", methodObject ),
+                    new Among ( "tar", -1, 2, "", methodObject )
+                };
+
+                private static final char g_v[] = {17, 65, 16, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 17, 4, 2 };
+
+        private int I_p2;
+        private int I_p1;
+        private int I_pV;
+
+                private void copy_from(irishStemmer other) {
+                    I_p2 = other.I_p2;
+                    I_p1 = other.I_p1;
+                    I_pV = other.I_pV;
+                    super.copy_from(other);
+                }
+
+                private boolean r_mark_regions() {
+            int v_1;
+            int v_3;
+                    // (, line 28
+                    I_pV = limit;
+                    I_p1 = limit;
+                    I_p2 = limit;
+                    // do, line 34
+                    v_1 = cursor;
+                    lab0: do {
+                        // (, line 34
+                        // gopast, line 35
+                        golab1: while(true)
+                        {
+                            lab2: do {
+                                if (!(in_grouping(g_v, 97, 250)))
+                                {
+                                    break lab2;
+                                }
+                                break golab1;
+                            } while (false);
+                            if (cursor >= limit)
+                            {
+                                break lab0;
+                            }
+                            cursor++;
+                        }
+                        // setmark pV, line 35
+                        I_pV = cursor;
+                    } while (false);
+                    cursor = v_1;
+                    // do, line 37
+                    v_3 = cursor;
+                    lab3: do {
+                        // (, line 37
+                        // gopast, line 38
+                        golab4: while(true)
+                        {
+                            lab5: do {
+                                if (!(in_grouping(g_v, 97, 250)))
+                                {
+                                    break lab5;
+                                }
+                                break golab4;
+                            } while (false);
+                            if (cursor >= limit)
+                            {
+                                break lab3;
+                            }
+                            cursor++;
+                        }
+                        // gopast, line 38
+                        golab6: while(true)
+                        {
+                            lab7: do {
+                                if (!(out_grouping(g_v, 97, 250)))
+                                {
+                                    break lab7;
+                                }
+                                break golab6;
+                            } while (false);
+                            if (cursor >= limit)
+                            {
+                                break lab3;
+                            }
+                            cursor++;
+                        }
+                        // setmark p1, line 38
+                        I_p1 = cursor;
+                        // gopast, line 39
+                        golab8: while(true)
+                        {
+                            lab9: do {
+                                if (!(in_grouping(g_v, 97, 250)))
+                                {
+                                    break lab9;
+                                }
+                                break golab8;
+                            } while (false);
+                            if (cursor >= limit)
+                            {
+                                break lab3;
+                            }
+                            cursor++;
+                        }
+                        // gopast, line 39
+                        golab10: while(true)
+                        {
+                            lab11: do {
+                                if (!(out_grouping(g_v, 97, 250)))
+                                {
+                                    break lab11;
+                                }
+                                break golab10;
+                            } while (false);
+                            if (cursor >= limit)
+                            {
+                                break lab3;
+                            }
+                            cursor++;
+                        }
+                        // setmark p2, line 39
+                        I_p2 = cursor;
+                    } while (false);
+                    cursor = v_3;
+                    return true;
+                }
+
+                private boolean r_initial_morph() {
+            int among_var;
+                    // (, line 43
+                    // [, line 44
+                    bra = cursor;
+                    // substring, line 44
+                    among_var = find_among(a_0, 24);
+                    if (among_var == 0)
+                    {
+                        return false;
+                    }
+                    // ], line 44
+                    ket = cursor;
+                    switch (among_var) {
+                        case 0:
+                            return false;
+                        case 1:
+                            // (, line 46
+                            // delete, line 46
+                            slice_del();
+                            break;
+                        case 2:
+                            // (, line 50
+                            // delete, line 50
+                            slice_del();
+                            break;
+                        case 3:
+                            // (, line 52
+                            // <-, line 52
+                            slice_from("f");
+                            break;
+                        case 4:
+                            // (, line 55
+                            // delete, line 55
+                            slice_del();
+                            break;
+                        case 5:
+                            // (, line 58
+                            // <-, line 58
+                            slice_from("s");
+                            break;
+                        case 6:
+                            // (, line 61
+                            // <-, line 61
+                            slice_from("b");
+                            break;
+                        case 7:
+                            // (, line 63
+                            // <-, line 63
+                            slice_from("c");
+                            break;
+                        case 8:
+                            // (, line 65
+                            // <-, line 65
+                            slice_from("d");
+                            break;
+                        case 9:
+                            // (, line 67
+                            // <-, line 67
+                            slice_from("f");
+                            break;
+                        case 10:
+                            // (, line 69
+                            // <-, line 69
+                            slice_from("g");
+                            break;
+                        case 11:
+                            // (, line 71
+                            // <-, line 71
+                            slice_from("p");
+                            break;
+                        case 12:
+                            // (, line 73
+                            // <-, line 73
+                            slice_from("s");
+                            break;
+                        case 13:
+                            // (, line 75
+                            // <-, line 75
+                            slice_from("t");
+                            break;
+                        case 14:
+                            // (, line 79
+                            // <-, line 79
+                            slice_from("b");
+                            break;
+                        case 15:
+                            // (, line 81
+                            // <-, line 81
+                            slice_from("c");
+                            break;
+                        case 16:
+                            // (, line 83
+                            // <-, line 83
+                            slice_from("d");
+                            break;
+                        case 17:
+                            // (, line 85
+                            // <-, line 85
+                            slice_from("f");
+                            break;
+                        case 18:
+                            // (, line 87
+                            // <-, line 87
+                            slice_from("g");
+                            break;
+                        case 19:
+                            // (, line 89
+                            // <-, line 89
+                            slice_from("m");
+                            break;
+                        case 20:
+                            // (, line 91
+                            // <-, line 91
+                            slice_from("p");
+                            break;
+                        case 21:
+                            // (, line 93
+                            // <-, line 93
+                            slice_from("t");
+                            break;
+                    }
+                    return true;
+                }
+
+                private boolean r_RV() {
+                    if (!(I_pV <= cursor))
+                    {
+                        return false;
+                    }
+                    return true;
+                }
+
+                private boolean r_R1() {
+                    if (!(I_p1 <= cursor))
+                    {
+                        return false;
+                    }
+                    return true;
+                }
+
+                private boolean r_R2() {
+                    if (!(I_p2 <= cursor))
+                    {
+                        return false;
+                    }
+                    return true;
+                }
+
+                private boolean r_noun_sfx() {
+            int among_var;
+                    // (, line 103
+                    // [, line 104
+                    ket = cursor;
+                    // substring, line 104
+                    among_var = find_among_b(a_1, 16);
+                    if (among_var == 0)
+                    {
+                        return false;
+                    }
+                    // ], line 104
+                    bra = cursor;
+                    switch (among_var) {
+                        case 0:
+                            return false;
+                        case 1:
+                            // (, line 108
+                            // call R1, line 108
+                            if (!r_R1())
+                            {
+                                return false;
+                            }
+                            // delete, line 108
+                            slice_del();
+                            break;
+                        case 2:
+                            // (, line 110
+                            // call R2, line 110
+                            if (!r_R2())
+                            {
+                                return false;
+                            }
+                            // delete, line 110
+                            slice_del();
+                            break;
+                    }
+                    return true;
+                }
+
+                private boolean r_deriv() {
+            int among_var;
+                    // (, line 113
+                    // [, line 114
+                    ket = cursor;
+                    // substring, line 114
+                    among_var = find_among_b(a_2, 25);
+                    if (among_var == 0)
+                    {
+                        return false;
+                    }
+                    // ], line 114
+                    bra = cursor;
+                    switch (among_var) {
+                        case 0:
+                            return false;
+                        case 1:
+                            // (, line 116
+                            // call R2, line 116
+                            if (!r_R2())
+                            {
+                                return false;
+                            }
+                            // delete, line 116
+                            slice_del();
+                            break;
+                        case 2:
+                            // (, line 118
+                            // <-, line 118
+                            slice_from("arc");
+                            break;
+                        case 3:
+                            // (, line 120
+                            // <-, line 120
+                            slice_from("gin");
+                            break;
+                        case 4:
+                            // (, line 122
+                            // <-, line 122
+                            slice_from("graf");
+                            break;
+                        case 5:
+                            // (, line 124
+                            // <-, line 124
+                            slice_from("paite");
+                            break;
+                        case 6:
+                            // (, line 126
+                            // <-, line 126
+                            slice_from("\u00F3id");
+                            break;
+                    }
+                    return true;
+                }
+
+                private boolean r_verb_sfx() {
+            int among_var;
+                    // (, line 129
+                    // [, line 130
+                    ket = cursor;
+                    // substring, line 130
+                    among_var = find_among_b(a_3, 12);
+                    if (among_var == 0)
+                    {
+                        return false;
+                    }
+                    // ], line 130
+                    bra = cursor;
+                    switch (among_var) {
+                        case 0:
+                            return false;
+                        case 1:
+                            // (, line 133
+                            // call RV, line 133
+                            if (!r_RV())
+                            {
+                                return false;
+                            }
+                            // delete, line 133
+                            slice_del();
+                            break;
+                        case 2:
+                            // (, line 138
+                            // call R1, line 138
+                            if (!r_R1())
+                            {
+                                return false;
+                            }
+                            // delete, line 138
+                            slice_del();
+                            break;
+                    }
+                    return true;
+                }
+
+                public boolean stem() {
+            int v_1;
+            int v_2;
+            int v_3;
+            int v_4;
+            int v_5;
+                    // (, line 143
+                    // do, line 144
+                    v_1 = cursor;
+                    lab0: do {
+                        // call initial_morph, line 144
+                        if (!r_initial_morph())
+                        {
+                            break lab0;
+                        }
+                    } while (false);
+                    cursor = v_1;
+                    // do, line 145
+                    v_2 = cursor;
+                    lab1: do {
+                        // call mark_regions, line 145
+                        if (!r_mark_regions())
+                        {
+                            break lab1;
+                        }
+                    } while (false);
+                    cursor = v_2;
+                    // backwards, line 146
+                    limit_backward = cursor; cursor = limit;
+                    // (, line 146
+                    // do, line 147
+                    v_3 = limit - cursor;
+                    lab2: do {
+                        // call noun_sfx, line 147
+                        if (!r_noun_sfx())
+                        {
+                            break lab2;
+                        }
+                    } while (false);
+                    cursor = limit - v_3;
+                    // do, line 148
+                    v_4 = limit - cursor;
+                    lab3: do {
+                        // call deriv, line 148
+                        if (!r_deriv())
+                        {
+                            break lab3;
+                        }
+                    } while (false);
+                    cursor = limit - v_4;
+                    // do, line 149
+                    v_5 = limit - cursor;
+                    lab4: do {
+                        // call verb_sfx, line 149
+                        if (!r_verb_sfx())
+                        {
+                            break lab4;
+                        }
+                    } while (false);
+                    cursor = limit - v_5;
+                    cursor = limit_backward;                    return true;
+                }
+
+        public boolean equals( Object o ) {
+            return o instanceof irishStemmer;
+        }
+
+        public int hashCode() {
+            return irishStemmer.class.getName().hashCode();
+        }
+
+
+
+}
+

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6c2dbf28/opennlp-tools/src/test/java/opennlp/tools/stemmer/SnowballStemmerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/stemmer/SnowballStemmerTest.java b/opennlp-tools/src/test/java/opennlp/tools/stemmer/SnowballStemmerTest.java
index dad1fa0..6396b2f 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/stemmer/SnowballStemmerTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/stemmer/SnowballStemmerTest.java
@@ -89,6 +89,15 @@ public class SnowballStemmerTest {
   }
 
   @Test
+  public void testIrish() {
+    SnowballStemmer stemmer = new SnowballStemmer(ALGORITHM.IRISH);
+    Assert.assertEquals(stemmer.stem("bhfeidhm"), "feidhm");
+    Assert.assertEquals(stemmer.stem("feirmeoireacht"), "feirmeoir");
+    Assert.assertEquals(stemmer.stem("monarcacht"), "monarc");
+
+  }
+
+  @Test
   public void testItalian() {
     SnowballStemmer stemmer = new SnowballStemmer(ALGORITHM.ITALIAN);
     Assert.assertEquals(stemmer.stem("abbattimento"), "abbatt");


[40/50] opennlp git commit: OPENNLP-1059 Set model version before creating the POS Model

Posted by co...@apache.org.
OPENNLP-1059 Set model version before creating the POS Model


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/108fa9a9
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/108fa9a9
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/108fa9a9

Branch: refs/heads/LangDetect
Commit: 108fa9a93c2cd126a138f8813390e197d0a3584e
Parents: ab44624
Author: Jörn Kottmann <jo...@apache.org>
Authored: Mon May 15 16:04:58 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Mon May 15 16:04:58 2017 +0200

----------------------------------------------------------------------
 .../java/opennlp/tools/util/model/POSModelSerializer.java | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/108fa9a9/opennlp-tools/src/main/java/opennlp/tools/util/model/POSModelSerializer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/model/POSModelSerializer.java b/opennlp-tools/src/main/java/opennlp/tools/util/model/POSModelSerializer.java
index a82319c..5fe365a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/model/POSModelSerializer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/model/POSModelSerializer.java
@@ -20,6 +20,8 @@ package opennlp.tools.util.model;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;
+import java.util.HashMap;
+import java.util.Map;
 
 import opennlp.tools.ml.BeamSearch;
 import opennlp.tools.postag.POSModel;
@@ -36,8 +38,14 @@ public class POSModelSerializer implements ArtifactSerializer<POSModel> {
     Version version = posModel.getVersion();
     if (version.getMajor() == 1 && version.getMinor() == 5) {
       if (posModel.getManifestProperty(BeamSearch.BEAM_SIZE_PARAMETER) == null) {
+        Map<String, String> manifestInfoEntries = new HashMap<>();
+
+        // The version in the model must be correct or otherwise version
+        // dependent code branches in other places fail
+        manifestInfoEntries.put("OpenNLP-Version", "1.5.0");
+
         posModel = new POSModel(posModel.getLanguage(), posModel.getPosModel(), 10,
-            null, posModel.getFactory());
+            manifestInfoEntries, posModel.getFactory());
       }
     }
 


[39/50] opennlp git commit: Rollback Release 1.8.0 RC

Posted by co...@apache.org.
Rollback Release 1.8.0 RC


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/ab446247
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/ab446247
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/ab446247

Branch: refs/heads/LangDetect
Commit: ab44624770a0d36a924af5326514fa61d7d1ed5f
Parents: 652edde
Author: Jörn Kottmann <jo...@apache.org>
Authored: Mon May 15 10:14:10 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Mon May 15 10:15:43 2017 +0200

----------------------------------------------------------------------
 opennlp-brat-annotator/pom.xml   | 2 +-
 opennlp-distr/pom.xml            | 2 +-
 opennlp-docs/pom.xml             | 2 +-
 opennlp-morfologik-addon/pom.xml | 2 +-
 opennlp-tools/pom.xml            | 2 +-
 opennlp-uima/pom.xml             | 2 +-
 pom.xml                          | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/ab446247/opennlp-brat-annotator/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-brat-annotator/pom.xml b/opennlp-brat-annotator/pom.xml
index 0791e6b..6c7be0d 100644
--- a/opennlp-brat-annotator/pom.xml
+++ b/opennlp-brat-annotator/pom.xml
@@ -17,7 +17,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.1-SNAPSHOT</version>
+		<version>1.8.0-SNAPSHOT</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/ab446247/opennlp-distr/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-distr/pom.xml b/opennlp-distr/pom.xml
index 4428240..3f838cd 100644
--- a/opennlp-distr/pom.xml
+++ b/opennlp-distr/pom.xml
@@ -24,7 +24,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.1-SNAPSHOT</version>
+		<version>1.8.0-SNAPSHOT</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/ab446247/opennlp-docs/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/pom.xml b/opennlp-docs/pom.xml
index 312f6b8..fbf0b5c 100644
--- a/opennlp-docs/pom.xml
+++ b/opennlp-docs/pom.xml
@@ -24,7 +24,7 @@
   <parent>
 	<groupId>org.apache.opennlp</groupId>
 	<artifactId>opennlp</artifactId>
-	<version>1.8.1-SNAPSHOT</version>
+	<version>1.8.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
   

http://git-wip-us.apache.org/repos/asf/opennlp/blob/ab446247/opennlp-morfologik-addon/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/pom.xml b/opennlp-morfologik-addon/pom.xml
index 8c5b9f4..c46f101 100644
--- a/opennlp-morfologik-addon/pom.xml
+++ b/opennlp-morfologik-addon/pom.xml
@@ -24,7 +24,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.1-SNAPSHOT</version>
+		<version>1.8.0-SNAPSHOT</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/ab446247/opennlp-tools/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/pom.xml b/opennlp-tools/pom.xml
index a2cf596..a499375 100644
--- a/opennlp-tools/pom.xml
+++ b/opennlp-tools/pom.xml
@@ -25,7 +25,7 @@
   <parent>
     <groupId>org.apache.opennlp</groupId>
     <artifactId>opennlp</artifactId>
-    <version>1.8.1-SNAPSHOT</version>
+    <version>1.8.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
     

http://git-wip-us.apache.org/repos/asf/opennlp/blob/ab446247/opennlp-uima/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-uima/pom.xml b/opennlp-uima/pom.xml
index d8f5246..7cfdb72 100644
--- a/opennlp-uima/pom.xml
+++ b/opennlp-uima/pom.xml
@@ -25,7 +25,7 @@
 	<parent>
 	    <groupId>org.apache.opennlp</groupId>
 	    <artifactId>opennlp</artifactId>
-	    <version>1.8.1-SNAPSHOT</version>
+	    <version>1.8.0-SNAPSHOT</version>
 	    <relativePath>../pom.xml</relativePath>
     </parent>
     

http://git-wip-us.apache.org/repos/asf/opennlp/blob/ab446247/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 03811a8..2190a26 100644
--- a/pom.xml
+++ b/pom.xml
@@ -31,7 +31,7 @@
 
 	<groupId>org.apache.opennlp</groupId>
 	<artifactId>opennlp</artifactId>
-	<version>1.8.1-SNAPSHOT</version>
+	<version>1.8.0-SNAPSHOT</version>
 	<packaging>pom</packaging>
 
 	<name>Apache OpenNLP Reactor</name>


[44/50] opennlp git commit: OPENNLP-1062: Add lemmatizer eval tests

Posted by co...@apache.org.
OPENNLP-1062: Add lemmatizer eval tests


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/d372ad1d
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/d372ad1d
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/d372ad1d

Branch: refs/heads/LangDetect
Commit: d372ad1de8212901641ba4bf896188c58be1b017
Parents: c0880fb
Author: Jörn Kottmann <jo...@apache.org>
Authored: Mon May 15 11:57:45 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Tue May 16 14:28:35 2017 +0200

----------------------------------------------------------------------
 .../formats/conllu/ConlluLemmaSampleStream.java |  2 +-
 .../tools/formats/conllu/ConlluTagset.java      |  2 +-
 .../opennlp/tools/eval/Conll00ChunkerEval.java  |  5 +-
 .../opennlp/tools/eval/ConllXPosTaggerEval.java |  6 +-
 .../test/java/opennlp/tools/eval/EvalUtil.java  | 33 ++++++++
 .../tools/eval/OntoNotes4NameFinderEval.java    |  8 +-
 .../tools/eval/OntoNotes4ParserEval.java        |  8 +-
 .../tools/eval/OntoNotes4PosTaggerEval.java     |  8 +-
 .../tools/eval/SourceForgeModelEval.java        | 39 ++++-----
 .../tools/eval/UniversalDependency20Eval.java   | 84 ++++++++++++++++++++
 10 files changed, 143 insertions(+), 52 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java
index 0782120..98ee48d 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStream.java
@@ -29,7 +29,7 @@ public class ConlluLemmaSampleStream extends FilterObjectStream<ConlluSentence,
 
   private final ConlluTagset tagset;
 
-  ConlluLemmaSampleStream(ObjectStream<ConlluSentence> samples, ConlluTagset tagset) {
+  public ConlluLemmaSampleStream(ObjectStream<ConlluSentence> samples, ConlluTagset tagset) {
     super(samples);
     this.tagset = tagset;
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java
index f49f3fd..3f6ee76 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTagset.java
@@ -17,7 +17,7 @@
 
 package opennlp.tools.formats.conllu;
 
-enum ConlluTagset {
+public enum ConlluTagset {
   U,
   X
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
index 8ac90d7..62d4a46 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
@@ -19,6 +19,7 @@ package opennlp.tools.eval;
 
 import java.io.File;
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 
 import org.junit.Assert;
 import org.junit.Test;
@@ -49,7 +50,7 @@ public class Conll00ChunkerEval {
 
     ObjectStream<ChunkSample> samples = new ChunkSampleStream(
         new PlainTextByLineStream(
-            new MarkableFileInputStreamFactory(trainFile), "UTF-8"));
+            new MarkableFileInputStreamFactory(trainFile), StandardCharsets.UTF_8));
 
     return ChunkerME.train("en", samples, params, new ChunkerFactory());
   }
@@ -58,7 +59,7 @@ public class Conll00ChunkerEval {
                            double expectedFMeasure) throws IOException {
 
     ObjectStream<ChunkSample> samples = new ChunkSampleStream(
-        new PlainTextByLineStream(new MarkableFileInputStreamFactory(testData), "UTF-8"));
+        new PlainTextByLineStream(new MarkableFileInputStreamFactory(testData), StandardCharsets.UTF_8));
 
     ChunkerEvaluator evaluator = new ChunkerEvaluator(new ChunkerME(model));
     evaluator.evaluate(samples);

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
index 600e599..af53878 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
@@ -19,7 +19,7 @@ package opennlp.tools.eval;
 
 import java.io.File;
 import java.io.IOException;
-import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 
 import org.junit.Assert;
 import org.junit.Test;
@@ -59,7 +59,7 @@ public class ConllXPosTaggerEval {
                                 TrainingParameters params) throws IOException {
 
     ObjectStream<POSSample> samples =
-        new ConllXPOSSampleStream(new MarkableFileInputStreamFactory(trainFile), Charset.forName("UTF-8"));
+        new ConllXPOSSampleStream(new MarkableFileInputStreamFactory(trainFile), StandardCharsets.UTF_8);
 
     return POSTaggerME.train(lang, samples, params, new POSTaggerFactory());
   }
@@ -68,7 +68,7 @@ public class ConllXPosTaggerEval {
                            double expectedAccuracy) throws IOException {
 
     ObjectStream<POSSample> samples = new ConllXPOSSampleStream(
-        new MarkableFileInputStreamFactory(testData), Charset.forName("UTF-8"));
+        new MarkableFileInputStreamFactory(testData), StandardCharsets.UTF_8);
 
     POSEvaluator evaluator = new POSEvaluator(new POSTaggerME(model));
     evaluator.evaluate(samples);

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java b/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java
index 45f2471..2b04afb 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java
@@ -18,6 +18,15 @@
 package opennlp.tools.eval;
 
 import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.math.BigInteger;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.security.MessageDigest;
+import java.security.NoSuchAlgorithmException;
+
+import org.junit.Assert;
 
 import opennlp.tools.ml.maxent.quasinewton.QNTrainer;
 import opennlp.tools.ml.naivebayes.NaiveBayesTrainer;
@@ -27,6 +36,8 @@ import opennlp.tools.util.model.ModelUtil;
 
 public class EvalUtil {
 
+  static final double ACCURACY_DELTA = 0.0001d;
+
   static TrainingParameters createPerceptronParams() {
     TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
     params.put(TrainingParameters.ALGORITHM_PARAM,
@@ -54,4 +65,26 @@ public class EvalUtil {
   public static File getOpennlpDataDir() {
     return new File(System.getProperty("OPENNLP_DATA_DIR"));
   }
+
+  static MessageDigest createDigest() {
+    try {
+      return MessageDigest.getInstance("MD5");
+    } catch (NoSuchAlgorithmException e) {
+      throw new IllegalStateException(e);
+    }
+  }
+
+  static void verifyFileChecksum(Path file, BigInteger checksum) throws IOException {
+    MessageDigest digest = createDigest();
+
+    try (InputStream in = Files.newInputStream(file)) {
+      byte[] buf = new byte[65536];
+      int len;
+      while ((len = in.read(buf)) > 0) {
+        digest.update(buf, 0, len);
+      }
+    }
+
+    Assert.assertEquals(checksum, new BigInteger(1, digest.digest()));
+  }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
index a696787..d9f5ecd 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
@@ -29,7 +29,6 @@ import java.nio.file.Path;
 import java.nio.file.Paths;
 import java.nio.file.StandardCopyOption;
 import java.security.MessageDigest;
-import java.security.NoSuchAlgorithmException;
 import java.util.Map;
 
 import org.junit.Assert;
@@ -89,12 +88,7 @@ public class OntoNotes4NameFinderEval {
 
   @BeforeClass
   public static void verifyTrainingData() throws IOException {
-    MessageDigest digest;
-    try {
-      digest = MessageDigest.getInstance("MD5");
-    } catch (NoSuchAlgorithmException e) {
-      throw new IllegalStateException(e);
-    }
+    MessageDigest digest = EvalUtil.createDigest();
 
     try (ObjectStream<NameSample> samples = createNameSampleStream()) {
       NameSample sample;

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
index f7e1046..5606b82 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
@@ -24,7 +24,6 @@ import java.io.InputStreamReader;
 import java.math.BigInteger;
 import java.nio.charset.StandardCharsets;
 import java.security.MessageDigest;
-import java.security.NoSuchAlgorithmException;
 
 import org.junit.Assert;
 import org.junit.BeforeClass;
@@ -73,12 +72,7 @@ public class OntoNotes4ParserEval {
 
   @BeforeClass
   public static void verifyTrainingData() throws IOException {
-    MessageDigest digest;
-    try {
-      digest = MessageDigest.getInstance("MD5");
-    } catch (NoSuchAlgorithmException e) {
-      throw new IllegalStateException(e);
-    }
+    MessageDigest digest = EvalUtil.createDigest();
 
     try (ObjectStream<Parse> samples = createParseSampleStream()) {
       Parse sample;

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
index 6236507..3ea7abe 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
@@ -22,7 +22,6 @@ import java.io.IOException;
 import java.math.BigInteger;
 import java.nio.charset.StandardCharsets;
 import java.security.MessageDigest;
-import java.security.NoSuchAlgorithmException;
 
 import org.junit.Assert;
 import org.junit.BeforeClass;
@@ -70,12 +69,7 @@ public class OntoNotes4PosTaggerEval {
 
   @BeforeClass
   public static void verifyTrainingData() throws IOException {
-    MessageDigest digest;
-    try {
-      digest = MessageDigest.getInstance("MD5");
-    } catch (NoSuchAlgorithmException e) {
-      throw new IllegalStateException(e);
-    }
+    MessageDigest digest = EvalUtil.createDigest();
 
     try (ObjectStream<POSSample> samples = createPOSSampleStream()) {
       POSSample sample;

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
index 25b6f54..24cdcd0 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
@@ -20,10 +20,8 @@ package opennlp.tools.eval;
 import java.io.File;
 import java.io.IOException;
 import java.math.BigInteger;
-import java.nio.charset.Charset;
 import java.nio.charset.StandardCharsets;
 import java.security.MessageDigest;
-import java.security.NoSuchAlgorithmException;
 
 import org.junit.Assert;
 import org.junit.BeforeClass;
@@ -83,25 +81,17 @@ import opennlp.tools.util.Span;
  */
 public class SourceForgeModelEval {
 
-  private static MessageDigest createDigest() {
-    try {
-      return MessageDigest.getInstance("MD5");
-    } catch (NoSuchAlgorithmException e) {
-      throw new IllegalStateException(e);
-    }
-  }
-
   @BeforeClass
   public static void ensureTestDataIsCorrect() throws IOException {
-    MessageDigest digest = createDigest();
+    MessageDigest digest = EvalUtil.createDigest();
 
     try (ObjectStream<String> lines = new PlainTextByLineStream(
         new MarkableFileInputStreamFactory(new File(EvalUtil.getOpennlpDataDir(),
-            "leipzig/eng_news_2010_300K-sentences.txt")), Charset.forName("UTF-8"))) {
+            "leipzig/eng_news_2010_300K-sentences.txt")), StandardCharsets.UTF_8)) {
 
       String line;
       while ((line = lines.read()) != null) {
-        digest.update(line.getBytes("UTF-8"));
+        digest.update(line.getBytes(StandardCharsets.UTF_8));
       }
 
       Assert.assertEquals(new BigInteger("248567841356936801447294643695012852392"),
@@ -115,7 +105,7 @@ public class SourceForgeModelEval {
     SentenceModel model = new SentenceModel(
         new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-sent.bin"));
 
-    MessageDigest digest = createDigest();
+    MessageDigest digest = EvalUtil.createDigest();
 
     SentenceDetector sentenceDetector = new SentenceDetectorME(model);
 
@@ -134,7 +124,7 @@ public class SourceForgeModelEval {
     String[] sentences = sentenceDetector.sentDetect(text.toString());
 
     for (String sentence : sentences) {
-      digest.update(sentence.getBytes("UTF-8"));
+      digest.update(sentence.getBytes(StandardCharsets.UTF_8));
     }
 
     Assert.assertEquals(new BigInteger("228544068397077998410949364710969159291"),
@@ -151,7 +141,7 @@ public class SourceForgeModelEval {
     TokenizerModel model = new TokenizerModel(
         new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-token.bin"));
 
-    MessageDigest digest = createDigest();
+    MessageDigest digest = EvalUtil.createDigest();
 
     Tokenizer tokenizer = new TokenizerME(model);
 
@@ -164,7 +154,7 @@ public class SourceForgeModelEval {
       while ((line = lines.read()) != null) {
         String[] tokens = tokenizer.tokenize(String.join(" ", line.getText()));
         for (String token : tokens) {
-          digest.update(token.getBytes("UTF-8"));
+          digest.update(token.getBytes(StandardCharsets.UTF_8));
         }
       }
     }
@@ -183,7 +173,7 @@ public class SourceForgeModelEval {
   private void evalNameFinder(TokenNameFinderModel model, BigInteger expectedHash)
       throws IOException {
 
-    MessageDigest digest = createDigest();
+    MessageDigest digest = EvalUtil.createDigest();
 
     TokenNameFinder nameFinder = new NameFinderME(model);
 
@@ -193,7 +183,8 @@ public class SourceForgeModelEval {
       while ((line = lines.read()) != null) {
         Span[] names = nameFinder.find(line.getText());
         for (Span name : names) {
-          digest.update((name.getType() + name.getStart() + name.getEnd()).getBytes("UTF-8"));
+          digest.update((name.getType() + name.getStart()
+              + name.getEnd()).getBytes(StandardCharsets.UTF_8));
         }
       }
     }
@@ -260,7 +251,7 @@ public class SourceForgeModelEval {
   @Test
   public void evalChunkerModel() throws IOException {
 
-    MessageDigest digest = createDigest();
+    MessageDigest digest = EvalUtil.createDigest();
 
     POSTagger tagger = new POSTaggerME(new POSModel(
         new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin")));
@@ -276,7 +267,7 @@ public class SourceForgeModelEval {
 
         String[] chunks = chunker.chunk(sentence.getSentence(), sentence.getTags());
         for (String chunk : chunks) {
-          digest.update(chunk.getBytes("UTF-8"));
+          digest.update(chunk.getBytes(StandardCharsets.UTF_8));
         }
       }
     }
@@ -290,7 +281,7 @@ public class SourceForgeModelEval {
     // break the input stream into sentences
     // The input stream is tokenized and can be processed here directly
 
-    MessageDigest digest = createDigest();
+    MessageDigest digest = EvalUtil.createDigest();
 
     POSTagger tagger = new POSTaggerME(model);
 
@@ -300,7 +291,7 @@ public class SourceForgeModelEval {
       while ((line = lines.read()) != null) {
         String[] tags = tagger.tag(line.getText());
         for (String tag : tags) {
-          digest.update(tag.getBytes("UTF-8"));
+          digest.update(tag.getBytes(StandardCharsets.UTF_8));
         }
       }
     }
@@ -330,7 +321,7 @@ public class SourceForgeModelEval {
     ParserModel model = new ParserModel(
         new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-parser-chunking.bin"));
 
-    MessageDigest digest = createDigest();
+    MessageDigest digest = EvalUtil.createDigest();
 
     Parser parser = ParserFactory.create(model);
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d372ad1d/opennlp-tools/src/test/java/opennlp/tools/eval/UniversalDependency20Eval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/UniversalDependency20Eval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/UniversalDependency20Eval.java
new file mode 100644
index 0000000..70fc8b0
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/UniversalDependency20Eval.java
@@ -0,0 +1,84 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.eval;
+
+import java.io.File;
+import java.io.IOException;
+import java.math.BigInteger;
+
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import opennlp.tools.formats.conllu.ConlluLemmaSampleStream;
+import opennlp.tools.formats.conllu.ConlluStream;
+import opennlp.tools.formats.conllu.ConlluTagset;
+import opennlp.tools.lemmatizer.LemmaSample;
+import opennlp.tools.lemmatizer.LemmatizerEvaluator;
+import opennlp.tools.lemmatizer.LemmatizerFactory;
+import opennlp.tools.lemmatizer.LemmatizerME;
+import opennlp.tools.lemmatizer.LemmatizerModel;
+import opennlp.tools.util.MarkableFileInputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.model.ModelUtil;
+
+public class UniversalDependency20Eval {
+
+  private static File SPA_ANCORA_TRAIN =
+      new File(EvalUtil.getOpennlpDataDir(),"ud20/UD_Spanish-AnCora/es_ancora-ud-train.conllu");
+  private static File SPA_ANCORA_DEV =
+      new File(EvalUtil.getOpennlpDataDir(),"ud20/UD_Spanish-AnCora/es_ancora-ud-dev.conllu");
+
+  @BeforeClass
+  public static void ensureTestDataIsCorrect() throws IOException {
+    SourceForgeModelEval.ensureTestDataIsCorrect();
+
+    EvalUtil.verifyFileChecksum(SPA_ANCORA_TRAIN.toPath(),
+        new BigInteger("224942804200733453179524127037951530195"));
+    EvalUtil.verifyFileChecksum(SPA_ANCORA_DEV.toPath(),
+        new BigInteger("280996187464384493180190898172297941708"));
+  }
+
+  private static double trainAndEval(String lang, File trainFile, TrainingParameters params,
+                                     File evalFile) throws IOException {
+    ConlluTagset tagset = ConlluTagset.X;
+
+    ObjectStream<LemmaSample> trainSamples = new ConlluLemmaSampleStream(new ConlluStream(
+        new MarkableFileInputStreamFactory(trainFile)), tagset);
+
+    LemmatizerModel model = LemmatizerME.train(lang, trainSamples, params, new LemmatizerFactory());
+    LemmatizerEvaluator evaluator = new LemmatizerEvaluator(new LemmatizerME(model));
+
+    evaluator.evaluate(new ConlluLemmaSampleStream(new ConlluStream(
+        new MarkableFileInputStreamFactory(evalFile)), tagset));
+
+    return evaluator.getWordAccuracy();
+  }
+
+  @Test
+  public void trainAndEvalSpanishAncora() throws IOException {
+    TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
+    params.put("Threads", "4");
+
+    double wordAccuracy = trainAndEval("spa", SPA_ANCORA_TRAIN,
+        params, SPA_ANCORA_DEV);
+
+    Assert.assertEquals(0.9046675934566091d, wordAccuracy, EvalUtil.ACCURACY_DELTA);
+  }
+}


[14/50] opennlp git commit: OPENNLP-1039: PerceptronTrainer should call super.isValid() in its isValid(). This closes apache/opennlp#177

Posted by co...@apache.org.
OPENNLP-1039: PerceptronTrainer should call super.isValid() in its isValid(). This closes apache/opennlp#177


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/735b1b59
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/735b1b59
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/735b1b59

Branch: refs/heads/LangDetect
Commit: 735b1b59862dc1b5f7e60549aa7c33dbcc4095b2
Parents: 3a22156
Author: koji <ko...@apache.org>
Authored: Mon Apr 24 09:56:08 2017 +0800
Committer: koji <ko...@apache.org>
Committed: Mon Apr 24 09:56:08 2017 +0800

----------------------------------------------------------------------
 .../opennlp/tools/ml/perceptron/PerceptronTrainer.java   | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/735b1b59/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/PerceptronTrainer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/PerceptronTrainer.java b/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/PerceptronTrainer.java
index dec6274..129c576 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/PerceptronTrainer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/PerceptronTrainer.java
@@ -86,8 +86,17 @@ public class PerceptronTrainer extends AbstractEventTrainer {
   }
   
   public boolean isValid() {
+    if (!super.isValid()) {
+      return false;
+    }
+
     String algorithmName = getAlgorithm();
-    return !(algorithmName != null && !(PERCEPTRON_VALUE.equals(algorithmName)));
+    if (algorithmName != null) {
+      return PERCEPTRON_VALUE.equals(algorithmName);
+    }
+    else {
+      return true;
+    }
   }
 
   public boolean isSortAndMerge() {


[12/50] opennlp git commit: NoJira: Fix README syntax, and minor text changes

Posted by co...@apache.org.
NoJira: Fix README syntax, and minor text changes


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/95e43b30
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/95e43b30
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/95e43b30

Branch: refs/heads/LangDetect
Commit: 95e43b305ce373d238bd4bd5d8eb26f54d664e46
Parents: 99cbf0d
Author: Bruno P. Kinoshita <br...@yahoo.com.br>
Authored: Sun Apr 23 20:58:23 2017 +1200
Committer: Bruno P. Kinoshita <br...@yahoo.com.br>
Committed: Sun Apr 23 20:58:23 2017 +1200

----------------------------------------------------------------------
 README.md | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/95e43b30/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index 02b146a..4b27762 100644
--- a/README.md
+++ b/README.md
@@ -29,19 +29,21 @@ The Apache OpenNLP library is a machine learning based toolkit for the processin
 It supports the most common NLP tasks, such as tokenization, sentence segmentation,
 part-of-speech tagging, named entity extraction, chunking, parsing, and coreference resolution.
 These tasks are usually required to build more advanced text processing services.
-OpenNLP also included maximum entropy and perceptron based machine learning.  
+OpenNLP also includes maximum entropy and perceptron based machine learning.
       
-The goal of the OpenNLP project will be to create a mature toolkit for the abovementioned tasks.
+The goal of the OpenNLP project is to create a mature toolkit for the above mentioned tasks.
 An additional goal is to provide a large number of pre-built models for a variety of languages, as
-well as the annotated text resources that those models are derived from.      
+well as the annotated text resources that those models are derived from.
 
 For additional information about OpenNLP, visit the [OpenNLP Home Page](http://opennlp.apache.org/)
 
 Documentation for OpenNLP, including JavaDocs, code usage and command line interface are available [here](http://opennlp.apache.org/documentation.html)
 
-####Using OpenNLP as a Library
+#### Using OpenNLP as a Library
+
 Running any application that uses OpenNLP will require installing a binary or source version and setting the environment.
 To compile from source:
+
 * `mvn -DskipTests clean install`
 * To run tests do `mvn test`
 
@@ -54,3 +56,4 @@ To use maven, add the appropriate setting to your pom.xml or build.sbt following
     <version>${opennlp.version}</version>
 </dependency>
 ```
+


[46/50] opennlp git commit: OPENNLP-1061 Add functionality to DictionaryLemmatizer to output several lemmas for a given word postag pair

Posted by co...@apache.org.
OPENNLP-1061 Add functionality to DictionaryLemmatizer to output several lemmas for a given word postag pair

closes #202


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/a00624cf
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/a00624cf
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/a00624cf

Branch: refs/heads/LangDetect
Commit: a00624cf27791193be74a610723a9a0b0980d23f
Parents: c434b3a
Author: Rodrigo Agerri <ra...@apache.org>
Authored: Tue May 16 12:35:22 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Tue May 16 15:32:37 2017 +0200

----------------------------------------------------------------------
 .../tools/lemmatizer/DictionaryLemmatizer.java  | 11 ++--
 .../DictionaryLemmatizerMultiTest.java          | 64 ++++++++++++++++++++
 .../tools/lemmatizer/smalldictionarymulti.dict  |  5 ++
 3 files changed, 76 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/a00624cf/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
index 37d488c..97d6854 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
@@ -29,7 +29,7 @@ import java.util.Map;
 
 /**
  * Lemmatize by simple dictionary lookup into a hashmap built from a file
- * containing, for each line, word\tablemma\tabpostag.
+ * containing, for each line, word\tabpostag\tablemma.
  * @version 2014-07-08
  */
 public class DictionaryLemmatizer implements Lemmatizer {
@@ -42,7 +42,9 @@ public class DictionaryLemmatizer implements Lemmatizer {
   /**
    * Construct a hashmap from the input tab separated dictionary.
    *
-   * The input file should have, for each line, word\tablemma\tabpostag
+   * The input file should have, for each line, word\tabpostag\tablemma.
+   * Alternatively, if multiple lemmas are possible for each word,postag pair,
+   * then the format should be word\tab\postag\tablemma01#lemma02#lemma03
    *
    * @param dictionary
    *          the input dictionary via inputstream
@@ -54,7 +56,8 @@ public class DictionaryLemmatizer implements Lemmatizer {
     String line;
     while ((line = breader.readLine()) != null) {
       final String[] elems = line.split("\t");
-      this.dictMap.put(Arrays.asList(elems[0], elems[1]), Arrays.asList(elems[2]));
+      final String[] lemmas = elems[2].split("#");
+      this.dictMap.put(Arrays.asList(elems[0], elems[1]), Arrays.asList(lemmas));
     }
   }
 
@@ -137,7 +140,7 @@ public class DictionaryLemmatizer implements Lemmatizer {
     final List<String> keys = this.getDictKeys(word, postag);
     // lookup lemma as value of the map
     final List<String> keyValues = this.dictMap.get(keys);
-    if (!keyValues.isEmpty()) {
+    if (keyValues != null && !keyValues.isEmpty()) {
       lemmasList.addAll(keyValues);
     } else {
       lemmasList.add("O");

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a00624cf/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DictionaryLemmatizerMultiTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DictionaryLemmatizerMultiTest.java b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DictionaryLemmatizerMultiTest.java
new file mode 100644
index 0000000..d29830b
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/DictionaryLemmatizerMultiTest.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.lemmatizer;
+
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class DictionaryLemmatizerMultiTest {
+
+  private static DictionaryLemmatizer dictionaryLemmatizer;
+
+  @BeforeClass
+  public static void loadDictionary() throws Exception {
+    dictionaryLemmatizer = new DictionaryLemmatizer(
+        DictionaryLemmatizerTest.class.getResourceAsStream(
+          "/opennlp/tools/lemmatizer/smalldictionarymulti.dict")
+    );
+  }
+  
+  @Test
+  public void testForNullPointerException() {
+    List<String> sentence = Arrays.asList("The","dogs","were","running","and","barking",
+        "down","the","street");
+    List<String> sentencePOS = Arrays.asList("DT","NNS","VBD","VBG","CC","VBG","RP","DT","NN");
+    List<List<String>> expectedLemmas = new ArrayList<>();
+    expectedLemmas.add(Arrays.asList("the"));
+    expectedLemmas.add(Arrays.asList("dog"));
+    expectedLemmas.add(Arrays.asList("is"));
+    expectedLemmas.add(Arrays.asList("run,run"));
+    expectedLemmas.add(Arrays.asList("and"));
+    expectedLemmas.add(Arrays.asList("bark,bark"));
+    expectedLemmas.add(Arrays.asList("down"));
+    expectedLemmas.add(Arrays.asList("the"));
+    expectedLemmas.add(Arrays.asList("street"));
+    
+    List<List<String>> actualLemmas = dictionaryLemmatizer.lemmatize(sentence, sentencePOS);
+    
+    for (int i = 0; i < sentence.size(); i++) {
+      // don't compare cases where the word is not in the dictionary...
+      if (!actualLemmas.get(0).get(0).equals("O")) 
+        Assert.assertEquals(expectedLemmas.get(i), actualLemmas.get(i));
+    }
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a00624cf/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/smalldictionarymulti.dict
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/smalldictionarymulti.dict b/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/smalldictionarymulti.dict
new file mode 100644
index 0000000..b650a0b
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/smalldictionarymulti.dict
@@ -0,0 +1,5 @@
+barking	VBG	bark#bark
+dogs	NNS	dog
+running	VBG	run#run
+down	RP	down
+street	NN	street
\ No newline at end of file


[25/50] opennlp git commit: OPENNLP-1042: Correctly tokenize reference parse text

Posted by co...@apache.org.
OPENNLP-1042: Correctly tokenize reference parse text

Closes #180


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/b4eb2910
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/b4eb2910
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/b4eb2910

Branch: refs/heads/LangDetect
Commit: b4eb29107afb3cad68d3acadd6dccaa63ceef9a9
Parents: 6c2dbf2
Author: Jörn Kottmann <jo...@apache.org>
Authored: Fri Apr 21 15:45:53 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Thu May 4 09:29:40 2017 +0200

----------------------------------------------------------------------
 .../src/main/java/opennlp/tools/parser/Parse.java   | 16 ++++++++++++++++
 .../java/opennlp/tools/parser/ParserEvaluator.java  | 11 +++++------
 2 files changed, 21 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/b4eb2910/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java b/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java
index 5ee4f0a..d4265cf 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/parser/Parse.java
@@ -972,6 +972,22 @@ public class Parse implements Cloneable, Comparable<Parse> {
     return tags.toArray(new Parse[tags.size()]);
   }
 
+  public Parse[] getTokenNodes() {
+    List<Parse> tokens = new LinkedList<>();
+    List<Parse> nodes = new LinkedList<>();
+    nodes.addAll(this.parts);
+    while (nodes.size() != 0) {
+      Parse p = nodes.remove(0);
+      if (p.getType().equals(AbstractBottomUpParser.TOK_NODE)) {
+        tokens.add(p);
+      }
+      else {
+        nodes.addAll(0, p.parts);
+      }
+    }
+    return tokens.toArray(new Parse[tokens.size()]);
+  }
+
   /**
    * Returns the deepest shared parent of this node and the specified node.
    * If the nodes are identical then their parent is returned.

http://git-wip-us.apache.org/repos/asf/opennlp/blob/b4eb2910/opennlp-tools/src/main/java/opennlp/tools/parser/ParserEvaluator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/ParserEvaluator.java b/opennlp-tools/src/main/java/opennlp/tools/parser/ParserEvaluator.java
index 64b2b42..013a6c1 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/parser/ParserEvaluator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/parser/ParserEvaluator.java
@@ -89,15 +89,14 @@ public class ParserEvaluator extends Evaluator<Parse> {
     return consts.toArray(new Span[consts.size()]);
   }
 
-  /* (non-Javadoc)
-   * @see opennlp.tools.util.eval.Evaluator#processSample(java.lang.Object)
-   */
   @Override
   protected final Parse processSample(final Parse reference) {
+    List<String> tokens = new ArrayList<>();
+    for (Parse token : reference.getTokenNodes()) {
+      tokens.add(token.getSpan().getCoveredText(reference.getText()).toString());
+    }
 
-    String sentenceText = reference.getText();
-
-    Parse[] predictions = ParserTool.parseLine(sentenceText, parser, 1);
+    Parse[] predictions = ParserTool.parseLine(String.join(" ", tokens), parser, 1);
 
     Parse prediction = null;
     if (predictions.length > 0) {


[10/50] opennlp git commit: OPENNLP-1034: Move serializers to resource mapping to GeneratorFactory

Posted by co...@apache.org.
OPENNLP-1034: Move serializers to resource mapping to GeneratorFactory

Closes #173


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/f74a86f4
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/f74a86f4
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/f74a86f4

Branch: refs/heads/LangDetect
Commit: f74a86f4b6a6f93d3a1e10f2a4852c5898feefb3
Parents: 041507d
Author: Jörn Kottmann <jo...@apache.org>
Authored: Wed Apr 19 18:34:15 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Thu Apr 20 18:05:40 2017 +0200

----------------------------------------------------------------------
 .../TokenNameFinderCrossValidatorTool.java      | 10 ++-
 .../namefind/TokenNameFinderTrainerTool.java    | 77 ++++----------------
 .../postag/POSTaggerCrossValidatorTool.java     |  9 ++-
 .../cmdline/postag/POSTaggerTrainerTool.java    | 11 ++-
 .../tools/util/featuregen/GeneratorFactory.java | 41 ++++++++++-
 .../util/featuregen/GeneratorFactoryTest.java   |  2 +-
 6 files changed, 78 insertions(+), 72 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
index 0ee3738..6e62577 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderCrossValidatorTool.java
@@ -71,8 +71,14 @@ public final class TokenNameFinderCrossValidatorTool
     byte[] featureGeneratorBytes =
         TokenNameFinderTrainerTool.openFeatureGeneratorBytes(params.getFeaturegen());
 
-    Map<String, Object> resources =
-        TokenNameFinderTrainerTool.loadResources(params.getResources(), params.getFeaturegen());
+    Map<String, Object> resources;
+
+    try {
+      resources = TokenNameFinderTrainerTool.loadResources(params.getResources(), params.getFeaturegen());
+    }
+    catch (IOException e) {
+      throw new TerminateToolException(-1,"IO error while loading resources", e);
+    }
 
     if (params.getNameTypes() != null) {
       String[] nameTypes = params.getNameTypes().split(",");

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
index 4fb8cb9..f3cef48 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/namefind/TokenNameFinderTrainerTool.java
@@ -20,13 +20,9 @@ package opennlp.tools.cmdline.namefind;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
-import java.util.ArrayList;
 import java.util.HashMap;
-import java.util.List;
 import java.util.Map;
 
-import org.w3c.dom.Element;
-
 import opennlp.tools.cmdline.AbstractTrainerTool;
 import opennlp.tools.cmdline.CmdLineUtil;
 import opennlp.tools.cmdline.TerminateToolException;
@@ -89,79 +85,31 @@ public final class TokenNameFinderTrainerTool
    * @param featureGenDescriptor the feature xml descriptor
    * @return a map consisting of the file name of the resource and its corresponding Object
    */
-  public static Map<String, Object> loadResources(File resourcePath, File featureGenDescriptor) {
+  public static Map<String, Object> loadResources(File resourcePath, File featureGenDescriptor)
+      throws IOException {
     Map<String, Object> resources = new HashMap<>();
 
     if (resourcePath != null) {
+      Map<String, ArtifactSerializer> artifactSerializers = new HashMap<>();
 
-      Map<String, ArtifactSerializer> artifactSerializers = TokenNameFinderModel
-          .createArtifactSerializers();
-      List<Element> elements = new ArrayList<>();
-      ArtifactSerializer serializer = null;
-
-
-      // TODO: If there is descriptor file, it should be consulted too
       if (featureGenDescriptor != null) {
 
         try (InputStream xmlDescriptorIn = CmdLineUtil.openInFile(featureGenDescriptor)) {
           artifactSerializers.putAll(
-              GeneratorFactory.extractCustomArtifactSerializerMappings(xmlDescriptorIn));
-        } catch (IOException e) {
-          // TODO: Improve error handling!
-          e.printStackTrace();
-        }
-
-        try (InputStream inputStreamXML = CmdLineUtil.openInFile(featureGenDescriptor)) {
-          elements = GeneratorFactory.getDescriptorElements(inputStreamXML);
-        } catch (IOException e) {
-          e.printStackTrace();
+              GeneratorFactory.extractArtifactSerializerMappings(xmlDescriptorIn));
         }
       }
 
-      File[] resourceFiles = resourcePath.listFiles();
-
-      for (File resourceFile : resourceFiles) {
-        String resourceName = resourceFile.getName();
-        //gettting the serializer key from the element tag name
-        //if the element contains a dict attribute
-        for (Element xmlElement : elements) {
-          String dictName = xmlElement.getAttribute("dict");
-          if (dictName != null && dictName.equals(resourceName)) {
-            serializer = artifactSerializers.get(xmlElement.getTagName());
-          }
-        }
-        // TODO: Do different? For now just ignore ....
-        if (serializer == null)
-          continue;
-
-        try (InputStream resourceIn = CmdLineUtil.openInFile(resourceFile)) {
-          resources.put(resourceName, serializer.create(resourceIn));
-        } catch (IOException e) {
-          // TODO: Fix exception handling
-          e.printStackTrace();
+      for (Map.Entry<String, ArtifactSerializer> serializerMapping : artifactSerializers.entrySet()) {
+        String resourceName = serializerMapping.getKey();
+        try (InputStream resourceIn = CmdLineUtil.openInFile(new File(resourcePath, resourceName))) {
+          resources.put(resourceName, serializerMapping.getValue().create(resourceIn));
         }
       }
     }
     return resources;
   }
 
-  /**
-   * Calls a loadResources method above to load any external resource required for training.
-   * @param resourceDirectory the directory where the resources are to be found
-   * @param featureGeneratorDescriptor the xml feature generator
-   * @return a map containing the file name of the resource and its mapped Object
-   */
-  static Map<String, Object> loadResources(String resourceDirectory, File featureGeneratorDescriptor) {
-
-    if (resourceDirectory != null) {
-      File resourcePath = new File(resourceDirectory);
-
-      return loadResources(resourcePath, featureGeneratorDescriptor);
-    }
-
-    return new HashMap<>();
-  }
-
   public void run(String format, String[] args) {
     super.run(format, args);
 
@@ -174,12 +122,17 @@ public final class TokenNameFinderTrainerTool
 
     byte[] featureGeneratorBytes = openFeatureGeneratorBytes(params.getFeaturegen());
 
-
     // TODO: Support Custom resources:
     //       Must be loaded into memory, or written to tmp file until descriptor
     //       is loaded which defines parses when model is loaded
 
-    Map<String, Object> resources = loadResources(params.getResources(), params.getFeaturegen());
+    Map<String, Object> resources;
+    try {
+      resources = loadResources(params.getResources(), params.getFeaturegen());
+    }
+    catch (IOException e) {
+      throw new TerminateToolException(-1, e.getMessage(), e);
+    }
 
     CmdLineUtil.checkOutputFile("name finder model", modelOutFile);
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java
index 67ad2b9..c6a37a8 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerCrossValidatorTool.java
@@ -77,8 +77,13 @@ public final class POSTaggerCrossValidatorTool
       }
     }
 
-    Map<String, Object> resources = TokenNameFinderTrainerTool.loadResources(
-        params.getResources(), params.getFeaturegen());
+    Map<String, Object> resources;
+    try {
+      resources = TokenNameFinderTrainerTool.loadResources(params.getResources(), params.getFeaturegen());
+    }
+    catch (IOException e) {
+      throw new TerminateToolException(-1,"IO error while loading resources", e);
+    }
 
     byte[] featureGeneratorBytes =
         TokenNameFinderTrainerTool.openFeatureGeneratorBytes(params.getFeaturegen());

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
index b922176..ca614f9 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/postag/POSTaggerTrainerTool.java
@@ -67,8 +67,15 @@ public final class POSTaggerTrainerTool
     File modelOutFile = params.getModel();
     CmdLineUtil.checkOutputFile("pos tagger model", modelOutFile);
 
-    Map<String, Object> resources = TokenNameFinderTrainerTool.loadResources(
-        params.getResources(), params.getFeaturegen());
+    Map<String, Object> resources;
+
+    try {
+      resources = TokenNameFinderTrainerTool.loadResources(
+          params.getResources(), params.getFeaturegen());
+    }
+    catch (IOException e) {
+      throw new TerminateToolException(-1,"IO error while loading resources", e);
+    }
 
     byte[] featureGeneratorBytes =
         TokenNameFinderTrainerTool.openFeatureGeneratorBytes(params.getFeaturegen());

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
index a1ac72b..5060961 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
@@ -755,7 +755,7 @@ public class GeneratorFactory {
     return createGenerator(generatorElement, resourceManager);
   }
 
-  public static Map<String, ArtifactSerializer<?>> extractCustomArtifactSerializerMappings(
+  public static Map<String, ArtifactSerializer<?>> extractArtifactSerializerMappings(
       InputStream xmlDescriptorIn) throws IOException {
 
     Map<String, ArtifactSerializer<?>> mapping = new HashMap<>();
@@ -764,7 +764,6 @@ public class GeneratorFactory {
 
     XPath xPath = XPathFactory.newInstance().newXPath();
 
-
     NodeList customElements;
     try {
       XPathExpression exp = xPath.compile("//custom");
@@ -774,7 +773,6 @@ public class GeneratorFactory {
     }
 
     for (int i = 0; i < customElements.getLength(); i++) {
-
       if (customElements.item(i) instanceof Element) {
         Element customElement = (Element) customElements.item(i);
 
@@ -788,6 +786,43 @@ public class GeneratorFactory {
         }
       }
     }
+
+    NodeList allElements;
+    try {
+      XPathExpression exp = xPath.compile("//*");
+      allElements = (NodeList) exp.evaluate(xmlDescriptorDOM.getDocumentElement(), XPathConstants.NODESET);
+    } catch (XPathExpressionException e) {
+      throw new IllegalStateException("The hard coded XPath expression should always be valid!");
+    }
+
+    for (int i = 0; i < allElements.getLength(); i++) {
+      if (allElements.item(i) instanceof Element) {
+        Element xmlElement = (Element) allElements.item(i);
+
+        String dictName = xmlElement.getAttribute("dict");
+        if (dictName != null) {
+
+          switch (xmlElement.getTagName()) {
+            case "wordcluster":
+              mapping.put(dictName, new WordClusterDictionary.WordClusterDictionarySerializer());
+              break;
+
+            case "brownclustertoken":
+              mapping.put(dictName, new BrownCluster.BrownClusterSerializer());
+              break;
+
+            case "brownclustertokenclass"://, ;
+              mapping.put(dictName, new BrownCluster.BrownClusterSerializer());
+              break;
+
+            case "brownclusterbigram": //, ;
+              mapping.put(dictName, new BrownCluster.BrownClusterSerializer());
+              break;
+          }
+        }
+      }
+    }
+
     return mapping;
   }
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/f74a86f4/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java
index 8a48575..dd569b0 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/GeneratorFactoryTest.java
@@ -120,7 +120,7 @@ public class GeneratorFactoryTest {
         "/opennlp/tools/util/featuregen/CustomClassLoadingWithSerializers.xml");
 
     Map<String, ArtifactSerializer<?>> mapping =
-        GeneratorFactory.extractCustomArtifactSerializerMappings(descIn);
+        GeneratorFactory.extractArtifactSerializerMappings(descIn);
 
     Assert.assertTrue(mapping.get("test.resource") instanceof WordClusterDictionarySerializer);
   }


[06/50] opennlp git commit: OPENNLP-1029: Add tests for InsufficientTrainingDataException, closes apache/opennlp#167

Posted by co...@apache.org.
OPENNLP-1029: Add tests for InsufficientTrainingDataException, closes apache/opennlp#167


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/d447459a
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/d447459a
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/d447459a

Branch: refs/heads/LangDetect
Commit: d447459a682cdc7e06b3980a59100ea94a6a180a
Parents: 45ea3f7
Author: jzonthemtn <je...@mtnfog.com>
Authored: Wed Apr 19 14:11:20 2017 -0400
Committer: smarthi <sm...@apache.org>
Committed: Wed Apr 19 14:11:20 2017 -0400

----------------------------------------------------------------------
 .../opennlp/tools/chunker/ChunkerMETest.java    | 18 +++++++++++
 .../tools/doccat/DocumentCategorizerMETest.java | 17 ++++++++++
 .../tools/lemmatizer/LemmatizerMETest.java      | 21 +++++++++++--
 .../TokenNameFinderCrossValidatorTest.java      | 33 +++++++++++++++++---
 .../opennlp/tools/postag/POSTaggerMETest.java   | 24 ++++++++++++--
 .../sentdetect/SentenceDetectorMETest.java      | 28 +++++++++++++++--
 .../opennlp/tools/tokenize/TokenizerMETest.java | 26 +++++++++++++++
 .../opennlp/tools/chunker/test-insufficient.txt |  1 +
 .../tools/lemmatizer/trial.old-insufficient.tsv |  1 +
 .../namefind/AnnotatedSentencesInsufficient.txt |  5 +++
 .../postag/AnnotatedSentencesInsufficient.txt   |  1 +
 .../tools/sentdetect/SentencesInsufficient.txt  |  1 +
 .../tools/tokenize/token-insufficient.train     |  1 +
 13 files changed, 166 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
index 51112df..facb408 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
@@ -27,6 +27,7 @@ import org.junit.Test;
 
 import opennlp.tools.formats.ResourceAsStreamFactory;
 import opennlp.tools.namefind.NameFinderME;
+import opennlp.tools.util.InsufficientTrainingDataException;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.Sequence;
@@ -128,5 +129,22 @@ public class ChunkerMETest {
     Assert.assertEquals(Arrays.asList(expect1), preds[0].getOutcomes());
     Assert.assertNotSame(Arrays.asList(expect1), preds[1].getOutcomes());
   }
+  
+  @Test(expected = InsufficientTrainingDataException.class)
+  public void testInsufficientData() throws IOException {
+
+    ResourceAsStreamFactory in = new ResourceAsStreamFactory(getClass(),
+        "/opennlp/tools/chunker/test-insufficient.txt");
+
+    ObjectStream<ChunkSample> sampleStream = new ChunkSampleStream(
+        new PlainTextByLineStream(in, StandardCharsets.UTF_8));
+
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ITERATIONS_PARAM, "70");
+    params.put(TrainingParameters.CUTOFF_PARAM, "1");
+
+    ChunkerME.train("en", sampleStream, params, new ChunkerFactory());
+
+  }
 
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
index 220df87..391125e 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
@@ -24,6 +24,7 @@ import java.util.SortedMap;
 import org.junit.Assert;
 import org.junit.Test;
 
+import opennlp.tools.util.InsufficientTrainingDataException;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.ObjectStreamUtils;
 import opennlp.tools.util.TrainingParameters;
@@ -61,4 +62,20 @@ public class DocumentCategorizerMETest {
     Set<String> cat = sortedScoreMap.get(sortedScoreMap.lastKey());
     Assert.assertEquals(1, cat.size());
   }
+  
+  @Test(expected = InsufficientTrainingDataException.class)
+  public void insufficientTestData() throws IOException {
+
+    ObjectStream<DocumentSample> samples = ObjectStreamUtils.createObjectStream(
+        new DocumentSample("1", new String[]{"a", "b", "c"}));
+
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "0");
+
+    DocumentCategorizerME.train("x-unspecified", samples,
+        params, new DoccatFactory());
+
+  }
+  
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
index 4631763..f00f2b4 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
@@ -24,6 +24,7 @@ import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Test;
 
+import opennlp.tools.util.InsufficientTrainingDataException;
 import opennlp.tools.util.MockInputStreamFactory;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
@@ -68,8 +69,8 @@ public class LemmatizerMETest {
           new File("opennlp/tools/lemmatizer/trial.old.tsv")), "UTF-8"));
 
     TrainingParameters params = new TrainingParameters();
-    params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100));
-    params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(5));
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "5");
 
     LemmatizerModel lemmatizerModel = LemmatizerME.train("en", sampleStream,
         params, new LemmatizerFactory());
@@ -84,5 +85,21 @@ public class LemmatizerMETest {
 
     Assert.assertArrayEquals(expect, lemmas);
   }
+  
+  @Test(expected = InsufficientTrainingDataException.class)
+  public void testInsufficientData() throws IOException {
+ 
+    ObjectStream<LemmaSample> sampleStream = new LemmaSampleStream(
+        new PlainTextByLineStream(new MockInputStreamFactory(
+            new File("opennlp/tools/lemmatizer/trial.old-insufficient.tsv")),
+                "UTF-8"));
+
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "5");
+
+    LemmatizerME.train("en", sampleStream, params, new LemmatizerFactory());
+
+  }
 
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java
index 679726d..9e31987 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java
@@ -28,6 +28,7 @@ import org.junit.Test;
 import opennlp.tools.cmdline.namefind.NameEvaluationErrorListener;
 import opennlp.tools.formats.ResourceAsStreamFactory;
 import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.InsufficientTrainingDataException;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.TrainingParameters;
@@ -50,8 +51,8 @@ public class TokenNameFinderCrossValidatorTest {
         new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1));
 
     TrainingParameters mlParams = new TrainingParameters();
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(70));
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(1));
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, "70");
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, "1");
 
     mlParams.put(TrainingParameters.ALGORITHM_PARAM,
         ModelType.MAXENT.toString());
@@ -77,8 +78,8 @@ public class TokenNameFinderCrossValidatorTest {
         new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1));
 
     TrainingParameters mlParams = new TrainingParameters();
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(70));
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(1));
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, "70");
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, "1");
 
     mlParams.put(TrainingParameters.ALGORITHM_PARAM,
         ModelType.MAXENT.toString());
@@ -95,4 +96,28 @@ public class TokenNameFinderCrossValidatorTest {
     Assert.assertTrue(out.size() > 0);
     Assert.assertNotNull(cv.getFMeasure());
   }
+  
+  @Test(expected = InsufficientTrainingDataException.class)
+  public void testWithInsufficientData() throws Exception {
+
+    InputStreamFactory in = new ResourceAsStreamFactory(getClass(),
+        "/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt");
+
+    ObjectStream<NameSample> sampleStream = new NameSampleDataStream(
+        new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1));
+
+    TrainingParameters mlParams = new TrainingParameters();
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, "70");
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, "1");
+
+    mlParams.put(TrainingParameters.ALGORITHM_PARAM,
+        ModelType.MAXENT.toString());
+
+    TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("en",
+        TYPE, mlParams, null, (TokenNameFinderEvaluationMonitor)null);
+
+    cv.evaluate(sampleStream, 2);
+
+  }
+  
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java
index 51cae2c..e2bca48 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java
@@ -25,6 +25,7 @@ import org.junit.Test;
 
 import opennlp.tools.formats.ResourceAsStreamFactory;
 import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.InsufficientTrainingDataException;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.TrainingParameters;
@@ -50,8 +51,8 @@ public class POSTaggerMETest {
   static POSModel trainPOSModel(ModelType type) throws IOException {
     TrainingParameters params = new TrainingParameters();
     params.put(TrainingParameters.ALGORITHM_PARAM, type.toString());
-    params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100));
-    params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(5));
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "5");
 
     return POSTaggerME.train("en", createSampleStream(), params,
         new POSTaggerFactory());
@@ -85,4 +86,23 @@ public class POSTaggerMETest {
     ObjectStream<POSSample> samples = createSampleStream();
     POSTaggerME.buildNGramDictionary(samples, 0);
   }
+  
+  @Test(expected = InsufficientTrainingDataException.class)
+  public void insufficientTestData() throws IOException {
+
+    InputStreamFactory in = new ResourceAsStreamFactory(POSTaggerMETest.class,
+        "/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt");
+
+    ObjectStream<POSSample> stream = new WordTagSampleStream(
+        new PlainTextByLineStream(in, StandardCharsets.UTF_8));
+ 
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ALGORITHM_PARAM, ModelType.MAXENT.name());
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "5");
+
+    POSTaggerME.train("en", stream, params, new POSTaggerFactory());
+
+  }
+  
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
index 43d5829..220650d 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
@@ -26,6 +26,7 @@ import org.junit.Test;
 
 import opennlp.tools.formats.ResourceAsStreamFactory;
 import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.InsufficientTrainingDataException;
 import opennlp.tools.util.PlainTextByLineStream;
 import opennlp.tools.util.Span;
 import opennlp.tools.util.TrainingParameters;
@@ -42,12 +43,14 @@ public class SentenceDetectorMETest {
         "/opennlp/tools/sentdetect/Sentences.txt");
 
     TrainingParameters mlParams = new TrainingParameters();
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100));
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(0));
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, "0");
+
+    SentenceDetectorFactory factory = new SentenceDetectorFactory("en", true, null, null);
 
     SentenceModel sentdetectModel = SentenceDetectorME.train(
         "en", new SentenceSampleStream(new PlainTextByLineStream(in,
-            StandardCharsets.UTF_8)), true, null, mlParams);
+            StandardCharsets.UTF_8)), factory, mlParams);
 
     Assert.assertEquals("en", sentdetectModel.getLanguage());
 
@@ -132,4 +135,23 @@ public class SentenceDetectorMETest {
     Assert.assertEquals(new Span(16, 56), pos[1]);
 
   }
+  
+  @Test(expected = InsufficientTrainingDataException.class)
+  public void testInsufficientData() throws IOException {
+
+    InputStreamFactory in = new ResourceAsStreamFactory(getClass(),
+        "/opennlp/tools/sentdetect/SentencesInsufficient.txt");
+
+    TrainingParameters mlParams = new TrainingParameters();
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, "0");
+
+    SentenceDetectorFactory factory = new SentenceDetectorFactory("en", true, null, null);
+    
+    SentenceDetectorME.train("en", 
+        new SentenceSampleStream(
+            new PlainTextByLineStream(in, StandardCharsets.UTF_8)), factory, mlParams);
+    
+  }
+  
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
index 5a7a811..14b9185 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
@@ -18,10 +18,18 @@
 package opennlp.tools.tokenize;
 
 import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 
 import org.junit.Assert;
 import org.junit.Test;
 
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.InsufficientTrainingDataException;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
 /**
  * Tests for the {@link TokenizerME} class.
  *
@@ -65,4 +73,22 @@ public class TokenizerMETest {
     Assert.assertEquals("through", tokens[7]);
     Assert.assertEquals("!", tokens[8]);
   }
+  
+  @Test(expected = InsufficientTrainingDataException.class)
+  public void testInsufficientData() throws IOException {
+
+    InputStreamFactory trainDataIn = new ResourceAsStreamFactory(
+        TokenizerModel.class, "/opennlp/tools/tokenize/token-insufficient.train");
+
+    ObjectStream<TokenSample> samples = new TokenSampleStream(
+        new PlainTextByLineStream(trainDataIn, StandardCharsets.UTF_8));
+
+    TrainingParameters mlParams = new TrainingParameters();
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, "5");
+
+    TokenizerME.train(samples, TokenizerFactory.create(null, "en", null, true, null), mlParams);
+
+  }
+  
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/resources/opennlp/tools/chunker/test-insufficient.txt
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/chunker/test-insufficient.txt b/opennlp-tools/src/test/resources/opennlp/tools/chunker/test-insufficient.txt
new file mode 100644
index 0000000..a578590
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/chunker/test-insufficient.txt
@@ -0,0 +1 @@
+Rockwell NNP B-NP
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/trial.old-insufficient.tsv
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/trial.old-insufficient.tsv b/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/trial.old-insufficient.tsv
new file mode 100644
index 0000000..89c2aee
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/lemmatizer/trial.old-insufficient.tsv
@@ -0,0 +1 @@
+The	DT	the
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/resources/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt b/opennlp-tools/src/test/resources/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt
new file mode 100644
index 0000000..c70ec6d
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/namefind/AnnotatedSentencesInsufficient.txt
@@ -0,0 +1,5 @@
+Last September, I tried to find out the address of an old school friend whom I hadnt't seen for 15 years.
+I just knew his name , <START> Alan McKennedy <END> , and I'd heard the rumour that he'd moved to Scotland, the country of his ancestors.
+So I called <START> Julie <END> , a friend who's still in contact with him.
+She told me that he lived in 23213 Edinburgh, Worcesterstreet 12.
+I wrote him a letter right away and he answered soon, sounding very happy and delighted.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/resources/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt b/opennlp-tools/src/test/resources/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt
new file mode 100644
index 0000000..786f182
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/postag/AnnotatedSentencesInsufficient.txt
@@ -0,0 +1 @@
+Find_VB out_RP.
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/resources/opennlp/tools/sentdetect/SentencesInsufficient.txt
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/sentdetect/SentencesInsufficient.txt b/opennlp-tools/src/test/resources/opennlp/tools/sentdetect/SentencesInsufficient.txt
new file mode 100644
index 0000000..0465ce2
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/sentdetect/SentencesInsufficient.txt
@@ -0,0 +1 @@
+Last September, I tried to find out the address of an old school friend whom I hadnt't seen for 15 years.

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d447459a/opennlp-tools/src/test/resources/opennlp/tools/tokenize/token-insufficient.train
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/tokenize/token-insufficient.train b/opennlp-tools/src/test/resources/opennlp/tools/tokenize/token-insufficient.train
new file mode 100644
index 0000000..db4a49d
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/tokenize/token-insufficient.train
@@ -0,0 +1 @@
+I tried to find out the address of an old school .
\ No newline at end of file


[41/50] opennlp git commit: OPENNLP-1059: Update READM1E.md, this closes apache/opennlp#198

Posted by co...@apache.org.
OPENNLP-1059: Update READM1E.md, this closes apache/opennlp#198


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/068b1f3c
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/068b1f3c
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/068b1f3c

Branch: refs/heads/LangDetect
Commit: 068b1f3ce5b46c32df08e3558b923e4d9ba362b0
Parents: 108fa9a
Author: beylerian <>
Authored: Mon May 15 14:10:47 2017 -0400
Committer: smarthi <sm...@apache.org>
Committed: Mon May 15 14:12:50 2017 -0400

----------------------------------------------------------------------
 README.md | 79 ++++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 66 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/068b1f3c/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index 4b27762..824afe0 100644
--- a/README.md
+++ b/README.md
@@ -26,29 +26,54 @@ Welcome to Apache OpenNLP!
 [![Twitter Follow](https://img.shields.io/twitter/follow/ApacheOpennlp.svg?style=social)](https://twitter.com/ApacheOpenNLP)
 
 The Apache OpenNLP library is a machine learning based toolkit for the processing of natural language text.
-It supports the most common NLP tasks, such as tokenization, sentence segmentation,
-part-of-speech tagging, named entity extraction, chunking, parsing, and coreference resolution.
+
+This toolkit is written completly in Java and provides support for common NLP tasks, such as tokenization, sentence segmentation, part-of-speech tagging, named entity extraction, chunking, parsing, coreference resolution and more!
+
 These tasks are usually required to build more advanced text processing services.
-OpenNLP also includes maximum entropy and perceptron based machine learning.
-      
-The goal of the OpenNLP project is to create a mature toolkit for the above mentioned tasks.
+
+The goal of the OpenNLP project is to be a mature toolkit for the abovementioned tasks.
+
 An additional goal is to provide a large number of pre-built models for a variety of languages, as
 well as the annotated text resources that those models are derived from.
 
-For additional information about OpenNLP, visit the [OpenNLP Home Page](http://opennlp.apache.org/)
+Currently, OpenNLP also includes common classifiers such as Maximum Entropy, Perceptron and Naive Bayes.
+
+OpenNLP can be used both programmatically through its Java API or from a terminal through its CLI.
+
+## Useful Links
+       
+For additional information, visit the [OpenNLP Home Page](http://opennlp.apache.org/)
+
+You can use OpenNLP with any language, demo models are provided [here](http://opennlp.sourceforge.net/models-1.5/).
+
+The models are fully compatible with the latest release, they can be used for testing or getting started. 
+
+Please train your own models for all other use cases.
+
+Documentation, including JavaDocs, code usage and command-line interface examples are available [here](http://opennlp.apache.org/docs/)
+
+You can also follow our [mailing lists](http://opennlp.apache.org/mailing-lists.html) for news and updates.
+
+## Overview
+
+Currently the library has different packages:
+
+`opennlp-tools` : The core toolkit.
+
+`opennlp-uima` : A set of [Apache UIMA](https://uima.apache.org) annotators.
+
+`opennlp-brat-annotator` : A set of annotators for [BRAT](http://brat.nlplab.org/)
 
-Documentation for OpenNLP, including JavaDocs, code usage and command line interface are available [here](http://opennlp.apache.org/documentation.html)
+`opennlp-morfologik-addon` : An addon for Morfologik
 
-#### Using OpenNLP as a Library
+`opennlp-sandbox`: Other projects in progress are found in the [sandbox](https://github.com/apache/opennlp-sandbox)
 
-Running any application that uses OpenNLP will require installing a binary or source version and setting the environment.
-To compile from source:
 
-* `mvn -DskipTests clean install`
-* To run tests do `mvn test`
+## Getting Started
 
-To use maven, add the appropriate setting to your pom.xml or build.sbt following the template below.
+You can import the core toolkit directly from Maven, SBT or Gradle:
 
+#### Maven
 ```
 <dependency>
     <groupId>org.apache.opennlp</groupId>
@@ -57,3 +82,31 @@ To use maven, add the appropriate setting to your pom.xml or build.sbt following
 </dependency>
 ```
 
+#### SBT
+```
+libraryDependencies += "org.apache.opennlp" % "opennlp-tools" % "${opennlp.version}"
+```
+
+#### Gradle
+```
+compile group: "org.apache.opennlp", name: "opennlp-tools", version: "$opennlp.version"
+```
+
+
+For more details please check our [documentation](http://opennlp.apache.org/docs/)
+
+## Building OpenNLP
+
+At least JDK 8 and Maven 3.3.9 are required to build the library.
+
+After cloning the repository go into the destination directory and run:
+
+```
+mvn install
+```
+
+## Contributing
+
+The Apache OpenNLP project is developed by volunteers and is always looking for new contributors to work on all parts of the project. Every contribution is welcome and needed to make it better. A contribution can be anything from a small documentation typo fix to a new component.
+
+If you would like to get involved please follow the instructions [here](https://github.com/apache/opennlp/blob/master/.github/CONTRIBUTING.md)
\ No newline at end of file


[38/50] opennlp git commit: [maven-release-plugin] prepare for next development iteration

Posted by co...@apache.org.
[maven-release-plugin] prepare for next development iteration


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/652edde7
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/652edde7
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/652edde7

Branch: refs/heads/LangDetect
Commit: 652edde7038a241469bb2c425be79c12c886ff8a
Parents: 38d7f2c
Author: Jörn Kottmann <jo...@apache.org>
Authored: Thu May 11 18:20:59 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Thu May 11 18:20:59 2017 +0200

----------------------------------------------------------------------
 opennlp-brat-annotator/pom.xml   | 2 +-
 opennlp-distr/pom.xml            | 2 +-
 opennlp-docs/pom.xml             | 2 +-
 opennlp-morfologik-addon/pom.xml | 2 +-
 opennlp-tools/pom.xml            | 2 +-
 opennlp-uima/pom.xml             | 2 +-
 pom.xml                          | 4 ++--
 7 files changed, 8 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/652edde7/opennlp-brat-annotator/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-brat-annotator/pom.xml b/opennlp-brat-annotator/pom.xml
index 008fd65..0791e6b 100644
--- a/opennlp-brat-annotator/pom.xml
+++ b/opennlp-brat-annotator/pom.xml
@@ -17,7 +17,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.0</version>
+		<version>1.8.1-SNAPSHOT</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/652edde7/opennlp-distr/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-distr/pom.xml b/opennlp-distr/pom.xml
index 317c37f..4428240 100644
--- a/opennlp-distr/pom.xml
+++ b/opennlp-distr/pom.xml
@@ -24,7 +24,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.0</version>
+		<version>1.8.1-SNAPSHOT</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/652edde7/opennlp-docs/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/pom.xml b/opennlp-docs/pom.xml
index 6b407b8..312f6b8 100644
--- a/opennlp-docs/pom.xml
+++ b/opennlp-docs/pom.xml
@@ -24,7 +24,7 @@
   <parent>
 	<groupId>org.apache.opennlp</groupId>
 	<artifactId>opennlp</artifactId>
-	<version>1.8.0</version>
+	<version>1.8.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
   

http://git-wip-us.apache.org/repos/asf/opennlp/blob/652edde7/opennlp-morfologik-addon/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/pom.xml b/opennlp-morfologik-addon/pom.xml
index bfae09f..8c5b9f4 100644
--- a/opennlp-morfologik-addon/pom.xml
+++ b/opennlp-morfologik-addon/pom.xml
@@ -24,7 +24,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.0</version>
+		<version>1.8.1-SNAPSHOT</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/652edde7/opennlp-tools/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/pom.xml b/opennlp-tools/pom.xml
index 573861b..a2cf596 100644
--- a/opennlp-tools/pom.xml
+++ b/opennlp-tools/pom.xml
@@ -25,7 +25,7 @@
   <parent>
     <groupId>org.apache.opennlp</groupId>
     <artifactId>opennlp</artifactId>
-    <version>1.8.0</version>
+    <version>1.8.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
     

http://git-wip-us.apache.org/repos/asf/opennlp/blob/652edde7/opennlp-uima/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-uima/pom.xml b/opennlp-uima/pom.xml
index 1db9c38..d8f5246 100644
--- a/opennlp-uima/pom.xml
+++ b/opennlp-uima/pom.xml
@@ -25,7 +25,7 @@
 	<parent>
 	    <groupId>org.apache.opennlp</groupId>
 	    <artifactId>opennlp</artifactId>
-	    <version>1.8.0</version>
+	    <version>1.8.1-SNAPSHOT</version>
 	    <relativePath>../pom.xml</relativePath>
     </parent>
     

http://git-wip-us.apache.org/repos/asf/opennlp/blob/652edde7/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 29a0699..03811a8 100644
--- a/pom.xml
+++ b/pom.xml
@@ -31,7 +31,7 @@
 
 	<groupId>org.apache.opennlp</groupId>
 	<artifactId>opennlp</artifactId>
-	<version>1.8.0</version>
+	<version>1.8.1-SNAPSHOT</version>
 	<packaging>pom</packaging>
 
 	<name>Apache OpenNLP Reactor</name>
@@ -40,7 +40,7 @@
 		<connection>scm:git:git@github.com:apache/opennlp.git</connection>
 		<developerConnection>scm:git:https://git-wip-us.apache.org/repos/asf/opennlp.git</developerConnection>
 		<url>https://git-wip-us.apache.org/repos/asf?p=opennlp.git</url>
-		<tag>opennlp-1.8.0</tag>
+		<tag>HEAD</tag>
 	</scm>
 
 	<mailingLists>


[37/50] opennlp git commit: [maven-release-plugin] prepare release opennlp-1.8.0

Posted by co...@apache.org.
[maven-release-plugin] prepare release opennlp-1.8.0


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/38d7f2cc
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/38d7f2cc
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/38d7f2cc

Branch: refs/heads/LangDetect
Commit: 38d7f2ccc3231a0193ecd2ca05399025ba762e2c
Parents: cb6ee2c
Author: Jörn Kottmann <jo...@apache.org>
Authored: Thu May 11 18:17:56 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Thu May 11 18:17:56 2017 +0200

----------------------------------------------------------------------
 opennlp-brat-annotator/pom.xml   | 2 +-
 opennlp-distr/pom.xml            | 2 +-
 opennlp-docs/pom.xml             | 2 +-
 opennlp-morfologik-addon/pom.xml | 2 +-
 opennlp-tools/pom.xml            | 2 +-
 opennlp-uima/pom.xml             | 2 +-
 pom.xml                          | 4 ++--
 7 files changed, 8 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/38d7f2cc/opennlp-brat-annotator/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-brat-annotator/pom.xml b/opennlp-brat-annotator/pom.xml
index 6c7be0d..008fd65 100644
--- a/opennlp-brat-annotator/pom.xml
+++ b/opennlp-brat-annotator/pom.xml
@@ -17,7 +17,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.0-SNAPSHOT</version>
+		<version>1.8.0</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/38d7f2cc/opennlp-distr/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-distr/pom.xml b/opennlp-distr/pom.xml
index 3f838cd..317c37f 100644
--- a/opennlp-distr/pom.xml
+++ b/opennlp-distr/pom.xml
@@ -24,7 +24,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.0-SNAPSHOT</version>
+		<version>1.8.0</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/38d7f2cc/opennlp-docs/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/pom.xml b/opennlp-docs/pom.xml
index fbf0b5c..6b407b8 100644
--- a/opennlp-docs/pom.xml
+++ b/opennlp-docs/pom.xml
@@ -24,7 +24,7 @@
   <parent>
 	<groupId>org.apache.opennlp</groupId>
 	<artifactId>opennlp</artifactId>
-	<version>1.8.0-SNAPSHOT</version>
+	<version>1.8.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
   

http://git-wip-us.apache.org/repos/asf/opennlp/blob/38d7f2cc/opennlp-morfologik-addon/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/pom.xml b/opennlp-morfologik-addon/pom.xml
index c46f101..bfae09f 100644
--- a/opennlp-morfologik-addon/pom.xml
+++ b/opennlp-morfologik-addon/pom.xml
@@ -24,7 +24,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.0-SNAPSHOT</version>
+		<version>1.8.0</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/38d7f2cc/opennlp-tools/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/pom.xml b/opennlp-tools/pom.xml
index a499375..573861b 100644
--- a/opennlp-tools/pom.xml
+++ b/opennlp-tools/pom.xml
@@ -25,7 +25,7 @@
   <parent>
     <groupId>org.apache.opennlp</groupId>
     <artifactId>opennlp</artifactId>
-    <version>1.8.0-SNAPSHOT</version>
+    <version>1.8.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
     

http://git-wip-us.apache.org/repos/asf/opennlp/blob/38d7f2cc/opennlp-uima/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-uima/pom.xml b/opennlp-uima/pom.xml
index 7cfdb72..1db9c38 100644
--- a/opennlp-uima/pom.xml
+++ b/opennlp-uima/pom.xml
@@ -25,7 +25,7 @@
 	<parent>
 	    <groupId>org.apache.opennlp</groupId>
 	    <artifactId>opennlp</artifactId>
-	    <version>1.8.0-SNAPSHOT</version>
+	    <version>1.8.0</version>
 	    <relativePath>../pom.xml</relativePath>
     </parent>
     

http://git-wip-us.apache.org/repos/asf/opennlp/blob/38d7f2cc/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 2190a26..29a0699 100644
--- a/pom.xml
+++ b/pom.xml
@@ -31,7 +31,7 @@
 
 	<groupId>org.apache.opennlp</groupId>
 	<artifactId>opennlp</artifactId>
-	<version>1.8.0-SNAPSHOT</version>
+	<version>1.8.0</version>
 	<packaging>pom</packaging>
 
 	<name>Apache OpenNLP Reactor</name>
@@ -40,7 +40,7 @@
 		<connection>scm:git:git@github.com:apache/opennlp.git</connection>
 		<developerConnection>scm:git:https://git-wip-us.apache.org/repos/asf/opennlp.git</developerConnection>
 		<url>https://git-wip-us.apache.org/repos/asf?p=opennlp.git</url>
-		<tag>HEAD</tag>
+		<tag>opennlp-1.8.0</tag>
 	</scm>
 
 	<mailingLists>


[26/50] opennlp git commit: OPENNLP-1037: OpenNLP build fails if only the eval tests are run

Posted by co...@apache.org.
OPENNLP-1037: OpenNLP build fails if only the eval tests are run


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/7d5bd017
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/7d5bd017
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/7d5bd017

Branch: refs/heads/LangDetect
Commit: 7d5bd01744a8bd6e53ee25fa96441545bc1c9811
Parents: b4eb291 d44fe15
Author: Peter Thygesen <th...@apache.org>
Authored: Fri May 5 20:00:38 2017 +0200
Committer: Peter Thygesen <th...@apache.org>
Committed: Fri May 5 20:00:38 2017 +0200

----------------------------------------------------------------------
 opennlp-tools/pom.xml | 11 +----------
 pom.xml               | 25 +++++++++++++++++++++++++
 2 files changed, 26 insertions(+), 10 deletions(-)
----------------------------------------------------------------------



[29/50] opennlp git commit: OPENNLP-1044: Add validate() which checks validity of parameters in the process of the framework. This closes apache/opennlp#192

Posted by co...@apache.org.
OPENNLP-1044: Add validate() which checks validity of parameters in the process of the framework. This closes apache/opennlp#192


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/ca9a1d94
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/ca9a1d94
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/ca9a1d94

Branch: refs/heads/LangDetect
Commit: ca9a1d943d4cde23fe36d0c557ddb4110bad0260
Parents: 5f96aa3
Author: koji <ko...@apache.org>
Authored: Mon May 8 11:00:18 2017 +0900
Committer: koji <ko...@apache.org>
Committed: Mon May 8 11:00:18 2017 +0900

----------------------------------------------------------------------
 .../ml/AbstractEventModelSequenceTrainer.java   |  5 +--
 .../opennlp/tools/ml/AbstractEventTrainer.java  | 17 +++++-----
 .../tools/ml/AbstractSequenceTrainer.java       |  5 +--
 .../java/opennlp/tools/ml/AbstractTrainer.java  | 26 ++++++++++++---
 .../tools/ml/maxent/quasinewton/QNTrainer.java  | 34 +++++++++++++-------
 .../tools/ml/naivebayes/NaiveBayesTrainer.java  |  4 ---
 .../tools/ml/perceptron/PerceptronTrainer.java  | 20 +++++++++---
 .../SimplePerceptronSequenceTrainer.java        | 26 +++++++++++----
 8 files changed, 88 insertions(+), 49 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/ca9a1d94/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractEventModelSequenceTrainer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractEventModelSequenceTrainer.java b/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractEventModelSequenceTrainer.java
index fdcb4b6..362a0d6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractEventModelSequenceTrainer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractEventModelSequenceTrainer.java
@@ -32,10 +32,7 @@ public abstract class AbstractEventModelSequenceTrainer extends AbstractTrainer
       throws IOException;
 
   public final MaxentModel train(SequenceStream events) throws IOException {
-
-    if (!isValid()) {
-      throw new IllegalArgumentException("trainParams are not valid!");
-    }
+    validate();
 
     MaxentModel model = doTrain(events);
     addToReport(AbstractTrainer.TRAINER_TYPE_PARAM,

http://git-wip-us.apache.org/repos/asf/opennlp/blob/ca9a1d94/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractEventTrainer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractEventTrainer.java b/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractEventTrainer.java
index 330307a..dc75ffe 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractEventTrainer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractEventTrainer.java
@@ -42,7 +42,13 @@ public abstract class AbstractEventTrainer extends AbstractTrainer implements Ev
   public AbstractEventTrainer(TrainingParameters parameters) {
     super(parameters);
   }
-  
+
+  @Override
+  public void validate() {
+    super.validate();
+  }
+
+  @Deprecated
   @Override
   public boolean isValid() {
     return super.isValid();
@@ -66,9 +72,7 @@ public abstract class AbstractEventTrainer extends AbstractTrainer implements Ev
   public abstract MaxentModel doTrain(DataIndexer indexer) throws IOException;
 
   public final MaxentModel train(DataIndexer indexer) throws IOException {
-    if (!isValid()) {
-      throw new IllegalArgumentException("trainParams are not valid!");
-    }
+    validate();
 
     if (indexer.getOutcomeLabels().length <= 1) {
       throw new InsufficientTrainingDataException("Training data must contain more than one outcome");
@@ -80,10 +84,7 @@ public abstract class AbstractEventTrainer extends AbstractTrainer implements Ev
   }
 
   public final MaxentModel train(ObjectStream<Event> events) throws IOException {
-
-    if (!isValid()) {
-      throw new IllegalArgumentException("trainParams are not valid!");
-    }
+    validate();
 
     HashSumEventStream hses = new HashSumEventStream(events);
     DataIndexer indexer = getDataIndexer(hses);

http://git-wip-us.apache.org/repos/asf/opennlp/blob/ca9a1d94/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractSequenceTrainer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractSequenceTrainer.java b/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractSequenceTrainer.java
index 2d48624..19ecc4b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractSequenceTrainer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractSequenceTrainer.java
@@ -32,10 +32,7 @@ public abstract class AbstractSequenceTrainer extends AbstractTrainer implements
       throws IOException;
 
   public final SequenceClassificationModel<String> train(SequenceStream events) throws IOException {
-
-    if (!isValid()) {
-      throw new IllegalArgumentException("trainParams are not valid!");
-    }
+    validate();
 
     SequenceClassificationModel<String> model = doTrain(events);
     addToReport(AbstractTrainer.TRAINER_TYPE_PARAM, SequenceTrainer.SEQUENCE_VALUE);

http://git-wip-us.apache.org/repos/asf/opennlp/blob/ca9a1d94/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractTrainer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractTrainer.java b/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractTrainer.java
index 070b96c..32c5df6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractTrainer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractTrainer.java
@@ -74,20 +74,36 @@ public abstract class AbstractTrainer {
     return trainingParameters.getIntParameter(ITERATIONS_PARAM, ITERATIONS_DEFAULT);
   }
 
-  public boolean isValid() {
-
+  /**
+   * Check parameters. If subclass overrides this, it should call super.validate();
+   *
+   * @throws java.lang.IllegalArgumentException
+   */
+  public void validate() {
     // TODO: Need to validate all parameters correctly ... error prone?!
-
     // should validate if algorithm is set? What about the Parser?
 
     try {
       trainingParameters.getIntParameter(CUTOFF_PARAM, CUTOFF_DEFAULT);
       trainingParameters.getIntParameter(ITERATIONS_PARAM, ITERATIONS_DEFAULT);
     } catch (NumberFormatException e) {
+      throw new IllegalArgumentException(e);
+    }
+  }
+
+  /**
+   * @deprecated Use {@link #validate()} instead.
+   * @return
+   */
+  @Deprecated
+  public boolean isValid() {
+    try {
+      validate();
+      return true;
+    }
+    catch (IllegalArgumentException e) {
       return false;
     }
-    
-    return true;
   }
 
 /**

http://git-wip-us.apache.org/repos/asf/opennlp/blob/ca9a1d94/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/quasinewton/QNTrainer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/quasinewton/QNTrainer.java b/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/quasinewton/QNTrainer.java
index 7a1a74f..daa90a4 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/quasinewton/QNTrainer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/quasinewton/QNTrainer.java
@@ -115,42 +115,52 @@ public class QNTrainer extends AbstractEventTrainer {
     init(new TrainingParameters(trainParams),reportMap);
   }
 
-  public boolean isValid() {
-
-    if (!super.isValid()) {
-      return false;
-    }
+  @Override
+  public void validate() {
+    super.validate();
 
     String algorithmName = getAlgorithm();
     if (algorithmName != null && !(MAXENT_QN_VALUE.equals(algorithmName))) {
-      return false;
+      throw new IllegalArgumentException("algorithmName must be MAXENT_QN");
     }
 
     // Number of Hessian updates to remember
     if (m < 0) {
-      return false;
+      throw new IllegalArgumentException(
+          "Number of Hessian updates to remember must be >= 0");
     }
 
     // Maximum number of function evaluations
     if (maxFctEval < 0) {
-      return false;
+      throw new IllegalArgumentException(
+          "Maximum number of function evaluations must be >= 0");
     }
 
     // Number of threads must be >= 1
     if (threads < 1) {
-      return false;
+      throw new IllegalArgumentException("Number of threads must be >= 1");
     }
 
     // Regularization costs must be >= 0
     if (l1Cost < 0) {
-      return false;
+      throw new IllegalArgumentException("Regularization costs must be >= 0");
     }
 
     if (l2Cost < 0) {
-      return false;
+      throw new IllegalArgumentException("Regularization costs must be >= 0");
     }
+  }
 
-    return true;
+  @Deprecated
+  @Override
+  public boolean isValid() {
+    try {
+      validate();
+      return true;
+    }
+    catch (IllegalArgumentException e) {
+      return false;
+    }
   }
 
   public boolean isSortAndMerge() {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/ca9a1d94/opennlp-tools/src/main/java/opennlp/tools/ml/naivebayes/NaiveBayesTrainer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/naivebayes/NaiveBayesTrainer.java b/opennlp-tools/src/main/java/opennlp/tools/ml/naivebayes/NaiveBayesTrainer.java
index 629c222..69ef44e 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/naivebayes/NaiveBayesTrainer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/naivebayes/NaiveBayesTrainer.java
@@ -102,10 +102,6 @@ public class NaiveBayesTrainer extends AbstractEventTrainer {
   }
 
   public AbstractModel doTrain(DataIndexer indexer) throws IOException {
-    if (!isValid()) {
-      throw new IllegalArgumentException("trainParams are not valid!");
-    }
-
     return this.trainModel(indexer);
   }
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/ca9a1d94/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/PerceptronTrainer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/PerceptronTrainer.java b/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/PerceptronTrainer.java
index 129c576..b73eaca 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/PerceptronTrainer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/PerceptronTrainer.java
@@ -84,7 +84,21 @@ public class PerceptronTrainer extends AbstractEventTrainer {
   public PerceptronTrainer(TrainingParameters parameters) {
     super(parameters);
   }
-  
+
+  @Override
+  public void validate() {
+    super.validate();
+
+    String algorithmName = getAlgorithm();
+    if (algorithmName != null) {
+      if (!PERCEPTRON_VALUE.equals(algorithmName)) {
+        throw new IllegalArgumentException("algorithmName must be PERCEPTRON");
+      }
+    }
+  }
+
+  @Deprecated
+  @Override
   public boolean isValid() {
     if (!super.isValid()) {
       return false;
@@ -104,10 +118,6 @@ public class PerceptronTrainer extends AbstractEventTrainer {
   }
 
   public AbstractModel doTrain(DataIndexer indexer) throws IOException {
-    if (!isValid()) {
-      throw new IllegalArgumentException("trainParams are not valid!");
-    }
-
     int iterations = getIterations();
     int cutoff = getCutoff();
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/ca9a1d94/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/SimplePerceptronSequenceTrainer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/SimplePerceptronSequenceTrainer.java b/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/SimplePerceptronSequenceTrainer.java
index 5fc4bbe..a9ac516 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/SimplePerceptronSequenceTrainer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/SimplePerceptronSequenceTrainer.java
@@ -83,16 +83,28 @@ public class SimplePerceptronSequenceTrainer extends AbstractEventModelSequenceT
   public SimplePerceptronSequenceTrainer() {
   }
 
-  public boolean isValid() {
-
-    if (!super.isValid()) {
-      return false;
-    }
+  @Override
+  public void validate() {
+    super.validate();
 
     String algorithmName = getAlgorithm();
+    if (algorithmName != null) {
+      if (!PERCEPTRON_SEQUENCE_VALUE.equals(algorithmName)) {
+        throw new IllegalArgumentException("algorithmName must be PERCEPTRON_SEQUENCE");
+      }
+    }
+  }
 
-    return !(algorithmName != null
-        && !(PERCEPTRON_SEQUENCE_VALUE.equals(algorithmName)));
+  @Deprecated
+  @Override
+  public boolean isValid() {
+    try {
+      validate();
+      return true;
+    }
+    catch (IllegalArgumentException e) {
+      return false;
+    }
   }
 
   public AbstractModel doTrain(SequenceStream events) throws IOException {


[47/50] opennlp git commit: OPENNLP-1059: Adjust evalAllTypesWithPOSNameFinder for larger beam size

Posted by co...@apache.org.
OPENNLP-1059: Adjust evalAllTypesWithPOSNameFinder for larger beam size

The 1.5.x pos models are loaded with a larger beam size to work with the parser


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/1713b449
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/1713b449
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/1713b449

Branch: refs/heads/LangDetect
Commit: 1713b4497a8a2aee1c1272e3a95703914f96b716
Parents: a00624c
Author: Jörn Kottmann <jo...@apache.org>
Authored: Wed May 17 14:45:34 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 17 14:45:34 2017 +0200

----------------------------------------------------------------------
 .../tools/eval/OntoNotes4NameFinderEval.java      | 18 +++---------------
 1 file changed, 3 insertions(+), 15 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/1713b449/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
index d9f5ecd..a001ce9 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
@@ -35,7 +35,6 @@ import org.junit.Assert;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
-import opennlp.tools.cmdline.TerminateToolException;
 import opennlp.tools.cmdline.namefind.TokenNameFinderTrainerTool;
 import opennlp.tools.formats.DirectorySampleStream;
 import opennlp.tools.formats.convert.FileToStringSampleStream;
@@ -123,7 +122,7 @@ public class OntoNotes4NameFinderEval {
   }
 
   @Test
-  public void evalAllTypesWithPOSNameFinder() throws IOException {
+  public void evalAllTypesWithPOSNameFinder() throws IOException, URISyntaxException {
     TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
     params.put("Threads", "4");
 
@@ -137,9 +136,6 @@ public class OntoNotes4NameFinderEval {
         bytes.write(buf, 0, len);
       }
     }
-    catch (IOException e) {
-      throw new IllegalStateException("Failed reading from ner-default-features.xml file on classpath!");
-    }
 
     byte[] featureGen = bytes.toByteArray();
 
@@ -149,16 +145,8 @@ public class OntoNotes4NameFinderEval {
         new File(resourcesPath.toFile(), "en-pos-perceptron.bin").toPath(),
         StandardCopyOption.REPLACE_EXISTING);
 
-    Map<String, Object> resources;
-
-    try {
-      resources = TokenNameFinderTrainerTool.loadResources(resourcesPath.toFile(),
+    Map<String, Object> resources = TokenNameFinderTrainerTool.loadResources(resourcesPath.toFile(),
           Paths.get(this.getClass().getResource("ner-en_pos-features.xml").toURI()).toFile());
-    }
-    catch (IOException | URISyntaxException e) {
-      throw new TerminateToolException(-1,"IO error while loading resources", e);
-    }
-
 
     try (ObjectStream<NameSample> samples = createNameSampleStream()) {
 
@@ -171,7 +159,7 @@ public class OntoNotes4NameFinderEval {
 
       cv.evaluate(filteredSamples, 5);
 
-      Assert.assertEquals(0.8044097625338349d, cv.getFMeasure().getFMeasure(), 0.001d);
+      Assert.assertEquals(0.8070226153653437d, cv.getFMeasure().getFMeasure(), 0.001d);
     }
   }
 }


[19/50] opennlp git commit: OPENNLP-1046: Correctly join tokens to text string

Posted by co...@apache.org.
OPENNLP-1046: Correctly join tokens to text string

The text was one space too long which results in a different
parse tree if the method is used to reproduce an existing
parse tree as it is done by the parser evaluation tool.


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/bbbb4313
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/bbbb4313
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/bbbb4313

Branch: refs/heads/LangDetect
Commit: bbbb4313846efaba9546e86052940f0a79b8948f
Parents: 6059525
Author: Jörn Kottmann <jo...@apache.org>
Authored: Wed Apr 26 10:46:48 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed Apr 26 15:35:54 2017 +0200

----------------------------------------------------------------------
 .../main/java/opennlp/tools/cmdline/parser/ParserTool.java    | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/bbbb4313/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java
index 499fa58..d8d3902 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/parser/ParserTool.java
@@ -72,11 +72,8 @@ public final class ParserTool extends BasicCmdLineTool {
 
     // tokenize
     List<String> tokens = Arrays.asList( tokenizer.tokenize(line));
-    StringBuilder sb = new StringBuilder();
-    for (String tok : tokens) {
-      sb.append(tok).append(" ");
-    }
-    String text = sb.substring(0, sb.length());
+    String text = String.join(" ", tokens);
+
     Parse p = new Parse(text, new Span(0, text.length()), AbstractBottomUpParser.INC_NODE, 0, 0);
     int start = 0;
     int i = 0;


[03/50] opennlp git commit: closes apache/opennlp#168 *Merged*

Posted by co...@apache.org.
closes apache/opennlp#168 *Merged*


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/e220a729
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/e220a729
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/e220a729

Branch: refs/heads/LangDetect
Commit: e220a72940e87defc363542fbc36c3c719bbc490
Parents: e2cf481
Author: smarthi <sm...@apache.org>
Authored: Tue Apr 18 20:41:27 2017 -0400
Committer: smarthi <sm...@apache.org>
Committed: Tue Apr 18 20:41:27 2017 -0400

----------------------------------------------------------------------

----------------------------------------------------------------------



[05/50] opennlp git commit: OPENNLP-1031: Use getIntParameter when getting beam size. This closes apache/opennlp#169

Posted by co...@apache.org.
OPENNLP-1031: Use getIntParameter when getting beam size. This closes apache/opennlp#169


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/45ea3f77
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/45ea3f77
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/45ea3f77

Branch: refs/heads/LangDetect
Commit: 45ea3f77397ee833bf6370769fce38f339f7ca2f
Parents: 580e0d1
Author: koji <ko...@apache.org>
Authored: Wed Apr 19 17:54:58 2017 +0900
Committer: koji <ko...@apache.org>
Committed: Wed Apr 19 17:54:58 2017 +0900

----------------------------------------------------------------------
 .../src/main/java/opennlp/tools/chunker/ChunkerME.java       | 7 +------
 .../src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java | 8 ++------
 .../src/main/java/opennlp/tools/namefind/NameFinderME.java   | 8 ++------
 .../src/main/java/opennlp/tools/postag/POSTaggerME.java      | 7 +------
 4 files changed, 6 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/45ea3f77/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
index 71917fb..8be0cca 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
@@ -162,12 +162,7 @@ public class ChunkerME implements Chunker {
   public static ChunkerModel train(String lang, ObjectStream<ChunkSample> in,
       TrainingParameters mlParams, ChunkerFactory factory) throws IOException {
 
-    String beamSizeString = mlParams.getSettings().get(BeamSearch.BEAM_SIZE_PARAMETER);
-
-    int beamSize = ChunkerME.DEFAULT_BEAM_SIZE;
-    if (beamSizeString != null) {
-      beamSize = Integer.parseInt(beamSizeString);
-    }
+    int beamSize = mlParams.getIntParameter(BeamSearch.BEAM_SIZE_PARAMETER, ChunkerME.DEFAULT_BEAM_SIZE);
 
     Map<String, String> manifestInfoEntries = new HashMap<>();
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/45ea3f77/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
index 2b8122f..83db79a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerME.java
@@ -202,12 +202,8 @@ public class LemmatizerME implements Lemmatizer {
       ObjectStream<LemmaSample> samples, TrainingParameters trainParams,
       LemmatizerFactory posFactory) throws IOException {
 
-    String beamSizeString = trainParams.getSettings().get(BeamSearch.BEAM_SIZE_PARAMETER);
-
-    int beamSize = LemmatizerME.DEFAULT_BEAM_SIZE;
-    if (beamSizeString != null) {
-      beamSize = Integer.parseInt(beamSizeString);
-    }
+    int beamSize = trainParams.getIntParameter(BeamSearch.BEAM_SIZE_PARAMETER,
+            LemmatizerME.DEFAULT_BEAM_SIZE);
 
     LemmatizerContextGenerator contextGenerator = posFactory.getContextGenerator();
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/45ea3f77/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
index 1d52473..4cd8ebc 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
@@ -225,12 +225,8 @@ public class NameFinderME implements TokenNameFinder {
     trainParams.putIfAbsent(TrainingParameters.CUTOFF_PARAM, "0");
     trainParams.putIfAbsent(TrainingParameters.ITERATIONS_PARAM, "300");
 
-    String beamSizeString = trainParams.getSettings().get(BeamSearch.BEAM_SIZE_PARAMETER);
-
-    int beamSize = NameFinderME.DEFAULT_BEAM_SIZE;
-    if (beamSizeString != null) {
-      beamSize = Integer.parseInt(beamSizeString);
-    }
+    int beamSize = trainParams.getIntParameter(BeamSearch.BEAM_SIZE_PARAMETER,
+            NameFinderME.DEFAULT_BEAM_SIZE);
 
     Map<String, String> manifestInfoEntries = new HashMap<>();
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/45ea3f77/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
index 5415ba7..1edcf4b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/postag/POSTaggerME.java
@@ -225,12 +225,7 @@ public class POSTaggerME implements POSTagger {
       ObjectStream<POSSample> samples, TrainingParameters trainParams,
       POSTaggerFactory posFactory) throws IOException {
 
-    String beamSizeString = trainParams.getSettings().get(BeamSearch.BEAM_SIZE_PARAMETER);
-
-    int beamSize = POSTaggerME.DEFAULT_BEAM_SIZE;
-    if (beamSizeString != null) {
-      beamSize = Integer.parseInt(beamSizeString);
-    }
+    int beamSize = trainParams.getIntParameter(BeamSearch.BEAM_SIZE_PARAMETER, POSTaggerME.DEFAULT_BEAM_SIZE);
 
     POSContextGenerator contextGenerator = posFactory.getPOSContextGenerator();
 


[27/50] opennlp git commit: closes apache/opennlp#193

Posted by co...@apache.org.
closes apache/opennlp#193


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/6d2fa048
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/6d2fa048
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/6d2fa048

Branch: refs/heads/LangDetect
Commit: 6d2fa048136cb50aaf270ea5f5a9c47632777fa2
Parents: 7d5bd01
Author: Peter Thygesen <th...@apache.org>
Authored: Fri May 5 20:02:59 2017 +0200
Committer: Peter Thygesen <th...@apache.org>
Committed: Fri May 5 20:02:59 2017 +0200

----------------------------------------------------------------------

----------------------------------------------------------------------



[34/50] opennlp git commit: [maven-release-plugin] prepare for next development iteration

Posted by co...@apache.org.
[maven-release-plugin] prepare for next development iteration


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/b17f7356
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/b17f7356
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/b17f7356

Branch: refs/heads/LangDetect
Commit: b17f7356c53c136587a04857fdecc041d5a04e59
Parents: 286e45b
Author: Jörn Kottmann <jo...@apache.org>
Authored: Tue May 9 18:27:13 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Tue May 9 18:27:13 2017 +0200

----------------------------------------------------------------------
 opennlp-brat-annotator/pom.xml   | 2 +-
 opennlp-distr/pom.xml            | 2 +-
 opennlp-docs/pom.xml             | 2 +-
 opennlp-morfologik-addon/pom.xml | 2 +-
 opennlp-tools/pom.xml            | 2 +-
 opennlp-uima/pom.xml             | 2 +-
 pom.xml                          | 4 ++--
 7 files changed, 8 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/b17f7356/opennlp-brat-annotator/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-brat-annotator/pom.xml b/opennlp-brat-annotator/pom.xml
index 008fd65..0791e6b 100644
--- a/opennlp-brat-annotator/pom.xml
+++ b/opennlp-brat-annotator/pom.xml
@@ -17,7 +17,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.0</version>
+		<version>1.8.1-SNAPSHOT</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/b17f7356/opennlp-distr/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-distr/pom.xml b/opennlp-distr/pom.xml
index 317c37f..4428240 100644
--- a/opennlp-distr/pom.xml
+++ b/opennlp-distr/pom.xml
@@ -24,7 +24,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.0</version>
+		<version>1.8.1-SNAPSHOT</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/b17f7356/opennlp-docs/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/pom.xml b/opennlp-docs/pom.xml
index 6b407b8..312f6b8 100644
--- a/opennlp-docs/pom.xml
+++ b/opennlp-docs/pom.xml
@@ -24,7 +24,7 @@
   <parent>
 	<groupId>org.apache.opennlp</groupId>
 	<artifactId>opennlp</artifactId>
-	<version>1.8.0</version>
+	<version>1.8.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
   

http://git-wip-us.apache.org/repos/asf/opennlp/blob/b17f7356/opennlp-morfologik-addon/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/pom.xml b/opennlp-morfologik-addon/pom.xml
index bfae09f..8c5b9f4 100644
--- a/opennlp-morfologik-addon/pom.xml
+++ b/opennlp-morfologik-addon/pom.xml
@@ -24,7 +24,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.0</version>
+		<version>1.8.1-SNAPSHOT</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/b17f7356/opennlp-tools/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/pom.xml b/opennlp-tools/pom.xml
index 573861b..a2cf596 100644
--- a/opennlp-tools/pom.xml
+++ b/opennlp-tools/pom.xml
@@ -25,7 +25,7 @@
   <parent>
     <groupId>org.apache.opennlp</groupId>
     <artifactId>opennlp</artifactId>
-    <version>1.8.0</version>
+    <version>1.8.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
     

http://git-wip-us.apache.org/repos/asf/opennlp/blob/b17f7356/opennlp-uima/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-uima/pom.xml b/opennlp-uima/pom.xml
index 1db9c38..d8f5246 100644
--- a/opennlp-uima/pom.xml
+++ b/opennlp-uima/pom.xml
@@ -25,7 +25,7 @@
 	<parent>
 	    <groupId>org.apache.opennlp</groupId>
 	    <artifactId>opennlp</artifactId>
-	    <version>1.8.0</version>
+	    <version>1.8.1-SNAPSHOT</version>
 	    <relativePath>../pom.xml</relativePath>
     </parent>
     

http://git-wip-us.apache.org/repos/asf/opennlp/blob/b17f7356/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 29a0699..03811a8 100644
--- a/pom.xml
+++ b/pom.xml
@@ -31,7 +31,7 @@
 
 	<groupId>org.apache.opennlp</groupId>
 	<artifactId>opennlp</artifactId>
-	<version>1.8.0</version>
+	<version>1.8.1-SNAPSHOT</version>
 	<packaging>pom</packaging>
 
 	<name>Apache OpenNLP Reactor</name>
@@ -40,7 +40,7 @@
 		<connection>scm:git:git@github.com:apache/opennlp.git</connection>
 		<developerConnection>scm:git:https://git-wip-us.apache.org/repos/asf/opennlp.git</developerConnection>
 		<url>https://git-wip-us.apache.org/repos/asf?p=opennlp.git</url>
-		<tag>opennlp-1.8.0</tag>
+		<tag>HEAD</tag>
 	</scm>
 
 	<mailingLists>


[28/50] opennlp git commit: OPENNLP-1001: Changes signature of Chunker Sequence Validator

Posted by co...@apache.org.
OPENNLP-1001: Changes signature of Chunker Sequence Validator

Chunker SequenceValidator should have access to both token and POS tag

Closes #137


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/5f96aa32
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/5f96aa32
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/5f96aa32

Branch: refs/heads/LangDetect
Commit: 5f96aa323dce95467365b41077b609852ddb08aa
Parents: 6d2fa04
Author: William D C M SILVA <co...@apache.org>
Authored: Thu Feb 23 15:04:37 2017 -0300
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Sun May 7 21:05:48 2017 +0200

----------------------------------------------------------------------
 .../tools/chunker/ChunkerContextGenerator.java  |   3 +-
 .../opennlp/tools/chunker/ChunkerFactory.java   |   3 +-
 .../java/opennlp/tools/chunker/ChunkerME.java   |  19 ++--
 .../opennlp/tools/chunker/ChunkerModel.java     |  14 ++-
 .../chunker/DefaultChunkerContextGenerator.java |  14 ++-
 .../DefaultChunkerSequenceValidator.java        |   5 +-
 .../tools/parser/ChunkContextGenerator.java     |  12 +++
 .../tools/parser/ParserChunkerFactory.java      |   3 +-
 .../parser/ParserChunkerSequenceValidator.java  |   9 +-
 .../main/java/opennlp/tools/util/TokenTag.java  |  99 +++++++++++++++++++
 .../opennlp/tools/chunker/ChunkerMETest.java    |   2 +-
 .../opennlp/tools/chunker/ChunkerModelTest.java |  58 +++++++++++
 .../tools/chunker/DummyChunkerFactory.java      |   5 +-
 .../opennlp/tools/chunker/chunker170custom.bin  | Bin 0 -> 21675 bytes
 .../opennlp/tools/chunker/chunker170default.bin | Bin 0 -> 21671 bytes
 .../opennlp/tools/chunker/chunker180custom.bin  | Bin 0 -> 21675 bytes
 16 files changed, 227 insertions(+), 19 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/5f96aa32/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerContextGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerContextGenerator.java
index 590bc85..b666ad3 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerContextGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerContextGenerator.java
@@ -18,11 +18,12 @@
 package opennlp.tools.chunker;
 
 import opennlp.tools.util.BeamSearchContextGenerator;
+import opennlp.tools.util.TokenTag;
 
 /**
  * Interface for the context generator used in syntactic chunking.
  */
-public interface ChunkerContextGenerator extends BeamSearchContextGenerator<String> {
+public interface ChunkerContextGenerator extends BeamSearchContextGenerator<TokenTag> {
 
   /**
    * Returns the contexts for chunking of the specified index.

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5f96aa32/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerFactory.java b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerFactory.java
index 1cb772f..961a738 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerFactory.java
@@ -20,6 +20,7 @@ package opennlp.tools.chunker;
 import opennlp.tools.util.BaseToolFactory;
 import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.SequenceValidator;
+import opennlp.tools.util.TokenTag;
 import opennlp.tools.util.ext.ExtensionLoader;
 
 public class ChunkerFactory extends BaseToolFactory {
@@ -53,7 +54,7 @@ public class ChunkerFactory extends BaseToolFactory {
     // no additional artifacts
   }
 
-  public SequenceValidator<String> getSequenceValidator() {
+  public SequenceValidator<TokenTag> getSequenceValidator() {
     return new DefaultChunkerSequenceValidator();
   }
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5f96aa32/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
index 8be0cca..4346df3 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerME.java
@@ -34,6 +34,7 @@ import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.Sequence;
 import opennlp.tools.util.SequenceValidator;
 import opennlp.tools.util.Span;
+import opennlp.tools.util.TokenTag;
 import opennlp.tools.util.TrainingParameters;
 
 /**
@@ -49,10 +50,10 @@ public class ChunkerME implements Chunker {
   /**
    * The model used to assign chunk tags to a sequence of tokens.
    */
-  protected SequenceClassificationModel<String> model;
+  protected SequenceClassificationModel<TokenTag> model;
 
   private ChunkerContextGenerator contextGenerator;
-  private SequenceValidator<String> sequenceValidator;
+  private SequenceValidator<TokenTag> sequenceValidator;
 
   /**
    * Initializes the current instance with the specified model and
@@ -67,7 +68,7 @@ public class ChunkerME implements Chunker {
    *     to configure the {@link SequenceValidator} and {@link ChunkerContextGenerator}.
    */
   @Deprecated
-  private ChunkerME(ChunkerModel model, int beamSize, SequenceValidator<String> sequenceValidator,
+  private ChunkerME(ChunkerModel model, int beamSize, SequenceValidator<TokenTag> sequenceValidator,
       ChunkerContextGenerator contextGenerator) {
 
     this.sequenceValidator = sequenceValidator;
@@ -117,7 +118,8 @@ public class ChunkerME implements Chunker {
   }
 
   public String[] chunk(String[] toks, String[] tags) {
-    bestSequence = model.bestSequence(toks, new Object[] {tags}, contextGenerator, sequenceValidator);
+    TokenTag[] tuples = TokenTag.create(toks, tags);
+    bestSequence = model.bestSequence(tuples, new Object[] {}, contextGenerator, sequenceValidator);
     List<String> c = bestSequence.getOutcomes();
     return c.toArray(new String[c.size()]);
   }
@@ -128,12 +130,15 @@ public class ChunkerME implements Chunker {
   }
 
   public Sequence[] topKSequences(String[] sentence, String[] tags) {
-    return model.bestSequences(DEFAULT_BEAM_SIZE, sentence,
-        new Object[] { tags }, contextGenerator, sequenceValidator);
+    TokenTag[] tuples = TokenTag.create(sentence, tags);
+
+    return model.bestSequences(DEFAULT_BEAM_SIZE, tuples,
+        new Object[] { }, contextGenerator, sequenceValidator);
   }
 
   public Sequence[] topKSequences(String[] sentence, String[] tags, double minSequenceScore) {
-    return model.bestSequences(DEFAULT_BEAM_SIZE, sentence, new Object[] { tags }, minSequenceScore,
+    TokenTag[] tuples = TokenTag.create(sentence, tags);
+    return model.bestSequences(DEFAULT_BEAM_SIZE, tuples, new Object[] { }, minSequenceScore,
         contextGenerator, sequenceValidator);
   }
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5f96aa32/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerModel.java b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerModel.java
index ed13f65..12c8bbe 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerModel.java
@@ -31,6 +31,7 @@ import opennlp.tools.ml.model.MaxentModel;
 import opennlp.tools.ml.model.SequenceClassificationModel;
 import opennlp.tools.util.BaseToolFactory;
 import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.TokenTag;
 import opennlp.tools.util.model.BaseModel;
 
 /**
@@ -90,6 +91,17 @@ public class ChunkerModel extends BaseModel {
     if (!(artifactMap.get(CHUNKER_MODEL_ENTRY_NAME) instanceof AbstractModel)) {
       throw new InvalidFormatException("Chunker model is incomplete!");
     }
+
+    // Since 1.8.0 we changed the ChunkerFactory signature. This will check the if the model
+    // declares a not default factory, and if yes, check if it was created before 1.8
+    if ( (getManifestProperty(FACTORY_NAME) != null
+            && !getManifestProperty(FACTORY_NAME).equals("opennlp.tools.chunker.ChunkerFactory") )
+        && this.getVersion().getMajor() <= 1
+        && this.getVersion().getMinor() < 8) {
+      throw new InvalidFormatException("The Chunker factory '" + getManifestProperty(FACTORY_NAME) +
+      "' is no longer compatible. Please update it to match the latest ChunkerFactory.");
+    }
+
   }
 
   /**
@@ -105,7 +117,7 @@ public class ChunkerModel extends BaseModel {
     }
   }
 
-  public SequenceClassificationModel<String> getChunkerSequenceModel() {
+  public SequenceClassificationModel<TokenTag> getChunkerSequenceModel() {
 
     Properties manifest = (Properties) artifactMap.get(MANIFEST_ENTRY);
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5f96aa32/opennlp-tools/src/main/java/opennlp/tools/chunker/DefaultChunkerContextGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/chunker/DefaultChunkerContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/chunker/DefaultChunkerContextGenerator.java
index 76616d4..b140c3f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/chunker/DefaultChunkerContextGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/chunker/DefaultChunkerContextGenerator.java
@@ -18,6 +18,8 @@
 
 package opennlp.tools.chunker;
 
+import opennlp.tools.util.TokenTag;
+
 /** Features based on chunking model described in Fei Sha and Fernando Pereira. Shallow
  *  parsing with conditional random fields. In Proceedings of HLT-NAACL 2003. Association
  *  for Computational Linguistics, 2003.
@@ -30,9 +32,9 @@ public class DefaultChunkerContextGenerator implements ChunkerContextGenerator {
   public DefaultChunkerContextGenerator() {
   }
 
-  public String[] getContext(int index, String[] sequence,
+  public String[] getContext(int index, String[] tokens, String[] postags,
       String[] priorDecisions, Object[] additionalContext) {
-    return getContext(index,sequence,(String[]) additionalContext[0],priorDecisions);
+    return getContext(index, tokens, postags, priorDecisions);
   }
 
   public String[] getContext(int i, String[] toks, String[] tags, String[] preds) {
@@ -143,4 +145,12 @@ public class DefaultChunkerContextGenerator implements ChunkerContextGenerator {
 
     return features;
   }
+
+  @Override
+  public String[] getContext(int index, TokenTag[] sequence, String[] priorDecisions,
+                             Object[] additionalContext) {
+    String[] token = TokenTag.extractTokens(sequence);
+    String[] tags = TokenTag.extractTags(sequence);
+    return getContext(index, token, tags, priorDecisions, additionalContext);
+  }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5f96aa32/opennlp-tools/src/main/java/opennlp/tools/chunker/DefaultChunkerSequenceValidator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/chunker/DefaultChunkerSequenceValidator.java b/opennlp-tools/src/main/java/opennlp/tools/chunker/DefaultChunkerSequenceValidator.java
index ce395eb..e1e09fa 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/chunker/DefaultChunkerSequenceValidator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/chunker/DefaultChunkerSequenceValidator.java
@@ -18,8 +18,9 @@
 package opennlp.tools.chunker;
 
 import opennlp.tools.util.SequenceValidator;
+import opennlp.tools.util.TokenTag;
 
-public class DefaultChunkerSequenceValidator implements SequenceValidator<String> {
+public class DefaultChunkerSequenceValidator implements SequenceValidator<TokenTag> {
 
   private boolean validOutcome(String outcome, String prevOutcome) {
     if (outcome.startsWith("I-")) {
@@ -46,7 +47,7 @@ public class DefaultChunkerSequenceValidator implements SequenceValidator<String
     return validOutcome(outcome,prevOutcome);
   }
 
-  public boolean validSequence(int i, String[] sequence, String[] s, String outcome) {
+  public boolean validSequence(int i, TokenTag[] sequence, String[] s, String outcome) {
     return validOutcome(outcome, s);
   }
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5f96aa32/opennlp-tools/src/main/java/opennlp/tools/parser/ChunkContextGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/ChunkContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/parser/ChunkContextGenerator.java
index 7d37fcb..e32cead 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/parser/ChunkContextGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/parser/ChunkContextGenerator.java
@@ -22,6 +22,7 @@ import java.util.List;
 
 import opennlp.tools.chunker.ChunkerContextGenerator;
 import opennlp.tools.util.Cache;
+import opennlp.tools.util.TokenTag;
 
 /**
  * Creates predivtive context for the pre-chunking phases of parsing.
@@ -44,11 +45,13 @@ public class ChunkContextGenerator implements ChunkerContextGenerator {
     }
   }
 
+  @Deprecated
   public String[] getContext(Object o) {
     Object[] data = (Object[]) o;
     return getContext((Integer) data[0], (String[]) data[1], (String[]) data[2], (String[]) data[3]);
   }
 
+  @Deprecated
   public String[] getContext(int i, String[] words, String[] prevDecisions, Object[] ac) {
     return getContext(i,words,(String[]) ac[0],prevDecisions);
   }
@@ -184,4 +187,13 @@ public class ChunkContextGenerator implements ChunkerContextGenerator {
     }
     return feat.toString();
   }
+
+  @Override
+  public String[] getContext(int index, TokenTag[] sequence, String[] priorDecisions,
+                             Object[] additionalContext) {
+    String[] token = TokenTag.extractTokens(sequence);
+    String[] tags = TokenTag.extractTags(sequence);
+
+    return getContext(index, token, tags, priorDecisions);
+  }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5f96aa32/opennlp-tools/src/main/java/opennlp/tools/parser/ParserChunkerFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/ParserChunkerFactory.java b/opennlp-tools/src/main/java/opennlp/tools/parser/ParserChunkerFactory.java
index 7d3c8f7..ca792c9 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/parser/ParserChunkerFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/parser/ParserChunkerFactory.java
@@ -22,6 +22,7 @@ import opennlp.tools.chunker.ChunkerFactory;
 import opennlp.tools.chunker.ChunkerME;
 import opennlp.tools.ml.model.MaxentModel;
 import opennlp.tools.util.SequenceValidator;
+import opennlp.tools.util.TokenTag;
 
 public class ParserChunkerFactory extends ChunkerFactory {
 
@@ -31,7 +32,7 @@ public class ParserChunkerFactory extends ChunkerFactory {
   }
 
   @Override
-  public SequenceValidator<String> getSequenceValidator() {
+  public SequenceValidator<TokenTag> getSequenceValidator() {
 
     MaxentModel model = artifactProvider.getArtifact("chunker.model");
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5f96aa32/opennlp-tools/src/main/java/opennlp/tools/parser/ParserChunkerSequenceValidator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/ParserChunkerSequenceValidator.java b/opennlp-tools/src/main/java/opennlp/tools/parser/ParserChunkerSequenceValidator.java
index ef15bf5..3536841 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/parser/ParserChunkerSequenceValidator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/parser/ParserChunkerSequenceValidator.java
@@ -22,8 +22,9 @@ import java.util.Map;
 
 import opennlp.tools.parser.chunking.Parser;
 import opennlp.tools.util.SequenceValidator;
+import opennlp.tools.util.TokenTag;
 
-public class ParserChunkerSequenceValidator implements SequenceValidator<String> {
+public class ParserChunkerSequenceValidator implements SequenceValidator<TokenTag> {
 
   private Map<String, String> continueStartMap;
 
@@ -66,4 +67,10 @@ public class ParserChunkerSequenceValidator implements SequenceValidator<String>
     }
     return true;
   }
+
+  @Override
+  public boolean validSequence(int i, TokenTag[] inputTuples, String[] outcomesSequence, String outcome) {
+    String[] inputSequence = TokenTag.extractTokens(inputTuples);
+    return validSequence(i, inputSequence, outcomesSequence, outcome);
+  }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5f96aa32/opennlp-tools/src/main/java/opennlp/tools/util/TokenTag.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/TokenTag.java b/opennlp-tools/src/main/java/opennlp/tools/util/TokenTag.java
new file mode 100644
index 0000000..2a4377b
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/TokenTag.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util;
+
+import java.util.Arrays;
+import java.util.Objects;
+
+public class TokenTag {
+
+  private final String token;
+  private final String tag;
+  private final String[] addtionalData;
+
+  public TokenTag(String token, String tag, String[] addtionalData) {
+    this.token = token;
+    this.tag = tag;
+    if (addtionalData != null) {
+      this.addtionalData = Arrays.copyOf(addtionalData, addtionalData.length);
+    } else {
+      this.addtionalData = null;
+    }
+  }
+
+  public String getToken() {
+    return token;
+  }
+
+  public String getTag() {
+    return tag;
+  }
+
+  public String[] getAddtionalData() {
+    return addtionalData;
+  }
+
+  public static String[] extractTokens(TokenTag[] tuples) {
+    String[] tokens = new String[tuples.length];
+    for (int i = 0; i < tuples.length; i++) {
+      tokens[i] = tuples[i].getToken();
+    }
+
+    return tokens;
+  }
+
+  public static String[] extractTags(TokenTag[] tuples) {
+    String[] tags = new String[tuples.length];
+    for (int i = 0; i < tuples.length; i++) {
+      tags[i] = tuples[i].getTag();
+    }
+
+    return tags;
+  }
+
+  public static TokenTag[] create(String[] toks, String[] tags) {
+    TokenTag[] tuples = new TokenTag[toks.length];
+    for (int i = 0; i < toks.length; i++) {
+      tuples[i] = new TokenTag(toks[i], tags[i], null);
+    }
+    return tuples;
+  }
+
+  @Override
+  public boolean equals(Object o) {
+    if (this == o) {
+      return true;
+    } else if (o instanceof TokenTag) {
+      return Objects.equals(this.token, ((TokenTag) o).token)
+          && Objects.equals(this.tag, ((TokenTag) o).tag)
+          && Objects.equals(this.addtionalData, ((TokenTag) o).addtionalData);
+    }
+    return false;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(token, tag, addtionalData);
+  }
+
+  @Override
+  public String toString() {
+    return token + "_" + tag;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5f96aa32/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
index 4922ce9..3c04894 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
@@ -124,7 +124,7 @@ public class ChunkerMETest {
   public void testTokenProbMinScore() throws Exception {
     Sequence[] preds = chunker.topKSequences(toks1, tags1, -5.55);
 
-    Assert.assertTrue(preds.length == 4);
+    Assert.assertEquals(4, preds.length);
     Assert.assertEquals(expect1.length, preds[0].getProbs().length);
     Assert.assertEquals(Arrays.asList(expect1), preds[0].getOutcomes());
     Assert.assertNotSame(Arrays.asList(expect1), preds[1].getOutcomes());

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5f96aa32/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerModelTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerModelTest.java b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerModelTest.java
new file mode 100644
index 0000000..85afc53
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerModelTest.java
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.chunker;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * This is the test class for {@link ChunkerModel}.
+ */
+public class ChunkerModelTest {
+
+  @Test
+  public void testInvalidFactorySignature() throws Exception {
+
+    ChunkerModel model = null;
+    try {
+      model = new ChunkerModel(this.getClass().getResourceAsStream("chunker170custom.bin"));
+    } catch (IllegalArgumentException e) {
+      Assert.assertTrue("Exception must state ChunkerFactory",
+          e.getMessage().contains("ChunkerFactory"));
+      Assert.assertTrue("Exception must mention DummyChunkerFactory",
+          e.getMessage().contains("opennlp.tools.chunker.DummyChunkerFactory"));
+    }
+    Assert.assertNull(model);
+  }
+
+  @Test
+  public void test170DefaultFactory() throws Exception {
+
+    Assert.assertNotNull(
+        new ChunkerModel(this.getClass().getResourceAsStream("chunker170default.bin")));
+
+  }
+
+  @Test
+  public void test180CustomFactory() throws Exception {
+
+    Assert.assertNotNull(
+        new ChunkerModel(this.getClass().getResourceAsStream("chunker180custom.bin")));
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5f96aa32/opennlp-tools/src/test/java/opennlp/tools/chunker/DummyChunkerFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/chunker/DummyChunkerFactory.java b/opennlp-tools/src/test/java/opennlp/tools/chunker/DummyChunkerFactory.java
index 0ae8b6d..9b4eae7 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/chunker/DummyChunkerFactory.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/chunker/DummyChunkerFactory.java
@@ -18,6 +18,7 @@
 package opennlp.tools.chunker;
 
 import opennlp.tools.util.SequenceValidator;
+import opennlp.tools.util.TokenTag;
 
 public class DummyChunkerFactory extends ChunkerFactory {
 
@@ -30,7 +31,7 @@ public class DummyChunkerFactory extends ChunkerFactory {
   }
 
   @Override
-  public SequenceValidator<String> getSequenceValidator() {
+  public SequenceValidator<TokenTag> getSequenceValidator() {
     return new DummySequenceValidator();
   }
 
@@ -46,7 +47,7 @@ public class DummyChunkerFactory extends ChunkerFactory {
   static class DummySequenceValidator extends DefaultChunkerSequenceValidator {
 
     @Override
-    public boolean validSequence(int i, String[] sequence, String[] s,
+    public boolean validSequence(int i, TokenTag[] sequence, String[] s,
         String outcome) {
       return super.validSequence(i, sequence, s, outcome);
     }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5f96aa32/opennlp-tools/src/test/resources/opennlp/tools/chunker/chunker170custom.bin
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/chunker/chunker170custom.bin b/opennlp-tools/src/test/resources/opennlp/tools/chunker/chunker170custom.bin
new file mode 100644
index 0000000..f971261
Binary files /dev/null and b/opennlp-tools/src/test/resources/opennlp/tools/chunker/chunker170custom.bin differ

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5f96aa32/opennlp-tools/src/test/resources/opennlp/tools/chunker/chunker170default.bin
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/chunker/chunker170default.bin b/opennlp-tools/src/test/resources/opennlp/tools/chunker/chunker170default.bin
new file mode 100644
index 0000000..47b1daf
Binary files /dev/null and b/opennlp-tools/src/test/resources/opennlp/tools/chunker/chunker170default.bin differ

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5f96aa32/opennlp-tools/src/test/resources/opennlp/tools/chunker/chunker180custom.bin
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/chunker/chunker180custom.bin b/opennlp-tools/src/test/resources/opennlp/tools/chunker/chunker180custom.bin
new file mode 100644
index 0000000..9e96e95
Binary files /dev/null and b/opennlp-tools/src/test/resources/opennlp/tools/chunker/chunker180custom.bin differ


[11/50] opennlp git commit: OPENNLP-1036: Use Object values in TrainingParameters instead of String. This closes apache/opennlp#176

Posted by co...@apache.org.
OPENNLP-1036: Use Object values in TrainingParameters instead of String. This closes apache/opennlp#176


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/99cbf0da
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/99cbf0da
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/99cbf0da

Branch: refs/heads/LangDetect
Commit: 99cbf0da308d532eb96094855e74e718b6ae497a
Parents: f74a86f
Author: koji <ko...@apache.org>
Authored: Sat Apr 22 00:40:44 2017 +0900
Committer: koji <ko...@apache.org>
Committed: Sat Apr 22 00:40:44 2017 +0900

----------------------------------------------------------------------
 .../opennlp/tools/ml/AbstractEventTrainer.java  |   4 +-
 .../opennlp/tools/ml/maxent/GISTrainer.java     |   4 +-
 .../SimplePerceptronSequenceTrainer.java        |   4 +-
 .../opennlp/tools/namefind/NameFinderME.java    |   4 +-
 .../tools/parser/AbstractBottomUpParser.java    |  12 +-
 .../opennlp/tools/parser/chunking/Parser.java   |   5 +-
 .../opennlp/tools/parser/treeinsert/Parser.java |  20 +-
 .../opennlp/tools/util/TrainingParameters.java  | 345 ++++++++++++++++---
 .../opennlp/tools/util/model/ModelUtil.java     |   4 +-
 .../opennlp/tools/chunker/ChunkerMETest.java    |   8 +-
 .../tools/cmdline/TokenNameFinderToolTest.java  |   4 +-
 .../tools/doccat/DocumentCategorizerMETest.java |   8 +-
 .../tools/doccat/DocumentCategorizerNBTest.java |   4 +-
 .../opennlp/tools/eval/ArvoresDeitadasEval.java |   6 +-
 .../opennlp/tools/eval/Conll00ChunkerEval.java  |   2 +-
 .../test/java/opennlp/tools/eval/EvalUtil.java  |   6 +-
 .../tools/eval/OntoNotes4ParserEval.java        |   8 +-
 .../tools/lemmatizer/LemmatizerMETest.java      |   8 +-
 .../opennlp/tools/ml/TrainerFactoryTest.java    |   4 +-
 .../tools/ml/maxent/GISIndexingTest.java        |  32 +-
 .../tools/ml/maxent/MaxentPrepAttachTest.java   |   6 +-
 .../tools/ml/maxent/RealValueModelTest.java     |   2 +-
 .../tools/ml/maxent/ScaleDoesntMatterTest.java  |   2 +-
 .../maxent/io/RealValueFileEventStreamTest.java |   2 +-
 .../quasinewton/NegLogLikelihoodTest.java       |   2 +-
 .../ml/maxent/quasinewton/QNPrepAttachTest.java |  24 +-
 .../ml/maxent/quasinewton/QNTrainerTest.java    |   2 +-
 .../naivebayes/NaiveBayesCorrectnessTest.java   |   4 +-
 .../NaiveBayesModelReadWriteTest.java           |   4 +-
 .../ml/naivebayes/NaiveBayesPrepAttachTest.java |   8 +-
 .../NaiveBayesSerializedCorrectnessTest.java    |   4 +-
 .../ml/perceptron/PerceptronPrepAttachTest.java |  34 +-
 .../tools/namefind/NameFinderMETest.java        |  28 +-
 .../TokenNameFinderCrossValidatorTest.java      |  12 +-
 .../opennlp/tools/postag/POSTaggerMETest.java   |   8 +-
 .../sentdetect/SentenceDetectorMETest.java      |   8 +-
 .../opennlp/tools/tokenize/TokenizerMETest.java |   4 +-
 .../tools/tokenize/TokenizerTestUtil.java       |   8 +-
 .../tools/util/TrainingParametersTest.java      |   8 +-
 39 files changed, 442 insertions(+), 220 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractEventTrainer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractEventTrainer.java b/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractEventTrainer.java
index bb11aaa..330307a 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractEventTrainer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/AbstractEventTrainer.java
@@ -52,10 +52,10 @@ public abstract class AbstractEventTrainer extends AbstractTrainer implements Ev
 
   public DataIndexer getDataIndexer(ObjectStream<Event> events) throws IOException {
 
-    trainingParameters.put(AbstractDataIndexer.SORT_PARAM, Boolean.toString(isSortAndMerge()));
+    trainingParameters.put(AbstractDataIndexer.SORT_PARAM, isSortAndMerge());
     // If the cutoff was set, don't overwrite the value.
     if (trainingParameters.getIntParameter(CUTOFF_PARAM, -1) == -1) {
-      trainingParameters.put(CUTOFF_PARAM, "5");
+      trainingParameters.put(CUTOFF_PARAM, 5);
     }
     
     DataIndexer indexer = DataIndexerFactory.getDataIndexer(trainingParameters, reportMap);

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/GISTrainer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/GISTrainer.java b/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/GISTrainer.java
index 61ddb47..33a6c16 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/GISTrainer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/GISTrainer.java
@@ -246,8 +246,8 @@ public class GISTrainer extends AbstractEventTrainer {
                              int cutoff) throws IOException {
     DataIndexer indexer = new OnePassDataIndexer();
     TrainingParameters indexingParameters = new TrainingParameters();
-    indexingParameters.put(GISTrainer.CUTOFF_PARAM, Integer.toString(cutoff));
-    indexingParameters.put(GISTrainer.ITERATIONS_PARAM, Integer.toString(iterations));
+    indexingParameters.put(GISTrainer.CUTOFF_PARAM, cutoff);
+    indexingParameters.put(GISTrainer.ITERATIONS_PARAM, iterations);
     Map<String, String> reportMap = new HashMap<>();
     indexer.init(indexingParameters, reportMap);
     indexer.index(eventStream);

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/SimplePerceptronSequenceTrainer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/SimplePerceptronSequenceTrainer.java b/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/SimplePerceptronSequenceTrainer.java
index 7a50055..5fc4bbe 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/SimplePerceptronSequenceTrainer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/SimplePerceptronSequenceTrainer.java
@@ -111,8 +111,8 @@ public class SimplePerceptronSequenceTrainer extends AbstractEventModelSequenceT
     this.iterations = iterations;
     this.sequenceStream = sequenceStream;
 
-    trainingParameters.put(AbstractDataIndexer.CUTOFF_PARAM, Integer.toString(cutoff));
-    trainingParameters.put(AbstractDataIndexer.SORT_PARAM, Boolean.toString(false));
+    trainingParameters.put(AbstractDataIndexer.CUTOFF_PARAM, cutoff);
+    trainingParameters.put(AbstractDataIndexer.SORT_PARAM, false);
     DataIndexer di = new OnePassDataIndexer();
     di.init(trainingParameters,reportMap);
     di.index(new SequenceStreamEventStream(sequenceStream));

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
index 4cd8ebc..12ce701 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/NameFinderME.java
@@ -222,8 +222,8 @@ public class NameFinderME implements TokenNameFinder {
           TokenNameFinderFactory factory) throws IOException {
 
     trainParams.putIfAbsent(TrainingParameters.ALGORITHM_PARAM, PerceptronTrainer.PERCEPTRON_VALUE);
-    trainParams.putIfAbsent(TrainingParameters.CUTOFF_PARAM, "0");
-    trainParams.putIfAbsent(TrainingParameters.ITERATIONS_PARAM, "300");
+    trainParams.putIfAbsent(TrainingParameters.CUTOFF_PARAM, 0);
+    trainParams.putIfAbsent(TrainingParameters.ITERATIONS_PARAM, 300);
 
     int beamSize = trainParams.getIntParameter(BeamSearch.BEAM_SIZE_PARAMETER,
             NameFinderME.DEFAULT_BEAM_SIZE);

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java b/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java
index a553328..7d7c1b0 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java
@@ -528,15 +528,7 @@ public abstract class AbstractBottomUpParser implements Parser {
   public static Dictionary buildDictionary(ObjectStream<Parse> data, HeadRules rules,
       TrainingParameters params) throws IOException {
 
-    int cutoff = 5;
-
-    String cutoffString = params.getSettings("dict").
-        get(TrainingParameters.CUTOFF_PARAM);
-
-    if (cutoffString != null) {
-      // TODO: Maybe throw illegal argument exception if not parse able
-      cutoff = Integer.parseInt(cutoffString);
-    }
+    int cutoff = params.getIntParameter("dict", TrainingParameters.CUTOFF_PARAM, 5);
 
     NGramModel mdict = new NGramModel();
     Parse p;
@@ -621,7 +613,7 @@ public abstract class AbstractBottomUpParser implements Parser {
       throws IOException {
 
     TrainingParameters params = new TrainingParameters();
-    params.put("dict", TrainingParameters.CUTOFF_PARAM, Integer.toString(cutoff));
+    params.put("dict", TrainingParameters.CUTOFF_PARAM, cutoff);
 
     return buildDictionary(data, rules, params);
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java b/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java
index 53a8cba..f2079e9 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/parser/chunking/Parser.java
@@ -293,9 +293,8 @@ public class Parser extends AbstractBottomUpParser {
     // tag
     TrainingParameters posTaggerParams = mlParams.getParameters("tagger");
 
-    if (!posTaggerParams.getSettings().containsKey(BeamSearch.BEAM_SIZE_PARAMETER)) {
-      mlParams.put("tagger", BeamSearch.BEAM_SIZE_PARAMETER,
-          Integer.toString(10));
+    if (!posTaggerParams.getObjectSettings().containsKey(BeamSearch.BEAM_SIZE_PARAMETER)) {
+      mlParams.put("tagger", BeamSearch.BEAM_SIZE_PARAMETER, 10);
     }
 
     POSModel posModel = POSTaggerME.train(languageCode, new PosSampleStream(parseSamples),

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java b/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java
index 527bdb6..f18fbf7 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/parser/treeinsert/Parser.java
@@ -514,16 +514,16 @@ public class Parser extends AbstractBottomUpParser {
       throws IOException {
 
     TrainingParameters params = new TrainingParameters();
-    params.put("dict", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
-
-    params.put("tagger", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
-    params.put("tagger", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
-    params.put("chunker", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
-    params.put("chunker", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
-    params.put("check", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
-    params.put("check", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
-    params.put("build", TrainingParameters.CUTOFF_PARAM, Integer.toString(cut));
-    params.put("build", TrainingParameters.ITERATIONS_PARAM, Integer.toString(iterations));
+    params.put("dict", TrainingParameters.CUTOFF_PARAM, cut);
+
+    params.put("tagger", TrainingParameters.CUTOFF_PARAM, cut);
+    params.put("tagger", TrainingParameters.ITERATIONS_PARAM, iterations);
+    params.put("chunker", TrainingParameters.CUTOFF_PARAM, cut);
+    params.put("chunker", TrainingParameters.ITERATIONS_PARAM, iterations);
+    params.put("check", TrainingParameters.CUTOFF_PARAM, cut);
+    params.put("check", TrainingParameters.ITERATIONS_PARAM, iterations);
+    params.put("build", TrainingParameters.CUTOFF_PARAM, cut);
+    params.put("build", TrainingParameters.ITERATIONS_PARAM, iterations);
 
     return train(languageCode, parseSamples, rules, params);
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java b/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java
index 3f21623..08a1373 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/TrainingParameters.java
@@ -37,7 +37,7 @@ public class TrainingParameters {
   public static final String CUTOFF_PARAM = "Cutoff";
   public static final String THREADS_PARAM = "Threads";
 
-  private Map<String, String> parameters = new HashMap<>();
+  private Map<String, Object> parameters = new HashMap<>();
 
   public TrainingParameters() {
   }
@@ -45,18 +45,52 @@ public class TrainingParameters {
   public TrainingParameters(TrainingParameters trainingParameters) {
     this.parameters.putAll(trainingParameters.parameters);
   }
-  
+
+  /**
+   *
+   * @deprecated
+   */
   public TrainingParameters(Map<String,String> map) {
+    //parameters.putAll(map);
+    // try to respect their original type...
+    for (String key: map.keySet()) {
+      String value = map.get(key);
+      try {
+        int intValue = Integer.parseInt(value);
+        parameters.put(key, intValue);
+      }
+      catch (NumberFormatException ei) {
+        try {
+          double doubleValue = Double.parseDouble(value);
+          parameters.put(key, doubleValue);
+        }
+        catch (NumberFormatException ed) {
+          // Because Boolean.parseBoolean() doesn't throw NFE, it just checks the value is either
+          // true or yes. So let's see their letters here.
+          if (value.toLowerCase().equals("true") || value.toLowerCase().equals("false")) {
+            parameters.put(key, Boolean.parseBoolean(value));
+          }
+          else {
+            parameters.put(key, value);
+          }
+        }
+      }
+    }
+  }
+
+  /* TODO: Once we throw Map<String,String> away, have this constructor to be uncommented
+  public TrainingParameters(Map<String,Object> map) {
     parameters.putAll(map);
   }
-  
+  */
+
   public TrainingParameters(InputStream in) throws IOException {
 
     Properties properties = new Properties();
     properties.load(in);
 
     for (Map.Entry<Object, Object> entry : properties.entrySet()) {
-      parameters.put((String) entry.getKey(), (String) entry.getValue());
+      parameters.put((String) entry.getKey(), entry.getValue());
     }
   }
 
@@ -66,7 +100,7 @@ public class TrainingParameters {
    * @return the name or null if not set.
    */
   public String algorithm(String namespace) {
-    return parameters.get(namespace + "." + ALGORITHM_PARAM);
+    return (String)parameters.get(getKey(namespace, ALGORITHM_PARAM));
   }
 
   /**
@@ -75,7 +109,7 @@ public class TrainingParameters {
    * @return the name or null if not set.
    */
   public String algorithm() {
-    return parameters.get(ALGORITHM_PARAM);
+    return (String)parameters.get(ALGORITHM_PARAM);
   }
 
   /**
@@ -84,17 +118,74 @@ public class TrainingParameters {
    * @param namespace
    *
    * @return a parameter map which can be passed to the train and validate methods.
+   *
+   * @deprecated use {@link #getObjectSettings(String)} instead
    */
   public Map<String, String> getSettings(String namespace) {
 
     Map<String, String> trainingParams = new HashMap<>();
+    String prefix = namespace + ".";
 
-    for (Map.Entry<String, String> entry : parameters.entrySet()) {
+    for (Map.Entry<String, Object> entry : parameters.entrySet()) {
       String key = entry.getKey();
 
       if (namespace != null) {
-        String prefix = namespace + ".";
+        if (key.startsWith(prefix))  {
+          trainingParams.put(key.substring(prefix.length()), getStringValue(entry.getValue()));
+        }
+      }
+      else {
+        if (!key.contains(".")) {
+          trainingParams.put(key, getStringValue(entry.getValue()));
+        }
+      }
+    }
+
+    return Collections.unmodifiableMap(trainingParams);
+  }
+
+  private static String getStringValue(Object value) {
+    if (value instanceof Integer) {
+      return Integer.toString((Integer)value);
+    }
+    else if (value instanceof Double) {
+      return Double.toString((Double)value);
+    }
+    else if (value instanceof Boolean) {
+      return Boolean.toString((Boolean)value);
+    }
+    else {
+      return (String)value;
+    }
+  }
+
+  /**
+   * Retrieves all parameters without a name space.
+   *
+   * @return the settings map
+   *
+   * @deprecated use {@link #getObjectSettings()} instead
+   */
+  public Map<String, String> getSettings() {
+    return getSettings(null);
+  }
+
+  /**
+   * Retrieves a map with the training parameters which have the passed name space.
+   *
+   * @param namespace
+   *
+   * @return a parameter map which can be passed to the train and validate methods.
+   */
+  public Map<String, Object> getObjectSettings(String namespace) {
+
+    Map<String, Object> trainingParams = new HashMap<>();
+    String prefix = namespace + ".";
 
+    for (Map.Entry<String, Object> entry : parameters.entrySet()) {
+      String key = entry.getKey();
+
+      if (namespace != null) {
         if (key.startsWith(prefix))  {
           trainingParams.put(key.substring(prefix.length()), entry.getValue());
         }
@@ -114,116 +205,256 @@ public class TrainingParameters {
    *
    * @return the settings map
    */
-  public Map<String, String> getSettings() {
-    return getSettings(null);
+  public Map<String, Object> getObjectSettings() {
+    return getObjectSettings(null);
   }
 
   // reduces the params to contain only the params in the name space
   public TrainingParameters getParameters(String namespace) {
 
     TrainingParameters params = new TrainingParameters();
+    Map<String, Object> settings = getObjectSettings(namespace);
 
-    for (Map.Entry<String, String> entry : getSettings(namespace).entrySet()) {
-      params.put(entry.getKey(), entry.getValue());
+    for (String key: settings.keySet()) {
+      Object value = settings.get(key);
+      if (value instanceof Integer) {
+        params.put(key, (Integer)value);
+      }
+      else if (value instanceof Double) {
+        params.put(key, (Double)value);
+      }
+      else if (value instanceof Boolean) {
+        params.put(key, (Boolean)value);
+      }
+      else {
+        params.put(key, (String)value);
+      }
     }
 
     return params;
   }
 
   public void putIfAbsent(String namespace, String key, String value) {
-    if (namespace == null) {
-      parameters.putIfAbsent(key, value);
-    }
-    else {
-      parameters.putIfAbsent(namespace + "." + key, value);
-    }
+    parameters.putIfAbsent(getKey(namespace, key), value);
   }
 
   public void putIfAbsent(String key, String value) {
     putIfAbsent(null, key, value);
   }
 
-  public void put(String namespace, String key, String value) {
+  public void putIfAbsent(String namespace, String key, int value) {
+    parameters.putIfAbsent(getKey(namespace, key), value);
+  }
 
-    if (namespace == null) {
-      parameters.put(key, value);
-    }
-    else {
-      parameters.put(namespace + "." + key, value);
-    }
+  public void putIfAbsent(String key, int value) {
+    putIfAbsent(null, key, value);
+  }
+
+  public void putIfAbsent(String namespace, String key, double value) {
+    parameters.putIfAbsent(getKey(namespace, key), value);
+  }
+
+  public void putIfAbsent(String key, double value) {
+    putIfAbsent(null, key, value);
+  }
+
+  public void putIfAbsent(String namespace, String key, boolean value) {
+    parameters.putIfAbsent(getKey(namespace, key), value);
+  }
+
+  public void putIfAbsent(String key, boolean value) {
+    putIfAbsent(null, key, value);
+  }
+
+  public void put(String namespace, String key, String value) {
+    parameters.put(getKey(namespace, key), value);
   }
 
   public void put(String key, String value) {
     put(null, key, value);
   }
 
+  public void put(String namespace, String key, int value) {
+    parameters.put(getKey(namespace, key), value);
+  }
+
+  public void put(String key, int value) {
+    put(null, key, value);
+  }
+
+  public void put(String namespace, String key, double value) {
+    parameters.put(getKey(namespace, key), value);
+  }
+
+  public void put(String key, double value) {
+    put(null, key, value);
+  }
+
+  public void put(String namespace, String key, boolean value) {
+    parameters.put(getKey(namespace, key), value);
+  }
+
+  public void put(String key, boolean value) {
+    put(null, key, value);
+  }
+
   public void serialize(OutputStream out) throws IOException {
     Properties properties = new Properties();
 
-    for (Map.Entry<String, String> entry : parameters.entrySet()) {
+    for (Map.Entry<String, Object> entry: parameters.entrySet()) {
       properties.put(entry.getKey(), entry.getValue());
     }
 
     properties.store(out, null);
   }
 
+  /**
+   * get a String parameter
+   * @param key
+   * @param defaultValue
+   * @return
+   * @throws {@link java.lang.ClassCastException} can be thrown if the value is not {@link String}
+   */
   public String getStringParameter(String key, String defaultValue) {
-    return parameters.getOrDefault(key, defaultValue);
+    return getStringParameter(null, key, defaultValue);
   }
-  
+
+  /**
+   * get a String parameter in the specified namespace
+   * @param namespace
+   * @param key
+   * @param defaultValue
+   * @return
+   * @throws {@link java.lang.ClassCastException} can be thrown if the value is not {@link String}
+   */
   public String getStringParameter(String namespace, String key, String defaultValue) {
-    if (namespace == null) {
-      return getStringParameter(key, defaultValue);
+    Object value = parameters.get(getKey(namespace, key));
+    if (value == null) {
+      return defaultValue;
+    }
+    else {
+      return (String)value;
     }
-    return parameters.getOrDefault(namespace + "." + key, defaultValue);
   }
-  
+
+  /**
+   * get an Integer parameter
+   * @param key
+   * @param defaultValue
+   * @return
+   */
   public int getIntParameter(String key, int defaultValue) {
-    String value = parameters.getOrDefault(key, Integer.toString(defaultValue));
-    return Integer.parseInt(value);
+    return getIntParameter(null, key, defaultValue);
   }
-  
+
+  /**
+   * get an Integer parameter in the specified namespace
+   * @param namespace
+   * @param key
+   * @param defaultValue
+   * @return
+   */
   public int getIntParameter(String namespace, String key, int defaultValue) {
-    if (namespace == null) {
-      return getIntParameter(key, defaultValue);
+    Object value = parameters.get(getKey(namespace, key));
+    if (value == null) {
+      return defaultValue;
+    }
+    else {
+      // TODO: We have this try-catch for back-compat reason. After removing deprecated flag,
+      // we can remove try-catch block and just return (Integer)value;
+      try {
+        return (Integer) value;
+      }
+      catch (ClassCastException e) {
+        return Integer.parseInt((String)value);
+      }
     }
-    String value = parameters.getOrDefault(namespace + "." + key, Integer.toString(defaultValue));
-    return Integer.parseInt(value);
   }
-  
+
+  /**
+   * get a Double parameter
+   * @param key
+   * @param defaultValue
+   * @return
+   */
   public double getDoubleParameter(String key, double defaultValue) {
-    String value = parameters.getOrDefault(key, Double.toString(defaultValue));
-    return Double.parseDouble(value);
+    return getDoubleParameter(null, key, defaultValue);
   }
-  
+
+  /**
+   * get a Double parameter in the specified namespace
+   * @param namespace
+   * @param key
+   * @param defaultValue
+   * @return
+   */
   public double getDoubleParameter(String namespace, String key, double defaultValue) {
-    if (namespace == null) {
-      return getDoubleParameter(key, defaultValue);
+    Object value = parameters.get(getKey(namespace, key));
+    if (value == null) {
+      return defaultValue;
+    }
+    else {
+      // TODO: We have this try-catch for back-compat reason. After removing deprecated flag,
+      // we can remove try-catch block and just return (Double)value;
+      try {
+        return (Double) value;
+      }
+      catch (ClassCastException e) {
+        return Double.parseDouble((String)value);
+      }
     }
-    String value = parameters.getOrDefault(namespace + "." + key, Double.toString(defaultValue));
-    return Double.parseDouble(value);
   }
-  
+
+  /**
+   * get a Boolean parameter
+   * @param key
+   * @param defaultValue
+   * @return
+   */
   public boolean getBooleanParameter(String key, boolean defaultValue) {
-    String value = parameters.getOrDefault(key, Boolean.toString(defaultValue));
-    return Boolean.parseBoolean(value);
+    return getBooleanParameter(null, key, defaultValue);
   }
-  
+
+  /**
+   * get a Boolean parameter in the specified namespace
+   * @param namespace
+   * @param key
+   * @param defaultValue
+   * @return
+   */
   public boolean getBooleanParameter(String namespace, String key, boolean defaultValue) {
-    if (namespace == null) {
-      return getBooleanParameter(key, defaultValue);
+    Object value = parameters.get(getKey(namespace, key));
+    if (value == null) {
+      return defaultValue;
+    }
+    else {
+      // TODO: We have this try-catch for back-compat reason. After removing deprecated flag,
+      // we can remove try-catch block and just return (Boolean)value;
+      try {
+        return (Boolean) value;
+      }
+      catch (ClassCastException e) {
+        return Boolean.parseBoolean((String)value);
+      }
     }
-    String value = parameters.getOrDefault(namespace + "." + key, Boolean.toString(defaultValue));
-    return Boolean.parseBoolean(value);
   }
   
   public static TrainingParameters defaultParams() {
     TrainingParameters mlParams = new TrainingParameters();
     mlParams.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
     mlParams.put(TrainingParameters.TRAINER_TYPE_PARAM, EventTrainer.EVENT_VALUE);
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100));
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(5));
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, 100);
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, 5);
 
     return mlParams;
   }
+
+  static String getKey(String namespace, String key) {
+    if (namespace == null) {
+      return key;
+    }
+    else {
+      return namespace + "." + key;
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/main/java/opennlp/tools/util/model/ModelUtil.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/model/ModelUtil.java b/opennlp-tools/src/main/java/opennlp/tools/util/model/ModelUtil.java
index bcba6ea..fb84201 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/model/ModelUtil.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/model/ModelUtil.java
@@ -142,8 +142,8 @@ public final class ModelUtil {
   public static TrainingParameters createDefaultTrainingParameters() {
     TrainingParameters mlParams = new TrainingParameters();
     mlParams.put(TrainingParameters.ALGORITHM_PARAM, GISTrainer.MAXENT_VALUE);
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100));
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(5));
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, 100);
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, 5);
 
     return mlParams;
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
index facb408..4922ce9 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/chunker/ChunkerMETest.java
@@ -75,8 +75,8 @@ public class ChunkerMETest {
         new PlainTextByLineStream(in, StandardCharsets.UTF_8));
 
     TrainingParameters params = new TrainingParameters();
-    params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(70));
-    params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(1));
+    params.put(TrainingParameters.ITERATIONS_PARAM, 70);
+    params.put(TrainingParameters.CUTOFF_PARAM, 1);
 
     ChunkerModel chunkerModel = ChunkerME.train("en", sampleStream, params, new ChunkerFactory());
 
@@ -140,8 +140,8 @@ public class ChunkerMETest {
         new PlainTextByLineStream(in, StandardCharsets.UTF_8));
 
     TrainingParameters params = new TrainingParameters();
-    params.put(TrainingParameters.ITERATIONS_PARAM, "70");
-    params.put(TrainingParameters.CUTOFF_PARAM, "1");
+    params.put(TrainingParameters.ITERATIONS_PARAM, 70);
+    params.put(TrainingParameters.CUTOFF_PARAM, 1);
 
     ChunkerME.train("en", sampleStream, params, new ChunkerFactory());
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java b/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java
index 3ade0d5..ba02e50 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/cmdline/TokenNameFinderToolTest.java
@@ -104,8 +104,8 @@ public class TokenNameFinderToolTest {
     ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream);
 
     TrainingParameters params = new TrainingParameters();
-    params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(70));
-    params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(1));
+    params.put(TrainingParameters.ITERATIONS_PARAM, 70);
+    params.put(TrainingParameters.CUTOFF_PARAM, 1);
     
     TokenNameFinderModel model;
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
index 391125e..5e8ddaf 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerMETest.java
@@ -43,8 +43,8 @@ public class DocumentCategorizerMETest {
         new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"}));
 
     TrainingParameters params = new TrainingParameters();
-    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
-    params.put(TrainingParameters.CUTOFF_PARAM, "0");
+    params.put(TrainingParameters.ITERATIONS_PARAM, 100);
+    params.put(TrainingParameters.CUTOFF_PARAM, 0);
 
     DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples,
             params, new DoccatFactory());
@@ -70,8 +70,8 @@ public class DocumentCategorizerMETest {
         new DocumentSample("1", new String[]{"a", "b", "c"}));
 
     TrainingParameters params = new TrainingParameters();
-    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
-    params.put(TrainingParameters.CUTOFF_PARAM, "0");
+    params.put(TrainingParameters.ITERATIONS_PARAM, 100);
+    params.put(TrainingParameters.CUTOFF_PARAM, 0);
 
     DocumentCategorizerME.train("x-unspecified", samples,
         params, new DoccatFactory());

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java
index 0847690..1c96a36 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/DocumentCategorizerNBTest.java
@@ -44,8 +44,8 @@ public class DocumentCategorizerNBTest {
         new DocumentSample("0", new String[]{"x", "y", "z", "7", "8"}));
 
     TrainingParameters params = new TrainingParameters();
-    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
-    params.put(TrainingParameters.CUTOFF_PARAM, "0");
+    params.put(TrainingParameters.ITERATIONS_PARAM, 100);
+    params.put(TrainingParameters.CUTOFF_PARAM, 0);
     params.put(AbstractTrainer.ALGORITHM_PARAM, NaiveBayesTrainer.NAIVE_BAYES_VALUE);
 
     DoccatModel model = DocumentCategorizerME.train("x-unspecified", samples,

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/eval/ArvoresDeitadasEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/ArvoresDeitadasEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/ArvoresDeitadasEval.java
index ca62c5b..7e55165 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/ArvoresDeitadasEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/ArvoresDeitadasEval.java
@@ -173,7 +173,7 @@ public class ArvoresDeitadasEval {
   @Test
   public void evalPortugueseTokenizerMaxentQnMultipleThreads() throws IOException {
     TrainingParameters params = EvalUtil.createMaxentQnParams();
-    params.put("Threads", "4");
+    params.put("Threads", 4);
     tokenizerCrossEval(params, 0.9996017148748251d);
   }
 
@@ -192,7 +192,7 @@ public class ArvoresDeitadasEval {
   @Test
   public void evalPortugueseChunkerGisMultipleThreads() throws IOException {
     TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
-    params.put("Threads", "4");
+    params.put("Threads", 4);
     chunkerCrossEval(params, 0.9573860781121228d);
   }
 
@@ -205,7 +205,7 @@ public class ArvoresDeitadasEval {
   @Test
   public void evalPortugueseChunkerQnMultipleThreads() throws IOException {
     TrainingParameters params = EvalUtil.createMaxentQnParams();
-    params.put("Threads", "4");
+    params.put("Threads", 4);
 
     // NOTE: Should be the same as without multiple threads!!!
     chunkerCrossEval(params, 0.9647304571382662);

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
index 7e6102e..8ac90d7 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/Conll00ChunkerEval.java
@@ -90,7 +90,7 @@ public class Conll00ChunkerEval {
   @Test
   public void evalEnglishMaxentQn() throws IOException {
     TrainingParameters params = EvalUtil.createMaxentQnParams();
-    params.put("Threads", "4");
+    params.put("Threads", 4);
     ChunkerModel maxentModel = train(new File(EvalUtil.getOpennlpDataDir(),
         "conll00/train.txt"), params);
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java b/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java
index 608e474..45f2471 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/EvalUtil.java
@@ -31,7 +31,7 @@ public class EvalUtil {
     TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
     params.put(TrainingParameters.ALGORITHM_PARAM,
         PerceptronTrainer.PERCEPTRON_VALUE);
-    params.put(TrainingParameters.CUTOFF_PARAM, "0");
+    params.put(TrainingParameters.CUTOFF_PARAM, 0);
     return params;
   }
 
@@ -39,7 +39,7 @@ public class EvalUtil {
     TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
     params.put(TrainingParameters.ALGORITHM_PARAM,
         QNTrainer.MAXENT_QN_VALUE);
-    params.put(TrainingParameters.CUTOFF_PARAM, "0");
+    params.put(TrainingParameters.CUTOFF_PARAM, 0);
     return params;
   }
 
@@ -47,7 +47,7 @@ public class EvalUtil {
     TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
     params.put(TrainingParameters.ALGORITHM_PARAM,
         NaiveBayesTrainer.NAIVE_BAYES_VALUE);
-    params.put(TrainingParameters.CUTOFF_PARAM, "5");
+    params.put(TrainingParameters.CUTOFF_PARAM, 5);
     return params;
   }
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
index 68d49fc..2182957 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
@@ -75,10 +75,10 @@ public class OntoNotes4ParserEval {
     }
 
     TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
-    params.put("build.Threads", "4");
-    params.put("tagger.Threads", "4");
-    params.put("chunker.Threads", "4");
-    params.put("check.Threads", "4");
+    params.put("build.Threads", 4);
+    params.put("tagger.Threads", 4);
+    params.put("chunker.Threads", 4);
+    params.put("check.Threads", 4);
 
 
     crossEval(params, headRules, 0.937987617163142d);

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
index f00f2b4..285af4a 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/lemmatizer/LemmatizerMETest.java
@@ -69,8 +69,8 @@ public class LemmatizerMETest {
           new File("opennlp/tools/lemmatizer/trial.old.tsv")), "UTF-8"));
 
     TrainingParameters params = new TrainingParameters();
-    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
-    params.put(TrainingParameters.CUTOFF_PARAM, "5");
+    params.put(TrainingParameters.ITERATIONS_PARAM, 100);
+    params.put(TrainingParameters.CUTOFF_PARAM, 5);
 
     LemmatizerModel lemmatizerModel = LemmatizerME.train("en", sampleStream,
         params, new LemmatizerFactory());
@@ -95,8 +95,8 @@ public class LemmatizerMETest {
                 "UTF-8"));
 
     TrainingParameters params = new TrainingParameters();
-    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
-    params.put(TrainingParameters.CUTOFF_PARAM, "5");
+    params.put(TrainingParameters.ITERATIONS_PARAM, 100);
+    params.put(TrainingParameters.CUTOFF_PARAM, 5);
 
     LemmatizerME.train("en", sampleStream, params, new LemmatizerFactory());
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/ml/TrainerFactoryTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/TrainerFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/TrainerFactoryTest.java
index f7ac117..b08d28a 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ml/TrainerFactoryTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ml/TrainerFactoryTest.java
@@ -34,8 +34,8 @@ public class TrainerFactoryTest {
   public void setup() {
     mlParams = new TrainingParameters();
     mlParams.put(TrainingParameters.ALGORITHM_PARAM, GISTrainer.MAXENT_VALUE);
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(10));
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(5));
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, 10);
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, 5);
   }
 
   @Test

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/GISIndexingTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/GISIndexingTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/GISIndexingTest.java
index c8bc27f..03539a1 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/GISIndexingTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/GISIndexingTest.java
@@ -64,7 +64,7 @@ public class GISIndexingTest {
   public void testGISTrainSignature1() throws IOException {
     try (ObjectStream<Event> eventStream = createEventStream()) {
       TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
-      params.put(AbstractTrainer.CUTOFF_PARAM, "1");
+      params.put(AbstractTrainer.CUTOFF_PARAM, 1);
 
       EventTrainer trainer = TrainerFactory.getEventTrainer(params,  null);
 
@@ -79,8 +79,8 @@ public class GISIndexingTest {
   public void testGISTrainSignature2() throws IOException {
     try (ObjectStream<Event> eventStream = createEventStream()) {
       TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
-      params.put(AbstractTrainer.CUTOFF_PARAM, "1");
-      params.put("smoothing", "true");
+      params.put(AbstractTrainer.CUTOFF_PARAM, 1);
+      params.put("smoothing", true);
       EventTrainer trainer = TrainerFactory.getEventTrainer(params, null);
 
       Assert.assertNotNull(trainer.train(eventStream));
@@ -95,8 +95,8 @@ public class GISIndexingTest {
     try (ObjectStream<Event> eventStream = createEventStream()) {
       TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
 
-      params.put(AbstractTrainer.ITERATIONS_PARAM, "10");
-      params.put(AbstractTrainer.CUTOFF_PARAM, "1");
+      params.put(AbstractTrainer.ITERATIONS_PARAM, 10);
+      params.put(AbstractTrainer.CUTOFF_PARAM, 1);
 
       EventTrainer trainer = TrainerFactory.getEventTrainer(params, null);
 
@@ -111,8 +111,8 @@ public class GISIndexingTest {
   public void testGISTrainSignature4() throws IOException {
     try (ObjectStream<Event> eventStream = createEventStream()) {
       TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
-      params.put(AbstractTrainer.ITERATIONS_PARAM, "10");
-      params.put(AbstractTrainer.CUTOFF_PARAM, "1");
+      params.put(AbstractTrainer.ITERATIONS_PARAM, 10);
+      params.put(AbstractTrainer.CUTOFF_PARAM, 1);
       GISTrainer trainer = (GISTrainer) TrainerFactory.getEventTrainer(params, null);
       trainer.setGaussianSigma(0.01);
 
@@ -129,10 +129,10 @@ public class GISIndexingTest {
     try (ObjectStream<Event> eventStream = createEventStream()) {
       TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
 
-      params.put(AbstractTrainer.ITERATIONS_PARAM, "10");
-      params.put(AbstractTrainer.CUTOFF_PARAM, "1");
-      params.put("smoothing", "false");
-      params.put(AbstractTrainer.VERBOSE_PARAM, "false");
+      params.put(AbstractTrainer.ITERATIONS_PARAM, 10);
+      params.put(AbstractTrainer.CUTOFF_PARAM, 1);
+      params.put("smoothing", false);
+      params.put(AbstractTrainer.VERBOSE_PARAM, false);
 
       EventTrainer trainer = TrainerFactory.getEventTrainer(params, null);
       Assert.assertNotNull(trainer.train(eventStream));
@@ -145,11 +145,11 @@ public class GISIndexingTest {
     
     TrainingParameters parameters = TrainingParameters.defaultParams();
     // by default we are using GIS/EventTrainer/Cutoff of 5/100 iterations
-    parameters.put(TrainingParameters.ITERATIONS_PARAM, "10");
+    parameters.put(TrainingParameters.ITERATIONS_PARAM, 10);
     parameters.put(AbstractEventTrainer.DATA_INDEXER_PARAM, AbstractEventTrainer.DATA_INDEXER_ONE_PASS_VALUE);
-    parameters.put(AbstractEventTrainer.CUTOFF_PARAM, "1");
+    parameters.put(AbstractEventTrainer.CUTOFF_PARAM, 1);
     // note: setting the SORT_PARAM to true is the default, so it is not really needed
-    parameters.put(AbstractDataIndexer.SORT_PARAM, "true");
+    parameters.put(AbstractDataIndexer.SORT_PARAM, true);
 
     // guarantee that you have a GIS trainer...
     EventTrainer trainer =
@@ -169,7 +169,7 @@ public class GISIndexingTest {
  
     parameters.put(TrainingParameters.ALGORITHM_PARAM, QNTrainer.MAXENT_QN_VALUE);
     parameters.put(AbstractEventTrainer.DATA_INDEXER_PARAM, AbstractEventTrainer.DATA_INDEXER_TWO_PASS_VALUE);
-    parameters.put(AbstractEventTrainer.CUTOFF_PARAM, "2");
+    parameters.put(AbstractEventTrainer.CUTOFF_PARAM, 2);
     
     trainer = TrainerFactory.getEventTrainer(parameters, new HashMap<>());
     Assert.assertEquals("opennlp.tools.ml.maxent.quasinewton.QNTrainer", trainer.getClass().getName());
@@ -187,7 +187,7 @@ public class GISIndexingTest {
 
     // set the cutoff to 1 for this test.
     TrainingParameters parameters = new TrainingParameters();
-    parameters.put(AbstractDataIndexer.CUTOFF_PARAM, "1");
+    parameters.put(AbstractDataIndexer.CUTOFF_PARAM, 1);
     
     // test with a 1 pass data indexer...
     parameters.put(AbstractEventTrainer.DATA_INDEXER_PARAM, AbstractEventTrainer.DATA_INDEXER_ONE_PASS_VALUE);

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/MaxentPrepAttachTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/MaxentPrepAttachTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/MaxentPrepAttachTest.java
index 36e8926..09a40e5 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/MaxentPrepAttachTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/MaxentPrepAttachTest.java
@@ -42,8 +42,8 @@ public class MaxentPrepAttachTest {
   @Before
   public void initIndexer() {
     TrainingParameters trainingParameters = new TrainingParameters();
-    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, "1");
-    trainingParameters.put(AbstractDataIndexer.SORT_PARAM, "false");
+    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, 1);
+    trainingParameters.put(AbstractDataIndexer.SORT_PARAM, false);
     testDataIndexer = new TwoPassDataIndexer();
     testDataIndexer.init(trainingParameters, new HashMap<>());
   }
@@ -78,7 +78,7 @@ public class MaxentPrepAttachTest {
     trainParams.put(AbstractTrainer.ALGORITHM_PARAM, GISTrainer.MAXENT_VALUE);
     trainParams.put(AbstractEventTrainer.DATA_INDEXER_PARAM,
         AbstractEventTrainer.DATA_INDEXER_TWO_PASS_VALUE);
-    trainParams.put(AbstractTrainer.CUTOFF_PARAM, Integer.toString(1));
+    trainParams.put(AbstractTrainer.CUTOFF_PARAM, 1);
 
     EventTrainer trainer = TrainerFactory.getEventTrainer(trainParams, null);
     MaxentModel model = trainer.train(PrepAttachDataUtil.createTrainingStream());

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/RealValueModelTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/RealValueModelTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/RealValueModelTest.java
index 850d9bc..fbff618 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/RealValueModelTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/RealValueModelTest.java
@@ -38,7 +38,7 @@ public class RealValueModelTest {
   @Before
   public void initIndexer() {
     TrainingParameters trainingParameters = new TrainingParameters();
-    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, "1");
+    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, 1);
     testDataIndexer = new OnePassRealValueDataIndexer();
     testDataIndexer.init(trainingParameters, new HashMap<>());
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/ScaleDoesntMatterTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/ScaleDoesntMatterTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/ScaleDoesntMatterTest.java
index 1e5c8a3..ed7b2a1 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/ScaleDoesntMatterTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/ScaleDoesntMatterTest.java
@@ -45,7 +45,7 @@ public class ScaleDoesntMatterTest {
   @Before
   public void initIndexer() {
     TrainingParameters trainingParameters = new TrainingParameters();
-    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, "0");
+    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, 0);
     testDataIndexer = new OnePassRealValueDataIndexer();
     testDataIndexer.init(trainingParameters, new HashMap<>());
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/io/RealValueFileEventStreamTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/io/RealValueFileEventStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/io/RealValueFileEventStreamTest.java
index d084977..b5425ac 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/io/RealValueFileEventStreamTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/io/RealValueFileEventStreamTest.java
@@ -37,7 +37,7 @@ public class RealValueFileEventStreamTest {
   @Before
   public void initIndexer() {
     TrainingParameters trainingParameters = new TrainingParameters();
-    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, "1");
+    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, 1);
     indexer = new OnePassRealValueDataIndexer();
     indexer.init(trainingParameters, new HashMap<>());
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/quasinewton/NegLogLikelihoodTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/quasinewton/NegLogLikelihoodTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/quasinewton/NegLogLikelihoodTest.java
index dcba896..d51852d 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/quasinewton/NegLogLikelihoodTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/quasinewton/NegLogLikelihoodTest.java
@@ -41,7 +41,7 @@ public class NegLogLikelihoodTest {
   @Before
   public void initIndexer() {
     TrainingParameters trainingParameters = new TrainingParameters();
-    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, "1");
+    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, 1);
     testDataIndexer = new OnePassRealValueDataIndexer();
     testDataIndexer.init(trainingParameters, new HashMap<>());
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/quasinewton/QNPrepAttachTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/quasinewton/QNPrepAttachTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/quasinewton/QNPrepAttachTest.java
index c01aa76..574e871 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/quasinewton/QNPrepAttachTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/quasinewton/QNPrepAttachTest.java
@@ -39,8 +39,8 @@ public class QNPrepAttachTest {
   public void testQNOnPrepAttachData() throws IOException {
     DataIndexer indexer = new TwoPassDataIndexer();
     TrainingParameters indexingParameters = new TrainingParameters();
-    indexingParameters.put(AbstractTrainer.CUTOFF_PARAM, "1");
-    indexingParameters.put(AbstractDataIndexer.SORT_PARAM, "false");
+    indexingParameters.put(AbstractTrainer.CUTOFF_PARAM, 1);
+    indexingParameters.put(AbstractDataIndexer.SORT_PARAM, false);
     indexer.init(indexingParameters, new HashMap<>());
     indexer.index(PrepAttachDataUtil.createTrainingStream());
 
@@ -68,9 +68,9 @@ public class QNPrepAttachTest {
     trainParams.put(AbstractTrainer.ALGORITHM_PARAM, QNTrainer.MAXENT_QN_VALUE);
     trainParams.put(AbstractEventTrainer.DATA_INDEXER_PARAM,
         AbstractEventTrainer.DATA_INDEXER_TWO_PASS_VALUE);
-    trainParams.put(AbstractTrainer.CUTOFF_PARAM, Integer.toString(1));
-    trainParams.put(QNTrainer.L1COST_PARAM, Double.toString(0.25));
-    trainParams.put(QNTrainer.L2COST_PARAM, Double.toString(1.0));
+    trainParams.put(AbstractTrainer.CUTOFF_PARAM, 1);
+    trainParams.put(QNTrainer.L1COST_PARAM, 0.25);
+    trainParams.put(QNTrainer.L2COST_PARAM, 1.0D);
 
     MaxentModel model = TrainerFactory.getEventTrainer(trainParams, null)
                                       .train(PrepAttachDataUtil.createTrainingStream());
@@ -85,9 +85,9 @@ public class QNPrepAttachTest {
     trainParams.put(AbstractTrainer.ALGORITHM_PARAM, QNTrainer.MAXENT_QN_VALUE);
     trainParams.put(AbstractEventTrainer.DATA_INDEXER_PARAM,
         AbstractEventTrainer.DATA_INDEXER_TWO_PASS_VALUE);
-    trainParams.put(AbstractTrainer.CUTOFF_PARAM, Integer.toString(1));
-    trainParams.put(QNTrainer.L1COST_PARAM, Double.toString(1.0));
-    trainParams.put(QNTrainer.L2COST_PARAM, Double.toString(0));
+    trainParams.put(AbstractTrainer.CUTOFF_PARAM, 1);
+    trainParams.put(QNTrainer.L1COST_PARAM, 1.0D);
+    trainParams.put(QNTrainer.L2COST_PARAM, 0D);
 
     MaxentModel model = TrainerFactory.getEventTrainer(trainParams, null)
                                       .train(PrepAttachDataUtil.createTrainingStream());
@@ -102,9 +102,9 @@ public class QNPrepAttachTest {
     trainParams.put(AbstractTrainer.ALGORITHM_PARAM, QNTrainer.MAXENT_QN_VALUE);
     trainParams.put(AbstractEventTrainer.DATA_INDEXER_PARAM,
         AbstractEventTrainer.DATA_INDEXER_TWO_PASS_VALUE);
-    trainParams.put(AbstractTrainer.CUTOFF_PARAM, Integer.toString(1));
-    trainParams.put(QNTrainer.L1COST_PARAM, Double.toString(0));
-    trainParams.put(QNTrainer.L2COST_PARAM, Double.toString(1.0));
+    trainParams.put(AbstractTrainer.CUTOFF_PARAM, 1);
+    trainParams.put(QNTrainer.L1COST_PARAM, 0D);
+    trainParams.put(QNTrainer.L2COST_PARAM, 1.0D);
 
     MaxentModel model = TrainerFactory.getEventTrainer(trainParams, null)
                                       .train(PrepAttachDataUtil.createTrainingStream());
@@ -117,7 +117,7 @@ public class QNPrepAttachTest {
 
     TrainingParameters trainParams = new TrainingParameters();
     trainParams.put(AbstractTrainer.ALGORITHM_PARAM, QNTrainer.MAXENT_QN_VALUE);
-    trainParams.put(QNTrainer.THREADS_PARAM, Integer.toString(2));
+    trainParams.put(QNTrainer.THREADS_PARAM, 2);
 
     MaxentModel model = TrainerFactory.getEventTrainer(trainParams, null)
                                       .train(PrepAttachDataUtil.createTrainingStream());

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/quasinewton/QNTrainerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/quasinewton/QNTrainerTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/quasinewton/QNTrainerTest.java
index fbe9ecc..555b9dc 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/quasinewton/QNTrainerTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ml/maxent/quasinewton/QNTrainerTest.java
@@ -46,7 +46,7 @@ public class QNTrainerTest {
   @Before
   public void initIndexer() {
     TrainingParameters trainingParameters = new TrainingParameters();
-    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, "1");
+    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, 1);
     testDataIndexer = new OnePassRealValueDataIndexer();
     testDataIndexer.init(trainingParameters, new HashMap<>());
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesCorrectnessTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesCorrectnessTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesCorrectnessTest.java
index 11a961b..9a322d4 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesCorrectnessTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesCorrectnessTest.java
@@ -46,8 +46,8 @@ public class NaiveBayesCorrectnessTest {
   @Before
   public void initIndexer() {
     TrainingParameters trainingParameters = new TrainingParameters();
-    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, "1");
-    trainingParameters.put(AbstractDataIndexer.SORT_PARAM, "false");;
+    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, 1);
+    trainingParameters.put(AbstractDataIndexer.SORT_PARAM, false);;
     testDataIndexer = new TwoPassDataIndexer();
     testDataIndexer.init(trainingParameters, new HashMap<>());
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesModelReadWriteTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesModelReadWriteTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesModelReadWriteTest.java
index a76b428..7a0fb22 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesModelReadWriteTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesModelReadWriteTest.java
@@ -43,8 +43,8 @@ public class NaiveBayesModelReadWriteTest {
   @Before
   public void initIndexer() {
     TrainingParameters trainingParameters = new TrainingParameters();
-    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, "1");
-    trainingParameters.put(AbstractDataIndexer.SORT_PARAM, "false");;
+    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, 1);
+    trainingParameters.put(AbstractDataIndexer.SORT_PARAM, false);;
     testDataIndexer = new TwoPassDataIndexer();
     testDataIndexer.init(trainingParameters, new HashMap<>());
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesPrepAttachTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesPrepAttachTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesPrepAttachTest.java
index e994ba1..c6c5ace 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesPrepAttachTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesPrepAttachTest.java
@@ -44,8 +44,8 @@ public class NaiveBayesPrepAttachTest {
   @Before
   public void initIndexer() {
     TrainingParameters trainingParameters = new TrainingParameters();
-    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, "1");
-    trainingParameters.put(AbstractDataIndexer.SORT_PARAM, "false");
+    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, 1);
+    trainingParameters.put(AbstractDataIndexer.SORT_PARAM, false);
     testDataIndexer = new TwoPassDataIndexer();
     testDataIndexer.init(trainingParameters, new HashMap<>());
   }
@@ -62,7 +62,7 @@ public class NaiveBayesPrepAttachTest {
   public void testNaiveBayesOnPrepAttachDataUsingTrainUtil() throws IOException {
     TrainingParameters trainParams = new TrainingParameters();
     trainParams.put(AbstractTrainer.ALGORITHM_PARAM, NaiveBayesTrainer.NAIVE_BAYES_VALUE);
-    trainParams.put(AbstractTrainer.CUTOFF_PARAM, Integer.toString(1));
+    trainParams.put(AbstractTrainer.CUTOFF_PARAM, 1);
 
     EventTrainer trainer = TrainerFactory.getEventTrainer(trainParams, null);
     MaxentModel model = trainer.train(PrepAttachDataUtil.createTrainingStream());
@@ -74,7 +74,7 @@ public class NaiveBayesPrepAttachTest {
   public void testNaiveBayesOnPrepAttachDataUsingTrainUtilWithCutoff5() throws IOException {
     TrainingParameters trainParams = new TrainingParameters();
     trainParams.put(AbstractTrainer.ALGORITHM_PARAM, NaiveBayesTrainer.NAIVE_BAYES_VALUE);
-    trainParams.put(AbstractTrainer.CUTOFF_PARAM, Integer.toString(5));
+    trainParams.put(AbstractTrainer.CUTOFF_PARAM, 5);
 
     EventTrainer trainer = TrainerFactory.getEventTrainer(trainParams, null);
     MaxentModel model = trainer.train(PrepAttachDataUtil.createTrainingStream());

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesSerializedCorrectnessTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesSerializedCorrectnessTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesSerializedCorrectnessTest.java
index 0146885..f684974 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesSerializedCorrectnessTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ml/naivebayes/NaiveBayesSerializedCorrectnessTest.java
@@ -48,8 +48,8 @@ public class NaiveBayesSerializedCorrectnessTest {
   @Before
   public void initIndexer() {
     TrainingParameters trainingParameters = new TrainingParameters();
-    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, "1");
-    trainingParameters.put(AbstractDataIndexer.SORT_PARAM, "false");;
+    trainingParameters.put(AbstractTrainer.CUTOFF_PARAM, 1);
+    trainingParameters.put(AbstractDataIndexer.SORT_PARAM, false);;
     testDataIndexer = new TwoPassDataIndexer();
     testDataIndexer.init(trainingParameters, new HashMap<>());
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/ml/perceptron/PerceptronPrepAttachTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/ml/perceptron/PerceptronPrepAttachTest.java b/opennlp-tools/src/test/java/opennlp/tools/ml/perceptron/PerceptronPrepAttachTest.java
index eda49f8..94985cc 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/ml/perceptron/PerceptronPrepAttachTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/ml/perceptron/PerceptronPrepAttachTest.java
@@ -47,8 +47,8 @@ public class PerceptronPrepAttachTest {
   public void testPerceptronOnPrepAttachData() throws IOException {
     TwoPassDataIndexer indexer = new TwoPassDataIndexer();
     TrainingParameters indexingParameters = new TrainingParameters();
-    indexingParameters.put(AbstractTrainer.CUTOFF_PARAM, "1");
-    indexingParameters.put(AbstractDataIndexer.SORT_PARAM, "false");
+    indexingParameters.put(AbstractTrainer.CUTOFF_PARAM, 1);
+    indexingParameters.put(AbstractDataIndexer.SORT_PARAM, false);
     indexer.init(indexingParameters, new HashMap<>());
     indexer.index(PrepAttachDataUtil.createTrainingStream());
     MaxentModel model = new PerceptronTrainer().trainModel(400, indexer, 1);
@@ -60,8 +60,8 @@ public class PerceptronPrepAttachTest {
 
     TrainingParameters trainParams = new TrainingParameters();
     trainParams.put(AbstractTrainer.ALGORITHM_PARAM, PerceptronTrainer.PERCEPTRON_VALUE);
-    trainParams.put(AbstractTrainer.CUTOFF_PARAM, Integer.toString(1));
-    trainParams.put("UseSkippedAveraging", Boolean.toString(true));
+    trainParams.put(AbstractTrainer.CUTOFF_PARAM, 1);
+    trainParams.put("UseSkippedAveraging", true);
 
     EventTrainer trainer = TrainerFactory.getEventTrainer(trainParams, null);
     MaxentModel model = trainer.train(PrepAttachDataUtil.createTrainingStream());
@@ -73,9 +73,9 @@ public class PerceptronPrepAttachTest {
 
     TrainingParameters trainParams = new TrainingParameters();
     trainParams.put(AbstractTrainer.ALGORITHM_PARAM, PerceptronTrainer.PERCEPTRON_VALUE);
-    trainParams.put(AbstractTrainer.CUTOFF_PARAM, Integer.toString(1));
-    trainParams.put(AbstractTrainer.ITERATIONS_PARAM, Integer.toString(500));
-    trainParams.put("Tolerance", Double.toString(0.0001d));
+    trainParams.put(AbstractTrainer.CUTOFF_PARAM, 1);
+    trainParams.put(AbstractTrainer.ITERATIONS_PARAM, 500);
+    trainParams.put("Tolerance", 0.0001d);
 
     EventTrainer trainer = TrainerFactory.getEventTrainer(trainParams, null);
     MaxentModel model = trainer.train(PrepAttachDataUtil.createTrainingStream());
@@ -87,9 +87,9 @@ public class PerceptronPrepAttachTest {
 
     TrainingParameters trainParams = new TrainingParameters();
     trainParams.put(AbstractTrainer.ALGORITHM_PARAM, PerceptronTrainer.PERCEPTRON_VALUE);
-    trainParams.put(AbstractTrainer.CUTOFF_PARAM, Integer.toString(1));
-    trainParams.put(AbstractTrainer.ITERATIONS_PARAM, Integer.toString(500));
-    trainParams.put("StepSizeDecrease", Double.toString(0.06d));
+    trainParams.put(AbstractTrainer.CUTOFF_PARAM, 1);
+    trainParams.put(AbstractTrainer.ITERATIONS_PARAM, 500);
+    trainParams.put("StepSizeDecrease", 0.06d);
 
     EventTrainer trainer = TrainerFactory.getEventTrainer(trainParams, null);
     MaxentModel model = trainer.train(PrepAttachDataUtil.createTrainingStream());
@@ -101,8 +101,8 @@ public class PerceptronPrepAttachTest {
 
     TrainingParameters trainParams = new TrainingParameters();
     trainParams.put(AbstractTrainer.ALGORITHM_PARAM, PerceptronTrainer.PERCEPTRON_VALUE);
-    trainParams.put(AbstractTrainer.CUTOFF_PARAM, Integer.toString(1));
-    trainParams.put("UseSkippedAveraging", Boolean.toString(true));
+    trainParams.put(AbstractTrainer.CUTOFF_PARAM, 1);
+    trainParams.put("UseSkippedAveraging", true);
 
     EventTrainer trainer = TrainerFactory.getEventTrainer(trainParams, null);
     AbstractModel model = (AbstractModel) trainer.train(PrepAttachDataUtil.createTrainingStream());
@@ -125,8 +125,8 @@ public class PerceptronPrepAttachTest {
   public void testModelEquals() throws IOException {
     TrainingParameters trainParams = new TrainingParameters();
     trainParams.put(AbstractTrainer.ALGORITHM_PARAM, PerceptronTrainer.PERCEPTRON_VALUE);
-    trainParams.put(AbstractTrainer.CUTOFF_PARAM, Integer.toString(1));
-    trainParams.put("UseSkippedAveraging", Boolean.toString(true));
+    trainParams.put(AbstractTrainer.CUTOFF_PARAM, 1);
+    trainParams.put("UseSkippedAveraging", true);
 
     EventTrainer trainer = TrainerFactory.getEventTrainer(trainParams, null);
     AbstractModel modelA = (AbstractModel) trainer.train(PrepAttachDataUtil.createTrainingStream());
@@ -140,10 +140,10 @@ public class PerceptronPrepAttachTest {
   public void verifyReportMap() throws IOException {
     TrainingParameters trainParams = new TrainingParameters();
     trainParams.put(AbstractTrainer.ALGORITHM_PARAM, PerceptronTrainer.PERCEPTRON_VALUE);
-    trainParams.put(AbstractTrainer.CUTOFF_PARAM, Integer.toString(1));
+    trainParams.put(AbstractTrainer.CUTOFF_PARAM, 1);
     // Since we are verifying the report map, we don't need to have more than 1 iteration
-    trainParams.put(AbstractTrainer.ITERATIONS_PARAM, Integer.toString(1));
-    trainParams.put("UseSkippedAveraging", Boolean.toString(true));
+    trainParams.put(AbstractTrainer.ITERATIONS_PARAM, 1);
+    trainParams.put("UseSkippedAveraging", true);
     
     Map<String,String> reportMap = new HashMap<>();
     EventTrainer trainer = TrainerFactory.getEventTrainer(trainParams, reportMap);

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/namefind/NameFinderMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/namefind/NameFinderMETest.java b/opennlp-tools/src/test/java/opennlp/tools/namefind/NameFinderMETest.java
index 876df5b..94fbb36 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/namefind/NameFinderMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/namefind/NameFinderMETest.java
@@ -65,8 +65,8 @@ public class NameFinderMETest {
               new File("opennlp/tools/namefind/AnnotatedSentences.txt")), encoding));
 
     TrainingParameters params = new TrainingParameters();
-    params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(70));
-    params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(1));
+    params.put(TrainingParameters.ITERATIONS_PARAM, 70);
+    params.put(TrainingParameters.CUTOFF_PARAM, 1);
 
     TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream,
         params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec()));
@@ -123,8 +123,8 @@ public class NameFinderMETest {
           new File("opennlp/tools/namefind/AnnotatedSentencesWithTypes.txt")), encoding));
 
     TrainingParameters params = new TrainingParameters();
-    params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(70));
-    params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(1));
+    params.put(TrainingParameters.ITERATIONS_PARAM, 70);
+    params.put(TrainingParameters.CUTOFF_PARAM, 1);
 
     TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream,
         params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec()));
@@ -167,8 +167,8 @@ public class NameFinderMETest {
               new File("opennlp/tools/namefind/OnlyWithNames.train")), "UTF-8"));
 
     TrainingParameters params = new TrainingParameters();
-    params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(70));
-    params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(1));
+    params.put(TrainingParameters.ITERATIONS_PARAM, 70);
+    params.put(TrainingParameters.CUTOFF_PARAM, 1);
 
     TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream,
             params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec()));
@@ -197,8 +197,8 @@ public class NameFinderMETest {
           new File("opennlp/tools/namefind/OnlyWithNames.train")), "UTF-8"));
 
     TrainingParameters params = new TrainingParameters();
-    params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(70));
-    params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(1));
+    params.put(TrainingParameters.ITERATIONS_PARAM, 70);
+    params.put(TrainingParameters.CUTOFF_PARAM, 1);
 
     TokenNameFinderModel nameFinderModel = NameFinderME.train("en", TYPE_OVERRIDE, sampleStream,
         params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec()));
@@ -232,8 +232,8 @@ public class NameFinderMETest {
           new File("opennlp/tools/namefind/OnlyWithNamesWithTypes.train")), "UTF-8"));
 
     TrainingParameters params = new TrainingParameters();
-    params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(70));
-    params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(1));
+    params.put(TrainingParameters.ITERATIONS_PARAM, 70);
+    params.put(TrainingParameters.CUTOFF_PARAM, 1);
 
     TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream,
         params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec()));
@@ -268,8 +268,8 @@ public class NameFinderMETest {
 
     TrainingParameters params = new TrainingParameters();
     params.put(TrainingParameters.ALGORITHM_PARAM, "MAXENT");
-    params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(70));
-    params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(1));
+    params.put(TrainingParameters.ITERATIONS_PARAM, 70);
+    params.put(TrainingParameters.CUTOFF_PARAM, 1);
 
     TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream,
         params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec()));
@@ -319,8 +319,8 @@ public class NameFinderMETest {
           new File("opennlp/tools/namefind/voa1.train")), "UTF-8"));
 
     TrainingParameters params = new TrainingParameters();
-    params.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(70));
-    params.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(1));
+    params.put(TrainingParameters.ITERATIONS_PARAM, 70);
+    params.put(TrainingParameters.CUTOFF_PARAM, 1);
 
     TokenNameFinderModel nameFinderModel = NameFinderME.train("en", null, sampleStream,
         params, TokenNameFinderFactory.create(null, null, Collections.emptyMap(), new BioCodec()));

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java
index 9e31987..0326fb2 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/namefind/TokenNameFinderCrossValidatorTest.java
@@ -51,8 +51,8 @@ public class TokenNameFinderCrossValidatorTest {
         new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1));
 
     TrainingParameters mlParams = new TrainingParameters();
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, "70");
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, "1");
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, 70);
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, 1);
 
     mlParams.put(TrainingParameters.ALGORITHM_PARAM,
         ModelType.MAXENT.toString());
@@ -78,8 +78,8 @@ public class TokenNameFinderCrossValidatorTest {
         new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1));
 
     TrainingParameters mlParams = new TrainingParameters();
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, "70");
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, "1");
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, 70);
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, 1);
 
     mlParams.put(TrainingParameters.ALGORITHM_PARAM,
         ModelType.MAXENT.toString());
@@ -107,8 +107,8 @@ public class TokenNameFinderCrossValidatorTest {
         new PlainTextByLineStream(in, StandardCharsets.ISO_8859_1));
 
     TrainingParameters mlParams = new TrainingParameters();
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, "70");
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, "1");
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, 70);
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, 1);
 
     mlParams.put(TrainingParameters.ALGORITHM_PARAM,
         ModelType.MAXENT.toString());

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java
index e2bca48..6d0785b 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java
@@ -51,8 +51,8 @@ public class POSTaggerMETest {
   static POSModel trainPOSModel(ModelType type) throws IOException {
     TrainingParameters params = new TrainingParameters();
     params.put(TrainingParameters.ALGORITHM_PARAM, type.toString());
-    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
-    params.put(TrainingParameters.CUTOFF_PARAM, "5");
+    params.put(TrainingParameters.ITERATIONS_PARAM, 100);
+    params.put(TrainingParameters.CUTOFF_PARAM, 5);
 
     return POSTaggerME.train("en", createSampleStream(), params,
         new POSTaggerFactory());
@@ -98,8 +98,8 @@ public class POSTaggerMETest {
  
     TrainingParameters params = new TrainingParameters();
     params.put(TrainingParameters.ALGORITHM_PARAM, ModelType.MAXENT.name());
-    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
-    params.put(TrainingParameters.CUTOFF_PARAM, "5");
+    params.put(TrainingParameters.ITERATIONS_PARAM, 100);
+    params.put(TrainingParameters.CUTOFF_PARAM, 5);
 
     POSTaggerME.train("en", stream, params, new POSTaggerFactory());
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
index 220650d..5fba0fd 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceDetectorMETest.java
@@ -43,8 +43,8 @@ public class SentenceDetectorMETest {
         "/opennlp/tools/sentdetect/Sentences.txt");
 
     TrainingParameters mlParams = new TrainingParameters();
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, "100");
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, "0");
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, 100);
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, 0);
 
     SentenceDetectorFactory factory = new SentenceDetectorFactory("en", true, null, null);
 
@@ -143,8 +143,8 @@ public class SentenceDetectorMETest {
         "/opennlp/tools/sentdetect/SentencesInsufficient.txt");
 
     TrainingParameters mlParams = new TrainingParameters();
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, "100");
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, "0");
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, 100);
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, 0);
 
     SentenceDetectorFactory factory = new SentenceDetectorFactory("en", true, null, null);
     

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
index 14b9185..3dd92a0 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerMETest.java
@@ -84,8 +84,8 @@ public class TokenizerMETest {
         new PlainTextByLineStream(trainDataIn, StandardCharsets.UTF_8));
 
     TrainingParameters mlParams = new TrainingParameters();
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, "100");
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, "5");
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, 100);
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, 5);
 
     TokenizerME.train(samples, TokenizerFactory.create(null, "en", null, true, null), mlParams);
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerTestUtil.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerTestUtil.java b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerTestUtil.java
index 65fed21..4d49c58 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerTestUtil.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/tokenize/TokenizerTestUtil.java
@@ -54,8 +54,8 @@ public class TokenizerTestUtil {
         new Span(3, 4)}));
 
     TrainingParameters mlParams = new TrainingParameters();
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100));
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(0));
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, 100);
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, 0);
 
     return TokenizerME.train(new CollectionObjectStream<>(samples),
       TokenizerFactory.create(null, "en", null, true, null), mlParams);
@@ -70,8 +70,8 @@ public class TokenizerTestUtil {
         new PlainTextByLineStream(trainDataIn, StandardCharsets.UTF_8));
 
     TrainingParameters mlParams = new TrainingParameters();
-    mlParams.put(TrainingParameters.ITERATIONS_PARAM, Integer.toString(100));
-    mlParams.put(TrainingParameters.CUTOFF_PARAM, Integer.toString(0));
+    mlParams.put(TrainingParameters.ITERATIONS_PARAM, 100);
+    mlParams.put(TrainingParameters.CUTOFF_PARAM, 0);
 
     return TokenizerME.train(samples, TokenizerFactory.create(null, "en", null, true, null), mlParams);
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/99cbf0da/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java
index 294dff8..7c8e41e 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java
@@ -99,20 +99,20 @@ public class TrainingParametersTest {
     Assert.assertEquals("def", tp.getStringParameter("str", "k4", "def"));
 
     Assert.assertEquals(-100, tp.getIntParameter("k11", -100));
-    tp.put("k11", "234");
+    tp.put("k11", 234);
     Assert.assertEquals(234, tp.getIntParameter("k11", -100));
     Assert.assertEquals(123, tp.getIntParameter("int", "k2", -100));
     Assert.assertEquals(-100, tp.getIntParameter("int", "k4", -100));
 
     Assert.assertEquals(234.5, tp.getDoubleParameter("k21", -100), 0.001);
-    tp.put("k21", "345.6");
+    tp.put("k21", 345.6);
     Assert.assertEquals(345.6, tp.getDoubleParameter("k21", -100), 0.001); // should be changed
-    tp.putIfAbsent("k21", "456.7");
+    tp.putIfAbsent("k21", 456.7);
     Assert.assertEquals(345.6, tp.getDoubleParameter("k21", -100), 0.001); // should be unchanged
     Assert.assertEquals(123.45, tp.getDoubleParameter("double", "k5", -100), 0.001);
 
     Assert.assertEquals(true, tp.getBooleanParameter("k31", true));
-    tp.put("k31", "false");
+    tp.put("k31", false);
     Assert.assertEquals(false, tp.getBooleanParameter("k31", true));
     Assert.assertEquals(false, tp.getBooleanParameter("boolean", "k4", true));
   }


[21/50] opennlp git commit: OPENNLP-1026: Replace references and usages of o.t.u.Heap with SortedSet

Posted by co...@apache.org.
OPENNLP-1026: Replace references and usages of o.t.u.Heap with SortedSet

Closes #187


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/3df659b9
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/3df659b9
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/3df659b9

Branch: refs/heads/LangDetect
Commit: 3df659b9bfb02084e782f1e8b6ec716f56e0611c
Parents: 9803662
Author: smarthi <sm...@apache.org>
Authored: Mon May 1 16:28:42 2017 -0400
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 3 11:17:38 2017 +0200

----------------------------------------------------------------------
 .../tools/parser/AbstractBottomUpParser.java    | 23 ++++----
 .../src/main/java/opennlp/tools/util/Heap.java  |  3 ++
 .../main/java/opennlp/tools/util/ListHeap.java  |  3 ++
 .../tools/eval/SourceForgeModelEval.java        |  2 +-
 .../java/opennlp/tools/util/ListHeapTest.java   | 55 --------------------
 5 files changed, 19 insertions(+), 67 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/3df659b9/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java b/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java
index 7d7c1b0..1a414f1 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/parser/AbstractBottomUpParser.java
@@ -22,14 +22,14 @@ import java.util.ArrayList;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
 
 import opennlp.tools.chunker.Chunker;
 import opennlp.tools.dictionary.Dictionary;
 import opennlp.tools.ngram.NGramModel;
 import opennlp.tools.parser.chunking.ParserEventStream;
 import opennlp.tools.postag.POSTagger;
-import opennlp.tools.util.Heap;
-import opennlp.tools.util.ListHeap;
 import opennlp.tools.util.ObjectStream;
 import opennlp.tools.util.Sequence;
 import opennlp.tools.util.Span;
@@ -77,17 +77,17 @@ public abstract class AbstractBottomUpParser implements Parser {
   /**
    * Completed parses.
    */
-  protected Heap<Parse> completeParses;
+  private SortedSet<Parse> completeParses;
 
   /**
    * Incomplete parses which will be advanced.
    */
-  protected Heap<Parse> odh;
+  private SortedSet<Parse> odh;
 
   /**
    * Incomplete parses which have been advanced.
    */
-  protected Heap<Parse> ndh;
+  private SortedSet<Parse> ndh;
 
   /**
    * The head rules for the parser.
@@ -182,9 +182,9 @@ public abstract class AbstractBottomUpParser implements Parser {
     reportFailedParse = true;
     this.headRules = headRules;
     this.punctSet = headRules.getPunctuationTags();
-    odh = new ListHeap<>(K);
-    ndh = new ListHeap<>(K);
-    completeParses = new ListHeap<>(K);
+    odh = new TreeSet<>();
+    ndh = new TreeSet<>();
+    completeParses = new TreeSet<>();
   }
 
   /**
@@ -279,11 +279,11 @@ public abstract class AbstractBottomUpParser implements Parser {
     double bestComplete = -100000; //approximating -infinity/0 in ln domain
     while (odh.size() > 0 && (completeParses.size() < M || (odh.first()).getProb() < minComplete)
         && derivationStage < maxDerivationLength) {
-      ndh = new ListHeap<>(K);
+      ndh = new TreeSet<>();
 
       int derivationRank = 0;
       for (Iterator<Parse> pi = odh.iterator(); pi.hasNext()
-          && derivationRank < K; derivationRank++) { // forearch derivation
+          && derivationRank < K; derivationRank++) { // foreach derivation
         Parse tp = pi.next();
         //TODO: Need to look at this for K-best parsing cases
         /*
@@ -359,7 +359,8 @@ public abstract class AbstractBottomUpParser implements Parser {
     else {
       List<Parse> topParses = new ArrayList<>(numParses);
       while (!completeParses.isEmpty() && topParses.size() < numParses) {
-        Parse tp = completeParses.extract();
+        Parse tp = completeParses.last();
+        completeParses.remove(tp);
         topParses.add(tp);
         //parses.remove(tp);
       }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3df659b9/opennlp-tools/src/main/java/opennlp/tools/util/Heap.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/Heap.java b/opennlp-tools/src/main/java/opennlp/tools/util/Heap.java
index 00d79ba..83f3315 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/Heap.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/Heap.java
@@ -24,7 +24,10 @@ import java.util.Iterator;
  * their natural ordering or the comparator provided to an implementation.
  * While this is a typical of a heap it allows this objects natural ordering to
  * match that of other sorted collections.
+ *
+ * This is now deprecated and will be removed in Release 1.8.1
  * */
+@Deprecated
 public interface Heap<E>  {
 
   /**

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3df659b9/opennlp-tools/src/main/java/opennlp/tools/util/ListHeap.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/ListHeap.java b/opennlp-tools/src/main/java/opennlp/tools/util/ListHeap.java
index 303b729..92744e0 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/ListHeap.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/ListHeap.java
@@ -28,7 +28,10 @@ import java.util.List;
  * return the top K values which have been added where K is specified by the size passed to
  * the constructor. K+1 values are not gaurenteed to be kept in the heap or returned in a
  * particular order.
+ *
+ * This is now deprecated and will be removed in Release 1.8.1
  */
+@Deprecated
 public class ListHeap<E extends Comparable<E>> implements Heap<E> {
   private List<E> list;
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3df659b9/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
index c28fd7c..d3ea980 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
@@ -346,7 +346,7 @@ public class SourceForgeModelEval {
       }
     }
 
-    Assert.assertEquals(new BigInteger("155722144104513046994135548456420803172"),
+    Assert.assertEquals(new BigInteger("13162568910062822351942983467905626940"),
         new BigInteger(1, digest.digest()));
   }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3df659b9/opennlp-tools/src/test/java/opennlp/tools/util/ListHeapTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/ListHeapTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/ListHeapTest.java
deleted file mode 100644
index 09afca2..0000000
--- a/opennlp-tools/src/test/java/opennlp/tools/util/ListHeapTest.java
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.util;
-
-import org.junit.Assert;
-import org.junit.Test;
-
-public class ListHeapTest {
-
-  @Test
-  public void testSimple() {
-    int size = 5;
-    Heap<Integer> heap = new ListHeap<>(size);
-
-    for (int ai = 0; ai < 10; ai++) {
-      if (ai < size) {
-        Assert.assertEquals(ai, heap.size());
-      } else {
-        Assert.assertEquals(size, heap.size());
-      }
-      heap.add(ai);
-    }
-
-    Assert.assertEquals(Integer.valueOf(0), heap.extract());
-    Assert.assertEquals(4, heap.size());
-
-    Assert.assertEquals(Integer.valueOf(1), heap.extract());
-    Assert.assertEquals(3, heap.size());
-
-    Assert.assertEquals(Integer.valueOf(2), heap.extract());
-    Assert.assertEquals(2, heap.size());
-
-    Assert.assertEquals(Integer.valueOf(3), heap.extract());
-    Assert.assertEquals(1, heap.size());
-
-    Assert.assertEquals(Integer.valueOf(4), heap.extract());
-    Assert.assertEquals(0, heap.size());
-
-  }
-}


[35/50] opennlp git commit: Rollback Release 1.8.0 RC

Posted by co...@apache.org.
Rollback Release 1.8.0 RC


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/60792b8f
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/60792b8f
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/60792b8f

Branch: refs/heads/LangDetect
Commit: 60792b8f369b73316fce6310330c28a7a0d45246
Parents: b17f735
Author: Jörn Kottmann <jo...@apache.org>
Authored: Thu May 11 10:21:02 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Thu May 11 10:24:12 2017 +0200

----------------------------------------------------------------------
 opennlp-brat-annotator/pom.xml   | 2 +-
 opennlp-distr/pom.xml            | 2 +-
 opennlp-docs/pom.xml             | 2 +-
 opennlp-morfologik-addon/pom.xml | 2 +-
 opennlp-tools/pom.xml            | 2 +-
 opennlp-uima/pom.xml             | 2 +-
 pom.xml                          | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/60792b8f/opennlp-brat-annotator/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-brat-annotator/pom.xml b/opennlp-brat-annotator/pom.xml
index 0791e6b..6c7be0d 100644
--- a/opennlp-brat-annotator/pom.xml
+++ b/opennlp-brat-annotator/pom.xml
@@ -17,7 +17,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.1-SNAPSHOT</version>
+		<version>1.8.0-SNAPSHOT</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/60792b8f/opennlp-distr/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-distr/pom.xml b/opennlp-distr/pom.xml
index 4428240..3f838cd 100644
--- a/opennlp-distr/pom.xml
+++ b/opennlp-distr/pom.xml
@@ -24,7 +24,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.1-SNAPSHOT</version>
+		<version>1.8.0-SNAPSHOT</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/60792b8f/opennlp-docs/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/pom.xml b/opennlp-docs/pom.xml
index 312f6b8..fbf0b5c 100644
--- a/opennlp-docs/pom.xml
+++ b/opennlp-docs/pom.xml
@@ -24,7 +24,7 @@
   <parent>
 	<groupId>org.apache.opennlp</groupId>
 	<artifactId>opennlp</artifactId>
-	<version>1.8.1-SNAPSHOT</version>
+	<version>1.8.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
   

http://git-wip-us.apache.org/repos/asf/opennlp/blob/60792b8f/opennlp-morfologik-addon/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/pom.xml b/opennlp-morfologik-addon/pom.xml
index 8c5b9f4..c46f101 100644
--- a/opennlp-morfologik-addon/pom.xml
+++ b/opennlp-morfologik-addon/pom.xml
@@ -24,7 +24,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.1-SNAPSHOT</version>
+		<version>1.8.0-SNAPSHOT</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/60792b8f/opennlp-tools/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/pom.xml b/opennlp-tools/pom.xml
index a2cf596..a499375 100644
--- a/opennlp-tools/pom.xml
+++ b/opennlp-tools/pom.xml
@@ -25,7 +25,7 @@
   <parent>
     <groupId>org.apache.opennlp</groupId>
     <artifactId>opennlp</artifactId>
-    <version>1.8.1-SNAPSHOT</version>
+    <version>1.8.0-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
     

http://git-wip-us.apache.org/repos/asf/opennlp/blob/60792b8f/opennlp-uima/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-uima/pom.xml b/opennlp-uima/pom.xml
index d8f5246..7cfdb72 100644
--- a/opennlp-uima/pom.xml
+++ b/opennlp-uima/pom.xml
@@ -25,7 +25,7 @@
 	<parent>
 	    <groupId>org.apache.opennlp</groupId>
 	    <artifactId>opennlp</artifactId>
-	    <version>1.8.1-SNAPSHOT</version>
+	    <version>1.8.0-SNAPSHOT</version>
 	    <relativePath>../pom.xml</relativePath>
     </parent>
     

http://git-wip-us.apache.org/repos/asf/opennlp/blob/60792b8f/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 03811a8..2190a26 100644
--- a/pom.xml
+++ b/pom.xml
@@ -31,7 +31,7 @@
 
 	<groupId>org.apache.opennlp</groupId>
 	<artifactId>opennlp</artifactId>
-	<version>1.8.1-SNAPSHOT</version>
+	<version>1.8.0-SNAPSHOT</version>
 	<packaging>pom</packaging>
 
 	<name>Apache OpenNLP Reactor</name>


[50/50] opennlp git commit: OPENNLP-788: Add initial LanguageDetector interface and Language class

Posted by co...@apache.org.
OPENNLP-788: Add initial LanguageDetector interface and Language class


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/5a234de7
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/5a234de7
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/5a234de7

Branch: refs/heads/LangDetect
Commit: 5a234de70fb18752ab0e65cf01f25aa4895bc051
Parents: 911d59f
Author: William D C M SILVA <co...@apache.org>
Authored: Wed Feb 15 10:03:28 2017 -0200
Committer: William D C M SILVA <co...@apache.org>
Committed: Wed May 17 12:51:42 2017 -0300

----------------------------------------------------------------------
 .../java/opennlp/tools/langdetect/Language.java | 39 ++++++++++++++++++++
 .../tools/langdetect/LanguageDetector.java      | 33 +++++++++++++++++
 2 files changed, 72 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/5a234de7/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
new file mode 100644
index 0000000..773201f
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+/**
+ * Class for holding the document language and its confidence
+ */
+public class Language {
+  private final String lang;
+  private final double confidence;
+
+  public Language(String lang, double confidence) {
+    this.lang = lang;
+    this.confidence = confidence;
+  }
+
+  public String getLang() {
+    return lang;
+  }
+
+  public double getConfidence() {
+    return confidence;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5a234de7/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
new file mode 100644
index 0000000..ca897fd
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Set;
+
+/**
+ * The interface for name finders which provide name tags for a sequence of tokens.
+ */
+public interface LanguageDetector {
+
+  Language[] detectLanguage(CharSequence content);
+
+  Set<String> getSupportedLanguages();
+
+  String getLanguageCoding();
+
+}


[16/50] opennlp git commit: OPENNLP-1041: Verify test data before running tests

Posted by co...@apache.org.
OPENNLP-1041: Verify test data before running tests


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/32afb6a8
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/32afb6a8
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/32afb6a8

Branch: refs/heads/LangDetect
Commit: 32afb6a8bff7fb5c630ecf0c610ca8f5bc2f62b3
Parents: 065a629
Author: Jörn Kottmann <jo...@apache.org>
Authored: Fri Apr 21 14:35:50 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Mon Apr 24 12:15:03 2017 +0200

----------------------------------------------------------------------
 .../src/test/java/opennlp/tools/eval/SourceForgeModelEval.java  | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/32afb6a8/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
index 7211cd3..c28fd7c 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
@@ -25,6 +25,7 @@ import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
 
 import org.junit.Assert;
+import org.junit.BeforeClass;
 import org.junit.Test;
 
 import opennlp.tools.chunker.Chunker;
@@ -89,8 +90,8 @@ public class SourceForgeModelEval {
     }
   }
 
-  @Test
-  public void ensureTestDataIsCorrect() throws IOException {
+  @BeforeClass
+  public static void ensureTestDataIsCorrect() throws IOException {
     MessageDigest digest = createDigest();
 
     try (ObjectStream<String> lines = new PlainTextByLineStream(


[04/50] opennlp git commit: OPENNLP-1028: Add tests for FeatureGenerators in doccat. This closes apache/opennlp#166

Posted by co...@apache.org.
OPENNLP-1028: Add tests for FeatureGenerators in doccat. This closes apache/opennlp#166


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/580e0d1e
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/580e0d1e
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/580e0d1e

Branch: refs/heads/LangDetect
Commit: 580e0d1e26ab2a9275f01506f3af56fe8fc32988
Parents: e220a72
Author: koji <ko...@apache.org>
Authored: Wed Apr 19 10:14:47 2017 +0900
Committer: koji <ko...@apache.org>
Committed: Wed Apr 19 10:14:47 2017 +0900

----------------------------------------------------------------------
 .../doccat/BagOfWordsFeatureGenerator.java      |   6 +-
 .../tools/doccat/NGramFeatureGenerator.java     |  11 +-
 .../doccat/BagOfWordsFeatureGeneratorTest.java  |  62 +++++++++
 .../tools/doccat/NGramFeatureGeneratorTest.java | 129 +++++++++++++++++++
 4 files changed, 201 insertions(+), 7 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/580e0d1e/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
index ac39afc..51a3277 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/BagOfWordsFeatureGenerator.java
@@ -21,6 +21,7 @@ package opennlp.tools.doccat;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Map;
+import java.util.Objects;
 
 import opennlp.tools.util.featuregen.StringPattern;
 
@@ -29,9 +30,10 @@ import opennlp.tools.util.featuregen.StringPattern;
  */
 public class BagOfWordsFeatureGenerator implements FeatureGenerator {
 
-  private boolean useOnlyAllLetterTokens = false;
+  private final boolean useOnlyAllLetterTokens;
 
   public BagOfWordsFeatureGenerator() {
+    this(false);
   }
 
   BagOfWordsFeatureGenerator(boolean useOnlyAllLetterTokens) {
@@ -40,7 +42,7 @@ public class BagOfWordsFeatureGenerator implements FeatureGenerator {
 
   @Override
   public Collection<String> extractFeatures(String[] text, Map<String, Object> extraInformation) {
-
+    Objects.requireNonNull(text, "text must not be null");
     Collection<String> bagOfWords = new ArrayList<>(text.length);
 
     for (String word : text) {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/580e0d1e/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
index 967b105..6e1786f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/NGramFeatureGenerator.java
@@ -21,6 +21,7 @@ import java.util.ArrayList;
 import java.util.Collection;
 import java.util.List;
 import java.util.Map;
+import java.util.Objects;
 
 import opennlp.tools.util.InvalidFormatException;
 
@@ -30,9 +31,8 @@ import opennlp.tools.util.InvalidFormatException;
  */
 public class NGramFeatureGenerator implements FeatureGenerator {
 
-  //default values for bigrams
-  private int minGram = 2;
-  private int maxGram = 2;
+  private final int minGram;
+  private final int maxGram;
 
   /**
    * Constructor for ngrams.
@@ -59,7 +59,8 @@ public class NGramFeatureGenerator implements FeatureGenerator {
   /**
    * Default constructor for Bi grams
    */
-  public NGramFeatureGenerator() {
+  public NGramFeatureGenerator() throws InvalidFormatException {
+    this(2, 2);
   }
 
   /**
@@ -70,7 +71,7 @@ public class NGramFeatureGenerator implements FeatureGenerator {
    * @return a collection of n gram features
    */
   public Collection<String> extractFeatures(String[] text, Map<String, Object> extraInfo) {
-
+    Objects.requireNonNull(text, "text must not be null");
     List<String> features = new ArrayList<>();
 
     for (int i = 0; i <= text.length - minGram; i++) {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/580e0d1e/opennlp-tools/src/test/java/opennlp/tools/doccat/BagOfWordsFeatureGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/BagOfWordsFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/BagOfWordsFeatureGeneratorTest.java
new file mode 100644
index 0000000..2b128d9
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/BagOfWordsFeatureGeneratorTest.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.doccat;
+
+import java.util.Collections;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+public class BagOfWordsFeatureGeneratorTest {
+
+  @Test
+  public void testNull() {
+    BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator();
+    try {
+      generator.extractFeatures(null, Collections.emptyMap());
+      Assert.fail("NullPointerException must be thrown");
+    }
+    catch (NullPointerException expected) {
+    }
+  }
+
+  @Test
+  public void testEmpty() {
+    BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator();
+
+    Assert.assertEquals(0, generator.extractFeatures(new String[]{}, Collections.emptyMap()).size());
+  }
+
+  @Test
+  public void testUseAllTokens() {
+    BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator();
+
+    Assert.assertArrayEquals(new String[]{"bow=it", "bow=is", "bow=12.345", "bow=feet", "bow=long"},
+        generator.extractFeatures(new String[]{"it", "is", "12.345", "feet", "long"},
+            Collections.emptyMap()).toArray());
+  }
+
+  @Test
+  public void testOnlyLetterTokens() {
+    BagOfWordsFeatureGenerator generator = new BagOfWordsFeatureGenerator(true);
+
+    Assert.assertArrayEquals(new String[]{"bow=it", "bow=is", "bow=feet", "bow=long"},
+            generator.extractFeatures(new String[]{"it", "is", "12.345", "feet", "long"},
+                    Collections.emptyMap()).toArray());
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/580e0d1e/opennlp-tools/src/test/java/opennlp/tools/doccat/NGramFeatureGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/doccat/NGramFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/doccat/NGramFeatureGeneratorTest.java
new file mode 100644
index 0000000..0aef3ea
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/doccat/NGramFeatureGeneratorTest.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.doccat;
+
+import java.util.Collections;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.util.InvalidFormatException;
+
+public class NGramFeatureGeneratorTest {
+
+  static final String[] TOKENS = new String[]{"a", "b", "c", "d", "e", "f", "g"};
+
+  @Test
+  public void testNull() throws Exception {
+    NGramFeatureGenerator generator = new NGramFeatureGenerator();
+    try {
+      generator.extractFeatures(null, Collections.emptyMap());
+      Assert.fail("NullPointerException must be thrown");
+    }
+    catch (NullPointerException expected) {
+    }
+  }
+
+  @Test
+  public void testEmpty() throws Exception {
+    NGramFeatureGenerator generator = new NGramFeatureGenerator();
+
+    Assert.assertEquals(0, generator.extractFeatures(new String[]{}, Collections.emptyMap()).size());
+  }
+
+  @Test
+  public void testInvalidGramSize1() {
+    try {
+      new NGramFeatureGenerator(0, 1);
+      Assert.fail("InvalidFormatException must be thrown");
+    }
+    catch (InvalidFormatException expected) {
+    }
+  }
+
+  @Test
+  public void testInvalidGramSize2() {
+    try {
+      new NGramFeatureGenerator(2, 1);
+      Assert.fail("InvalidFormatException must be thrown");
+    }
+    catch (InvalidFormatException expected) {
+    }
+  }
+
+  @Test
+  public void testUnigram() throws Exception {
+    NGramFeatureGenerator generator = new NGramFeatureGenerator(1, 1);
+
+    Assert.assertArrayEquals(
+            new String[]{"ng=:a", "ng=:b", "ng=:c", "ng=:d", "ng=:e", "ng=:f", "ng=:g"},
+        generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray());
+  }
+
+  @Test
+  public void testBigram() throws Exception {
+    NGramFeatureGenerator generator = new NGramFeatureGenerator(2, 2);
+
+    Assert.assertArrayEquals(
+            new String[]{"ng=:a:b", "ng=:b:c", "ng=:c:d", "ng=:d:e", "ng=:e:f", "ng=:f:g"},
+        generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray());
+  }
+
+  @Test
+  public void testTrigram() throws Exception {
+    NGramFeatureGenerator generator = new NGramFeatureGenerator(3, 3);
+
+    Assert.assertArrayEquals(
+            new String[]{"ng=:a:b:c", "ng=:b:c:d", "ng=:c:d:e", "ng=:d:e:f", "ng=:e:f:g"},
+        generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray());
+  }
+
+  @Test
+  public void test12gram() throws Exception {
+    NGramFeatureGenerator generator = new NGramFeatureGenerator(1, 2);
+
+    Assert.assertArrayEquals(
+            new String[]{
+                "ng=:a", "ng=:a:b",
+                "ng=:b", "ng=:b:c",
+                "ng=:c", "ng=:c:d",
+                "ng=:d", "ng=:d:e",
+                "ng=:e", "ng=:e:f",
+                "ng=:f", "ng=:f:g",
+                "ng=:g"
+            },
+        generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray());
+  }
+
+  @Test
+  public void test13gram() throws Exception {
+    NGramFeatureGenerator generator = new NGramFeatureGenerator(1, 3);
+
+    Assert.assertArrayEquals(
+            new String[]{
+                "ng=:a", "ng=:a:b", "ng=:a:b:c",
+                "ng=:b", "ng=:b:c", "ng=:b:c:d",
+                "ng=:c", "ng=:c:d", "ng=:c:d:e",
+                "ng=:d", "ng=:d:e", "ng=:d:e:f",
+                "ng=:e", "ng=:e:f", "ng=:e:f:g",
+                "ng=:f", "ng=:f:g",
+                "ng=:g"
+            },
+        generator.extractFeatures(TOKENS, Collections.emptyMap()).toArray());
+  }
+}


[15/50] opennlp git commit: OPENNLP-1043: Ensure files are always loaded in same order

Posted by co...@apache.org.
OPENNLP-1043: Ensure files are always loaded in same order

Closes #181


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/065a6297
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/065a6297
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/065a6297

Branch: refs/heads/LangDetect
Commit: 065a62975b1f6f9da50b2efe0c02346d30332968
Parents: 735b1b5
Author: Jörn Kottmann <jo...@apache.org>
Authored: Fri Apr 21 16:14:28 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Mon Apr 24 11:54:02 2017 +0200

----------------------------------------------------------------------
 .../main/java/opennlp/tools/formats/DirectorySampleStream.java  | 5 +++--
 .../test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java  | 2 +-
 .../test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java   | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/065a6297/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java
index da73507..8d63acc 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/DirectorySampleStream.java
@@ -21,6 +21,7 @@ import java.io.File;
 import java.io.FileFilter;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import java.util.Stack;
@@ -95,6 +96,8 @@ public class DirectorySampleStream implements ObjectStream<File> {
         files = dir.listFiles();
       }
 
+      Arrays.sort(files);
+
       for (File file : files) {
         if (file.isFile()) {
           textFiles.push(file);
@@ -128,7 +131,5 @@ public class DirectorySampleStream implements ObjectStream<File> {
    */
   @Override
   public void close() throws IOException {
-
   }
-  
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/065a6297/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
index d654014..e0e3912 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
@@ -68,7 +68,7 @@ public class OntoNotes4NameFinderEval {
   @Test
   public void evalEnglishPersonNameFinder() throws IOException {
     TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
-    crossEval(params, "person", 0.8299903903167106d);
+    crossEval(params, "person", 0.8286204642039883d);
   }
 
   @Test

http://git-wip-us.apache.org/repos/asf/opennlp/blob/065a6297/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
index 31b42d1..ab33568 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
@@ -62,6 +62,6 @@ public class OntoNotes4PosTaggerEval {
 
   @Test
   public void evalEnglishMaxentTagger() throws IOException {
-    crossEval(ModelUtil.createDefaultTrainingParameters(), 0.9699561275750962d);
+    crossEval(ModelUtil.createDefaultTrainingParameters(), 0.9698145168879707d);
   }
 }


[30/50] opennlp git commit: OPENNLP-1021: Change xv folds from 10 to 5 to reduce runtime

Posted by co...@apache.org.
OPENNLP-1021: Change xv folds from 10 to 5 to reduce runtime

Closes #184


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/62d9fd29
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/62d9fd29
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/62d9fd29

Branch: refs/heads/LangDetect
Commit: 62d9fd2988f64afd6790626b11cb769dddc95994
Parents: ca9a1d9
Author: Jörn Kottmann <jo...@apache.org>
Authored: Mon Apr 24 16:08:13 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Mon May 8 15:10:04 2017 +0200

----------------------------------------------------------------------
 .../opennlp/tools/eval/OntoNotes4NameFinderEval.java     | 11 +++++++----
 .../java/opennlp/tools/eval/OntoNotes4ParserEval.java    |  6 +++---
 .../java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java |  7 +++++--
 3 files changed, 15 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/62d9fd29/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
index ef018cd..ac9509c 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
@@ -71,7 +71,7 @@ public class OntoNotes4NameFinderEval {
         filteredSamples = samples;
       }
 
-      cv.evaluate(filteredSamples, 10);
+      cv.evaluate(filteredSamples, 5);
 
       Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.001d);
     }
@@ -100,18 +100,21 @@ public class OntoNotes4NameFinderEval {
   @Test
   public void evalEnglishPersonNameFinder() throws IOException {
     TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
-    crossEval(params, "person", 0.8286204642039883d);
+    params.put("Threads", "4");
+    crossEval(params, "person", 0.822014580552418d);
   }
 
   @Test
   public void evalEnglishDateNameFinder() throws IOException {
     TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
-    crossEval(params, "date", 0.8065329969459567);
+    params.put("Threads", "4");
+    crossEval(params, "date", 0.8043873255040994d);
   }
 
   @Test
   public void evalAllTypesNameFinder() throws IOException {
     TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
-    crossEval(params, null, 0.8061722553169423d);
+    params.put("Threads", "4");
+    crossEval(params, null, 0.8014054850253551d);
   }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/62d9fd29/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
index 3a5b30d..f7e1046 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4ParserEval.java
@@ -65,9 +65,9 @@ public class OntoNotes4ParserEval {
       throws IOException {
     try (ObjectStream<Parse> samples = createParseSampleStream()) {
       ParserCrossValidator cv = new ParserCrossValidator("en", params, rules, ParserType.CHUNKING);
-      cv.evaluate(samples, 10);
+      cv.evaluate(samples, 5);
 
-      Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.001d);
+      Assert.assertEquals(expectedScore, cv.getFMeasure().getFMeasure(), 0.0001d);
     }
   }
 
@@ -108,6 +108,6 @@ public class OntoNotes4ParserEval {
     params.put("check.Threads", 4);
 
 
-    crossEval(params, headRules, 0.937987617163142d);
+    crossEval(params, headRules, 0.9373673649973432d);
   }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/62d9fd29/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
index b171978..6236507 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4PosTaggerEval.java
@@ -62,7 +62,7 @@ public class OntoNotes4PosTaggerEval {
       throws IOException {
     try (ObjectStream<POSSample> samples = createPOSSampleStream()) {
       POSTaggerCrossValidator cv = new POSTaggerCrossValidator("en", params, new POSTaggerFactory());
-      cv.evaluate(samples, 10);
+      cv.evaluate(samples, 5);
 
       Assert.assertEquals(expectedScore, cv.getWordAccuracy(), 0.0001d);
     }
@@ -89,6 +89,9 @@ public class OntoNotes4PosTaggerEval {
   }
   @Test
   public void evalEnglishMaxentTagger() throws IOException {
-    crossEval(ModelUtil.createDefaultTrainingParameters(), 0.9698145168879707d);
+    TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
+    params.put("Threads", "4");
+
+    crossEval(params, 0.969345319453096d);
   }
 }


[31/50] opennlp git commit: OPENNLP-958: Add POS Name Finder feature generator

Posted by co...@apache.org.
OPENNLP-958: Add POS Name Finder feature generator

closes apache/opennlp#170


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/3ab6698b
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/3ab6698b
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/3ab6698b

Branch: refs/heads/LangDetect
Commit: 3ab6698b6ae590ae1ac5758d988837d89ea74b3e
Parents: 62d9fd2
Author: William D C M SILVA <co...@apache.org>
Authored: Mon May 8 19:48:09 2017 -0300
Committer: William D C M SILVA <co...@apache.org>
Committed: Mon May 8 19:48:09 2017 -0300

----------------------------------------------------------------------
 opennlp-docs/src/docbkx/namefinder.xml          |  5 ++
 .../java/opennlp/tools/parser/ParserModel.java  | 52 +--------------
 .../tools/util/featuregen/GeneratorFactory.java | 37 +++++++++++
 .../POSTaggerNameFeatureGenerator.java          | 68 ++++++++++++++++++++
 .../util/model/ChunkerModelSerializer.java      | 49 ++++++++++++++
 .../tools/util/model/POSModelSerializer.java    | 51 +++++++++++++++
 .../tools/eval/OntoNotes4NameFinderEval.java    | 63 ++++++++++++++++++
 .../opennlp/tools/postag/POSTaggerMETest.java   |  2 +-
 .../POSTaggerNameFeatureGeneratorTest.java      | 45 +++++++++++++
 .../opennlp/tools/eval/ner-en_pos-features.xml  | 37 +++++++++++
 10 files changed, 358 insertions(+), 51 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-docs/src/docbkx/namefinder.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/src/docbkx/namefinder.xml b/opennlp-docs/src/docbkx/namefinder.xml
index 2f68c47..abcd094 100644
--- a/opennlp-docs/src/docbkx/namefinder.xml
+++ b/opennlp-docs/src/docbkx/namefinder.xml
@@ -439,6 +439,11 @@ new NameFinderME(model);]]>
 					<entry>no</entry>
 					<entry>none</entry>
 			      </row>
+						<row>
+							<entry>tokenpos</entry>
+							<entry>no</entry>
+							<entry><emphasis>model</emphasis> is the file name of the POS Tagger model to use</entry>
+						</row>
 			      <row>
 				<entry>wordcluster</entry>
 				<entry>no</entry>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java b/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java
index 61ac401..c290d9f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java
@@ -30,15 +30,14 @@ import java.util.Map;
 import java.util.Objects;
 
 import opennlp.tools.chunker.ChunkerModel;
-import opennlp.tools.ml.BeamSearch;
 import opennlp.tools.ml.model.AbstractModel;
 import opennlp.tools.ml.model.MaxentModel;
 import opennlp.tools.postag.POSModel;
 import opennlp.tools.util.InvalidFormatException;
-import opennlp.tools.util.Version;
 import opennlp.tools.util.model.ArtifactSerializer;
 import opennlp.tools.util.model.BaseModel;
-import opennlp.tools.util.model.UncloseableInputStream;
+import opennlp.tools.util.model.ChunkerModelSerializer;
+import opennlp.tools.util.model.POSModelSerializer;
 
 /**
  * This is an abstract base class for {@link ParserModel} implementations.
@@ -46,53 +45,6 @@ import opennlp.tools.util.model.UncloseableInputStream;
 // TODO: Model should validate the artifact map
 public class ParserModel extends BaseModel {
 
-  private static class POSModelSerializer implements ArtifactSerializer<POSModel> {
-
-    public POSModel create(InputStream in) throws IOException {
-      POSModel posModel = new POSModel(new UncloseableInputStream(in));
-
-      // The 1.6.x models write the non-default beam size into the model itself.
-      // In 1.5.x the parser configured the beam size when the model was loaded,
-      // this is not possible anymore with the new APIs
-      Version version = posModel.getVersion();
-      if (version.getMajor() == 1 && version.getMinor() == 5) {
-        if (posModel.getManifestProperty(BeamSearch.BEAM_SIZE_PARAMETER) == null) {
-          posModel = new POSModel(posModel.getLanguage(), posModel.getPosModel(), 10,
-              null, posModel.getFactory());
-        }
-      }
-
-      return posModel;
-    }
-
-    public void serialize(POSModel artifact, OutputStream out)
-        throws IOException {
-      artifact.serialize(out);
-    }
-  }
-
-  private static class ChunkerModelSerializer implements ArtifactSerializer<ChunkerModel> {
-
-    public ChunkerModel create(InputStream in) throws IOException {
-
-      ChunkerModel model = new ChunkerModel(new UncloseableInputStream(in));
-
-      Version version = model.getVersion();
-      if (version.getMajor() == 1 && version.getMinor() == 5) {
-
-        model = new ChunkerModel(model.getLanguage(), model.getChunkerModel(), new ParserChunkerFactory());
-
-      }
-
-      return model;
-    }
-
-    public void serialize(ChunkerModel artifact, OutputStream out)
-        throws IOException {
-      artifact.serialize(out);
-    }
-  }
-
   private static class HeadRulesSerializer implements
       ArtifactSerializer<opennlp.tools.parser.lang.en.HeadRules> {
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
index 5060961..11cad42 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/GeneratorFactory.java
@@ -43,9 +43,11 @@ import org.w3c.dom.NodeList;
 import org.xml.sax.SAXException;
 
 import opennlp.tools.dictionary.Dictionary;
+import opennlp.tools.postag.POSModel;
 import opennlp.tools.util.InvalidFormatException;
 import opennlp.tools.util.ext.ExtensionLoader;
 import opennlp.tools.util.model.ArtifactSerializer;
+import opennlp.tools.util.model.POSModelSerializer;
 
 /**
  * Creates a set of feature generators based on a provided XML descriptor.
@@ -607,6 +609,30 @@ public class GeneratorFactory {
     }
   }
 
+
+
+  /**
+   * @see TokenPatternFeatureGenerator
+   */
+  static class POSTaggerNameFeatureGeneratorFactory implements XmlFeatureGeneratorFactory {
+
+    public AdaptiveFeatureGenerator create(Element generatorElement,
+                                           FeatureGeneratorResourceProvider resourceManager)
+        throws  InvalidFormatException {
+
+      String modelResourceKey = generatorElement.getAttribute("model");
+
+      POSModel model = (POSModel)resourceManager.getResource(modelResourceKey);
+
+      return new POSTaggerNameFeatureGenerator(model);
+
+    }
+
+    static void register(Map<String, XmlFeatureGeneratorFactory> factoryMap) {
+      factoryMap.put("tokenpos", new POSTaggerNameFeatureGeneratorFactory());
+    }
+  }
+
   // TODO: We have to support custom resources here. How does it work ?!
   // Attributes get into a Map<String, String> properties
 
@@ -678,6 +704,7 @@ public class GeneratorFactory {
     BrownClusterTokenClassFeatureGeneratorFactory.register(factories);
     BrownClusterBigramFeatureGeneratorFactory.register(factories);
     CustomFeatureGeneratorFactory.register(factories);
+    POSTaggerNameFeatureGeneratorFactory.register(factories);
   }
 
   /**
@@ -820,6 +847,16 @@ public class GeneratorFactory {
               break;
           }
         }
+
+        String modelName = xmlElement.getAttribute("model");
+        if (modelName != null) {
+
+          switch (xmlElement.getTagName()) {
+            case "tokenpos":
+              mapping.put(modelName, new POSModelSerializer());
+              break;
+          }
+        }
       }
     }
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/POSTaggerNameFeatureGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/POSTaggerNameFeatureGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/POSTaggerNameFeatureGenerator.java
new file mode 100644
index 0000000..39c6335
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/featuregen/POSTaggerNameFeatureGenerator.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.featuregen;
+
+import java.util.List;
+import java.util.Objects;
+
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.postag.POSTagger;
+import opennlp.tools.postag.POSTaggerME;
+
+/**
+ * Adds the token POS Tag as feature. Requires a POS Tag model.
+ */
+public class POSTaggerNameFeatureGenerator implements AdaptiveFeatureGenerator {
+
+  private POSTagger posTagger;
+
+  private String[] cachedTokens;
+  private String[] cachedTags;
+
+  /**
+   * Initializes a new instance.
+   *
+   * @param aPosTagger a POSTagger implementation.
+   */
+  public POSTaggerNameFeatureGenerator(POSTagger aPosTagger) {
+    this.posTagger = aPosTagger;
+  }
+
+  /**
+   * Initializes a new instance.
+   *
+   * @param aPosModel a POSTagger model.
+   */
+  public POSTaggerNameFeatureGenerator(POSModel aPosModel) {
+
+    this.posTagger = new POSTaggerME(aPosModel);
+  }
+
+
+  public void createFeatures(List<String> feats, String[] toks, int index, String[] preds) {
+    if (!Objects.equals(this.cachedTokens, toks)) {
+      this.cachedTokens = toks;
+      this.cachedTags = this.posTagger.tag(toks);
+    }
+
+    feats.add("pos=" + this.cachedTags[index]);
+  }
+
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-tools/src/main/java/opennlp/tools/util/model/ChunkerModelSerializer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/model/ChunkerModelSerializer.java b/opennlp-tools/src/main/java/opennlp/tools/util/model/ChunkerModelSerializer.java
new file mode 100644
index 0000000..c32cc69
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/model/ChunkerModelSerializer.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.model;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import opennlp.tools.chunker.ChunkerModel;
+import opennlp.tools.parser.ParserChunkerFactory;
+import opennlp.tools.util.Version;
+
+
+public class ChunkerModelSerializer implements ArtifactSerializer<ChunkerModel> {
+
+  public ChunkerModel create(InputStream in) throws IOException {
+
+    ChunkerModel model = new ChunkerModel(new UncloseableInputStream(in));
+
+    Version version = model.getVersion();
+    if (version.getMajor() == 1 && version.getMinor() == 5) {
+
+      model = new ChunkerModel(model.getLanguage(), model.getChunkerModel(), new ParserChunkerFactory());
+
+    }
+
+    return model;
+  }
+
+  public void serialize(ChunkerModel artifact, OutputStream out)
+      throws IOException {
+    artifact.serialize(out);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-tools/src/main/java/opennlp/tools/util/model/POSModelSerializer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/model/POSModelSerializer.java b/opennlp-tools/src/main/java/opennlp/tools/util/model/POSModelSerializer.java
new file mode 100644
index 0000000..a82319c
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/model/POSModelSerializer.java
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.model;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.OutputStream;
+
+import opennlp.tools.ml.BeamSearch;
+import opennlp.tools.postag.POSModel;
+import opennlp.tools.util.Version;
+
+public class POSModelSerializer implements ArtifactSerializer<POSModel> {
+
+  public POSModel create(InputStream in) throws IOException {
+    POSModel posModel = new POSModel(new UncloseableInputStream(in));
+
+    // The 1.6.x models write the non-default beam size into the model itself.
+    // In 1.5.x the parser configured the beam size when the model was loaded,
+    // this is not possible anymore with the new APIs
+    Version version = posModel.getVersion();
+    if (version.getMajor() == 1 && version.getMinor() == 5) {
+      if (posModel.getManifestProperty(BeamSearch.BEAM_SIZE_PARAMETER) == null) {
+        posModel = new POSModel(posModel.getLanguage(), posModel.getPosModel(), 10,
+            null, posModel.getFactory());
+      }
+    }
+
+    return posModel;
+  }
+
+  public void serialize(POSModel artifact, OutputStream out)
+      throws IOException {
+    artifact.serialize(out);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
index ac9509c..a696787 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/OntoNotes4NameFinderEval.java
@@ -17,17 +17,27 @@
 
 package opennlp.tools.eval;
 
+import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.IOException;
+import java.io.InputStream;
 import java.math.BigInteger;
+import java.net.URISyntaxException;
 import java.nio.charset.StandardCharsets;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+import java.nio.file.StandardCopyOption;
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
+import java.util.Map;
 
 import org.junit.Assert;
 import org.junit.BeforeClass;
 import org.junit.Test;
 
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.namefind.TokenNameFinderTrainerTool;
 import opennlp.tools.formats.DirectorySampleStream;
 import opennlp.tools.formats.convert.FileToStringSampleStream;
 import opennlp.tools.formats.ontonotes.OntoNotesNameSampleStream;
@@ -117,4 +127,57 @@ public class OntoNotes4NameFinderEval {
     params.put("Threads", "4");
     crossEval(params, null, 0.8014054850253551d);
   }
+
+  @Test
+  public void evalAllTypesWithPOSNameFinder() throws IOException {
+    TrainingParameters params = ModelUtil.createDefaultTrainingParameters();
+    params.put("Threads", "4");
+
+    // load the feature generator
+    ByteArrayOutputStream bytes = new ByteArrayOutputStream();
+    try (InputStream in = this.getClass().getResourceAsStream(
+        "ner-en_pos-features.xml")) {
+      byte[] buf = new byte[1024];
+      int len;
+      while ((len = in.read(buf)) > 0) {
+        bytes.write(buf, 0, len);
+      }
+    }
+    catch (IOException e) {
+      throw new IllegalStateException("Failed reading from ner-default-features.xml file on classpath!");
+    }
+
+    byte[] featureGen = bytes.toByteArray();
+
+    // create a temp resource folder and copy the pos model there
+    Path resourcesPath = Files.createTempDirectory("opennlp_resources");
+    Files.copy(new File(EvalUtil.getOpennlpDataDir(), "models-sf/en-pos-perceptron.bin").toPath(),
+        new File(resourcesPath.toFile(), "en-pos-perceptron.bin").toPath(),
+        StandardCopyOption.REPLACE_EXISTING);
+
+    Map<String, Object> resources;
+
+    try {
+      resources = TokenNameFinderTrainerTool.loadResources(resourcesPath.toFile(),
+          Paths.get(this.getClass().getResource("ner-en_pos-features.xml").toURI()).toFile());
+    }
+    catch (IOException | URISyntaxException e) {
+      throw new TerminateToolException(-1,"IO error while loading resources", e);
+    }
+
+
+    try (ObjectStream<NameSample> samples = createNameSampleStream()) {
+
+      TokenNameFinderCrossValidator cv = new TokenNameFinderCrossValidator("en", null,
+          params, featureGen, resources);
+
+      ObjectStream<NameSample> filteredSamples;
+
+      filteredSamples = samples;
+
+      cv.evaluate(filteredSamples, 5);
+
+      Assert.assertEquals(0.8044097625338349d, cv.getFMeasure().getFMeasure(), 0.001d);
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java
index 6d0785b..838150e 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/postag/POSTaggerMETest.java
@@ -48,7 +48,7 @@ public class POSTaggerMETest {
    *
    * @return {@link POSModel}
    */
-  static POSModel trainPOSModel(ModelType type) throws IOException {
+  public static POSModel trainPOSModel(ModelType type) throws IOException {
     TrainingParameters params = new TrainingParameters();
     params.put(TrainingParameters.ALGORITHM_PARAM, type.toString());
     params.put(TrainingParameters.ITERATIONS_PARAM, 100);

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/POSTaggerNameFeatureGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/POSTaggerNameFeatureGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/POSTaggerNameFeatureGeneratorTest.java
new file mode 100644
index 0000000..0514c26
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/featuregen/POSTaggerNameFeatureGeneratorTest.java
@@ -0,0 +1,45 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.featuregen;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.postag.POSTaggerMETest;
+import opennlp.tools.util.model.ModelType;
+
+public class POSTaggerNameFeatureGeneratorTest {
+
+
+  @Test
+  public void testFeatureGeneration() throws IOException {
+    POSTaggerNameFeatureGenerator fg = new POSTaggerNameFeatureGenerator(
+        POSTaggerMETest.trainPOSModel(ModelType.MAXENT));
+
+    String[] tokens = {"Hi", "Mike", ",", "it", "'s", "Stefanie", "Schmidt", "."};
+    for (int i = 0; i < tokens.length; i++) {
+      List<String> feats = new ArrayList<>();
+      fg.createFeatures(feats, tokens, i, null);
+      Assert.assertTrue(feats.get(0).startsWith("pos="));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/3ab6698b/opennlp-tools/src/test/resources/opennlp/tools/eval/ner-en_pos-features.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/eval/ner-en_pos-features.xml b/opennlp-tools/src/test/resources/opennlp/tools/eval/ner-en_pos-features.xml
new file mode 100644
index 0000000..b850904
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/eval/ner-en_pos-features.xml
@@ -0,0 +1,37 @@
+<!--
+	Licensed to the Apache Software Foundation (ASF) under one
+	or more contributor license agreements.  See the NOTICE file
+	distributed with this work for additional information
+	regarding copyright ownership.  The ASF licenses this file
+	to you under the Apache License, Version 2.0 (the
+	"License"); you may not use this file except in compliance
+	with the License.  You may obtain a copy of the License at
+
+	http://www.apache.org/licenses/LICENSE-2.0
+
+	Unless required by applicable law or agreed to in writing,
+	software distributed under the License is distributed on an
+	"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+	KIND, either express or implied.  See the License for the
+	specific language governing permissions and limitations
+	under the License.
+-->
+
+<!-- Default name finder feature generator configuration -->
+<generators>
+  <cache>
+    <generators>
+      <window prevLength = "2" nextLength = "2">
+        <tokenclass/>
+      </window>
+      <window prevLength = "2" nextLength = "2">
+        <token/>
+      </window>
+      <definition/>
+      <prevmap/>
+      <bigram/>
+      <sentence begin="true" end="false"/>
+      <tokenpos model="en-pos-perceptron.bin"/>
+    </generators>
+  </cache>
+</generators>
\ No newline at end of file


[42/50] opennlp git commit: OPENNLP-1057: Add all Eval tests to the Eval profile

Posted by co...@apache.org.
OPENNLP-1057: Add all Eval tests to the Eval profile

closes #200


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/d7b3b96b
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/d7b3b96b
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/d7b3b96b

Branch: refs/heads/LangDetect
Commit: d7b3b96b26623516514b21125c07d27031e5d291
Parents: 068b1f3
Author: thygesen <th...@apache.org>
Authored: Mon May 15 20:07:14 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Mon May 15 22:19:12 2017 +0200

----------------------------------------------------------------------
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/d7b3b96b/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 2190a26..695b95c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -434,7 +434,7 @@
 						<configuration>
 							<includes>
 								<include>**/*Test.java</include>
-								<include>**/SourceForgeModelEval.java</include>
+								<include>**/*Eval.java</include>
 							</includes>
 						</configuration>
 					</plugin>


[48/50] opennlp git commit: OPENNLP-1064: Disable evalDutchMaxentQn test

Posted by co...@apache.org.
OPENNLP-1064: Disable evalDutchMaxentQn test


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/08b2c429
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/08b2c429
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/08b2c429

Branch: refs/heads/LangDetect
Commit: 08b2c429e8cd65205a9776a7b9d033bb60c420c6
Parents: 1713b44
Author: Jörn Kottmann <jo...@apache.org>
Authored: Wed May 17 10:11:57 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 17 16:52:36 2017 +0200

----------------------------------------------------------------------
 .../src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java      | 2 ++
 1 file changed, 2 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/08b2c429/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
index af53878..8ac7b3a 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/ConllXPosTaggerEval.java
@@ -22,6 +22,7 @@ import java.io.IOException;
 import java.nio.charset.StandardCharsets;
 
 import org.junit.Assert;
+import org.junit.Ignore;
 import org.junit.Test;
 
 import opennlp.tools.formats.ConllXPOSSampleStream;
@@ -110,6 +111,7 @@ public class ConllXPosTaggerEval {
   }
 
   @Test
+  @Ignore
   public void evalDutchMaxentQn() throws IOException {
     TrainingParameters params = EvalUtil.createMaxentQnParams();
 


[32/50] opennlp git commit: OPENNLP-1052: Update README and CLI docbook before release

Posted by co...@apache.org.
OPENNLP-1052: Update README and CLI docbook before release

closes apache/opennlp#195


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/db9c511e
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/db9c511e
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/db9c511e

Branch: refs/heads/LangDetect
Commit: db9c511e8d5c3665eb2bb31cf0b11c0302252d45
Parents: 3ab6698
Author: William D C M SILVA <co...@apache.org>
Authored: Tue May 9 13:09:46 2017 -0300
Committer: William D C M SILVA <co...@apache.org>
Committed: Tue May 9 13:09:46 2017 -0300

----------------------------------------------------------------------
 opennlp-distr/README            |  29 +-
 opennlp-docs/src/docbkx/cli.xml | 582 +++++++++++++++++++++--------------
 2 files changed, 364 insertions(+), 247 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/db9c511e/opennlp-distr/README
----------------------------------------------------------------------
diff --git a/opennlp-distr/README b/opennlp-distr/README
index 12dc8ec..975c651 100644
--- a/opennlp-distr/README
+++ b/opennlp-distr/README
@@ -19,22 +19,25 @@ What is new in Apache OpenNLP ${pom.version}
 ---------------------------------------
 
 This release introduces many new features, improvements and bug fixes. The API
-has been improved for a better consistency and 1.4 deprecated methods were
-removed. Now Java 1.8 is required.
+has been improved for a better consistency and many deprecated methods were
+removed. Java 1.8 is required.
 
 Additionally the release contains the following noteworthy changes:
 
-- Name Finder evaluation can now show a confusion matrix
-- The default evaluation output contains more details
-- Added a Language Model CLI tool
-- Add Moses format support
-- More refactoring and cleanup, specially in Machine Learning package and Dictionary
-- Removed deprecated trainers from UIMA integration
-- Fixed potential localization issues and added maven plugin to prevent it (ForbiddenAPI)
-- Fixed issues with the BRAT corpus reader
-- Deprecated GIS class, will be removed in a future 1.8.x release
+- POS Tagger context generator now supports feature generation XML
+- Add a Name Finder feature generator that adds POS Tag features
+- Add CONLL-U format support
+- Improve default Name Finder settings
+- TokenNameFinderEvaluator CLI now support nameTypes argument
+- Stupid backoff is now the default in NGramLanguageModel
+- Language codes now are ISO 639-3 compliant
+- Add many unit tests
+- Distribution package now includes example parameters file
+- Now prefix and suffix feature generators are configurable
+- Remove API in Document Categorizer for user specified tokenizer
+- Learnable lemmatizer now returns all possible lemmas for a given word and pos tag
+- Add stemmer, detokenizer and sentence detection abbreviations for Irish
+- Chunker SequenceValidator signature changed to allow access to both token and POS tag
 
 A detailed list of the issues related to this release can be found in the release
 notes.
-
-

http://git-wip-us.apache.org/repos/asf/opennlp/blob/db9c511e/opennlp-docs/src/docbkx/cli.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/src/docbkx/cli.xml b/opennlp-docs/src/docbkx/cli.xml
index 3dc66b7..1a8c326 100644
--- a/opennlp-docs/src/docbkx/cli.xml
+++ b/opennlp-docs/src/docbkx/cli.xml
@@ -42,7 +42,7 @@ under the License.
 
 <title>Doccat</title>
 
-<para>Learnable document categorizer</para>
+<para>Learned document categorizer</para>
 
 <screen>
 <![CDATA[
@@ -60,15 +60,15 @@ Usage: opennlp Doccat model < documents
 
 <screen>
 <![CDATA[
-Usage: opennlp DoccatTrainer[.leipzig] [-factory factoryName] [-tokenizer tokenizer] [-featureGenerators fg] 
+Usage: opennlp DoccatTrainer[.leipzig] [-factory factoryName] [-featureGenerators fg] [-tokenizer tokenizer] 
         [-params paramsFile] -lang language -model modelFile -data sampleData [-encoding charsetName] 
 Arguments description:
 	-factory factoryName
 		A sub-class of DoccatFactory where to get implementation and resources.
-	-tokenizer tokenizer
-		Tokenizer implementation. WhitespaceTokenizer is used if not specified.
 	-featureGenerators fg
 		Comma separated feature generator classes. Bag of words is used if not specified.
+	-tokenizer tokenizer
+		Tokenizer implementation. WhitespaceTokenizer is used if not specified.
 	-params paramsFile
 		training parameters file.
 	-lang language
@@ -113,13 +113,13 @@ Arguments description:
 
 <screen>
 <![CDATA[
-Usage: opennlp DoccatEvaluator[.leipzig] [-misclassified true|false] -model model [-reportOutputFile 
+Usage: opennlp DoccatEvaluator[.leipzig] -model model [-misclassified true|false] [-reportOutputFile 
         outputFile] -data sampleData [-encoding charsetName] 
 Arguments description:
-	-misclassified true|false
-		if true will print false negatives and false positives.
 	-model model
 		the model file to be evaluated.
+	-misclassified true|false
+		if true will print false negatives and false positives.
 	-reportOutputFile outputFile
 		the path of the fine-grained report file.
 	-data sampleData
@@ -160,20 +160,20 @@ Arguments description:
 
 <screen>
 <![CDATA[
-Usage: opennlp DoccatCrossValidator[.leipzig] [-folds num] [-misclassified true|false] [-factory factoryName] 
-        [-tokenizer tokenizer] [-featureGenerators fg] [-params paramsFile] -lang language [-reportOutputFile 
+Usage: opennlp DoccatCrossValidator[.leipzig] [-misclassified true|false] [-folds num] [-factory factoryName] 
+        [-featureGenerators fg] [-tokenizer tokenizer] [-params paramsFile] -lang language [-reportOutputFile 
         outputFile] -data sampleData [-encoding charsetName] 
 Arguments description:
-	-folds num
-		number of folds, default is 10.
 	-misclassified true|false
 		if true will print false negatives and false positives.
+	-folds num
+		number of folds, default is 10.
 	-factory factoryName
 		A sub-class of DoccatFactory where to get implementation and resources.
-	-tokenizer tokenizer
-		Tokenizer implementation. WhitespaceTokenizer is used if not specified.
 	-featureGenerators fg
 		Comma separated feature generator classes. Bag of words is used if not specified.
+	-tokenizer tokenizer
+		Tokenizer implementation. WhitespaceTokenizer is used if not specified.
 	-params paramsFile
 		training parameters file.
 	-lang language
@@ -351,18 +351,18 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>language</entry>
-<entry>No</entry>
-<entry>Language which is being processed.</entry>
-</row>
-<row>
 <entry>splitHyphenatedTokens</entry>
 <entry>split</entry>
 <entry>Yes</entry>
 <entry>If true all hyphenated tokens will be separated (default true)</entry>
 </row>
 <row>
+<entry>lang</entry>
+<entry>language</entry>
+<entry>No</entry>
+<entry>Language which is being processed.</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -463,13 +463,13 @@ Arguments description:
 
 <screen>
 <![CDATA[
-Usage: opennlp TokenizerMEEvaluator[.ad|.pos|.conllx|.namefinder|.parse] [-misclassified true|false] -model 
-        model -data sampleData [-encoding charsetName] 
+Usage: opennlp TokenizerMEEvaluator[.ad|.pos|.conllx|.namefinder|.parse] -model model [-misclassified 
+        true|false] -data sampleData [-encoding charsetName] 
 Arguments description:
-	-misclassified true|false
-		if true will print false negatives and false positives.
 	-model model
 		the model file to be evaluated.
+	-misclassified true|false
+		if true will print false negatives and false positives.
 	-data sampleData
 		data to be used, usually a file name.
 	-encoding charsetName
@@ -490,18 +490,18 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>language</entry>
-<entry>No</entry>
-<entry>Language which is being processed.</entry>
-</row>
-<row>
 <entry>splitHyphenatedTokens</entry>
 <entry>split</entry>
 <entry>Yes</entry>
 <entry>If true all hyphenated tokens will be separated (default true)</entry>
 </row>
 <row>
+<entry>lang</entry>
+<entry>language</entry>
+<entry>No</entry>
+<entry>Language which is being processed.</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -602,14 +602,14 @@ Arguments description:
 
 <screen>
 <![CDATA[
-Usage: opennlp TokenizerCrossValidator[.ad|.pos|.conllx|.namefinder|.parse] [-folds num] [-misclassified 
-        true|false] [-factory factoryName] [-abbDict path] [-alphaNumOpt isAlphaNumOpt] [-params paramsFile] 
+Usage: opennlp TokenizerCrossValidator[.ad|.pos|.conllx|.namefinder|.parse] [-misclassified true|false] 
+        [-folds num] [-factory factoryName] [-abbDict path] [-alphaNumOpt isAlphaNumOpt] [-params paramsFile] 
         -lang language -data sampleData [-encoding charsetName] 
 Arguments description:
-	-folds num
-		number of folds, default is 10.
 	-misclassified true|false
 		if true will print false negatives and false positives.
+	-folds num
+		number of folds, default is 10.
 	-factory factoryName
 		A sub-class of TokenizerFactory where to get implementation and resources.
 	-abbDict path
@@ -640,18 +640,18 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>language</entry>
-<entry>No</entry>
-<entry>Language which is being processed.</entry>
-</row>
-<row>
 <entry>splitHyphenatedTokens</entry>
 <entry>split</entry>
 <entry>Yes</entry>
 <entry>If true all hyphenated tokens will be separated (default true)</entry>
 </row>
 <row>
+<entry>lang</entry>
+<entry>language</entry>
+<entry>No</entry>
+<entry>Language which is being processed.</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -769,18 +769,18 @@ Usage: opennlp TokenizerConverter help|ad|pos|conllx|namefinder|parse [help|opti
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>language</entry>
-<entry>No</entry>
-<entry>Language which is being processed.</entry>
-</row>
-<row>
 <entry>splitHyphenatedTokens</entry>
 <entry>split</entry>
 <entry>Yes</entry>
 <entry>If true all hyphenated tokens will be separated (default true)</entry>
 </row>
 <row>
+<entry>lang</entry>
+<entry>language</entry>
+<entry>No</entry>
+<entry>Language which is being processed.</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -916,15 +916,15 @@ Usage: opennlp SentenceDetector model < sentences
 <screen>
 <![CDATA[
 Usage: opennlp SentenceDetectorTrainer[.ad|.pos|.conllx|.namefinder|.parse|.moses|.letsmt] [-factory 
-        factoryName] [-eosChars string] [-abbDict path] [-params paramsFile] -lang language -model modelFile 
+        factoryName] [-abbDict path] [-eosChars string] [-params paramsFile] -lang language -model modelFile 
         -data sampleData [-encoding charsetName] 
 Arguments description:
 	-factory factoryName
 		A sub-class of SentenceDetectorFactory where to get implementation and resources.
-	-eosChars string
-		EOS characters.
 	-abbDict path
 		abbreviation dictionary in XML format.
+	-eosChars string
+		EOS characters.
 	-params paramsFile
 		training parameters file.
 	-lang language
@@ -951,18 +951,18 @@ Arguments description:
 <entry>Encoding for reading and writing text.</entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>language</entry>
-<entry>No</entry>
-<entry>Language which is being processed.</entry>
-</row>
-<row>
 <entry>includeTitles</entry>
 <entry>includeTitles</entry>
 <entry>Yes</entry>
 <entry>If true will include sentences marked as headlines.</entry>
 </row>
 <row>
+<entry>lang</entry>
+<entry>language</entry>
+<entry>No</entry>
+<entry>Language which is being processed.</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -1089,13 +1089,13 @@ Arguments description:
 
 <screen>
 <![CDATA[
-Usage: opennlp SentenceDetectorEvaluator[.ad|.pos|.conllx|.namefinder|.parse|.moses|.letsmt] [-misclassified 
-        true|false] -model model -data sampleData [-encoding charsetName] 
+Usage: opennlp SentenceDetectorEvaluator[.ad|.pos|.conllx|.namefinder|.parse|.moses|.letsmt] -model model 
+        [-misclassified true|false] -data sampleData [-encoding charsetName] 
 Arguments description:
-	-misclassified true|false
-		if true will print false negatives and false positives.
 	-model model
 		the model file to be evaluated.
+	-misclassified true|false
+		if true will print false negatives and false positives.
 	-data sampleData
 		data to be used, usually a file name.
 	-encoding charsetName
@@ -1116,18 +1116,18 @@ Arguments description:
 <entry>Encoding for reading and writing text.</entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>language</entry>
-<entry>No</entry>
-<entry>Language which is being processed.</entry>
-</row>
-<row>
 <entry>includeTitles</entry>
 <entry>includeTitles</entry>
 <entry>Yes</entry>
 <entry>If true will include sentences marked as headlines.</entry>
 </row>
 <row>
+<entry>lang</entry>
+<entry>language</entry>
+<entry>No</entry>
+<entry>Language which is being processed.</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -1255,23 +1255,23 @@ Arguments description:
 <screen>
 <![CDATA[
 Usage: opennlp SentenceDetectorCrossValidator[.ad|.pos|.conllx|.namefinder|.parse|.moses|.letsmt] [-factory 
-        factoryName] [-eosChars string] [-abbDict path] [-params paramsFile] -lang language [-folds num] 
-        [-misclassified true|false] -data sampleData [-encoding charsetName] 
+        factoryName] [-abbDict path] [-eosChars string] [-params paramsFile] -lang language [-misclassified 
+        true|false] [-folds num] -data sampleData [-encoding charsetName] 
 Arguments description:
 	-factory factoryName
 		A sub-class of SentenceDetectorFactory where to get implementation and resources.
-	-eosChars string
-		EOS characters.
 	-abbDict path
 		abbreviation dictionary in XML format.
+	-eosChars string
+		EOS characters.
 	-params paramsFile
 		training parameters file.
 	-lang language
 		language which is being processed.
-	-folds num
-		number of folds, default is 10.
 	-misclassified true|false
 		if true will print false negatives and false positives.
+	-folds num
+		number of folds, default is 10.
 	-data sampleData
 		data to be used, usually a file name.
 	-encoding charsetName
@@ -1292,18 +1292,18 @@ Arguments description:
 <entry>Encoding for reading and writing text.</entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>language</entry>
-<entry>No</entry>
-<entry>Language which is being processed.</entry>
-</row>
-<row>
 <entry>includeTitles</entry>
 <entry>includeTitles</entry>
 <entry>Yes</entry>
 <entry>If true will include sentences marked as headlines.</entry>
 </row>
 <row>
+<entry>lang</entry>
+<entry>language</entry>
+<entry>No</entry>
+<entry>Language which is being processed.</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -1447,18 +1447,18 @@ Usage: opennlp SentenceDetectorConverter help|ad|pos|conllx|namefinder|parse|mos
 <entry>Encoding for reading and writing text.</entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>language</entry>
-<entry>No</entry>
-<entry>Language which is being processed.</entry>
-</row>
-<row>
 <entry>includeTitles</entry>
 <entry>includeTitles</entry>
 <entry>Yes</entry>
 <entry>If true will include sentences marked as headlines.</entry>
 </row>
 <row>
+<entry>lang</entry>
+<entry>language</entry>
+<entry>No</entry>
+<entry>Language which is being processed.</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -1642,14 +1642,14 @@ Arguments description:
 <tbody>
 <row>
 <entry morerows='3' valign='middle'>evalita</entry>
-<entry>lang</entry>
-<entry>it</entry>
+<entry>types</entry>
+<entry>per,loc,org,gpe</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>types</entry>
-<entry>per,loc,org,gpe</entry>
+<entry>lang</entry>
+<entry>it</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -1673,18 +1673,18 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>language</entry>
-<entry>No</entry>
-<entry>Language which is being processed.</entry>
-</row>
-<row>
 <entry>splitHyphenatedTokens</entry>
 <entry>split</entry>
 <entry>Yes</entry>
 <entry>If true all hyphenated tokens will be separated (default true)</entry>
 </row>
 <row>
+<entry>lang</entry>
+<entry>language</entry>
+<entry>No</entry>
+<entry>Language which is being processed.</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -1692,14 +1692,14 @@ Arguments description:
 </row>
 <row>
 <entry morerows='3' valign='middle'>conll03</entry>
-<entry>lang</entry>
-<entry>en|de</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>eng|deu</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -1736,14 +1736,14 @@ Arguments description:
 </row>
 <row>
 <entry morerows='3' valign='middle'>conll02</entry>
-<entry>lang</entry>
-<entry>es|nl</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>es|nl</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -1836,17 +1836,17 @@ Arguments description:
 <screen>
 <![CDATA[
 Usage: opennlp TokenNameFinderEvaluator[.evalita|.ad|.conll03|.bionlp2004|.conll02|.muc6|.ontonotes|.brat] 
-        [-nameTypes types] [-misclassified true|false] -model model [-detailedF true|false] 
+        [-nameTypes types] -model model [-misclassified true|false] [-detailedF true|false] 
         [-reportOutputFile outputFile] -data sampleData [-encoding charsetName] 
 Arguments description:
 	-nameTypes types
 		name types to use for evaluation
-	-misclassified true|false
-		if true will print false negatives and false positives.
 	-model model
 		the model file to be evaluated.
+	-misclassified true|false
+		if true will print false negatives and false positives.
 	-detailedF true|false
-		if true will print detailed FMeasure results.
+		if true (default) will print detailed FMeasure results.
 	-reportOutputFile outputFile
 		the path of the fine-grained report file.
 	-data sampleData
@@ -1863,14 +1863,14 @@ Arguments description:
 <tbody>
 <row>
 <entry morerows='3' valign='middle'>evalita</entry>
-<entry>lang</entry>
-<entry>it</entry>
+<entry>types</entry>
+<entry>per,loc,org,gpe</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>types</entry>
-<entry>per,loc,org,gpe</entry>
+<entry>lang</entry>
+<entry>it</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -1894,18 +1894,18 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>language</entry>
-<entry>No</entry>
-<entry>Language which is being processed.</entry>
-</row>
-<row>
 <entry>splitHyphenatedTokens</entry>
 <entry>split</entry>
 <entry>Yes</entry>
 <entry>If true all hyphenated tokens will be separated (default true)</entry>
 </row>
 <row>
+<entry>lang</entry>
+<entry>language</entry>
+<entry>No</entry>
+<entry>Language which is being processed.</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -1913,14 +1913,14 @@ Arguments description:
 </row>
 <row>
 <entry morerows='3' valign='middle'>conll03</entry>
-<entry>lang</entry>
-<entry>en|de</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>eng|deu</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -1957,14 +1957,14 @@ Arguments description:
 </row>
 <row>
 <entry morerows='3' valign='middle'>conll02</entry>
-<entry>lang</entry>
-<entry>es|nl</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>es|nl</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -2059,8 +2059,8 @@ Arguments description:
 Usage: opennlp 
         TokenNameFinderCrossValidator[.evalita|.ad|.conll03|.bionlp2004|.conll02|.muc6|.ontonotes|.brat] 
         [-factory factoryName] [-resources resourcesDir] [-type modelType] [-featuregen featuregenFile] 
-        [-nameTypes types] [-sequenceCodec codec] [-params paramsFile] -lang language [-folds num] 
-        [-misclassified true|false] [-detailedF true|false] [-reportOutputFile outputFile] -data sampleData 
+        [-nameTypes types] [-sequenceCodec codec] [-params paramsFile] -lang language [-misclassified 
+        true|false] [-folds num] [-detailedF true|false] [-reportOutputFile outputFile] -data sampleData 
         [-encoding charsetName] 
 Arguments description:
 	-factory factoryName
@@ -2079,12 +2079,12 @@ Arguments description:
 		training parameters file.
 	-lang language
 		language which is being processed.
-	-folds num
-		number of folds, default is 10.
 	-misclassified true|false
 		if true will print false negatives and false positives.
+	-folds num
+		number of folds, default is 10.
 	-detailedF true|false
-		if true will print detailed FMeasure results.
+		if true (default) will print detailed FMeasure results.
 	-reportOutputFile outputFile
 		the path of the fine-grained report file.
 	-data sampleData
@@ -2101,14 +2101,14 @@ Arguments description:
 <tbody>
 <row>
 <entry morerows='3' valign='middle'>evalita</entry>
-<entry>lang</entry>
-<entry>it</entry>
+<entry>types</entry>
+<entry>per,loc,org,gpe</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>types</entry>
-<entry>per,loc,org,gpe</entry>
+<entry>lang</entry>
+<entry>it</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -2132,18 +2132,18 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>language</entry>
-<entry>No</entry>
-<entry>Language which is being processed.</entry>
-</row>
-<row>
 <entry>splitHyphenatedTokens</entry>
 <entry>split</entry>
 <entry>Yes</entry>
 <entry>If true all hyphenated tokens will be separated (default true)</entry>
 </row>
 <row>
+<entry>lang</entry>
+<entry>language</entry>
+<entry>No</entry>
+<entry>Language which is being processed.</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -2151,14 +2151,14 @@ Arguments description:
 </row>
 <row>
 <entry morerows='3' valign='middle'>conll03</entry>
-<entry>lang</entry>
-<entry>en|de</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>eng|deu</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -2195,14 +2195,14 @@ Arguments description:
 </row>
 <row>
 <entry morerows='3' valign='middle'>conll02</entry>
-<entry>lang</entry>
-<entry>es|nl</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>es|nl</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -2305,14 +2305,14 @@ Usage: opennlp TokenNameFinderConverter help|evalita|ad|conll03|bionlp2004|conll
 <tbody>
 <row>
 <entry morerows='3' valign='middle'>evalita</entry>
-<entry>lang</entry>
-<entry>it</entry>
+<entry>types</entry>
+<entry>per,loc,org,gpe</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>types</entry>
-<entry>per,loc,org,gpe</entry>
+<entry>lang</entry>
+<entry>it</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -2336,18 +2336,18 @@ Usage: opennlp TokenNameFinderConverter help|evalita|ad|conll03|bionlp2004|conll
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>language</entry>
-<entry>No</entry>
-<entry>Language which is being processed.</entry>
-</row>
-<row>
 <entry>splitHyphenatedTokens</entry>
 <entry>split</entry>
 <entry>Yes</entry>
 <entry>If true all hyphenated tokens will be separated (default true)</entry>
 </row>
 <row>
+<entry>lang</entry>
+<entry>language</entry>
+<entry>No</entry>
+<entry>Language which is being processed.</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -2355,14 +2355,14 @@ Usage: opennlp TokenNameFinderConverter help|evalita|ad|conll03|bionlp2004|conll
 </row>
 <row>
 <entry morerows='3' valign='middle'>conll03</entry>
-<entry>lang</entry>
-<entry>en|de</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>eng|deu</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -2399,14 +2399,14 @@ Usage: opennlp TokenNameFinderConverter help|evalita|ad|conll03|bionlp2004|conll
 </row>
 <row>
 <entry morerows='3' valign='middle'>conll02</entry>
-<entry>lang</entry>
-<entry>es|nl</entry>
+<entry>types</entry>
+<entry>per,loc,org,misc</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
 <row>
-<entry>types</entry>
-<entry>per,loc,org,misc</entry>
+<entry>lang</entry>
+<entry>es|nl</entry>
 <entry>No</entry>
 <entry></entry>
 </row>
@@ -2498,13 +2498,13 @@ Usage: opennlp TokenNameFinderConverter help|evalita|ad|conll03|bionlp2004|conll
 
 <screen>
 <![CDATA[
-Usage: opennlp CensusDictionaryCreator [-encoding charsetName] [-lang code] -dict dict -censusData censusDict
+Usage: opennlp CensusDictionaryCreator [-encoding charsetName] [-lang code] -censusData censusDict -dict dict
 
 Arguments description:
 	-encoding charsetName
 	-lang code
-	-dict dict
 	-censusData censusDict
+	-dict dict
 
 ]]>
 </screen> 
@@ -2538,19 +2538,18 @@ Usage: opennlp POSTagger model < sentences
 
 <screen>
 <![CDATA[
-Usage: opennlp POSTaggerTrainer[.ad|.conllx|.parse|.ontonotes] [-factory factoryName] [-type 
-        maxent|perceptron|perceptron_sequence] [-dict dictionaryPath] [-ngram cutoff] [-tagDictCutoff 
-        tagDictCutoff] [-params paramsFile] -lang language -model modelFile -data sampleData [-encoding 
-        charsetName] 
+Usage: opennlp POSTaggerTrainer[.ad|.conllx|.parse|.ontonotes|.conllu] [-factory factoryName] [-resources 
+        resourcesDir] [-featuregen featuregenFile] [-dict dictionaryPath] [-tagDictCutoff tagDictCutoff] 
+        [-params paramsFile] -lang language -model modelFile -data sampleData [-encoding charsetName] 
 Arguments description:
 	-factory factoryName
 		A sub-class of POSTaggerFactory where to get implementation and resources.
-	-type maxent|perceptron|perceptron_sequence
-		The type of the token name finder model. One of maxent|perceptron|perceptron_sequence.
+	-resources resourcesDir
+		The resources directory
+	-featuregen featuregenFile
+		The feature generator descriptor file
 	-dict dictionaryPath
 		The XML tag dictionary file
-	-ngram cutoff
-		NGram cutoff. If not specified will not create ngram dictionary.
 	-tagDictCutoff tagDictCutoff
 		TagDictionary cutoff. If specified will create/expand a mutable TagDictionary
 	-params paramsFile
@@ -2579,12 +2578,6 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>language</entry>
-<entry>No</entry>
-<entry>Language which is being processed.</entry>
-</row>
-<row>
 <entry>expandME</entry>
 <entry>expandME</entry>
 <entry>Yes</entry>
@@ -2597,6 +2590,12 @@ Arguments description:
 <entry>Combine POS Tags with word features, like number and gender.</entry>
 </row>
 <row>
+<entry>lang</entry>
+<entry>language</entry>
+<entry>No</entry>
+<entry>Language which is being processed.</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -2635,6 +2634,25 @@ Arguments description:
 <entry>No</entry>
 <entry></entry>
 </row>
+<row>
+<entry morerows='2' valign='middle'>conllu</entry>
+<entry>tagset</entry>
+<entry>tagset</entry>
+<entry>Yes</entry>
+<entry>U|x u for unified tags and x for language-specific part-of-speech tags</entry>
+</row>
+<row>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
 </tbody>
 </tgroup></informaltable>
 
@@ -2648,13 +2666,13 @@ Arguments description:
 
 <screen>
 <![CDATA[
-Usage: opennlp POSTaggerEvaluator[.ad|.conllx|.parse|.ontonotes] [-misclassified true|false] -model model 
-        [-reportOutputFile outputFile] -data sampleData [-encoding charsetName] 
+Usage: opennlp POSTaggerEvaluator[.ad|.conllx|.parse|.ontonotes|.conllu] -model model [-misclassified 
+        true|false] [-reportOutputFile outputFile] -data sampleData [-encoding charsetName] 
 Arguments description:
-	-misclassified true|false
-		if true will print false negatives and false positives.
 	-model model
 		the model file to be evaluated.
+	-misclassified true|false
+		if true will print false negatives and false positives.
 	-reportOutputFile outputFile
 		the path of the fine-grained report file.
 	-data sampleData
@@ -2677,12 +2695,6 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>language</entry>
-<entry>No</entry>
-<entry>Language which is being processed.</entry>
-</row>
-<row>
 <entry>expandME</entry>
 <entry>expandME</entry>
 <entry>Yes</entry>
@@ -2695,6 +2707,12 @@ Arguments description:
 <entry>Combine POS Tags with word features, like number and gender.</entry>
 </row>
 <row>
+<entry>lang</entry>
+<entry>language</entry>
+<entry>No</entry>
+<entry>Language which is being processed.</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -2733,6 +2751,25 @@ Arguments description:
 <entry>No</entry>
 <entry></entry>
 </row>
+<row>
+<entry morerows='2' valign='middle'>conllu</entry>
+<entry>tagset</entry>
+<entry>tagset</entry>
+<entry>Yes</entry>
+<entry>U|x u for unified tags and x for language-specific part-of-speech tags</entry>
+</row>
+<row>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
 </tbody>
 </tgroup></informaltable>
 
@@ -2746,23 +2783,23 @@ Arguments description:
 
 <screen>
 <![CDATA[
-Usage: opennlp POSTaggerCrossValidator[.ad|.conllx|.parse|.ontonotes] [-folds num] [-misclassified 
-        true|false] [-factory factoryName] [-type maxent|perceptron|perceptron_sequence] [-dict 
-        dictionaryPath] [-ngram cutoff] [-tagDictCutoff tagDictCutoff] [-params paramsFile] -lang language 
-        [-reportOutputFile outputFile] -data sampleData [-encoding charsetName] 
+Usage: opennlp POSTaggerCrossValidator[.ad|.conllx|.parse|.ontonotes|.conllu] [-misclassified true|false] 
+        [-folds num] [-factory factoryName] [-resources resourcesDir] [-featuregen featuregenFile] [-dict 
+        dictionaryPath] [-tagDictCutoff tagDictCutoff] [-params paramsFile] -lang language [-reportOutputFile 
+        outputFile] -data sampleData [-encoding charsetName] 
 Arguments description:
-	-folds num
-		number of folds, default is 10.
 	-misclassified true|false
 		if true will print false negatives and false positives.
+	-folds num
+		number of folds, default is 10.
 	-factory factoryName
 		A sub-class of POSTaggerFactory where to get implementation and resources.
-	-type maxent|perceptron|perceptron_sequence
-		The type of the token name finder model. One of maxent|perceptron|perceptron_sequence.
+	-resources resourcesDir
+		The resources directory
+	-featuregen featuregenFile
+		The feature generator descriptor file
 	-dict dictionaryPath
 		The XML tag dictionary file
-	-ngram cutoff
-		NGram cutoff. If not specified will not create ngram dictionary.
 	-tagDictCutoff tagDictCutoff
 		TagDictionary cutoff. If specified will create/expand a mutable TagDictionary
 	-params paramsFile
@@ -2791,12 +2828,6 @@ Arguments description:
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>language</entry>
-<entry>No</entry>
-<entry>Language which is being processed.</entry>
-</row>
-<row>
 <entry>expandME</entry>
 <entry>expandME</entry>
 <entry>Yes</entry>
@@ -2809,6 +2840,12 @@ Arguments description:
 <entry>Combine POS Tags with word features, like number and gender.</entry>
 </row>
 <row>
+<entry>lang</entry>
+<entry>language</entry>
+<entry>No</entry>
+<entry>Language which is being processed.</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -2847,6 +2884,25 @@ Arguments description:
 <entry>No</entry>
 <entry></entry>
 </row>
+<row>
+<entry morerows='2' valign='middle'>conllu</entry>
+<entry>tagset</entry>
+<entry>tagset</entry>
+<entry>Yes</entry>
+<entry>U|x u for unified tags and x for language-specific part-of-speech tags</entry>
+</row>
+<row>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
 </tbody>
 </tgroup></informaltable>
 
@@ -2856,11 +2912,11 @@ Arguments description:
 
 <title>POSTaggerConverter</title>
 
-<para>Converts foreign data formats (ad,conllx,parse,ontonotes) to native OpenNLP format</para>
+<para>Converts foreign data formats (ad,conllx,parse,ontonotes,conllu) to native OpenNLP format</para>
 
 <screen>
 <![CDATA[
-Usage: opennlp POSTaggerConverter help|ad|conllx|parse|ontonotes [help|options...]
+Usage: opennlp POSTaggerConverter help|ad|conllx|parse|ontonotes|conllu [help|options...]
 
 ]]>
 </screen> 
@@ -2877,12 +2933,6 @@ Usage: opennlp POSTaggerConverter help|ad|conllx|parse|ontonotes [help|options..
 <entry>Encoding for reading and writing text, if absent the system default is used.</entry>
 </row>
 <row>
-<entry>lang</entry>
-<entry>language</entry>
-<entry>No</entry>
-<entry>Language which is being processed.</entry>
-</row>
-<row>
 <entry>expandME</entry>
 <entry>expandME</entry>
 <entry>Yes</entry>
@@ -2895,6 +2945,12 @@ Usage: opennlp POSTaggerConverter help|ad|conllx|parse|ontonotes [help|options..
 <entry>Combine POS Tags with word features, like number and gender.</entry>
 </row>
 <row>
+<entry>lang</entry>
+<entry>language</entry>
+<entry>No</entry>
+<entry>Language which is being processed.</entry>
+</row>
+<row>
 <entry>data</entry>
 <entry>sampleData</entry>
 <entry>No</entry>
@@ -2933,6 +2989,25 @@ Usage: opennlp POSTaggerConverter help|ad|conllx|parse|ontonotes [help|options..
 <entry>No</entry>
 <entry></entry>
 </row>
+<row>
+<entry morerows='2' valign='middle'>conllu</entry>
+<entry>tagset</entry>
+<entry>tagset</entry>
+<entry>Yes</entry>
+<entry>U|x u for unified tags and x for language-specific part-of-speech tags</entry>
+</row>
+<row>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
 </tbody>
 </tgroup></informaltable>
 
@@ -2966,7 +3041,7 @@ Usage: opennlp LemmatizerME model < sentences
 
 <screen>
 <![CDATA[
-Usage: opennlp LemmatizerTrainerME [-factory factoryName] [-params paramsFile] -lang language -model 
+Usage: opennlp LemmatizerTrainerME[.conllu] [-factory factoryName] [-params paramsFile] -lang language -model 
         modelFile -data sampleData [-encoding charsetName] 
 Arguments description:
 	-factory factoryName
@@ -2989,6 +3064,25 @@ Arguments description:
 <informaltable frame='all'><tgroup cols='4' align='left' colsep='1' rowsep='1'>
 <thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
 <tbody>
+<row>
+<entry morerows='2' valign='middle'>conllu</entry>
+<entry>tagset</entry>
+<entry>tagset</entry>
+<entry>Yes</entry>
+<entry>U|x u for unified tags and x for language-specific part-of-speech tags</entry>
+</row>
+<row>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
 </tbody>
 </tgroup></informaltable>
 
@@ -3002,13 +3096,13 @@ Arguments description:
 
 <screen>
 <![CDATA[
-Usage: opennlp LemmatizerEvaluator [-misclassified true|false] -model model [-reportOutputFile outputFile] 
-        -data sampleData [-encoding charsetName] 
+Usage: opennlp LemmatizerEvaluator[.conllu] -model model [-misclassified true|false] [-reportOutputFile 
+        outputFile] -data sampleData [-encoding charsetName] 
 Arguments description:
-	-misclassified true|false
-		if true will print false negatives and false positives.
 	-model model
 		the model file to be evaluated.
+	-misclassified true|false
+		if true will print false negatives and false positives.
 	-reportOutputFile outputFile
 		the path of the fine-grained report file.
 	-data sampleData
@@ -3023,6 +3117,25 @@ Arguments description:
 <informaltable frame='all'><tgroup cols='4' align='left' colsep='1' rowsep='1'>
 <thead><row><entry>Format</entry><entry>Argument</entry><entry>Value</entry><entry>Optional</entry><entry>Description</entry></row></thead>
 <tbody>
+<row>
+<entry morerows='2' valign='middle'>conllu</entry>
+<entry>tagset</entry>
+<entry>tagset</entry>
+<entry>Yes</entry>
+<entry>U|x u for unified tags and x for language-specific part-of-speech tags</entry>
+</row>
+<row>
+<entry>data</entry>
+<entry>sampleData</entry>
+<entry>No</entry>
+<entry>Data to be used, usually a file name.</entry>
+</row>
+<row>
+<entry>encoding</entry>
+<entry>charsetName</entry>
+<entry>Yes</entry>
+<entry>Encoding for reading and writing text, if absent the system default is used.</entry>
+</row>
 </tbody>
 </tgroup></informaltable>
 
@@ -3123,15 +3236,15 @@ Arguments description:
 
 <screen>
 <![CDATA[
-Usage: opennlp ChunkerEvaluator[.ad] [-misclassified true|false] -model model [-detailedF true|false] -data 
+Usage: opennlp ChunkerEvaluator[.ad] -model model [-misclassified true|false] [-detailedF true|false] -data 
         sampleData [-encoding charsetName] 
 Arguments description:
-	-misclassified true|false
-		if true will print false negatives and false positives.
 	-model model
 		the model file to be evaluated.
+	-misclassified true|false
+		if true will print false negatives and false positives.
 	-detailedF true|false
-		if true will print detailed FMeasure results.
+		if true (default) will print detailed FMeasure results.
 	-data sampleData
 		data to be used, usually a file name.
 	-encoding charsetName
@@ -3188,8 +3301,9 @@ Arguments description:
 
 <screen>
 <![CDATA[
-Usage: opennlp ChunkerCrossValidator[.ad] [-factory factoryName] [-params paramsFile] -lang language [-folds 
-        num] [-misclassified true|false] [-detailedF true|false] -data sampleData [-encoding charsetName] 
+Usage: opennlp ChunkerCrossValidator[.ad] [-factory factoryName] [-params paramsFile] -lang language 
+        [-misclassified true|false] [-folds num] [-detailedF true|false] -data sampleData [-encoding 
+        charsetName] 
 Arguments description:
 	-factory factoryName
 		A sub-class of ChunkerFactory where to get implementation and resources.
@@ -3197,12 +3311,12 @@ Arguments description:
 		training parameters file.
 	-lang language
 		language which is being processed.
-	-folds num
-		number of folds, default is 10.
 	-misclassified true|false
 		if true will print false negatives and false positives.
+	-folds num
+		number of folds, default is 10.
 	-detailedF true|false
-		if true will print detailed FMeasure results.
+		if true (default) will print detailed FMeasure results.
 	-data sampleData
 		data to be used, usually a file name.
 	-encoding charsetName
@@ -3399,13 +3513,13 @@ Arguments description:
 
 <screen>
 <![CDATA[
-Usage: opennlp ParserEvaluator[.ontonotes|.frenchtreebank] [-misclassified true|false] -model model -data 
+Usage: opennlp ParserEvaluator[.ontonotes|.frenchtreebank] -model model [-misclassified true|false] -data 
         sampleData [-encoding charsetName] 
 Arguments description:
-	-misclassified true|false
-		if true will print false negatives and false positives.
 	-model model
 		the model file to be evaluated.
+	-misclassified true|false
+		if true will print false negatives and false positives.
 	-data sampleData
 		data to be used, usually a file name.
 	-encoding charsetName
@@ -3633,15 +3747,15 @@ Usage: opennlp EntityLinker model < sentences
 
 <title>Languagemodel</title>
 
-<section id='tools.cli.languagemodel.LanguageModel'>
+<section id='tools.cli.languagemodel.NGramLanguageModel'>
 
-<title>LanguageModel</title>
+<title>NGramLanguageModel</title>
 
-<para>Gives the probability of a sequence of tokens in a language model</para>
+<para>Gives the probability and most probable next token(s) of a sequence of tokens in a language model</para>
 
 <screen>
 <![CDATA[
-Usage: opennlp LanguageModel model
+Usage: opennlp NGramLanguageModel model
 
 ]]>
 </screen> 


[09/50] opennlp git commit: NoJira: Fix the codestyle violations in NameSampleTypeFilterTest, set severity to 'error' for indent violations; closes apache/opennlp#175

Posted by co...@apache.org.
NoJira: Fix the codestyle violations in NameSampleTypeFilterTest, set severity to 'error' for indent violations; closes apache/opennlp#175


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/041507d3
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/041507d3
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/041507d3

Branch: refs/heads/LangDetect
Commit: 041507d3aaf718ae536f9ab66e4eeb4065cb339d
Parents: 8fb1341
Author: smarthi <sm...@apache.org>
Authored: Wed Apr 19 21:52:46 2017 -0400
Committer: smarthi <sm...@apache.org>
Committed: Wed Apr 19 21:52:46 2017 -0400

----------------------------------------------------------------------
 checkstyle.xml                                  |  2 +-
 .../namefind/NameSampleTypeFilterTest.java      | 80 ++++++++++----------
 .../tools/util/TrainingParametersTest.java      | 31 +++-----
 3 files changed, 51 insertions(+), 62 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/041507d3/checkstyle.xml
----------------------------------------------------------------------
diff --git a/checkstyle.xml b/checkstyle.xml
index 1bfe788..88e653f 100644
--- a/checkstyle.xml
+++ b/checkstyle.xml
@@ -113,7 +113,7 @@
       <property name="throwsIndent" value="4"/>
       <property name="lineWrappingIndentation" value="4"/>
       <property name="arrayInitIndent" value="2"/>
-      <property name="severity" value="warning"/>
+      <property name="severity" value="error"/>
     </module>
     <module name="EmptyCatchBlock">
       <property name="exceptionVariableName" value="expected|ignore"/>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/041507d3/opennlp-tools/src/test/java/opennlp/tools/namefind/NameSampleTypeFilterTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/namefind/NameSampleTypeFilterTest.java b/opennlp-tools/src/test/java/opennlp/tools/namefind/NameSampleTypeFilterTest.java
index 24ecc9f..6e6095e 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/namefind/NameSampleTypeFilterTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/namefind/NameSampleTypeFilterTest.java
@@ -6,7 +6,7 @@
  * (the "License"); you may not use this file except in compliance with
  * the License. You may obtain a copy of the License at
  *
- *     http://www.apache.org/licenses/LICENSE-2.0
+ * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
@@ -35,68 +35,68 @@ import opennlp.tools.util.Span;
 
 public class NameSampleTypeFilterTest {
 
-    private static NameSampleTypeFilter filter;
+  private static NameSampleTypeFilter filter;
 
-    private static final String text = "<START:organization> NATO <END> Secretary - General " +
-            "<START:person> Anders Fogh Rasmussen <END> made clear that despite an intensifying " +
-            "insurgency and uncertainty over whether <START:location> U . S . <END> President " +
-            "<START:person> Barack Obama <END> will send more troops , <START:location> NATO <END> " +
-            "will remain in <START:location> Afghanistan <END> .";
+  private static final String text = "<START:organization> NATO <END> Secretary - General " +
+      "<START:person> Anders Fogh Rasmussen <END> made clear that despite an intensifying " +
+      "insurgency and uncertainty over whether <START:location> U . S . <END> President " +
+      "<START:person> Barack Obama <END> will send more troops , <START:location> NATO <END> " +
+      "will remain in <START:location> Afghanistan <END> .";
 
-    private static final String person = "person";
-    private static final String organization = "organization";
+  private static final String person = "person";
+  private static final String organization = "organization";
 
-    @Test
-    public void testNoFilter() throws IOException {
+  @Test
+  public void testNoFilter() throws IOException {
 
-        final String[] types = new String[] {};
+    final String[] types = new String[] {};
 
-        filter = new NameSampleTypeFilter(types, sampleStream(text));
+    filter = new NameSampleTypeFilter(types, sampleStream(text));
 
-        NameSample ns = filter.read();
+    NameSample ns = filter.read();
 
-        Assert.assertEquals(0, ns.getNames().length);
+    Assert.assertEquals(0, ns.getNames().length);
 
-    }
+  }
 
-    @Test
-    public void testSingleFilter() throws IOException {
+  @Test
+  public void testSingleFilter() throws IOException {
 
-        final String[] types = new String[] {organization};
+    final String[] types = new String[] {organization};
 
-        filter = new NameSampleTypeFilter(types, sampleStream(text));
+    filter = new NameSampleTypeFilter(types, sampleStream(text));
 
-        NameSample ns = filter.read();
+    NameSample ns = filter.read();
 
-        Assert.assertEquals(1, ns.getNames().length);
-        Assert.assertEquals(organization, ns.getNames()[0].getType());
+    Assert.assertEquals(1, ns.getNames().length);
+    Assert.assertEquals(organization, ns.getNames()[0].getType());
 
-    }
+  }
 
-    @Test
-    public void testMultiFilter() throws IOException {
+  @Test
+  public void testMultiFilter() throws IOException {
 
-        final String[] types = new String[] {person, organization};
+    final String[] types = new String[] {person, organization};
 
-        filter = new NameSampleTypeFilter(types, sampleStream(text));
+    filter = new NameSampleTypeFilter(types, sampleStream(text));
 
-        NameSample ns = filter.read();
+    NameSample ns = filter.read();
 
-        Map<String, List<Span>> collect = Arrays.stream(ns.getNames())
-                .collect(Collectors.groupingBy(Span::getType));
-        Assert.assertEquals(2, collect.size());
-        Assert.assertEquals(2, collect.get(person).size());
-        Assert.assertEquals(1, collect.get(organization).size());
+    Map<String, List<Span>> collect = Arrays.stream(ns.getNames())
+        .collect(Collectors.groupingBy(Span::getType));
+    Assert.assertEquals(2, collect.size());
+    Assert.assertEquals(2, collect.get(person).size());
+    Assert.assertEquals(1, collect.get(organization).size());
 
-    }
+  }
 
-    private ObjectStream<NameSample> sampleStream(String sampleText) throws IOException {
+  private ObjectStream<NameSample> sampleStream(String sampleText) throws IOException {
 
-        InputStreamFactory in = () -> new ByteArrayInputStream(sampleText.getBytes(StandardCharsets.UTF_8));
+    InputStreamFactory in = () -> new ByteArrayInputStream(sampleText.getBytes(StandardCharsets.UTF_8));
 
-        return new NameSampleDataStream(
-                new PlainTextByLineStream(in, StandardCharsets.UTF_8));
+    return new NameSampleDataStream(
+        new PlainTextByLineStream(in, StandardCharsets.UTF_8));
 
-    }
+  }
 
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/041507d3/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java
index 27e1695..294dff8 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java
@@ -34,21 +34,11 @@ public class TrainingParametersTest {
         new TrainingParameters(build("key1=val1,key2=val2,key3=val3"));
 
     TrainingParameters tp2 = new TrainingParameters(
-      new ByteArrayInputStream("key1=val1\nkey2=val2\nkey3=val3\n".getBytes())
+        new ByteArrayInputStream("key1=val1\nkey2=val2\nkey3=val3\n".getBytes())
     );
 
     TrainingParameters tp3 = new TrainingParameters(tp2);
 
-    /*
-    Assert.assertEquals(3, tp1.getSettings().size());
-    Assert.assertEquals(3, tp3.getSettings().size());
-    Assert.assertEquals(tp1.getStringParameter("key1", "v11"),
-            tp3.getStringParameter("key1", "v22"));   // use different defaults
-    Assert.assertEquals(tp1.getStringParameter("key2", "v11"),
-            tp3.getStringParameter("key2", "v22"));   // use different defaults
-    Assert.assertEquals(tp1.getStringParameter("key2", "v11"),
-            tp3.getStringParameter("key2", "v22"));   // use different defaults
-            */
     assertEquals(tp1, tp2);
     assertEquals(tp2, tp3);
   }
@@ -60,14 +50,14 @@ public class TrainingParametersTest {
     Assert.assertEquals(4, tr.getSettings().size());
     Assert.assertEquals("MAXENT", tr.algorithm());
     Assert.assertEquals(EventTrainer.EVENT_VALUE,
-            tr.getStringParameter(TrainingParameters.TRAINER_TYPE_PARAM,
-                    "v11"));  // use different defaults
+        tr.getStringParameter(TrainingParameters.TRAINER_TYPE_PARAM,
+            "v11"));  // use different defaults
     Assert.assertEquals(100,
-            tr.getIntParameter(TrainingParameters.ITERATIONS_PARAM,
-                    200));  // use different defaults
+        tr.getIntParameter(TrainingParameters.ITERATIONS_PARAM,
+            200));  // use different defaults
     Assert.assertEquals(5,
-            tr.getIntParameter(TrainingParameters.CUTOFF_PARAM,
-                    200));  // use different defaults
+        tr.getIntParameter(TrainingParameters.CUTOFF_PARAM,
+            200));  // use different defaults
   }
 
   @Test
@@ -131,7 +121,7 @@ public class TrainingParametersTest {
   private static Map<String, String> buildMap(String str) {
     String[] pairs = str.split(",");
     Map<String, String> map = new HashMap<>(pairs.length);
-    for (String pair: pairs) {
+    for (String pair : pairs) {
       String[] keyValue = pair.split("=");
       map.put(keyValue[0], keyValue[1]);
     }
@@ -148,7 +138,7 @@ public class TrainingParametersTest {
     Assert.assertNotNull(map1);
     Assert.assertNotNull(map2);
     Assert.assertEquals(map1.size(), map2.size());
-    for (String key: map1.keySet()) {
+    for (String key : map1.keySet()) {
       Assert.assertEquals(map1.get(key), map2.get(key));
     }
   }
@@ -161,8 +151,7 @@ public class TrainingParametersTest {
   private static void assertEquals(TrainingParameters expected, TrainingParameters actual) {
     if (expected == null) {
       Assert.assertNull(actual);
-    }
-    else {
+    } else {
       assertEquals(expected.getSettings(), actual);
     }
   }


[13/50] opennlp git commit: Merge remote-tracking branch 'github/pr/183'

Posted by co...@apache.org.
Merge remote-tracking branch 'github/pr/183'

This closes #183


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/3a22156f
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/3a22156f
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/3a22156f

Branch: refs/heads/LangDetect
Commit: 3a22156fa3dd9d436b006c8621d56dd19fe2d890
Parents: 99cbf0d 95e43b3
Author: Bruno P. Kinoshita <br...@yahoo.com.br>
Authored: Sun Apr 23 21:38:03 2017 +1200
Committer: Bruno P. Kinoshita <br...@yahoo.com.br>
Committed: Sun Apr 23 21:38:03 2017 +1200

----------------------------------------------------------------------
 README.md | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)
----------------------------------------------------------------------



[08/50] opennlp git commit: OPENNLP-1032: Add tests for TrainingParameters. This closes apache/opennlp#171

Posted by co...@apache.org.
OPENNLP-1032: Add tests for TrainingParameters. This closes apache/opennlp#171


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/8fb1341a
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/8fb1341a
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/8fb1341a

Branch: refs/heads/LangDetect
Commit: 8fb1341ad7c460286211f4f2a6be41bb5782653f
Parents: 3ba27e9
Author: koji <ko...@apache.org>
Authored: Thu Apr 20 10:08:43 2017 +0900
Committer: koji <ko...@apache.org>
Committed: Thu Apr 20 10:08:43 2017 +0900

----------------------------------------------------------------------
 .../tools/util/TrainingParametersTest.java      | 169 +++++++++++++++++++
 1 file changed, 169 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/8fb1341a/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java
new file mode 100644
index 0000000..27e1695
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/TrainingParametersTest.java
@@ -0,0 +1,169 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util;
+
+import java.io.ByteArrayInputStream;
+import java.util.HashMap;
+import java.util.Map;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.ml.EventTrainer;
+
+public class TrainingParametersTest {
+
+  @Test
+  public void testConstructors() throws Exception {
+    TrainingParameters tp1 =
+        new TrainingParameters(build("key1=val1,key2=val2,key3=val3"));
+
+    TrainingParameters tp2 = new TrainingParameters(
+      new ByteArrayInputStream("key1=val1\nkey2=val2\nkey3=val3\n".getBytes())
+    );
+
+    TrainingParameters tp3 = new TrainingParameters(tp2);
+
+    /*
+    Assert.assertEquals(3, tp1.getSettings().size());
+    Assert.assertEquals(3, tp3.getSettings().size());
+    Assert.assertEquals(tp1.getStringParameter("key1", "v11"),
+            tp3.getStringParameter("key1", "v22"));   // use different defaults
+    Assert.assertEquals(tp1.getStringParameter("key2", "v11"),
+            tp3.getStringParameter("key2", "v22"));   // use different defaults
+    Assert.assertEquals(tp1.getStringParameter("key2", "v11"),
+            tp3.getStringParameter("key2", "v22"));   // use different defaults
+            */
+    assertEquals(tp1, tp2);
+    assertEquals(tp2, tp3);
+  }
+
+  @Test
+  public void testDefault() {
+    TrainingParameters tr = TrainingParameters.defaultParams();
+
+    Assert.assertEquals(4, tr.getSettings().size());
+    Assert.assertEquals("MAXENT", tr.algorithm());
+    Assert.assertEquals(EventTrainer.EVENT_VALUE,
+            tr.getStringParameter(TrainingParameters.TRAINER_TYPE_PARAM,
+                    "v11"));  // use different defaults
+    Assert.assertEquals(100,
+            tr.getIntParameter(TrainingParameters.ITERATIONS_PARAM,
+                    200));  // use different defaults
+    Assert.assertEquals(5,
+            tr.getIntParameter(TrainingParameters.CUTOFF_PARAM,
+                    200));  // use different defaults
+  }
+
+  @Test
+  public void testGetAlgorithm() {
+    TrainingParameters tp = build("Algorithm=Perceptron,n1.Algorithm=SVM");
+
+    Assert.assertEquals("Perceptron", tp.algorithm());
+    Assert.assertEquals("SVM", tp.algorithm("n1"));
+  }
+
+  @Test
+  public void testGetSettings() {
+    TrainingParameters tp = build("k1=v1,n1.k2=v2,n2.k3=v3,n1.k4=v4");
+
+    assertEquals(buildMap("k1=v1"), tp.getSettings());
+    assertEquals(buildMap("k2=v2,k4=v4"), tp.getSettings("n1"));
+    assertEquals(buildMap("k3=v3"), tp.getSettings("n2"));
+    Assert.assertTrue(tp.getSettings("n3").isEmpty());
+  }
+
+  @Test
+  public void testGetParameters() {
+    TrainingParameters tp = build("k1=v1,n1.k2=v2,n2.k3=v3,n1.k4=v4");
+
+    assertEquals(build("k1=v1"), tp.getParameters(null));
+    assertEquals(build("k2=v2,k4=v4"), tp.getParameters("n1"));
+    assertEquals(build("k3=v3"), tp.getParameters("n2"));
+    Assert.assertTrue(tp.getParameters("n3").getSettings().isEmpty());
+  }
+
+  @Test
+  public void testPutGet() {
+    TrainingParameters tp =
+        build("k1=v1,int.k2=123,str.k2=v3,str.k3=v4,boolean.k4=false,double.k5=123.45,k21=234.5");
+
+    Assert.assertEquals("v1", tp.getStringParameter("k1", "def"));
+    Assert.assertEquals("def", tp.getStringParameter("k2", "def"));
+    Assert.assertEquals("v3", tp.getStringParameter("str", "k2", "def"));
+    Assert.assertEquals("def", tp.getStringParameter("str", "k4", "def"));
+
+    Assert.assertEquals(-100, tp.getIntParameter("k11", -100));
+    tp.put("k11", "234");
+    Assert.assertEquals(234, tp.getIntParameter("k11", -100));
+    Assert.assertEquals(123, tp.getIntParameter("int", "k2", -100));
+    Assert.assertEquals(-100, tp.getIntParameter("int", "k4", -100));
+
+    Assert.assertEquals(234.5, tp.getDoubleParameter("k21", -100), 0.001);
+    tp.put("k21", "345.6");
+    Assert.assertEquals(345.6, tp.getDoubleParameter("k21", -100), 0.001); // should be changed
+    tp.putIfAbsent("k21", "456.7");
+    Assert.assertEquals(345.6, tp.getDoubleParameter("k21", -100), 0.001); // should be unchanged
+    Assert.assertEquals(123.45, tp.getDoubleParameter("double", "k5", -100), 0.001);
+
+    Assert.assertEquals(true, tp.getBooleanParameter("k31", true));
+    tp.put("k31", "false");
+    Assert.assertEquals(false, tp.getBooleanParameter("k31", true));
+    Assert.assertEquals(false, tp.getBooleanParameter("boolean", "k4", true));
+  }
+
+  // format: k1=v1,k2=v2,...
+  private static Map<String, String> buildMap(String str) {
+    String[] pairs = str.split(",");
+    Map<String, String> map = new HashMap<>(pairs.length);
+    for (String pair: pairs) {
+      String[] keyValue = pair.split("=");
+      map.put(keyValue[0], keyValue[1]);
+    }
+
+    return map;
+  }
+
+  // format: k1=v1,k2=v2,...
+  private static TrainingParameters build(String str) {
+    return new TrainingParameters(buildMap(str));
+  }
+
+  private static void assertEquals(Map<String, String> map1, Map<String, String> map2) {
+    Assert.assertNotNull(map1);
+    Assert.assertNotNull(map2);
+    Assert.assertEquals(map1.size(), map2.size());
+    for (String key: map1.keySet()) {
+      Assert.assertEquals(map1.get(key), map2.get(key));
+    }
+  }
+
+  private static void assertEquals(Map<String, String> map, TrainingParameters actual) {
+    Assert.assertNotNull(actual);
+    assertEquals(map, actual.getSettings());
+  }
+
+  private static void assertEquals(TrainingParameters expected, TrainingParameters actual) {
+    if (expected == null) {
+      Assert.assertNull(actual);
+    }
+    else {
+      assertEquals(expected.getSettings(), actual);
+    }
+  }
+}


[45/50] opennlp git commit: OPENNLP-1063: Update Morfologik dependency version

Posted by co...@apache.org.
OPENNLP-1063: Update Morfologik dependency version

closes apache/opennlp#204


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/c434b3af
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/c434b3af
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/c434b3af

Branch: refs/heads/LangDetect
Commit: c434b3af4008281a96154403094b20cefa7dd15b
Parents: d372ad1
Author: William D C M SILVA <co...@apache.org>
Authored: Tue May 16 10:07:57 2017 -0300
Committer: William D C M SILVA <co...@apache.org>
Committed: Tue May 16 10:07:57 2017 -0300

----------------------------------------------------------------------
 opennlp-morfologik-addon/pom.xml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/c434b3af/opennlp-morfologik-addon/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/pom.xml b/opennlp-morfologik-addon/pom.xml
index c46f101..612b27b 100644
--- a/opennlp-morfologik-addon/pom.xml
+++ b/opennlp-morfologik-addon/pom.xml
@@ -36,19 +36,20 @@
 
 	<properties>
 		<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+		<morfologik.version>2.1.3</morfologik.version>
 	</properties>
 
 	<dependencies>
 		<dependency>
 			<groupId>org.carrot2</groupId>
 			<artifactId>morfologik-stemming</artifactId>
-			<version>2.1.0</version>
+			<version>${morfologik.version}</version>
 			<scope>compile</scope>
 		</dependency>
 		<dependency>
 			<groupId>org.carrot2</groupId>
 			<artifactId>morfologik-tools</artifactId>
-			<version>2.1.0</version>
+			<version>${morfologik.version}</version>
 			<scope>compile</scope>
 		</dependency>
 


[43/50] opennlp git commit: OPENNLP-1060: Fix computation of hash for the parser

Posted by co...@apache.org.
OPENNLP-1060: Fix computation of hash for the parser


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/c0880fb6
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/c0880fb6
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/c0880fb6

Branch: refs/heads/LangDetect
Commit: c0880fb689ce188c7616de65a63444c8595a0549
Parents: d7b3b96
Author: Jörn Kottmann <jo...@apache.org>
Authored: Mon May 15 21:54:40 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Mon May 15 23:41:23 2017 +0200

----------------------------------------------------------------------
 .../test/java/opennlp/tools/eval/SourceForgeModelEval.java  | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/c0880fb6/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
index d3ea980..25b6f54 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/eval/SourceForgeModelEval.java
@@ -21,6 +21,7 @@ import java.io.File;
 import java.io.IOException;
 import java.math.BigInteger;
 import java.nio.charset.Charset;
+import java.nio.charset.StandardCharsets;
 import java.security.MessageDigest;
 import java.security.NoSuchAlgorithmException;
 
@@ -339,14 +340,16 @@ public class SourceForgeModelEval {
       while ((line = lines.read()) != null) {
         Parse[] parse = ParserTool.parseLine(String.join(" ", line.getText()), parser, 1);
         if (parse.length > 0) {
-          digest.update(parse[0].toString().getBytes("UTF-8"));
+          StringBuffer sb = new StringBuffer();
+          parse[0].show(sb);
+          digest.update(sb.toString().getBytes(StandardCharsets.UTF_8));
         } else {
-          digest.update("empty".getBytes("UTF-8"));
+          digest.update("empty".getBytes(StandardCharsets.UTF_8));
         }
       }
     }
 
-    Assert.assertEquals(new BigInteger("13162568910062822351942983467905626940"),
+    Assert.assertEquals(new BigInteger("312218841713337505306598301082074515847"),
         new BigInteger(1, digest.digest()));
   }
 }


[22/50] opennlp git commit: OPENNLP-1047: Add detokenizer and sent detect abbreviations for Irish

Posted by co...@apache.org.
OPENNLP-1047: Add detokenizer and sent detect abbreviations for Irish

Closes #188


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/caeaaeea
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/caeaaeea
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/caeaaeea

Branch: refs/heads/LangDetect
Commit: caeaaeea61e88fe4222b997b2dad49728b91ba68
Parents: 3df659b
Author: Jim O'Regan <ja...@tcd.ie>
Authored: Sat Apr 29 00:06:42 2017 +0100
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 3 12:05:16 2017 +0200

----------------------------------------------------------------------
 opennlp-tools/lang/ga/sentdetect/abb.xml        | 164 +++++++++++++++++++
 .../lang/ga/tokenizer/ga-detokenizer.xml        | 113 +++++++++++++
 2 files changed, 277 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/caeaaeea/opennlp-tools/lang/ga/sentdetect/abb.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/lang/ga/sentdetect/abb.xml b/opennlp-tools/lang/ga/sentdetect/abb.xml
new file mode 100644
index 0000000..9d15aed
--- /dev/null
+++ b/opennlp-tools/lang/ga/sentdetect/abb.xml
@@ -0,0 +1,164 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one
+   or more contributor license agreements.  See the NOTICE file
+   distributed with this work for additional information
+   regarding copyright ownership.  The ASF licenses this file
+   to you under the Apache License, Version 2.0 (the
+   "License"); you may not use this file except in compliance
+   with the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing,
+   software distributed under the License is distributed on an
+   "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+   KIND, either express or implied.  See the License for the
+   specific language governing permissions and limitations
+   under the License.    
+-->
+
+<dictionary case_sensitive="false">
+<entry>
+<token>tel.</token>
+</entry>
+<entry>
+<token>Mr.</token>
+</entry>
+<entry>
+<token>Mrs.</token>
+</entry>
+<entry>
+<token>.i.</token>
+</entry>
+<entry>
+<token>Uacht.</token>
+</entry>
+<entry>
+<token>m.sh.</token>
+</entry>
+<entry>
+<token>lch.</token>
+</entry>
+<entry>
+<token>lgh.</token>
+</entry>
+<entry>
+<token>Dr.</token>
+</entry>
+<entry>
+<token>uimh.</token>
+</entry>
+<entry>
+<token>Co.</token>
+</entry>
+<entry>
+<token>gCo.</token>
+</entry>
+<entry>
+<token>tUacht.</token>
+</entry>
+<entry>
+<token>Uas.</token>
+</entry>
+<entry>
+<token>tUas.</token>
+</entry>
+<entry>
+<token>Msc.</token>
+</entry>
+<entry>
+<token>Ms.</token>
+</entry>
+<entry>
+<token>Sr.</token>
+</entry>
+<entry>
+<token>Jr.</token>
+</entry>
+<entry>
+<token>Bros.</token>
+</entry>
+<entry>
+<token>fig.</token>
+</entry>
+<entry>
+<token>Jan.</token>
+</entry>
+<entry>
+<token>Feb.</token>
+</entry>
+<entry>
+<token>Mar.</token>
+</entry>
+<entry>
+<token>Apr.</token>
+</entry>
+<entry>
+<token>Jun.</token>
+</entry>
+<entry>
+<token>Jul.</token>
+</entry>
+<entry>
+<token>Aug.</token>
+</entry>
+<entry>
+<token>Sep.</token>
+</entry>
+<entry>
+<token>Sept.</token>
+</entry>
+<entry>
+<token>Oct.</token>
+</entry>
+<entry>
+<token>Nov.</token>
+</entry>
+<entry>
+<token>Dec.</token>
+</entry>
+<entry>
+<token>Ean.</token>
+</entry>
+<entry>
+<token>Fea.</token>
+</entry>
+<entry>
+<token>Már.</token>
+</entry>
+<entry>
+<token>Aib.</token>
+</entry>
+<entry>
+<token>Bea.</token>
+</entry>
+<entry>
+<token>Mei.</token>
+</entry>
+<entry>
+<token>Iúl.</token>
+</entry>
+<entry>
+<token>Lún.</token>
+</entry>
+<entry>
+<token>M.Fr.</token>
+</entry>
+<entry>
+<token>D.Fr.</token>
+</entry>
+<entry>
+<token>Sam.</token>
+</entry>
+<entry>
+<token>Nol.</token>
+</entry>
+<entry>
+<token>Ltd.</token>
+</entry>
+<entry>
+<token>Teo.</token>
+</entry>
+</dictionary>

http://git-wip-us.apache.org/repos/asf/opennlp/blob/caeaaeea/opennlp-tools/lang/ga/tokenizer/ga-detokenizer.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/lang/ga/tokenizer/ga-detokenizer.xml b/opennlp-tools/lang/ga/tokenizer/ga-detokenizer.xml
new file mode 100644
index 0000000..23fe96a
--- /dev/null
+++ b/opennlp-tools/lang/ga/tokenizer/ga-detokenizer.xml
@@ -0,0 +1,113 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<!--
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing,
+software distributed under the License is distributed on an
+"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+KIND, either express or implied.  See the License for the
+specific language governing permissions and limitations
+under the License.    
+-->
+
+<dictionary>
+  <entry operation="RIGHT_LEFT_MATCHING">
+    <token>"</token>
+  </entry>
+  <entry operation="RIGHT_LEFT_MATCHING">
+    <token>'</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>.</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>?</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>!</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>,</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>;</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>:</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>(</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>)</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>}</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>{</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>]</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>[</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>»</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>«</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>``</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>''</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>%</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>.org</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>.com</token>
+  </entry>
+  <entry operation="MOVE_LEFT">
+    <token>.net</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>#</token>
+  </entry>
+  <entry operation="MOVE_BOTH">
+    <token>-</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>m'</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>d'</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>b'</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>mb'</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>dh'</token>
+  </entry>
+  <entry operation="MOVE_RIGHT">
+    <token>lem'</token>
+  </entry>
+</dictionary>