You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/06/06 10:09:45 UTC

[01/21] opennlp git commit: closes apache/opennlp#157 Won't fix [Forced Update!]

Repository: opennlp
Updated Branches:
  refs/heads/LangDetect b406dbe13 -> a98532846 (forced update)


closes apache/opennlp#157 *Won't fix*


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/839ff109
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/839ff109
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/839ff109

Branch: refs/heads/LangDetect
Commit: 839ff10997d83b83e794707a80a770feadce0b87
Parents: 911d59f
Author: smarthi <sm...@apache.org>
Authored: Wed May 17 13:47:43 2017 -0400
Committer: smarthi <sm...@apache.org>
Committed: Wed May 17 13:47:43 2017 -0400

----------------------------------------------------------------------

----------------------------------------------------------------------

[18/21] opennlp git commit: OPENNLP-1087: Add convenience methods to load from Path

Posted by jo...@apache.org.

OPENNLP-1087: Add convenience methods to load from Path


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/15ac7bd1
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/15ac7bd1
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/15ac7bd1

Branch: refs/heads/LangDetect
Commit: 15ac7bd178a0f06e7a1fa18a85d3dee631bc7fc9
Parents: ac1e0fd
Author: Jörn Kottmann <jo...@apache.org>
Authored: Thu Jun 1 00:17:57 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Thu Jun 1 09:44:02 2017 +0200

----------------------------------------------------------------------
 .../opennlp/tools/chunker/ChunkerModel.java     |  5 +++++
 .../java/opennlp/tools/doccat/DoccatModel.java  |  5 +++++
 .../tools/lemmatizer/DictionaryLemmatizer.java  | 22 ++++++++++++++++----
 .../tools/lemmatizer/LemmatizerModel.java       |  5 +++++
 .../tools/namefind/TokenNameFinderModel.java    |  5 +++++
 .../java/opennlp/tools/parser/ParserModel.java  |  5 +++++
 .../java/opennlp/tools/postag/POSModel.java     |  5 +++++
 .../opennlp/tools/sentdetect/SentenceModel.java |  5 +++++
 .../opennlp/tools/tokenize/TokenizerModel.java  |  5 +++++
 9 files changed, 58 insertions(+), 4 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/15ac7bd1/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerModel.java b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerModel.java
index 12c8bbe..393c2fa 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/chunker/ChunkerModel.java
@@ -22,6 +22,7 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.nio.file.Path;
 import java.util.Map;
 import java.util.Properties;
 
@@ -80,6 +81,10 @@ public class ChunkerModel extends BaseModel {
     super(COMPONENT_NAME, modelFile);
   }
 
+  public ChunkerModel(Path modelPath) throws IOException, InvalidFormatException {
+    this(modelPath.toFile());
+  }
+
   public ChunkerModel(URL modelURL) throws IOException, InvalidFormatException {
     super(COMPONENT_NAME, modelURL);
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/15ac7bd1/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java b/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
index e71b625..1b5c164 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/doccat/DoccatModel.java
@@ -21,6 +21,7 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.nio.file.Path;
 import java.util.Map;
 
 import opennlp.tools.ml.model.AbstractModel;
@@ -53,6 +54,10 @@ public class DoccatModel extends BaseModel {
     super(COMPONENT_NAME, modelFile);
   }
 
+  public DoccatModel(Path modelPath) throws IOException {
+    this(modelPath.toFile());
+  }
+
   public DoccatModel(URL modelURL) throws IOException {
     super(COMPONENT_NAME, modelURL);
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/15ac7bd1/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
index 97d6854..b480ad1 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/DictionaryLemmatizer.java
@@ -18,9 +18,12 @@
 package opennlp.tools.lemmatizer;
 
 import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
+import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.HashMap;
@@ -37,7 +40,7 @@ public class DictionaryLemmatizer implements Lemmatizer {
   /**
    * The hashmap containing the dictionary.
    */
-  private final Map<List<String>, List<String>> dictMap;
+  private final Map<List<String>, List<String>> dictMap = new HashMap<>();
 
   /**
    * Construct a hashmap from the input tab separated dictionary.
@@ -50,7 +53,20 @@ public class DictionaryLemmatizer implements Lemmatizer {
    *          the input dictionary via inputstream
    */
   public DictionaryLemmatizer(final InputStream dictionary) throws IOException {
-    this.dictMap = new HashMap<>();
+    init(dictionary);
+  }
+
+  public DictionaryLemmatizer(File dictionaryFile) throws IOException {
+    try (InputStream in = new FileInputStream(dictionaryFile)) {
+      init(in);
+    }
+  }
+
+  public DictionaryLemmatizer(Path dictionaryFile) throws IOException {
+    this(dictionaryFile.toFile());
+  }
+
+  private void init(InputStream dictionary) throws IOException {
     final BufferedReader breader = new BufferedReader(
         new InputStreamReader(dictionary));
     String line;
@@ -60,8 +76,6 @@ public class DictionaryLemmatizer implements Lemmatizer {
       this.dictMap.put(Arrays.asList(elems[0], elems[1]), Arrays.asList(lemmas));
     }
   }
-
-
   /**
    * Get the Map containing the dictionary.
    *

http://git-wip-us.apache.org/repos/asf/opennlp/blob/15ac7bd1/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerModel.java b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerModel.java
index 2f5f6ef..cb11e8b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/lemmatizer/LemmatizerModel.java
@@ -21,6 +21,7 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.nio.file.Path;
 import java.util.Map;
 import java.util.Properties;
 
@@ -77,6 +78,10 @@ public class LemmatizerModel extends BaseModel {
     super(COMPONENT_NAME, modelFile);
   }
 
+  public LemmatizerModel(Path modelPath) throws IOException, InvalidFormatException {
+    this(modelPath.toFile());
+  }
+
   public LemmatizerModel(URL modelURL) throws IOException, InvalidFormatException {
     super(COMPONENT_NAME, modelURL);
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/15ac7bd1/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java
index 5b72449..98dad06 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/namefind/TokenNameFinderModel.java
@@ -22,6 +22,7 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.nio.file.Path;
 import java.util.Map;
 import java.util.Properties;
 
@@ -107,6 +108,10 @@ public class TokenNameFinderModel extends BaseModel {
     super(COMPONENT_NAME, modelFile);
   }
 
+  public TokenNameFinderModel(Path modelPath) throws IOException {
+    this(modelPath.toFile());
+  }
+
   public TokenNameFinderModel(URL modelURL) throws IOException {
     super(COMPONENT_NAME, modelURL);
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/15ac7bd1/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java b/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java
index c290d9f..86ea2f0 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/parser/ParserModel.java
@@ -26,6 +26,7 @@ import java.io.OutputStream;
 import java.io.OutputStreamWriter;
 import java.net.URL;
 import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
 import java.util.Map;
 import java.util.Objects;
 
@@ -133,6 +134,10 @@ public class ParserModel extends BaseModel {
     super(COMPONENT_NAME, modelFile);
   }
 
+  public ParserModel(Path modelPath) throws IOException {
+    this(modelPath.toFile());
+  }
+
   public ParserModel(URL modelURL) throws IOException {
     super(COMPONENT_NAME, modelURL);
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/15ac7bd1/opennlp-tools/src/main/java/opennlp/tools/postag/POSModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/postag/POSModel.java b/opennlp-tools/src/main/java/opennlp/tools/postag/POSModel.java
index f81092b..95a41a8 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/postag/POSModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/postag/POSModel.java
@@ -21,6 +21,7 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.nio.file.Path;
 import java.util.Map;
 import java.util.Objects;
 import java.util.Properties;
@@ -98,6 +99,10 @@ public final class POSModel extends BaseModel {
     super(COMPONENT_NAME, modelFile);
   }
 
+  public POSModel(Path modelPath) throws IOException {
+    this(modelPath.toFile());
+  }
+
   public POSModel(URL modelURL) throws IOException {
     super(COMPONENT_NAME, modelURL);
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/15ac7bd1/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceModel.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceModel.java
index a716210..e8df1d6 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceModel.java
@@ -22,6 +22,7 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.nio.file.Path;
 import java.util.Map;
 
 import opennlp.tools.dictionary.Dictionary;
@@ -95,6 +96,10 @@ public class SentenceModel extends BaseModel {
     super(COMPONENT_NAME, modelFile);
   }
 
+  public SentenceModel(Path modelPath) throws IOException {
+    this(modelPath.toFile());
+  }
+
   public SentenceModel(URL modelURL) throws IOException {
     super(COMPONENT_NAME, modelURL);
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/15ac7bd1/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
index 04db0ce..1db7c49 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/tokenize/TokenizerModel.java
@@ -22,6 +22,7 @@ import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
+import java.nio.file.Path;
 import java.util.Map;
 
 import opennlp.tools.dictionary.Dictionary;
@@ -81,6 +82,10 @@ public final class TokenizerModel extends BaseModel {
     super(COMPONENT_NAME, modelFile);
   }
 
+  public TokenizerModel(Path modelPath) throws IOException {
+    this(modelPath.toFile());
+  }
+
   /**
    * Initializes the current instance.
    *

[03/21] opennlp git commit: Update README for release 1.8.0 RC

Posted by jo...@apache.org.

Update README for release 1.8.0 RC

closes #209


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/17800716
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/17800716
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/17800716

Branch: refs/heads/LangDetect
Commit: 17800716c8bcfe33e5508dda955dc39d15863b71
Parents: ee9fdb8
Author: Rodrigo Agerri <ra...@apache.org>
Authored: Wed May 17 22:00:10 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 17 23:12:08 2017 +0200

----------------------------------------------------------------------
 opennlp-distr/README | 1 +
 1 file changed, 1 insertion(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/17800716/opennlp-distr/README
----------------------------------------------------------------------
diff --git a/opennlp-distr/README b/opennlp-distr/README
index 975c651..7f9bc4d 100644
--- a/opennlp-distr/README
+++ b/opennlp-distr/README
@@ -36,6 +36,7 @@ Additionally the release contains the following noteworthy changes:
 - Now prefix and suffix feature generators are configurable
 - Remove API in Document Categorizer for user specified tokenizer
 - Learnable lemmatizer now returns all possible lemmas for a given word and pos tag
+- Lemmatizer API backward compatibility break: no need to encode/decode lemmas anymore, now LemmatizerME lemmatize method returns the actual lemma
 - Add stemmer, detokenizer and sentence detection abbreviations for Irish
 - Chunker SequenceValidator signature changed to allow access to both token and POS tag

[10/21] opennlp git commit: OPENNLP-1076: Add validation of spans to SentenceSample

Posted by jo...@apache.org.

OPENNLP-1076: Add validation of spans to SentenceSample


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/d378c065
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/d378c065
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/d378c065

Branch: refs/heads/LangDetect
Commit: d378c0656ff2374a867abe0383aa841275a47d8d
Parents: 226612f
Author: Jörn Kottmann <jo...@apache.org>
Authored: Wed May 24 12:10:37 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 24 12:10:37 2017 +0200

----------------------------------------------------------------------
 .../main/java/opennlp/tools/sentdetect/SentenceSample.java  | 9 +++++++++
 .../java/opennlp/tools/sentdetect/SentenceSampleTest.java   | 7 ++++++-
 2 files changed, 15 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/d378c065/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSample.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSample.java b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSample.java
index dbbd193..7891cfd 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSample.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/sentdetect/SentenceSample.java
@@ -45,6 +45,15 @@ public class SentenceSample {
   public SentenceSample(CharSequence document, Span... sentences) {
     this.document = document.toString();
     this.sentences = Collections.unmodifiableList(new ArrayList<>(Arrays.asList(sentences)));
+
+    // validate that all spans are inside the document text
+    for (Span sentence : sentences) {
+      if (sentence.getEnd() > document.length()) {
+        throw new IllegalArgumentException(
+            String.format("Sentence span is outside of document text [len %d] and span %s",
+            document.length(), sentence));
+      }
+    }
   }
 
   public SentenceSample(Detokenizer detokenizer, String[][] sentences) {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/d378c065/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceSampleTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceSampleTest.java b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceSampleTest.java
index 163cb73..2ec0978 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceSampleTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/sentdetect/SentenceSampleTest.java
@@ -29,7 +29,6 @@ public class SentenceSampleTest {
 
   @Test
   public void testRetrievingContent() {
-
     SentenceSample sample = new SentenceSample("1. 2.",
         new Span(0, 2), new Span(3, 5));
 
@@ -38,6 +37,12 @@ public class SentenceSampleTest {
     Assert.assertEquals(new Span(3, 5), sample.getSentences()[1]);
   }
 
+  @Test(expected = IllegalArgumentException.class)
+  public void testInvalidSpansFailFast() {
+    SentenceSample sample = new SentenceSample("1. 2.",
+        new Span(0, 2), new Span(5, 7));
+  }
+
   @Test
   public void testEquals() {
     Assert.assertFalse(createGoldSample() == createGoldSample());

[17/21] opennlp git commit: OPENNLP-1085: Add methods to write model to File or Path

Posted by jo...@apache.org.

OPENNLP-1085: Add methods to write model to File or Path


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/ac1e0fd3
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/ac1e0fd3
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/ac1e0fd3

Branch: refs/heads/LangDetect
Commit: ac1e0fd303361803d2ceb789d568e7b43ba25e9d
Parents: cc173c2
Author: Jörn Kottmann <jo...@apache.org>
Authored: Wed May 31 23:31:32 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Thu Jun 1 00:21:51 2017 +0200

----------------------------------------------------------------------
 .../main/java/opennlp/tools/util/model/BaseModel.java  | 13 +++++++++++++
 1 file changed, 13 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/ac1e0fd3/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java b/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java
index f70fb03..31f5079 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/model/BaseModel.java
@@ -18,9 +18,11 @@
 package opennlp.tools.util.model;
 
 import java.io.BufferedInputStream;
+import java.io.BufferedOutputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.File;
 import java.io.FileInputStream;
+import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.ObjectInputStream;
@@ -28,6 +30,7 @@ import java.io.ObjectOutputStream;
 import java.io.OutputStream;
 import java.io.Serializable;
 import java.net.URL;
+import java.nio.file.Path;
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Objects;
@@ -598,6 +601,16 @@ public abstract class BaseModel implements ArtifactProvider, Serializable {
     zip.flush();
   }
 
+  public final void serialize(File model) throws IOException {
+    try (OutputStream out = new BufferedOutputStream(new FileOutputStream(model))) {
+      serialize(out);
+    }
+  }
+
+  public final void serialize(Path model) throws IOException {
+    serialize(model.toFile());
+  }
+
   @SuppressWarnings("unchecked")
   public <T> T getArtifact(String key) {
     Object artifact = artifactMap.get(key);

[05/21] opennlp git commit: [maven-release-plugin] prepare for next development iteration

Posted by jo...@apache.org.

[maven-release-plugin] prepare for next development iteration


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/6d2c8fca
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/6d2c8fca
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/6d2c8fca

Branch: refs/heads/LangDetect
Commit: 6d2c8fca1650892e022a5cf5caec87fcd3c77d9e
Parents: 73c8e5b
Author: Jörn Kottmann <jo...@apache.org>
Authored: Wed May 17 23:29:50 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 17 23:29:50 2017 +0200

----------------------------------------------------------------------
 opennlp-brat-annotator/pom.xml   | 2 +-
 opennlp-distr/pom.xml            | 2 +-
 opennlp-docs/pom.xml             | 2 +-
 opennlp-morfologik-addon/pom.xml | 2 +-
 opennlp-tools/pom.xml            | 2 +-
 opennlp-uima/pom.xml             | 2 +-
 pom.xml                          | 4 ++--
 7 files changed, 8 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/6d2c8fca/opennlp-brat-annotator/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-brat-annotator/pom.xml b/opennlp-brat-annotator/pom.xml
index 008fd65..0791e6b 100644
--- a/opennlp-brat-annotator/pom.xml
+++ b/opennlp-brat-annotator/pom.xml
@@ -17,7 +17,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.0</version>
+		<version>1.8.1-SNAPSHOT</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6d2c8fca/opennlp-distr/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-distr/pom.xml b/opennlp-distr/pom.xml
index 317c37f..4428240 100644
--- a/opennlp-distr/pom.xml
+++ b/opennlp-distr/pom.xml
@@ -24,7 +24,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.0</version>
+		<version>1.8.1-SNAPSHOT</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6d2c8fca/opennlp-docs/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/pom.xml b/opennlp-docs/pom.xml
index 6b407b8..312f6b8 100644
--- a/opennlp-docs/pom.xml
+++ b/opennlp-docs/pom.xml
@@ -24,7 +24,7 @@
   <parent>
 	<groupId>org.apache.opennlp</groupId>
 	<artifactId>opennlp</artifactId>
-	<version>1.8.0</version>
+	<version>1.8.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
   

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6d2c8fca/opennlp-morfologik-addon/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/pom.xml b/opennlp-morfologik-addon/pom.xml
index 26ba3bd..3ce5e7c 100644
--- a/opennlp-morfologik-addon/pom.xml
+++ b/opennlp-morfologik-addon/pom.xml
@@ -24,7 +24,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.0</version>
+		<version>1.8.1-SNAPSHOT</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6d2c8fca/opennlp-tools/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/pom.xml b/opennlp-tools/pom.xml
index 573861b..a2cf596 100644
--- a/opennlp-tools/pom.xml
+++ b/opennlp-tools/pom.xml
@@ -25,7 +25,7 @@
   <parent>
     <groupId>org.apache.opennlp</groupId>
     <artifactId>opennlp</artifactId>
-    <version>1.8.0</version>
+    <version>1.8.1-SNAPSHOT</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
     

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6d2c8fca/opennlp-uima/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-uima/pom.xml b/opennlp-uima/pom.xml
index 1db9c38..d8f5246 100644
--- a/opennlp-uima/pom.xml
+++ b/opennlp-uima/pom.xml
@@ -25,7 +25,7 @@
 	<parent>
 	    <groupId>org.apache.opennlp</groupId>
 	    <artifactId>opennlp</artifactId>
-	    <version>1.8.0</version>
+	    <version>1.8.1-SNAPSHOT</version>
 	    <relativePath>../pom.xml</relativePath>
     </parent>
     

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6d2c8fca/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 737752c..41a0ed4 100644
--- a/pom.xml
+++ b/pom.xml
@@ -31,7 +31,7 @@
 
 	<groupId>org.apache.opennlp</groupId>
 	<artifactId>opennlp</artifactId>
-	<version>1.8.0</version>
+	<version>1.8.1-SNAPSHOT</version>
 	<packaging>pom</packaging>
 
 	<name>Apache OpenNLP Reactor</name>
@@ -40,7 +40,7 @@
 		<connection>scm:git:git@github.com:apache/opennlp.git</connection>
 		<developerConnection>scm:git:https://git-wip-us.apache.org/repos/asf/opennlp.git</developerConnection>
 		<url>https://git-wip-us.apache.org/repos/asf?p=opennlp.git</url>
-		<tag>opennlp-1.8.0</tag>
+		<tag>HEAD</tag>
 	</scm>
 
 	<mailingLists>

[19/21] opennlp git commit: OPENNLP-788: Add LanguageDetector tool

Posted by jo...@apache.org.

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
new file mode 100644
index 0000000..7d12581
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageSampleTest.java
@@ -0,0 +1,89 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageSampleTest {
+
+  @Test
+  public void testConstructor() {
+    Language lang = new Language("aLang");
+    CharSequence context = "aContext";
+
+    LanguageSample sample = new LanguageSample(lang, context);
+
+    Assert.assertEquals(lang, sample.getLanguage());
+    Assert.assertEquals(context, sample.getContext());
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void testNullLang() throws Exception {
+    CharSequence context = "aContext";
+
+    new LanguageSample(null, context);
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void testNullContext() {
+    Language lang = new Language("aLang");
+
+    new LanguageSample(lang, null);
+  }
+
+  @Test
+  public void testToString() {
+    Language lang = new Language("aLang");
+    CharSequence context = "aContext";
+
+    LanguageSample sample = new LanguageSample(lang, context);
+
+    Assert.assertEquals(lang.getLang() + "\t" + context, sample.toString());
+  }
+
+  @Test
+  public void testHash() {
+
+    int hashA = new LanguageSample(new Language("aLang"), "aContext").hashCode();
+    int hashB = new LanguageSample(new Language("bLang"), "aContext").hashCode();
+    int hashC = new LanguageSample(new Language("aLang"), "bContext").hashCode();
+
+    Assert.assertNotEquals(hashA, hashB);
+    Assert.assertNotEquals(hashA, hashC);
+    Assert.assertNotEquals(hashB, hashC);
+  }
+
+  @Test
+  public void testEquals() throws Exception {
+
+    LanguageSample sampleA = new LanguageSample(new Language("aLang"), "aContext");
+    LanguageSample sampleA1 = new LanguageSample(new Language("aLang"), "aContext");
+    LanguageSample sampleB = new LanguageSample(new Language("bLang"), "aContext");
+    LanguageSample sampleC = new LanguageSample(new Language("aLang"), "bContext");
+
+    Assert.assertEquals(sampleA, sampleA);
+    Assert.assertEquals(sampleA, sampleA1);
+    Assert.assertNotEquals(sampleA, sampleB);
+    Assert.assertNotEquals(sampleA, sampleC);
+    Assert.assertNotEquals(sampleB, sampleC);
+    Assert.assertFalse(sampleA.equals("something else"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
new file mode 100644
index 0000000..dc25bc6
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageTest.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageTest {
+
+
+  @Test
+  public void emptyConfidence() throws Exception {
+    String languageCode = "aLanguage";
+    Language lang = new Language(languageCode);
+
+    Assert.assertEquals(languageCode, lang.getLang());
+    Assert.assertEquals(0, lang.getConfidence(), 0);
+  }
+
+  @Test
+  public void nonEmptyConfidence() throws Exception {
+    String languageCode = "aLanguage";
+    double confidence = 0.05;
+    Language lang = new Language(languageCode, confidence);
+
+    Assert.assertEquals(languageCode, lang.getLang());
+    Assert.assertEquals(confidence, lang.getConfidence(), 0);
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void emptyLanguage() throws Exception {
+    new Language(null);
+  }
+
+  @Test(expected = NullPointerException.class)
+  public void emptyLanguageConfidence() throws Exception {
+    new Language(null, 0.05);
+  }
+
+  @Test
+  public void testToString() {
+    Language lang = new Language("aLang");
+
+    Assert.assertEquals("aLang (0.0)", lang.toString());
+
+    lang = new Language("aLang", 0.0886678);
+
+    Assert.assertEquals("aLang (0.0886678)", lang.toString());
+  }
+
+
+  @Test
+  public void testHash() {
+    int hashA = new Language("aLang").hashCode();
+    int hashAA = new Language("aLang").hashCode();
+    int hashB = new Language("BLang").hashCode();
+    int hashA5 = new Language("aLang", 5.0).hashCode();
+    int hashA6 = new Language("BLang", 6.0).hashCode();
+
+    Assert.assertEquals(hashA, hashAA);
+
+    Assert.assertNotEquals(hashA, hashB);
+    Assert.assertNotEquals(hashA, hashA5);
+    Assert.assertNotEquals(hashB, hashA5);
+    Assert.assertNotEquals(hashA5, hashA6);
+  }
+
+  @Test
+  public void testEquals() {
+    Language langA = new Language("langA");
+    Language langB = new Language("langB");
+    Language langA5 = new Language("langA5", 5.0);
+    Language langA6 = new Language("langA5", 6.0);
+
+    Assert.assertEquals(langA, langA);
+    Assert.assertEquals(langA5, langA5);
+
+    Assert.assertNotEquals(langA, langA5);
+    Assert.assertNotEquals(langA, langB);
+
+    Assert.assertEquals(langA6, langA5);
+
+    Assert.assertNotEquals(langA, "something else");
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..0f8dfe7
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizerTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class EmojiCharSequenceNormalizerTest {
+
+  public EmojiCharSequenceNormalizer normalizer = EmojiCharSequenceNormalizer.getInstance();
+
+  @Test
+  public void normalizeEmoji() throws Exception {
+
+    String s = new StringBuilder()
+        .append("Any funny text goes here ")
+        .appendCodePoint(0x1F606)
+        .appendCodePoint(0x1F606)
+        .appendCodePoint(0x1F606)
+        .append(" ")
+        .appendCodePoint(0x1F61B)
+        .toString();
+    Assert.assertEquals(
+        "Any funny text goes here    ", normalizer.normalize(s));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..50b1f0c
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizerTest.java
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class NumberCharSequenceNormalizerTest {
+
+  public NumberCharSequenceNormalizer normalizer = NumberCharSequenceNormalizer.getInstance();
+
+
+  @Test
+  public void normalize() throws Exception {
+    Assert.assertEquals("absc  ,  abcd", normalizer.normalize("absc 123,0123 abcd"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..95cf300
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizerTest.java
@@ -0,0 +1,41 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class ShrinkCharSequenceNormalizerTest {
+
+  public ShrinkCharSequenceNormalizer normalizer = ShrinkCharSequenceNormalizer.getInstance();
+
+  @Test
+  public void normalizeSpace() throws Exception {
+    Assert.assertEquals(
+        "a text extra space", normalizer.normalize("a text    extra space"));
+  }
+
+  @Test
+  public void normalizeChar() throws Exception {
+    Assert.assertEquals("Helloo", normalizer.normalize("Helllllloooooo"));
+    Assert.assertEquals("Hello", normalizer.normalize("Hello"));
+    Assert.assertEquals("HHello", normalizer.normalize("HHello"));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..f0bd517
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizerTest.java
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class TwitterCharSequenceNormalizerTest {
+
+  public TwitterCharSequenceNormalizer normalizer = TwitterCharSequenceNormalizer.getInstance();
+
+  @Test
+  public void normalizeHashtag() throws Exception {
+    Assert.assertEquals("asdf   2nnfdf", normalizer.normalize("asdf #hasdk23 2nnfdf"));
+  }
+
+  @Test
+  public void normalizeUser() throws Exception {
+    Assert.assertEquals("asdf   2nnfdf", normalizer.normalize("asdf @hasdk23 2nnfdf"));
+  }
+
+  @Test
+  public void normalizeRT() throws Exception {
+    Assert.assertEquals(" 2nnfdf", normalizer.normalize("RT RT RT 2nnfdf"));
+  }
+
+  @Test
+  public void normalizeLaugh() throws Exception {
+    Assert.assertEquals("ahahah", normalizer.normalize("ahahahah"));
+    Assert.assertEquals("haha", normalizer.normalize("hahha"));
+    Assert.assertEquals("haha", normalizer.normalize("hahaa"));
+    Assert.assertEquals("ahaha", normalizer.normalize("ahahahahhahahhahahaaaa"));
+    Assert.assertEquals("jaja", normalizer.normalize("jajjajajaja"));
+  }
+
+
+
+  @Test
+  public void normalizeFace() throws Exception {
+    Assert.assertEquals("hello   hello", normalizer.normalize("hello :-) hello"));
+    Assert.assertEquals("hello   hello", normalizer.normalize("hello ;) hello"));
+    Assert.assertEquals("  hello", normalizer.normalize(":) hello"));
+    Assert.assertEquals("hello  ", normalizer.normalize("hello :P"));
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..f654c74
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizerTest.java
@@ -0,0 +1,263 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+/**
+ * Tests for the @{@link UnicodeCharSequenceNormalizer} based on
+ * https://github.com/shuyo/language-detection
+ */
+public class UnicodeCharSequenceNormalizerTest {
+
+  public UnicodeCharSequenceNormalizer normalizer = UnicodeCharSequenceNormalizer.getInstance();
+
+  @Test
+  public void getMessage() throws Exception {
+    Assert.assertEquals("\u4F7C\u6934", UnicodeCharSequenceNormalizer.getMessage("NGram.KANJI_1_0"));
+    Assert.assertEquals("!blah!", UnicodeCharSequenceNormalizer.getMessage("blah"));
+  }
+
+  @Test
+  public final void testNormalize() {
+    Assert.assertEquals("a b c d á é í ó ú ã",
+        normalizer.normalize("a b c d á é í ó ú ã"));
+
+  }
+
+  /**
+   * Test method for {@link UnicodeCharSequenceNormalizer#normalize(char)} with Latin characters
+   */
+  @Test
+  public final void testNormalizeWithLatin() {
+    Assert.assertEquals(' ', normalizer.normalize('\u0000'));
+    Assert.assertEquals(' ', normalizer.normalize('\u0020'));
+    Assert.assertEquals(' ', normalizer.normalize('\u0030'));
+    Assert.assertEquals(' ', normalizer.normalize('\u0040'));
+    Assert.assertEquals('\u0041', normalizer.normalize('\u0041'));
+    Assert.assertEquals('\u005a', normalizer.normalize('\u005a'));
+    Assert.assertEquals(' ', normalizer.normalize('\u005b'));
+    Assert.assertEquals(' ', normalizer.normalize('\u0060'));
+    Assert.assertEquals('\u0061', normalizer.normalize('\u0061'));
+    Assert.assertEquals('\u007a', normalizer.normalize('\u007a'));
+    Assert.assertEquals(' ', normalizer.normalize('\u007b'));
+    Assert.assertEquals(' ', normalizer.normalize('\u007f'));
+    Assert.assertEquals('\u0080', normalizer.normalize('\u0080'));
+    Assert.assertEquals(' ', normalizer.normalize('\u00a0'));
+    Assert.assertEquals('\u00a1', normalizer.normalize('\u00a1'));
+    // LATIN_EXTENDED_ADDITIONAL
+    Assert.assertEquals('\u1ec3', normalizer.normalize('\u1EA0'));
+    Assert.assertEquals('\u1ec3', normalizer.normalize('\u1EA1'));
+
+    Assert.assertEquals(' ', normalizer.normalize('\u2012'));
+    // Arabic
+    Assert.assertEquals('\u064a', normalizer.normalize('\u06cc'));
+    // Hiragana
+    Assert.assertEquals('\u3042', normalizer.normalize('\u3041'));
+    // Katakana
+    Assert.assertEquals('\u30a2', normalizer.normalize('\u30A1'));
+    // Bopomofo
+    Assert.assertEquals('\u3105', normalizer.normalize('\u31A0'));
+    // Bopomofo Ex
+    Assert.assertEquals('\u3105', normalizer.normalize('\u3106'));
+    //HANGUL_SYLLABLES
+    Assert.assertEquals('\uac00', normalizer.normalize('\uAC01'));
+  }
+
+  /**
+   * Test method for {@link UnicodeCharSequenceNormalizer#normalize(char)} with CJK Kanji characters
+   */
+  @Test
+  public final void testNormalizeWithCJKKanji() {
+    Assert.assertEquals('\u4E00', normalizer.normalize('\u4E00'));
+    Assert.assertEquals('\u4E01', normalizer.normalize('\u4E01'));
+    Assert.assertEquals('\u4E02', normalizer.normalize('\u4E02'));
+    Assert.assertEquals('\u4E01', normalizer.normalize('\u4E03'));
+    Assert.assertEquals('\u4E04', normalizer.normalize('\u4E04'));
+    Assert.assertEquals('\u4E05', normalizer.normalize('\u4E05'));
+    Assert.assertEquals('\u4E06', normalizer.normalize('\u4E06'));
+    Assert.assertEquals('\u4E07', normalizer.normalize('\u4E07'));
+    Assert.assertEquals('\u4E08', normalizer.normalize('\u4E08'));
+    Assert.assertEquals('\u4E09', normalizer.normalize('\u4E09'));
+    Assert.assertEquals('\u4E10', normalizer.normalize('\u4E10'));
+    Assert.assertEquals('\u4E11', normalizer.normalize('\u4E11'));
+    Assert.assertEquals('\u4E12', normalizer.normalize('\u4E12'));
+    Assert.assertEquals('\u4E13', normalizer.normalize('\u4E13'));
+    Assert.assertEquals('\u4E14', normalizer.normalize('\u4E14'));
+    Assert.assertEquals('\u4E15', normalizer.normalize('\u4E15'));
+    Assert.assertEquals('\u4E1e', normalizer.normalize('\u4E1e'));
+    Assert.assertEquals('\u4E1f', normalizer.normalize('\u4E1f'));
+    Assert.assertEquals('\u4E20', normalizer.normalize('\u4E20'));
+    Assert.assertEquals('\u4E21', normalizer.normalize('\u4E21'));
+    Assert.assertEquals('\u4E22', normalizer.normalize('\u4E22'));
+    Assert.assertEquals('\u4E23', normalizer.normalize('\u4E23'));
+    Assert.assertEquals('\u4E13', normalizer.normalize('\u4E24'));
+    Assert.assertEquals('\u4E13', normalizer.normalize('\u4E25'));
+    Assert.assertEquals('\u4E30', normalizer.normalize('\u4E30'));
+  }
+
+
+  /**
+   * Test method for {@link UnicodeCharSequenceNormalizer#normalize(char)} for Romanian characters
+   */
+  @Test
+  public final void testNormalizeForRomanian() {
+    Assert.assertEquals('\u015f', normalizer.normalize('\u015f'));
+    Assert.assertEquals('\u0163', normalizer.normalize('\u0163'));
+    Assert.assertEquals('\u015f', normalizer.normalize('\u0219'));
+    Assert.assertEquals('\u0163', normalizer.normalize('\u021b'));
+  }
+
+  /**
+   * Test method for {@link UnicodeCharSequenceNormalizer#normalize_vi(CharSequence)}
+   */
+  @Test
+  public final void testNormalizeVietnamese() {
+    Assert.assertEquals("", normalizer.normalize_vi(""));
+    Assert.assertEquals("ABC", normalizer.normalize_vi("ABC"));
+    Assert.assertEquals("012", normalizer.normalize_vi("012"));
+    Assert.assertEquals("\u00c0", normalizer.normalize_vi("\u00c0"));
+
+    Assert.assertEquals("\u00C0", normalizer.normalize_vi("\u0041\u0300"));
+    Assert.assertEquals("\u00C8", normalizer.normalize_vi("\u0045\u0300"));
+    Assert.assertEquals("\u00CC", normalizer.normalize_vi("\u0049\u0300"));
+    Assert.assertEquals("\u00D2", normalizer.normalize_vi("\u004F\u0300"));
+    Assert.assertEquals("\u00D9", normalizer.normalize_vi("\u0055\u0300"));
+    Assert.assertEquals("\u1EF2", normalizer.normalize_vi("\u0059\u0300"));
+    Assert.assertEquals("\u00E0", normalizer.normalize_vi("\u0061\u0300"));
+    Assert.assertEquals("\u00E8", normalizer.normalize_vi("\u0065\u0300"));
+    Assert.assertEquals("\u00EC", normalizer.normalize_vi("\u0069\u0300"));
+    Assert.assertEquals("\u00F2", normalizer.normalize_vi("\u006F\u0300"));
+    Assert.assertEquals("\u00F9", normalizer.normalize_vi("\u0075\u0300"));
+    Assert.assertEquals("\u1EF3", normalizer.normalize_vi("\u0079\u0300"));
+    Assert.assertEquals("\u1EA6", normalizer.normalize_vi("\u00C2\u0300"));
+    Assert.assertEquals("\u1EC0", normalizer.normalize_vi("\u00CA\u0300"));
+    Assert.assertEquals("\u1ED2", normalizer.normalize_vi("\u00D4\u0300"));
+    Assert.assertEquals("\u1EA7", normalizer.normalize_vi("\u00E2\u0300"));
+    Assert.assertEquals("\u1EC1", normalizer.normalize_vi("\u00EA\u0300"));
+    Assert.assertEquals("\u1ED3", normalizer.normalize_vi("\u00F4\u0300"));
+    Assert.assertEquals("\u1EB0", normalizer.normalize_vi("\u0102\u0300"));
+    Assert.assertEquals("\u1EB1", normalizer.normalize_vi("\u0103\u0300"));
+    Assert.assertEquals("\u1EDC", normalizer.normalize_vi("\u01A0\u0300"));
+    Assert.assertEquals("\u1EDD", normalizer.normalize_vi("\u01A1\u0300"));
+    Assert.assertEquals("\u1EEA", normalizer.normalize_vi("\u01AF\u0300"));
+    Assert.assertEquals("\u1EEB", normalizer.normalize_vi("\u01B0\u0300"));
+
+    Assert.assertEquals("\u00C1", normalizer.normalize_vi("\u0041\u0301"));
+    Assert.assertEquals("\u00C9", normalizer.normalize_vi("\u0045\u0301"));
+    Assert.assertEquals("\u00CD", normalizer.normalize_vi("\u0049\u0301"));
+    Assert.assertEquals("\u00D3", normalizer.normalize_vi("\u004F\u0301"));
+    Assert.assertEquals("\u00DA", normalizer.normalize_vi("\u0055\u0301"));
+    Assert.assertEquals("\u00DD", normalizer.normalize_vi("\u0059\u0301"));
+    Assert.assertEquals("\u00E1", normalizer.normalize_vi("\u0061\u0301"));
+    Assert.assertEquals("\u00E9", normalizer.normalize_vi("\u0065\u0301"));
+    Assert.assertEquals("\u00ED", normalizer.normalize_vi("\u0069\u0301"));
+    Assert.assertEquals("\u00F3", normalizer.normalize_vi("\u006F\u0301"));
+    Assert.assertEquals("\u00FA", normalizer.normalize_vi("\u0075\u0301"));
+    Assert.assertEquals("\u00FD", normalizer.normalize_vi("\u0079\u0301"));
+    Assert.assertEquals("\u1EA4", normalizer.normalize_vi("\u00C2\u0301"));
+    Assert.assertEquals("\u1EBE", normalizer.normalize_vi("\u00CA\u0301"));
+    Assert.assertEquals("\u1ED0", normalizer.normalize_vi("\u00D4\u0301"));
+    Assert.assertEquals("\u1EA5", normalizer.normalize_vi("\u00E2\u0301"));
+    Assert.assertEquals("\u1EBF", normalizer.normalize_vi("\u00EA\u0301"));
+    Assert.assertEquals("\u1ED1", normalizer.normalize_vi("\u00F4\u0301"));
+    Assert.assertEquals("\u1EAE", normalizer.normalize_vi("\u0102\u0301"));
+    Assert.assertEquals("\u1EAF", normalizer.normalize_vi("\u0103\u0301"));
+    Assert.assertEquals("\u1EDA", normalizer.normalize_vi("\u01A0\u0301"));
+    Assert.assertEquals("\u1EDB", normalizer.normalize_vi("\u01A1\u0301"));
+    Assert.assertEquals("\u1EE8", normalizer.normalize_vi("\u01AF\u0301"));
+    Assert.assertEquals("\u1EE9", normalizer.normalize_vi("\u01B0\u0301"));
+
+    Assert.assertEquals("\u00C3", normalizer.normalize_vi("\u0041\u0303"));
+    Assert.assertEquals("\u1EBC", normalizer.normalize_vi("\u0045\u0303"));
+    Assert.assertEquals("\u0128", normalizer.normalize_vi("\u0049\u0303"));
+    Assert.assertEquals("\u00D5", normalizer.normalize_vi("\u004F\u0303"));
+    Assert.assertEquals("\u0168", normalizer.normalize_vi("\u0055\u0303"));
+    Assert.assertEquals("\u1EF8", normalizer.normalize_vi("\u0059\u0303"));
+    Assert.assertEquals("\u00E3", normalizer.normalize_vi("\u0061\u0303"));
+    Assert.assertEquals("\u1EBD", normalizer.normalize_vi("\u0065\u0303"));
+    Assert.assertEquals("\u0129", normalizer.normalize_vi("\u0069\u0303"));
+    Assert.assertEquals("\u00F5", normalizer.normalize_vi("\u006F\u0303"));
+    Assert.assertEquals("\u0169", normalizer.normalize_vi("\u0075\u0303"));
+    Assert.assertEquals("\u1EF9", normalizer.normalize_vi("\u0079\u0303"));
+    Assert.assertEquals("\u1EAA", normalizer.normalize_vi("\u00C2\u0303"));
+    Assert.assertEquals("\u1EC4", normalizer.normalize_vi("\u00CA\u0303"));
+    Assert.assertEquals("\u1ED6", normalizer.normalize_vi("\u00D4\u0303"));
+    Assert.assertEquals("\u1EAB", normalizer.normalize_vi("\u00E2\u0303"));
+    Assert.assertEquals("\u1EC5", normalizer.normalize_vi("\u00EA\u0303"));
+    Assert.assertEquals("\u1ED7", normalizer.normalize_vi("\u00F4\u0303"));
+    Assert.assertEquals("\u1EB4", normalizer.normalize_vi("\u0102\u0303"));
+    Assert.assertEquals("\u1EB5", normalizer.normalize_vi("\u0103\u0303"));
+    Assert.assertEquals("\u1EE0", normalizer.normalize_vi("\u01A0\u0303"));
+    Assert.assertEquals("\u1EE1", normalizer.normalize_vi("\u01A1\u0303"));
+    Assert.assertEquals("\u1EEE", normalizer.normalize_vi("\u01AF\u0303"));
+    Assert.assertEquals("\u1EEF", normalizer.normalize_vi("\u01B0\u0303"));
+
+    Assert.assertEquals("\u1EA2", normalizer.normalize_vi("\u0041\u0309"));
+    Assert.assertEquals("\u1EBA", normalizer.normalize_vi("\u0045\u0309"));
+    Assert.assertEquals("\u1EC8", normalizer.normalize_vi("\u0049\u0309"));
+    Assert.assertEquals("\u1ECE", normalizer.normalize_vi("\u004F\u0309"));
+    Assert.assertEquals("\u1EE6", normalizer.normalize_vi("\u0055\u0309"));
+    Assert.assertEquals("\u1EF6", normalizer.normalize_vi("\u0059\u0309"));
+    Assert.assertEquals("\u1EA3", normalizer.normalize_vi("\u0061\u0309"));
+    Assert.assertEquals("\u1EBB", normalizer.normalize_vi("\u0065\u0309"));
+    Assert.assertEquals("\u1EC9", normalizer.normalize_vi("\u0069\u0309"));
+    Assert.assertEquals("\u1ECF", normalizer.normalize_vi("\u006F\u0309"));
+    Assert.assertEquals("\u1EE7", normalizer.normalize_vi("\u0075\u0309"));
+    Assert.assertEquals("\u1EF7", normalizer.normalize_vi("\u0079\u0309"));
+    Assert.assertEquals("\u1EA8", normalizer.normalize_vi("\u00C2\u0309"));
+    Assert.assertEquals("\u1EC2", normalizer.normalize_vi("\u00CA\u0309"));
+    Assert.assertEquals("\u1ED4", normalizer.normalize_vi("\u00D4\u0309"));
+    Assert.assertEquals("\u1EA9", normalizer.normalize_vi("\u00E2\u0309"));
+    Assert.assertEquals("\u1EC3", normalizer.normalize_vi("\u00EA\u0309"));
+    Assert.assertEquals("\u1ED5", normalizer.normalize_vi("\u00F4\u0309"));
+    Assert.assertEquals("\u1EB2", normalizer.normalize_vi("\u0102\u0309"));
+    Assert.assertEquals("\u1EB3", normalizer.normalize_vi("\u0103\u0309"));
+    Assert.assertEquals("\u1EDE", normalizer.normalize_vi("\u01A0\u0309"));
+    Assert.assertEquals("\u1EDF", normalizer.normalize_vi("\u01A1\u0309"));
+    Assert.assertEquals("\u1EEC", normalizer.normalize_vi("\u01AF\u0309"));
+    Assert.assertEquals("\u1EED", normalizer.normalize_vi("\u01B0\u0309"));
+
+    Assert.assertEquals("\u1EA0", normalizer.normalize_vi("\u0041\u0323"));
+    Assert.assertEquals("\u1EB8", normalizer.normalize_vi("\u0045\u0323"));
+    Assert.assertEquals("\u1ECA", normalizer.normalize_vi("\u0049\u0323"));
+    Assert.assertEquals("\u1ECC", normalizer.normalize_vi("\u004F\u0323"));
+    Assert.assertEquals("\u1EE4", normalizer.normalize_vi("\u0055\u0323"));
+    Assert.assertEquals("\u1EF4", normalizer.normalize_vi("\u0059\u0323"));
+    Assert.assertEquals("\u1EA1", normalizer.normalize_vi("\u0061\u0323"));
+    Assert.assertEquals("\u1EB9", normalizer.normalize_vi("\u0065\u0323"));
+    Assert.assertEquals("\u1ECB", normalizer.normalize_vi("\u0069\u0323"));
+    Assert.assertEquals("\u1ECD", normalizer.normalize_vi("\u006F\u0323"));
+    Assert.assertEquals("\u1EE5", normalizer.normalize_vi("\u0075\u0323"));
+    Assert.assertEquals("\u1EF5", normalizer.normalize_vi("\u0079\u0323"));
+    Assert.assertEquals("\u1EAC", normalizer.normalize_vi("\u00C2\u0323"));
+    Assert.assertEquals("\u1EC6", normalizer.normalize_vi("\u00CA\u0323"));
+    Assert.assertEquals("\u1ED8", normalizer.normalize_vi("\u00D4\u0323"));
+    Assert.assertEquals("\u1EAD", normalizer.normalize_vi("\u00E2\u0323"));
+    Assert.assertEquals("\u1EC7", normalizer.normalize_vi("\u00EA\u0323"));
+    Assert.assertEquals("\u1ED9", normalizer.normalize_vi("\u00F4\u0323"));
+    Assert.assertEquals("\u1EB6", normalizer.normalize_vi("\u0102\u0323"));
+    Assert.assertEquals("\u1EB7", normalizer.normalize_vi("\u0103\u0323"));
+    Assert.assertEquals("\u1EE2", normalizer.normalize_vi("\u01A0\u0323"));
+    Assert.assertEquals("\u1EE3", normalizer.normalize_vi("\u01A1\u0323"));
+    Assert.assertEquals("\u1EF0", normalizer.normalize_vi("\u01AF\u0323"));
+    Assert.assertEquals("\u1EF1", normalizer.normalize_vi("\u01B0\u0323"));
+
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java
new file mode 100644
index 0000000..72eb83a
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizerTest.java
@@ -0,0 +1,47 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.util.normalizer;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class UrlCharSequenceNormalizerTest {
+
+  public UrlCharSequenceNormalizer normalizer = UrlCharSequenceNormalizer.getInstance();
+
+  @Test
+  public void normalizeUrl() throws Exception {
+    Assert.assertEquals(
+        "asdf   2nnfdf", normalizer.normalize("asdf http://asdf.com/dfa/cxs 2nnfdf"));
+
+
+    Assert.assertEquals(
+        "asdf   2nnfdf  ", normalizer.normalize("asdf http://asdf.com/dfa/cx" +
+            "s 2nnfdf http://asdf.com/dfa/cxs"));
+  }
+
+  @Test
+  public void normalizeEmail() throws Exception {
+    Assert.assertEquals(
+        "asdf   2nnfdf", normalizer.normalize("asdf asd.fdfa@hasdk23.com.br 2nnfdf"));
+    Assert.assertEquals(
+        "asdf   2nnfdf  ", normalizer.normalize("asdf asd.fdfa@hasdk23.com.br" +
+            " 2nnfdf asd.fdfa@hasdk23.com.br"));
+  }
+}

[04/21] opennlp git commit: [maven-release-plugin] prepare release opennlp-1.8.0

Posted by jo...@apache.org.

[maven-release-plugin] prepare release opennlp-1.8.0


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/73c8e5b9
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/73c8e5b9
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/73c8e5b9

Branch: refs/heads/LangDetect
Commit: 73c8e5b9d8e055fefb53f7f3c2487d05c9788c6a
Parents: 1780071
Author: Jörn Kottmann <jo...@apache.org>
Authored: Wed May 17 23:19:47 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 17 23:19:47 2017 +0200

----------------------------------------------------------------------
 opennlp-brat-annotator/pom.xml   | 2 +-
 opennlp-distr/pom.xml            | 2 +-
 opennlp-docs/pom.xml             | 2 +-
 opennlp-morfologik-addon/pom.xml | 2 +-
 opennlp-tools/pom.xml            | 2 +-
 opennlp-uima/pom.xml             | 2 +-
 pom.xml                          | 4 ++--
 7 files changed, 8 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/73c8e5b9/opennlp-brat-annotator/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-brat-annotator/pom.xml b/opennlp-brat-annotator/pom.xml
index 6c7be0d..008fd65 100644
--- a/opennlp-brat-annotator/pom.xml
+++ b/opennlp-brat-annotator/pom.xml
@@ -17,7 +17,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.0-SNAPSHOT</version>
+		<version>1.8.0</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/73c8e5b9/opennlp-distr/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-distr/pom.xml b/opennlp-distr/pom.xml
index 3f838cd..317c37f 100644
--- a/opennlp-distr/pom.xml
+++ b/opennlp-distr/pom.xml
@@ -24,7 +24,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.0-SNAPSHOT</version>
+		<version>1.8.0</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/73c8e5b9/opennlp-docs/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/pom.xml b/opennlp-docs/pom.xml
index fbf0b5c..6b407b8 100644
--- a/opennlp-docs/pom.xml
+++ b/opennlp-docs/pom.xml
@@ -24,7 +24,7 @@
   <parent>
 	<groupId>org.apache.opennlp</groupId>
 	<artifactId>opennlp</artifactId>
-	<version>1.8.0-SNAPSHOT</version>
+	<version>1.8.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
   

http://git-wip-us.apache.org/repos/asf/opennlp/blob/73c8e5b9/opennlp-morfologik-addon/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-morfologik-addon/pom.xml b/opennlp-morfologik-addon/pom.xml
index 612b27b..26ba3bd 100644
--- a/opennlp-morfologik-addon/pom.xml
+++ b/opennlp-morfologik-addon/pom.xml
@@ -24,7 +24,7 @@
 	<parent>
 		<groupId>org.apache.opennlp</groupId>
 		<artifactId>opennlp</artifactId>
-		<version>1.8.0-SNAPSHOT</version>
+		<version>1.8.0</version>
 		<relativePath>../pom.xml</relativePath>
 	</parent>
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/73c8e5b9/opennlp-tools/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/pom.xml b/opennlp-tools/pom.xml
index a499375..573861b 100644
--- a/opennlp-tools/pom.xml
+++ b/opennlp-tools/pom.xml
@@ -25,7 +25,7 @@
   <parent>
     <groupId>org.apache.opennlp</groupId>
     <artifactId>opennlp</artifactId>
-    <version>1.8.0-SNAPSHOT</version>
+    <version>1.8.0</version>
     <relativePath>../pom.xml</relativePath>
   </parent>
     

http://git-wip-us.apache.org/repos/asf/opennlp/blob/73c8e5b9/opennlp-uima/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-uima/pom.xml b/opennlp-uima/pom.xml
index 7cfdb72..1db9c38 100644
--- a/opennlp-uima/pom.xml
+++ b/opennlp-uima/pom.xml
@@ -25,7 +25,7 @@
 	<parent>
 	    <groupId>org.apache.opennlp</groupId>
 	    <artifactId>opennlp</artifactId>
-	    <version>1.8.0-SNAPSHOT</version>
+	    <version>1.8.0</version>
 	    <relativePath>../pom.xml</relativePath>
     </parent>
     

http://git-wip-us.apache.org/repos/asf/opennlp/blob/73c8e5b9/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index 695b95c..737752c 100644
--- a/pom.xml
+++ b/pom.xml
@@ -31,7 +31,7 @@
 
 	<groupId>org.apache.opennlp</groupId>
 	<artifactId>opennlp</artifactId>
-	<version>1.8.0-SNAPSHOT</version>
+	<version>1.8.0</version>
 	<packaging>pom</packaging>
 
 	<name>Apache OpenNLP Reactor</name>
@@ -40,7 +40,7 @@
 		<connection>scm:git:git@github.com:apache/opennlp.git</connection>
 		<developerConnection>scm:git:https://git-wip-us.apache.org/repos/asf/opennlp.git</developerConnection>
 		<url>https://git-wip-us.apache.org/repos/asf?p=opennlp.git</url>
-		<tag>HEAD</tag>
+		<tag>opennlp-1.8.0</tag>
 	</scm>
 
 	<mailingLists>

[16/21] opennlp git commit: OPENNLP-1083: Conll-U Sample contraction handling

Posted by jo...@apache.org.

OPENNLP-1083: Conll-U Sample contraction handling

closes apache/opennlp#222


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/cc173c2e
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/cc173c2e
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/cc173c2e

Branch: refs/heads/LangDetect
Commit: cc173c2e4d47d6ee49b4b6050a0fea779d691429
Parents: f418eed
Author: William D C M SILVA <co...@apache.org>
Authored: Tue May 30 12:56:20 2017 -0300
Committer: William D C M SILVA <co...@apache.org>
Committed: Tue May 30 12:56:20 2017 -0300

----------------------------------------------------------------------
 .../tools/formats/conllu/ConlluStream.java      | 86 ++++++++++++++++++++
 .../formats/conllu/ConlluTokenSampleStream.java | 11 +--
 .../tools/formats/conllu/ConlluWordLine.java    | 14 ++++
 .../conllu/ConlluLemmaSampleStreamTest.java     | 49 +++++++++++
 .../conllu/ConlluPOSSampleStreamTest.java       | 77 ++++++++++++++++++
 .../conllu/ConlluTokenSampleStreamTest.java     | 51 +++++++++++-
 .../tools/formats/conllu/es-ud-sample.conllu    | 62 ++++++++++++++
 .../tools/formats/conllu/pt_br-ud-sample.conllu | 76 +++++++++++++++++
 8 files changed, 417 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
index cbac450..4dd204f 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
@@ -22,7 +22,10 @@ import java.io.IOException;
 import java.io.StringReader;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
+import java.util.HashMap;
 import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
 
 import opennlp.tools.util.InputStreamFactory;
 import opennlp.tools.util.ObjectStream;
@@ -81,12 +84,95 @@ public class ConlluStream implements ObjectStream<ConlluSentence> {
         }
       }
 
+      wordLines = postProcessContractions(wordLines);
+
       return new ConlluSentence(wordLines, sentenceId, text);
     }
 
     return null;
   }
 
+  private List<ConlluWordLine> postProcessContractions(List<ConlluWordLine> lines) {
+
+
+    // 1. Find contractions
+    Map<String, Integer> index = new HashMap();
+    Map<String, List<String>> contractions = new HashMap();
+    List<String> linesToDelete = new ArrayList();
+
+    for (int i = 0; i < lines.size(); i++) {
+      ConlluWordLine line = lines.get(i);
+      index.put(line.getId(), i);
+      if (line.getId().contains("-")) {
+        List<String> expandedContractions = new ArrayList();
+        String[] ids = line.getId().split("-");
+        int start = Integer.parseInt(ids[0]);
+        int end = Integer.parseInt(ids[1]);
+        for (int j = start; j <= end; j++) {
+          String js = Integer.toString(j);
+          expandedContractions.add(js);
+          linesToDelete.add(js);
+        }
+        contractions.put(line.getId(), expandedContractions);
+      }
+    }
+
+    // 2. Merge annotation
+    for (String contractionId: contractions.keySet()) {
+      ConlluWordLine contraction = lines.get(index.get(contractionId));
+      List<ConlluWordLine> expandedParts = new ArrayList();
+      for (String id : contractions.get(contractionId)) {
+        expandedParts.add(lines.get(index.get(id)));
+      }
+      ConlluWordLine merged = mergeAnnotation(contraction, expandedParts);
+      lines.set(index.get(contractionId), merged);
+    }
+
+    // 3. Delete the expanded parts
+    for (int i = linesToDelete.size() - 1; i >= 0; i--) {
+      lines.remove(index.get(linesToDelete.get(i)).intValue());
+    }
+    return lines;
+  }
+
+  /**
+   * Merges token level annotations
+   * @param contraction the line that receives the annotation
+   * @param expandedParts the lines to get annotation
+   * @return the merged line
+   */
+  private ConlluWordLine mergeAnnotation(ConlluWordLine contraction,
+                                         List<ConlluWordLine> expandedParts) {
+    String id = contraction.getId();
+    String form = contraction.getForm();
+    String lemma = expandedParts.stream()
+        .filter(p -> !"_".equals(p.getLemma()))
+        .map(p -> p.getLemma())
+        .collect(Collectors.joining("+"));
+
+    String uPosTag = expandedParts.stream()
+        .filter(p -> !"_".equals(p.getPosTag(ConlluTagset.U)))
+        .map(p -> p.getPosTag(ConlluTagset.U))
+        .collect(Collectors.joining("+"));
+
+    String xPosTag = expandedParts.stream()
+        .filter(p -> !"_".equals(p.getPosTag(ConlluTagset.X)))
+        .map(p -> p.getPosTag(ConlluTagset.X))
+        .collect(Collectors.joining("+"));
+
+    String feats = expandedParts.stream()
+        .filter(p -> !"_".equals(p.getFeats()))
+        .map(p -> p.getFeats())
+        .collect(Collectors.joining("+"));
+
+    String head = contraction.getHead();
+    String deprel = contraction.getDeprel();
+    String deps = contraction.getDeps();
+    String misc = contraction.getMisc();
+
+    return new ConlluWordLine(id, form, lemma, uPosTag, xPosTag, feats,head, deprel, deps, misc);
+  }
+
   @Override
   public void close() throws IOException {
     sentenceStream.close();

http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
index a9ad937..bc6907b 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
@@ -53,15 +53,12 @@ public class ConlluTokenSampleStream extends FilterObjectStream<ConlluSentence,
                 token, sentence.getSentenceIdComment(), text));
           }
 
-          int charAfterTokenIndex = tokenIndex + token.length();
-          if (charAfterTokenIndex < text.length()) {
-            if (!StringUtil.isWhitespace(text.charAt(charAfterTokenIndex))) {
-              text.insert(charAfterTokenIndex,
+          searchIndex = tokenIndex + token.length();
+          if (searchIndex < text.length()) {
+            if (!StringUtil.isWhitespace(text.charAt(searchIndex))) {
+              text.insert(searchIndex,
                   TokenSample.DEFAULT_SEPARATOR_CHARS);
-              searchIndex += TokenSample.DEFAULT_SEPARATOR_CHARS.length();
             }
-
-            searchIndex += token.length();
           }
         }
         return TokenSample.parse(text.toString(), TokenSample.DEFAULT_SEPARATOR_CHARS);

http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java
index 9881bf1..4e626be 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluWordLine.java
@@ -32,6 +32,20 @@ public class ConlluWordLine {
   private final String deps;
   private final String misc;
 
+  ConlluWordLine(String id, String form, String lemma, String uPosTag, String xPosTag,
+                 String feats, String head, String deprel, String deps, String misc) {
+    this.id = id;
+    this.form = form;
+    this.lemma = lemma;
+    this.uPosTag = uPosTag;
+    this.xPosTag = xPosTag;
+    this.feats = feats;
+    this.head = head;
+    this.deprel = deprel;
+    this.deps = deps;
+    this.misc = misc;
+  }
+
   ConlluWordLine(String line) throws InvalidFormatException {
 
     String[] fields = line.split("\t");

http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamTest.java
new file mode 100644
index 0000000..5d58cf1
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamTest.java
@@ -0,0 +1,49 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.lemmatizer.LemmaSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluLemmaSampleStreamTest {
+
+
+  @Test
+  public void testParseSpanishS300() throws IOException {
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, "es-ud-sample.conllu");
+
+    try (ObjectStream<LemmaSample> stream = new ConlluLemmaSampleStream(
+        new ConlluStream(streamFactory), ConlluTagset.U)) {
+
+      LemmaSample predicted = stream.read();
+      System.out.println(predicted);
+      Assert.assertEquals("digám+tú+él", predicted.getLemmas()[0]);
+      Assert.assertEquals("la", predicted.getTokens()[3]);
+      Assert.assertEquals("el", predicted.getLemmas()[3]);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamTest.java
new file mode 100644
index 0000000..f6bef72
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluPOSSampleStreamTest.java
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.postag.POSSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluPOSSampleStreamTest {
+  @Test
+  public void testParseContraction() throws IOException {
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, "pt_br-ud-sample.conllu");
+
+    try (ObjectStream<POSSample> stream = new ConlluPOSSampleStream(
+        new ConlluStream(streamFactory), ConlluTagset.U)) {
+
+      POSSample expected = POSSample.parse("Numa_ADP+DET reunião_NOUN entre_ADP " +
+          "representantes_NOUN da_ADP+DET Secretaria_PROPN da_ADP+DET Criança_PROPN do_ADP+DET " +
+          "DF_PROPN ea_CCONJ juíza_NOUN da_ADP+DET Vara_PROPN de_ADP Execuções_PROPN de_ADP " +
+          "Medidas_PROPN Socioeducativas_PROPN ,_PUNCT Lavínia_PROPN Tupi_PROPN Vieira_PROPN " +
+          "Fonseca_PROPN ,_PUNCT ficou_VERB acordado_ADJ que_CCONJ dos_ADP+DET 25_NUM " +
+          "internos_NOUN ,_PUNCT 12_NUM serão_AUX internados_VERB na_ADP+DET Unidade_PROPN " +
+          "de_ADP Planaltina_PROPN e_CCONJ os_DET outros_DET 13_NUM devem_AUX retornar_VERB " +
+          "para_ADP a_DET Unidade_PROPN do_ADP+DET Recanto_NOUN das_ADP+DET Emas_PROPN ,_PUNCT " +
+          "antigo_ADJ Ciago_PROPN ._PUNCT");
+
+      POSSample predicted = stream.read();
+      Assert.assertEquals(expected, predicted);
+    }
+  }
+
+
+  @Test
+  public void testParseSpanishS300() throws IOException {
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, "es-ud-sample.conllu");
+
+    try (ObjectStream<POSSample> stream = new ConlluPOSSampleStream(new ConlluStream(streamFactory),
+        ConlluTagset.U)) {
+
+      POSSample expected1 = POSSample.parse(
+          "Digámoslo_VERB+PRON+PRON claramente_ADV ,_PUNCT la_DET insurgencia_NOUN se_PRON " +
+              "ha_AUX pronunciado_VERB mucho_PRON más_ADV claramente_ADV respecto_NOUN " +
+              "al_ADP+DET tema_NOUN de_ADP la_DET paz_NOUN que_CCONJ el_DET Estado_NOUN ,_PUNCT " +
+              "como_SCONJ lo_PRON demuestra_VERB el_DET fragmento_NOUN que_SCONJ Bermúdez_PROPN " +
+              "cita_VERB de_ADP la_DET respuesta_NOUN de_ADP \"_PUNCT Gabino_PROPN \"_PUNCT " +
+              "a_ADP Piedad_PROPN Córdoba_PROPN ,_PUNCT en_ADP la_DET cual_PRON no_ADV se_PRON " +
+              "plantea_VERB ni_CCONJ siquiera_ADV \"_PUNCT esperar_VERB un_DET mejor_ADJ " +
+              "gobierno_NOUN \"_PUNCT ._PUNCT");
+      POSSample predicted = stream.read();
+      Assert.assertEquals(expected1, predicted);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
index 62cb9a6..be32a3b 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
@@ -38,8 +38,8 @@ public class ConlluTokenSampleStreamTest {
 
       TokenSample expected1 = TokenSample.parse(
           "Fachlich kompetent" + TokenSample.DEFAULT_SEPARATOR_CHARS
-          + ", sehr gute Beratung und ein freundliches Team" + TokenSample.DEFAULT_SEPARATOR_CHARS
-          + ".", TokenSample.DEFAULT_SEPARATOR_CHARS);
+              + ", sehr gute Beratung und ein freundliches Team" + TokenSample.DEFAULT_SEPARATOR_CHARS
+              + ".", TokenSample.DEFAULT_SEPARATOR_CHARS);
       Assert.assertEquals(expected1, stream.read());
 
       TokenSample expected2 = TokenSample.parse("Beiden Zahnärzten verdanke ich einen " +
@@ -50,4 +50,51 @@ public class ConlluTokenSampleStreamTest {
       Assert.assertNull("Stream must be exhausted", stream.read());
     }
   }
+
+  @Test
+  public void testParseContraction() throws IOException {
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, "pt_br-ud-sample.conllu");
+
+    try (ObjectStream<TokenSample> stream = new ConlluTokenSampleStream(new ConlluStream(streamFactory))) {
+
+      TokenSample expected1 = TokenSample.parse(
+          "Numa reunião entre representantes da Secretaria da Criança do DF " +
+              "ea juíza da Vara de Execuções de Medidas Socioeducativas" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + ", Lavínia Tupi Vieira Fonseca" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + ", ficou acordado que dos 25 internos" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + ", 12 serão internados na Unidade de " +
+              "Planaltina e os outros 13 devem retornar para a Unidade do Recanto das Emas" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + ", antigo Ciago" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + "."
+          , TokenSample.DEFAULT_SEPARATOR_CHARS);
+      TokenSample predicted = stream.read();
+      Assert.assertEquals(expected1, predicted);
+    }
+  }
+
+  @Test
+  public void testParseSpanishS300() throws IOException {
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, "es-ud-sample.conllu");
+
+    try (ObjectStream<TokenSample> stream = new ConlluTokenSampleStream(new ConlluStream(streamFactory))) {
+
+      TokenSample expected1 = TokenSample.parse(
+          "Digámoslo claramente" + TokenSample.DEFAULT_SEPARATOR_CHARS +
+              ", la insurgencia se ha pronunciado mucho más claramente respecto al " +
+              "tema de la paz que el Estado" + TokenSample.DEFAULT_SEPARATOR_CHARS +
+              ", como lo demuestra el fragmento que Bermúdez cita de la respuesta de \"" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + "Gabino" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + "\" a Piedad Córdoba" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + ", en la cual no se plantea ni siquiera \"" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + "esperar un mejor gobierno" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + "\"" +
+              TokenSample.DEFAULT_SEPARATOR_CHARS + "."
+
+          , TokenSample.DEFAULT_SEPARATOR_CHARS);
+      TokenSample predicted = stream.read();
+      Assert.assertEquals(expected1, predicted);
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/es-ud-sample.conllu
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/es-ud-sample.conllu b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/es-ud-sample.conllu
new file mode 100644
index 0000000..e30c52b
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/es-ud-sample.conllu
@@ -0,0 +1,62 @@
+# sent_id = es-train-001-s300
+# text = Digámoslo claramente, la insurgencia se ha pronunciado mucho más claramente respecto al tema de la paz que el Estado, como lo demuestra el fragmento que Bermúdez cita de la respuesta de "Gabino" a Piedad Córdoba, en la cual no se plantea ni siquiera "esperar un mejor gobierno".
+1-3	Digámoslo	_	_	_	_	_	_	_	_
+1	Digám	digám	VERB	_	VerbForm=Fin	0	root	_	_
+2	os	tú	PRON	_	Case=Acc,Dat|Number=Plur|Person=2|PrepCase=Npr|PronType=Prs	1	iobj	_	_
+3	lo	él	PRON	_	Case=Acc|Gender=Masc|Number=Sing|Person=3|PrepCase=Npr|PronType=Prs	1	obj	_	_
+4	claramente	claramente	ADV	_	_	1	advmod	_	SpaceAfter=No
+5	,	,	PUNCT	_	_	1	punct	_	_
+6	la	el	DET	_	Definite=Def|Gender=Fem|Number=Sing|PronType=Art	7	det	_	_
+7	insurgencia	insurgencia	NOUN	_	Gender=Fem|Number=Sing	10	nsubj	_	_
+8	se	él	PRON	_	Case=Acc,Dat|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes	10	iobj	_	_
+9	ha	haber	AUX	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	10	aux	_	_
+10	pronunciado	pronunciar	VERB	_	Gender=Masc|Number=Sing|Tense=Past|VerbForm=Part	1	parataxis	_	_
+11	mucho	mucho	PRON	_	NumType=Card|PronType=Ind	12	nmod	_	_
+12	más	más	ADV	_	Degree=Cmp	13	advmod	_	_
+13	claramente	claramente	ADV	_	_	10	advmod	_	_
+14	respecto	respecto	NOUN	_	Gender=Masc|Number=Sing	17	nmod	_	_
+15-16	al	_	_	_	_	_	_	_	_
+15	a	a	ADP	_	_	14	fixed	_	_
+16	el	el	DET	_	Definite=Def|Gender=Masc|Number=Sing|PronType=Art	14	det	_	_
+17	tema	tema	NOUN	_	Gender=Masc|Number=Sing	10	obl	_	_
+18	de	de	ADP	_	_	20	case	_	_
+19	la	el	DET	_	Definite=Def|Gender=Fem|Number=Sing|PronType=Art	20	det	_	_
+20	paz	paz	NOUN	_	Gender=Fem|Number=Sing	17	nmod	_	_
+21	que	que	CCONJ	_	_	23	case	_	_
+22	el	el	DET	_	Definite=Def|Gender=Masc|Number=Sing|PronType=Art	23	det	_	_
+23	Estado	estado	NOUN	_	_	12	nmod	_	SpaceAfter=No
+24	,	,	PUNCT	_	_	27	punct	_	_
+25	como	como	SCONJ	_	_	27	mark	_	_
+26	lo	él	PRON	_	Case=Acc|Gender=Masc|Number=Sing|Person=3|PrepCase=Npr|PronType=Prs	27	obj	_	_
+27	demuestra	demostrar	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	10	advcl	_	_
+28	el	el	DET	_	Definite=Def|Gender=Masc|Number=Sing|PronType=Art	29	det	_	_
+29	fragmento	fragmento	NOUN	_	Gender=Masc|Number=Sing	27	nsubj	_	_
+30	que	que	SCONJ	_	_	32	mark	_	_
+31	Bermúdez	bermúdez	PROPN	_	_	32	nsubj	_	_
+32	cita	cita	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	29	acl:relcl	_	_
+33	de	de	ADP	_	_	35	case	_	_
+34	la	el	DET	_	Definite=Def|Gender=Fem|Number=Sing|PronType=Art	35	det	_	_
+35	respuesta	respuesta	NOUN	_	Gender=Fem|Number=Sing	29	nmod	_	_
+36	de	de	ADP	_	_	38	case	_	_
+37	"	"	PUNCT	_	_	38	punct	_	SpaceAfter=No
+38	Gabino	gabino	PROPN	_	_	35	nmod	_	SpaceAfter=No
+39	"	"	PUNCT	_	_	38	punct	_	_
+40	a	a	ADP	_	_	41	case	_	_
+41	Piedad	piedad	PROPN	_	_	35	nmod	_	_
+42	Córdoba	córdoba	PROPN	_	_	41	flat	_	SpaceAfter=No
+43	,	,	PUNCT	_	_	49	punct	_	_
+44	en	en	ADP	_	_	46	case	_	_
+45	la	el	DET	_	Definite=Def|Gender=Fem|Number=Sing|PronType=Art	46	det	_	_
+46	cual	cual	PRON	_	Number=Sing|PronType=Int,Rel	49	mark	_	_
+47	no	no	ADV	_	Polarity=Neg	49	advmod	_	_
+48	se	él	PRON	_	Case=Acc,Dat|Person=3|PrepCase=Npr|PronType=Prs|Reflex=Yes	49	iobj	_	_
+49	plantea	plantear	VERB	_	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	35	acl:relcl	_	_
+50	ni	ni	CCONJ	_	Polarity=Neg	53	advmod	_	_
+51	siquiera	siquiera	ADV	_	_	50	fixed	_	_
+52	"	"	PUNCT	_	_	53	punct	_	SpaceAfter=No
+53	esperar	esperar	VERB	_	VerbForm=Inf	49	csubj	_	_
+54	un	uno	DET	_	Definite=Ind|Gender=Masc|Number=Sing|PronType=Art	56	det	_	_
+55	mejor	mejor	ADJ	_	Degree=Cmp|Number=Sing	56	amod	_	_
+56	gobierno	gobierno	NOUN	_	Gender=Masc|Number=Sing	53	obj	_	SpaceAfter=No
+57	"	"	PUNCT	_	_	53	punct	_	SpaceAfter=No
+58	.	.	PUNCT	_	_	1	punct	_	_

http://git-wip-us.apache.org/repos/asf/opennlp/blob/cc173c2e/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/pt_br-ud-sample.conllu
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/pt_br-ud-sample.conllu b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/pt_br-ud-sample.conllu
new file mode 100644
index 0000000..f616044
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/pt_br-ud-sample.conllu
@@ -0,0 +1,76 @@
+# sent_id = train-s2
+# text = Numa reunião entre representantes da Secretaria da Criança do DF ea juíza da Vara de Execuções de Medidas Socioeducativas, Lavínia Tupi Vieira Fonseca, ficou acordado que dos 25 internos, 12 serão internados na Unidade de Planaltina e os outros 13 devem retornar para a Unidade do Recanto das Emas, antigo Ciago.
+1-2	Numa	_	_	_	_	_	_	_	_
+1	Em	_	ADP	ADP	_	3	case	_	_
+2	uma	_	DET	DET	_	3	det	_	_
+3	reunião	_	NOUN	NOUN	_	31	nmod	_	_
+4	entre	_	ADP	ADP	_	5	case	_	_
+5	representantes	_	NOUN	NOUN	_	3	nmod	_	_
+6-7	da	_	_	_	_	_	_	_	_
+6	de	de	ADP	ADP	_	8	case	_	_
+7	a	o	DET	DET	Definite=Def|Gender=Fem|Number=Sing|PronType=Art	8	det	_	_
+8	Secretaria	_	PROPN	PNOUN	_	5	nmod	_	_
+9-10	da	_	_	_	_	_	_	_	_
+9	de	de	ADP	ADP	_	11	case	_	_
+10	a	o	DET	DET	Definite=Def|Gender=Fem|Number=Sing|PronType=Art	11	det	_	_
+11	Criança	_	PROPN	PNOUN	_	8	nmod	_	_
+12-13	do	_	_	_	_	_	_	_	_
+12	de	de	ADP	ADP	_	14	case	_	_
+13	o	o	DET	DET	Definite=Def|Gender=Masc|Number=Sing|PronType=Art	14	det	_	_
+14	DF	_	PROPN	PNOUN	_	8	nmod	_	_
+15	ea	_	CCONJ	CONJ	_	16	cc	_	_
+16	juíza	_	NOUN	NOUN	_	5	conj	_	_
+17-18	da	_	_	_	_	_	_	_	_
+17	de	de	ADP	ADP	_	19	case	_	_
+18	a	o	DET	DET	Definite=Def|Gender=Fem|Number=Sing|PronType=Art	19	det	_	_
+19	Vara	_	PROPN	PNOUN	_	16	nmod	_	_
+20	de	_	ADP	ADP	_	21	case	_	_
+21	Execuções	_	PROPN	PNOUN	_	19	nmod	_	_
+22	de	_	ADP	ADP	_	23	case	_	_
+23	Medidas	_	PROPN	PNOUN	_	21	nmod	_	_
+24	Socioeducativas	_	PROPN	PNOUN	_	23	amod	_	SpaceAfter=No
+25	,	_	PUNCT	.	_	26	punct	_	_
+26	Lavínia	_	PROPN	PNOUN	_	16	appos	_	_
+27	Tupi	_	PROPN	PNOUN	_	26	flat	_	_
+28	Vieira	_	PROPN	PNOUN	_	26	flat	_	_
+29	Fonseca	_	PROPN	PNOUN	_	26	flat	_	SpaceAfter=No
+30	,	_	PUNCT	.	_	3	punct	_	_
+31	ficou	_	VERB	VERB	_	0	root	_	_
+32	acordado	_	ADJ	ADJ	_	31	xcomp:adj	_	_
+33	que	_	CCONJ	CONJ	_	41	mark	_	_
+34-35	dos	_	_	_	_	_	_	_	_
+34	de	de	ADP	ADP	_	37	case	_	_
+35	os	o	DET	DET	Definite=Def|Gender=Masc|Number=Plur|PronType=Art	37	det	_	_
+36	25	_	NUM	NUM	NumType=Card	37	nummod	_	_
+37	internos	_	NOUN	NOUN	_	41	nmod	_	SpaceAfter=No
+38	,	_	PUNCT	.	_	37	punct	_	_
+39	12	_	NUM	NUM	NumType=Card	41	nsubj:pass	_	_
+40	serão	_	AUX	AUX	_	41	aux:pass	_	_
+41	internados	_	VERB	VERB	_	31	csubj	_	_
+42-43	na	_	_	_	_	_	_	_	_
+42	en	en	ADP	ADP	_	44	case	_	_
+43	a	o	DET	DET	Definite=Def|Gender=Fem|Number=Sing|PronType=Art	44	det	_	_
+44	Unidade	_	PROPN	PNOUN	_	41	nmod	_	_
+45	de	_	ADP	ADP	_	46	case	_	_
+46	Planaltina	_	PROPN	PNOUN	_	44	nmod	_	_
+47	e	_	CCONJ	CONJ	_	52	cc	_	_
+48	os	_	DET	DET	_	50	det	_	_
+49	outros	_	DET	DET	_	50	det	_	_
+50	13	_	NUM	NUM	NumType=Card	52	nsubj	_	_
+51	devem	_	AUX	AUX	_	52	aux	_	_
+52	retornar	_	VERB	VERB	_	41	conj	_	_
+53	para	_	ADP	ADP	_	55	case	_	_
+54	a	_	DET	DET	_	55	det	_	_
+55	Unidade	_	PROPN	PNOUN	_	52	nmod	_	_
+56-57	do	_	_	_	_	_	_	_	_
+56	de	de	ADP	ADP	_	58	case	_	_
+57	o	o	DET	DET	Definite=Def|Gender=Masc|Number=Sing|PronType=Art	58	det	_	_
+58	Recanto	_	NOUN	NOUN	_	55	nmod	_	_
+59-60	das	_	_	_	_	_	_	_	_
+59	de	de	ADP	ADP	_	61	case	_	_
+60	as	o	DET	DET	Definite=Def|Gender=Fem|Number=Plur|PronType=Art	61	det	_	_
+61	Emas	_	PROPN	PNOUN	_	58	nmod	_	SpaceAfter=No
+62	,	_	PUNCT	.	_	64	punct	_	_
+63	antigo	_	ADJ	ADJ	_	64	amod	_	_
+64	Ciago	_	PROPN	PNOUN	_	55	appos	_	SpaceAfter=No
+65	.	_	PUNCT	.	_	31	punct	_	_

[12/21] opennlp git commit: OPENNLP-1050: Add formats support for Irish Sentence Bank

Posted by jo...@apache.org.

OPENNLP-1050: Add formats support for Irish Sentence Bank

closes #191


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/6f80a897
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/6f80a897
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/6f80a897

Branch: refs/heads/LangDetect
Commit: 6f80a89705d84dd74da902d512ca4682aed07a57
Parents: 5bf5366
Author: Jim O'Regan <ja...@tcd.ie>
Authored: Sun Apr 30 21:25:03 2017 +0100
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 24 16:52:42 2017 +0200

----------------------------------------------------------------------
 .../tools/cmdline/StreamFactoryRegistry.java    |   5 +
 .../IrishSentenceBankDocument.java              | 271 +++++++++++++++++++
 .../IrishSentenceBankSentenceStream.java        |  72 +++++
 .../IrishSentenceBankSentenceStreamFactory.java |  61 +++++
 .../IrishSentenceBankTokenSampleStream.java     |  52 ++++
 ...ishSentenceBankTokenSampleStreamFactory.java |  60 ++++
 .../IrishSentenceBankDocumentTest.java          |  67 +++++
 .../irishsentencebank-sample.xml                |  25 ++
 8 files changed, 613 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index 2cff212..3d68945 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -54,6 +54,8 @@ import opennlp.tools.formats.convert.ParseToPOSSampleStreamFactory;
 import opennlp.tools.formats.convert.ParseToSentenceSampleStreamFactory;
 import opennlp.tools.formats.convert.ParseToTokenSampleStreamFactory;
 import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory;
+import opennlp.tools.formats.irishsentencebank.IrishSentenceBankSentenceStreamFactory;
+import opennlp.tools.formats.irishsentencebank.IrishSentenceBankTokenSampleStreamFactory;
 import opennlp.tools.formats.letsmt.LetsmtSentenceStreamFactory;
 import opennlp.tools.formats.moses.MosesSentenceSampleStreamFactory;
 import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
@@ -119,6 +121,9 @@ public final class StreamFactoryRegistry {
     ConlluSentenceSampleStreamFactory.registerFactory();
     ConlluPOSSampleStreamFactory.registerFactory();
     ConlluLemmaSampleStreamFactory.registerFactory();
+
+    IrishSentenceBankSentenceStreamFactory.registerFactory();
+    IrishSentenceBankTokenSampleStreamFactory.registerFactory();
   }
 
   public static final String DEFAULT_FORMAT = "opennlp";

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java
new file mode 100644
index 0000000..91ab650
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java
@@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.irishsentencebank;
+
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.lang.StringBuilder;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+import javax.xml.parsers.ParserConfigurationException;
+
+import org.w3c.dom.Document;
+import org.w3c.dom.Node;
+import org.w3c.dom.NodeList;
+import org.xml.sax.SAXException;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.Span;
+
+/**
+ * A structure to hold an Irish Sentence Bank document, which is a collection
+ * of tokenized sentences.
+ * <p>
+ * The sentence bank can be downloaded from, and is described
+ * <a href="http://www.lexiconista.com/datasets/sentencebank-ga/">here</a>
+ */
+public class IrishSentenceBankDocument {
+
+  public static class IrishSentenceBankFlex {
+    String surface;
+    String[] flex;
+    public String getSurface() {
+      return surface;
+    }
+    public String[] getFlex() {
+      return flex;
+    }
+    public IrishSentenceBankFlex(String sf, String[] fl) {
+      this.surface = sf;
+      this.flex = fl;
+    }
+  }
+
+  public static class IrishSentenceBankSentence {
+    private String source;
+    private String translation;
+    private String original;
+    private Span[] tokens;
+    private IrishSentenceBankFlex[] flex;
+    public String getSource() {
+      return source;
+    }
+    public String getTranslation() {
+      return translation;
+    }
+    public String getOriginal() {
+      return original;
+    }
+    public Span[] getTokens() {
+      return tokens;
+    }
+    public IrishSentenceBankFlex[] getFlex() {
+      return flex;
+    }
+    public TokenSample getTokenSample() {
+      return new TokenSample(original, tokens);
+    }
+    public IrishSentenceBankSentence(String src, String trans, String orig, 
+                                     Span[] toks, IrishSentenceBankFlex[] flx) {
+      this.source = src;
+      this.translation = trans;
+      this.original = orig;
+      this.tokens = toks;
+      this.flex = flx;
+    }
+  }
+
+  private List<IrishSentenceBankSentence> sentences;
+
+  public IrishSentenceBankDocument() {
+    sentences = new ArrayList<IrishSentenceBankSentence>();
+  }
+
+  public void add(IrishSentenceBankSentence sent) {
+    this.sentences.add(sent);
+  }
+
+  public List<IrishSentenceBankSentence> getSentences() {
+    return Collections.unmodifiableList(sentences);
+  }
+
+  /**
+   * Helper to adjust the span of punctuation tokens: ignores spaces to the left of the string
+   * @param s the string to check
+   * @param start the offset of the start of the string
+   * @return the offset adjusted to ignore spaces to the left
+   */
+  private static int advanceLeft(String s, int start) {
+    int ret = start;
+    for (char c : s.toCharArray()) {
+      if (c == ' ') {
+        ret++;
+      } else {
+        return ret;
+      }
+    }
+    return ret;
+  }
+
+  /**
+   * Helper to adjust the span of punctuation tokens: ignores spaces to the right of the string
+   * @param s the string to check
+   * @param start the offset of the start of the string
+   * @return the offset of the end of the string, adjusted to ignore spaces to the right
+   */
+  private static int advanceRight(String s, int start) {
+    int end = s.length() - 1;
+    int ret = start + end + 1;
+    for (int i = end; i > 0; i--) {
+      if (s.charAt(i) == ' ') {
+        ret--;
+      } else {
+        return ret;
+      }
+    }
+    return ret;
+  }
+
+  public static IrishSentenceBankDocument parse(InputStream is) throws IOException {
+    IrishSentenceBankDocument document = new IrishSentenceBankDocument();
+
+    try {
+      DocumentBuilderFactory docBuilderFactory = DocumentBuilderFactory.newInstance();
+      DocumentBuilder docBuilder = docBuilderFactory.newDocumentBuilder();
+      Document doc = docBuilder.parse(is);
+
+      String root = doc.getDocumentElement().getNodeName();
+      if (!root.equalsIgnoreCase("sentences")) {
+        throw new IOException("Expected root node " + root);
+      }
+
+      NodeList nl = doc.getDocumentElement().getChildNodes();
+      for (int i = 0; i < nl.getLength(); i++) {
+        Node sentnode = nl.item(i);
+        if (sentnode.getNodeName().equals("sentence")) {
+          String src = sentnode.getAttributes().getNamedItem("source").getNodeValue();
+          String trans = "";
+          Map<Integer, String> toks = new HashMap<>();
+          Map<Integer, List<String>> flx = new HashMap<>();
+          List<Span> spans = new ArrayList<>();
+          NodeList sentnl = sentnode.getChildNodes();
+          int flexes = 1;
+          StringBuilder orig = new StringBuilder();
+
+          for (int j = 0; j < sentnl.getLength(); j++) {
+            final String name = sentnl.item(j).getNodeName();
+            switch (name) {
+              case "flex":
+                String slottmpa = sentnl.item(j).getAttributes().getNamedItem("slot").getNodeValue();
+                Integer flexslot = Integer.parseInt(slottmpa);
+                if (flexslot > flexes) {
+                  flexes = flexslot;
+                }
+
+                flx.computeIfAbsent(flexslot, k -> new ArrayList<>());
+                String tkn = sentnl.item(j).getAttributes().getNamedItem("lemma").getNodeValue();
+                flx.get(flexslot).add(tkn);
+                break;
+
+              case "translation":
+                trans = sentnl.item(j).getFirstChild().getTextContent();
+                break;
+
+              case "original":
+                int last = 0;
+                NodeList orignl = sentnl.item(j).getChildNodes();
+                for (int k = 0; k < orignl.getLength(); k++) {
+                  switch (orignl.item(k).getNodeName()) {
+                    case "token":
+                      String tmptok = orignl.item(k).getFirstChild().getTextContent();
+                      spans.add(new Span(last, last + tmptok.length()));
+
+                      String slottmpb = orignl.item(k).getAttributes().getNamedItem("slot").getNodeValue();
+                      Integer tokslot = Integer.parseInt(slottmpb);
+                      if (tokslot > flexes) {
+                        flexes = tokslot;
+                      }
+
+                      toks.put(tokslot, tmptok);
+                      orig.append(tmptok);
+                      last += tmptok.length();
+                      break;
+
+                    case "#text":
+                      String tmptxt = orignl.item(k).getTextContent();
+                      orig.append(tmptxt);
+
+                      if (!" ".equals(tmptxt)) {
+                        spans.add(new Span(advanceLeft(tmptxt, last), advanceRight(tmptxt, last)));
+                      }
+
+                      last += tmptxt.length();
+                      break;
+
+                    default:
+                      throw new IOException("Unexpected node: " + orignl.item(k).getNodeName());
+                  }
+                }
+                break;
+
+              case "#text":
+              case "#comment":
+                break;
+
+              default:
+                throw new IOException("Unexpected node: " + name);
+            }
+          }
+          IrishSentenceBankFlex[] flexa = new IrishSentenceBankFlex[flexes];
+          for (Integer flexidx : toks.keySet()) {
+            String left = toks.get(flexidx);
+            int rsize = flx.get(flexidx).size();
+            String[] right = new String[rsize];
+            right = flx.get(flexidx).toArray(right);
+            flexa[flexidx - 1] = new IrishSentenceBankFlex(left, right);
+          }
+
+          Span[] spanout = new Span[spans.size()];
+          spanout = spans.toArray(spanout);
+          document.add(new IrishSentenceBankSentence(src, trans, orig.toString(), spanout, flexa));
+        } else if (!sentnode.getNodeName().equals("#text") && !sentnode.getNodeName().equals("#comment")) {
+          throw new IOException("Unexpected node: " + sentnode.getNodeName());
+        }
+      }
+      return document;
+    } catch (ParserConfigurationException e) {
+      throw new IllegalStateException(e);
+    } catch (SAXException e) {
+      throw new IOException("Failed to parse IrishSentenceBank document", e);
+    }
+  }
+
+  static IrishSentenceBankDocument parse(File file) throws IOException {
+    try (InputStream in = new FileInputStream(file)) {
+      return parse(in);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java
new file mode 100644
index 0000000..e7c06d1
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStream.java
@@ -0,0 +1,72 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.irishsentencebank;
+
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+class IrishSentenceBankSentenceStream implements ObjectStream<SentenceSample>  {
+
+  private final IrishSentenceBankDocument source;
+
+  private Iterator<IrishSentenceBankDocument.IrishSentenceBankSentence> sentenceIt;
+
+  IrishSentenceBankSentenceStream(IrishSentenceBankDocument source) {
+    this.source = source;
+    reset();
+  }
+
+  @Override
+  public SentenceSample read() throws IOException {
+
+    StringBuilder sentencesString = new StringBuilder();
+    List<Span> sentenceSpans = new LinkedList<>();
+
+    while (sentenceIt.hasNext()) {
+      IrishSentenceBankDocument.IrishSentenceBankSentence sentence = sentenceIt.next();
+
+      int begin = sentencesString.length();
+
+      if (sentence.getOriginal() != null) {
+        sentencesString.append(sentence.getOriginal());
+      }
+
+      sentenceSpans.add(new Span(begin, sentencesString.length()));
+      sentencesString.append(' ');
+    }
+
+    // end of stream is reached, indicate that with null return value
+    if (sentenceSpans.size() == 0) {
+      return null;
+    }
+
+    return new SentenceSample(sentencesString.toString(),
+        sentenceSpans.toArray(new Span[sentenceSpans.size()]));
+  }
+
+  @Override
+  public void reset() {
+    sentenceIt = source.getSentences().iterator();
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java
new file mode 100644
index 0000000..e26dc56
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankSentenceStreamFactory.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.irishsentencebank;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.ObjectStream;
+
+public class IrishSentenceBankSentenceStreamFactory extends AbstractSampleStreamFactory<SentenceSample> {
+
+  interface Parameters extends BasicFormatParams {
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(SentenceSample.class,
+        "irishsentencebank", new IrishSentenceBankSentenceStreamFactory(
+        IrishSentenceBankSentenceStreamFactory.Parameters.class));
+  }
+
+  protected <P> IrishSentenceBankSentenceStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  @Override
+  public ObjectStream<SentenceSample> create(String[] args) {
+
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    CmdLineUtil.checkInputFile("Data", params.getData());
+
+    IrishSentenceBankDocument isbDoc = null;
+    try {
+      isbDoc = IrishSentenceBankDocument.parse(params.getData());
+    } catch (IOException ex) {
+      CmdLineUtil.handleCreateObjectStreamError(ex);
+    }
+
+    return new IrishSentenceBankSentenceStream(isbDoc);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java
new file mode 100644
index 0000000..8cbfac2
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStream.java
@@ -0,0 +1,52 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.irishsentencebank;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.ObjectStream;
+
+class IrishSentenceBankTokenSampleStream implements ObjectStream<TokenSample>  {
+
+  private final IrishSentenceBankDocument source;
+
+  private Iterator<IrishSentenceBankDocument.IrishSentenceBankSentence> sentenceIt;
+
+  IrishSentenceBankTokenSampleStream(IrishSentenceBankDocument source) {
+    this.source = source;
+    reset();
+  }
+
+  @Override
+  public TokenSample read() throws IOException {
+
+    if (sentenceIt.hasNext()) {
+      IrishSentenceBankDocument.IrishSentenceBankSentence sentence = sentenceIt.next();
+      return sentence.getTokenSample();
+    } else {
+      return null;
+    }
+  }
+
+  @Override
+  public void reset() {
+    sentenceIt = source.getSentences().iterator();
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java
new file mode 100644
index 0000000..86d1225
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankTokenSampleStreamFactory.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.irishsentencebank;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.DetokenizerSampleStreamFactory;
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.ObjectStream;
+
+public class IrishSentenceBankTokenSampleStreamFactory extends DetokenizerSampleStreamFactory<TokenSample> {
+
+  interface Parameters extends BasicFormatParams {
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(TokenSample.class,
+        "irishsentencebank", new IrishSentenceBankTokenSampleStreamFactory(
+        IrishSentenceBankTokenSampleStreamFactory.Parameters.class));
+  }
+
+  protected <P> IrishSentenceBankTokenSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  public ObjectStream<TokenSample> create(String[] args) {
+
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    CmdLineUtil.checkInputFile("Data", params.getData());
+
+    IrishSentenceBankDocument isbDoc = null;
+    try {
+      isbDoc = IrishSentenceBankDocument.parse(params.getData());
+    } catch (IOException ex) {
+      CmdLineUtil.handleCreateObjectStreamError(ex);
+    }
+
+    return new IrishSentenceBankTokenSampleStream(isbDoc);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java
new file mode 100644
index 0000000..671fea0
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocumentTest.java
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.irishsentencebank;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.util.List;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.Span;
+
+public class IrishSentenceBankDocumentTest {
+
+  @Test
+  public void testParsingSimpleDoc() throws IOException {
+    try (InputStream irishSBXmlIn = 
+          IrishSentenceBankDocumentTest.class.getResourceAsStream("irishsentencebank-sample.xml")) {
+
+      IrishSentenceBankDocument doc = IrishSentenceBankDocument.parse(irishSBXmlIn);
+
+      List<IrishSentenceBankDocument.IrishSentenceBankSentence> sents = doc.getSentences();
+
+      Assert.assertEquals(2, sents.size());
+
+      IrishSentenceBankDocument.IrishSentenceBankSentence sent1 = sents.get(0);
+      IrishSentenceBankDocument.IrishSentenceBankSentence sent2 = sents.get(1);
+
+      Assert.assertEquals("A Dhia, tá mé ag iompar clainne!", sent1.getOriginal());
+
+      IrishSentenceBankDocument.IrishSentenceBankFlex[] flex = sent1.getFlex();
+      Assert.assertEquals(7, flex.length);
+      Assert.assertEquals("A", flex[0].getSurface());
+      Assert.assertArrayEquals(new String[]{"a"}, flex[0].getFlex());
+
+      IrishSentenceBankDocument.IrishSentenceBankFlex[] flex2 = sent2.getFlex();
+      Assert.assertEquals("ón", flex2[4].getSurface());
+      Assert.assertArrayEquals(new String[]{"ó", "an"}, flex2[4].getFlex());
+
+      Assert.assertEquals("Excuse me, are you from the stone age?", sent2.getTranslation());
+
+      TokenSample ts = sent1.getTokenSample();
+      Span[] spans = ts.getTokenSpans();
+      Assert.assertEquals(9, spans.length);
+      Assert.assertEquals(24, spans[7].getStart());
+      Assert.assertEquals(31, spans[7].getEnd());
+      Assert.assertEquals("clainne", ts.getText().substring(spans[7].getStart(), spans[7].getEnd()));
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/6f80a897/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml b/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml
new file mode 100644
index 0000000..91e84c1
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/irishsentencebank/irishsentencebank-sample.xml
@@ -0,0 +1,25 @@
+<sentences datestamp="2015-03-10">
+<sentence source='potaL'>
+	<original xml:space="preserve"><token slot='1'>A</token> <token slot='2'>Dhia</token>, <token slot='3'>tá</token> <token slot='4'>mé</token> <token slot='5'>ag</token> <token slot='6'>iompar</token> <token slot='7'>clainne</token>!</original>
+	<translation>Oh my God, I&apos;m pregnant!</translation>
+	<flex slot='1' lemma='a'/>
+	<flex slot='2' lemma='dia'/>
+	<flex slot='3' lemma='bí'/>
+	<flex slot='4' lemma='mé'/>
+	<flex slot='5' lemma='ag'/>
+	<flex slot='6' lemma='iompair'/>
+	<flex slot='7' lemma='clann'/>
+</sentence>
+<sentence source='potaL'>
+	<original xml:space="preserve"><token slot='1'>Gabh</token> <token slot='2'>mo</token> <token slot='3'>leithscéal</token>, <token slot='4'>an</token> <token slot='5'>ón</token> <token slot='6'>chlochaois</token> <token slot='7'>thú</token>?</original>
+	<translation>Excuse me, are you from the stone age?</translation>
+	<flex slot='1' lemma='gabh'/>
+	<flex slot='2' lemma='mo'/>
+	<flex slot='3' lemma='leithscéal'/>
+	<flex slot='4' lemma='an'/>
+	<flex slot='5' lemma='ó'/>
+	<flex slot='5' lemma='an'/>
+	<flex slot='6' lemma='clochaois'/>
+	<flex slot='7' lemma='thú'/>
+</sentence>
+</sentences>

[02/21] opennlp git commit: OPENNLP-979 Update lemmatizer doc after API change

Posted by jo...@apache.org.

OPENNLP-979 Update lemmatizer doc after API change


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/ee9fdb8a
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/ee9fdb8a
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/ee9fdb8a

Branch: refs/heads/LangDetect
Commit: ee9fdb8aad0e4c43bba85e50be3687475bf2221d
Parents: 839ff10
Author: Rodrigo Agerri <ra...@apache.org>
Authored: Wed May 17 23:04:23 2017 +0200
Committer: Rodrigo Agerri <ra...@apache.org>
Committed: Wed May 17 23:04:23 2017 +0200

----------------------------------------------------------------------
 opennlp-docs/src/docbkx/lemmatizer.xml | 54 ++++++++++++++++-------------
 1 file changed, 30 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/ee9fdb8a/opennlp-docs/src/docbkx/lemmatizer.xml
----------------------------------------------------------------------
diff --git a/opennlp-docs/src/docbkx/lemmatizer.xml b/opennlp-docs/src/docbkx/lemmatizer.xml
index 1fa5540..630b04d 100644
--- a/opennlp-docs/src/docbkx/lemmatizer.xml
+++ b/opennlp-docs/src/docbkx/lemmatizer.xml
@@ -121,10 +121,9 @@ String[] postags = new String[] { "NNP", "NNP", "NNP", "POS", "NNP", "NN",
     "NNP", "NNP", "TO", "VB", "JJ", "NNS", "IN", "NNP", "POS", "CD", "NNS",
     "." };
 
-String[] lemmas = lemmatizer.lemmatize(tokens, postags);
-String[] decodedLemmas = lemmatizer.decodeLemmas(tokens, lemmas);]]>
+String[] lemmas = lemmatizer.lemmatize(tokens, postags);]]>
 		</programlisting>
-				The decodedLemmas array contains one lemma for each token in the
+				The lemmas array contains one lemma for each token in the
 				input array. The corresponding
 				tag and lemma can be found at the same index as the token has in the
 				input array.
@@ -133,29 +132,37 @@ String[] decodedLemmas = lemmatizer.decodeLemmas(tokens, lemmas);]]>
 			<para>
 				The DictionaryLemmatizer is constructed
 				by passing the InputStream of a lemmatizer dictionary. Such dictionary
-				consists of a
-				text file containing, for each row, a word, its postag and the
-				corresponding lemma:
+				consists of a text file containing, for each row, a word, its postag and the
+				corresponding lemma, each column separated by a tab character.
 				<screen>
 		<![CDATA[
-show    NN      show
-showcase        NN      showcase
-showcases       NNS     showcase
-showdown        NN      showdown
-showdowns       NNS     showdown
-shower  NN      shower
-showers NNS     shower
-showman NN      showman
-showmanship     NN      showmanship
-showmen NNS     showman
-showroom        NN      showroom
-showrooms       NNS     showroom
-shows   NNS     show
-showstopper     NN      showstopper
-showstoppers    NNS     showstopper
-shrapnel        NN      shrapnel
+show		NN	show
+showcase	NN	showcase
+showcases	NNS	showcase
+showdown	NN	showdown
+showdowns	NNS	showdown
+shower		NN	shower
+showers		NNS	shower
+showman		NN	showman
+showmanship	NN	showmanship
+showmen		NNS	showman
+showroom	NN	showroom
+showrooms	NNS	showroom
+shows		NNS	show
+shrapnel	NN	shrapnel
 		]]>
 		</screen>
+				Alternatively, if a (word,postag) pair can output multiple lemmas, the
+				the lemmatizer dictionary would consists of a text file containing, for 
+				each row, a word, its postag and the corresponding lemmas separated by "#":
+				<screen>
+		<![CDATA[
+muestras	NN	muestra
+cantaba		V	cantar
+fue		V	ir#ser
+entramos	V	entrar
+		]]>
+					</screen>
 				First the dictionary must be loaded into memory from disk or another
 				source.
 				In the sample below it is loaded from disk.
@@ -180,8 +187,7 @@ DictionaryLemmatizer lemmatizer = new DictionaryLemmatizer(dictLemmatizer);]]>
 			</para>
 			<para>
 				The following code shows how to find a lemma using a
-				DictionaryLemmatizer. There is no need to decode the
-				lemmas when using the DictionaryLemmatizer.
+				DictionaryLemmatizer.
 				<programlisting language="java">
 		  <![CDATA[
 String[] tokens = new String[]{"Most", "large", "cities", "in", "the", "US", "had",

[08/21] opennlp git commit: OPENNLP-1068: Use current version to generate changes list

Posted by jo...@apache.org.

OPENNLP-1068: Use current version to generate changes list


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/b581c20a
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/b581c20a
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/b581c20a

Branch: refs/heads/LangDetect
Commit: b581c20a9eda9bbbda8b002461409b7fa08ccea4
Parents: dd25a69
Author: Jörn Kottmann <jo...@apache.org>
Authored: Fri May 19 11:14:02 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Fri May 19 12:37:50 2017 +0200

----------------------------------------------------------------------
 opennlp-distr/pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/b581c20a/opennlp-distr/pom.xml
----------------------------------------------------------------------
diff --git a/opennlp-distr/pom.xml b/opennlp-distr/pom.xml
index 4428240..881b92b 100644
--- a/opennlp-distr/pom.xml
+++ b/opennlp-distr/pom.xml
@@ -127,7 +127,7 @@
               <phase>generate-resources</phase> 
               <goals><goal>jira-report</goal></goals>
                 <configuration>
-                  <fixVersionIds>12339249</fixVersionIds>
+                  <onlyCurrentVersion>true</onlyCurrentVersion>
                   <outputDirectory>${basedir}/target/issuesFixed/</outputDirectory>
                   <maxEntries>1000</maxEntries> <!-- hopefully, bigger than ever needed -->
                 </configuration>

[15/21] opennlp git commit: OPENNLP-1078: Fix NPE in irishsentencebank reader

Posted by jo...@apache.org.

OPENNLP-1078: Fix NPE in irishsentencebank reader

closes apache/opennlp#219


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/f418eed3
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/f418eed3
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/f418eed3

Branch: refs/heads/LangDetect
Commit: f418eed3089d74caa11affaf947a40b774e9c8cb
Parents: e972869
Author: Jim Regan <jo...@gmail.com>
Authored: Mon May 29 10:05:04 2017 -0300
Committer: William D C M SILVA <co...@apache.org>
Committed: Mon May 29 10:05:04 2017 -0300

----------------------------------------------------------------------
 .../formats/irishsentencebank/IrishSentenceBankDocument.java     | 4 ++++
 1 file changed, 4 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/f418eed3/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java
index 91ab650..2fe9231 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/irishsentencebank/IrishSentenceBankDocument.java
@@ -242,6 +242,10 @@ public class IrishSentenceBankDocument {
           IrishSentenceBankFlex[] flexa = new IrishSentenceBankFlex[flexes];
           for (Integer flexidx : toks.keySet()) {
             String left = toks.get(flexidx);
+            if (flx.get(flexidx) == null) {
+              flexa = null;
+              break;
+            }
             int rsize = flx.get(flexidx).size();
             String[] right = new String[rsize];
             right = flx.get(flexidx).toArray(right);

[07/21] opennlp git commit: OPENNLP-1054: Remove deprecated Heap and HeapList

Posted by jo...@apache.org.

OPENNLP-1054: Remove deprecated Heap and HeapList


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/dd25a691
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/dd25a691
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/dd25a691

Branch: refs/heads/LangDetect
Commit: dd25a69102d3f3b17663763fc4172764622c3d4c
Parents: 217f5eb
Author: Jörn Kottmann <jo...@apache.org>
Authored: Wed May 10 16:48:09 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Fri May 19 12:02:24 2017 +0200

----------------------------------------------------------------------
 .../src/main/java/opennlp/tools/util/Heap.java  |  80 --------
 .../main/java/opennlp/tools/util/ListHeap.java  | 197 -------------------
 2 files changed, 277 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/dd25a691/opennlp-tools/src/main/java/opennlp/tools/util/Heap.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/Heap.java b/opennlp-tools/src/main/java/opennlp/tools/util/Heap.java
deleted file mode 100644
index 83f3315..0000000
--- a/opennlp-tools/src/main/java/opennlp/tools/util/Heap.java
+++ /dev/null
@@ -1,80 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.util;
-
-import java.util.Iterator;
-
-/** Interface for interacting with a Heap data structure.
- * This implementation extract objects from smallest to largest based on either
- * their natural ordering or the comparator provided to an implementation.
- * While this is a typical of a heap it allows this objects natural ordering to
- * match that of other sorted collections.
- *
- * This is now deprecated and will be removed in Release 1.8.1
- * */
-@Deprecated
-public interface Heap<E>  {
-
-  /**
-   * Removes the smallest element from the heap and returns it.
-   * @return The smallest element from the heap.
-   */
-  E extract();
-
-  /**
-   * Returns the smallest element of the heap.
-   * @return The top element of the heap.
-   */
-  E first();
-
-  /**
-   * Returns the largest element of the heap.
-   * @return The largest element of the heap.
-   */
-  E last();
-
-  /**
-   * Adds the specified object to the heap.
-   * @param o The object to add to the heap.
-   */
-  void add(E o);
-
-  /**
-   * Returns the size of the heap.
-   * @return The size of the heap.
-   */
-  int size();
-
-  /**
-   * Returns whether the heap is empty.
-   * @return true if the heap is empty; false otherwise.
-   */
-  boolean isEmpty();
-
-  /**
-   * Returns an iterator over the elements of the heap.  No specific ordering of these
-   * elements is guaranteed.
-   * @return An iterator over the elements of the heap.
-   */
-  Iterator<E> iterator();
-
-  /**
-   * Clears the contents of the heap.
-   */
-  void clear();
-}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/dd25a691/opennlp-tools/src/main/java/opennlp/tools/util/ListHeap.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/ListHeap.java b/opennlp-tools/src/main/java/opennlp/tools/util/ListHeap.java
deleted file mode 100644
index 92744e0..0000000
--- a/opennlp-tools/src/main/java/opennlp/tools/util/ListHeap.java
+++ /dev/null
@@ -1,197 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package opennlp.tools.util;
-
-import java.util.ArrayList;
-import java.util.Comparator;
-import java.util.Iterator;
-import java.util.List;
-
-/**
- * This class implements the heap interface using a {@link java.util.List} as the underlying
- * data structure.  This heap allows values which are equals to be inserted.  The heap will
- * return the top K values which have been added where K is specified by the size passed to
- * the constructor. K+1 values are not gaurenteed to be kept in the heap or returned in a
- * particular order.
- *
- * This is now deprecated and will be removed in Release 1.8.1
- */
-@Deprecated
-public class ListHeap<E extends Comparable<E>> implements Heap<E> {
-  private List<E> list;
-
-  private Comparator<E> comp;
-
-  private int size;
-
-  private E max = null;
-
-  /**
-   * Creates a new heap with the specified size using the sorted based on the
-   * specified comparator.
-   * @param sz The size of the heap.
-   * @param c The comparator to be used to sort heap elements.
-   */
-  public ListHeap(int sz, Comparator<E> c) {
-    size = sz;
-    comp = c;
-    list = new ArrayList<>(sz);
-  }
-
-  /**
-   * Creates a new heap of the specified size.
-   * @param sz The size of the new heap.
-   */
-  public ListHeap(int sz) {
-    this(sz, null);
-  }
-
-  private int parent(int i) {
-    return (i - 1) / 2;
-  }
-
-  private int left(int i) {
-    return (i + 1) * 2 - 1;
-  }
-
-  private int right(int i) {
-    return (i + 1) * 2;
-  }
-
-  public int size() {
-    return list.size();
-  }
-
-  private void swap(int x, int y) {
-    E ox = list.get(x);
-    E oy = list.get(y);
-
-    list.set(y, ox);
-    list.set(x, oy);
-  }
-
-  private boolean lt(E o1, E o2) {
-    if (comp != null) {
-      return comp.compare(o1, o2) < 0;
-    }
-    else {
-      return o1.compareTo(o2) < 0;
-    }
-  }
-
-  private boolean gt(E o1, E o2) {
-    if (comp != null) {
-      return comp.compare(o1, o2) > 0;
-    }
-    else {
-      return o1.compareTo(o2) > 0;
-    }
-  }
-
-  private void heapify(int i) {
-    while (true) {
-      int l = left(i);
-      int r = right(i);
-      int smallest;
-
-      if (l < list.size() && lt(list.get(l), list.get(i)))
-        smallest = l;
-      else
-        smallest = i;
-
-      if (r < list.size() && lt(list.get(r), list.get(smallest)))
-        smallest = r;
-
-      if (smallest != i) {
-        swap(smallest, i);
-        i = smallest;
-      }
-      else
-        break;
-    }
-  }
-
-  public E extract() {
-    if (list.size() == 0) {
-      throw new RuntimeException("Heap Underflow");
-    }
-    E top = list.get(0);
-    int last = list.size() - 1;
-    if (last != 0) {
-      list.set(0, list.remove(last));
-      heapify(0);
-    }
-    else {
-      list.remove(last);
-    }
-
-    return top;
-  }
-
-  public E first() {
-    if (list.size() == 0) {
-      throw new RuntimeException("Heap Underflow");
-    }
-    return list.get(0);
-  }
-
-  public E last() {
-    if (list.size() == 0) {
-      throw new RuntimeException("Heap Underflow");
-    }
-    return max;
-  }
-
-  public void add(E o) {
-    /* keep track of max to prevent unnecessary insertion */
-    if (max == null) {
-      max = o;
-    }
-    else if (gt(o, max)) {
-      if (list.size() < size) {
-        max = o;
-      }
-      else {
-        return;
-      }
-    }
-    list.add(o);
-
-    int i = list.size() - 1;
-
-    //percolate new node to correct position in heap.
-    while (i > 0 && gt(list.get(parent(i)), o)) {
-      list.set(i, list.get(parent(i)));
-      i = parent(i);
-    }
-
-    list.set(i, o);
-  }
-
-  public void clear() {
-    list.clear();
-  }
-
-  public Iterator<E> iterator() {
-    return list.iterator();
-  }
-
-  public boolean isEmpty() {
-    return this.list.isEmpty();
-  }
-}

[20/21] opennlp git commit: OPENNLP-788: Add LanguageDetector tool

Posted by jo...@apache.org.

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java
new file mode 100644
index 0000000..771be19
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/AggregateCharSequenceNormalizer.java
@@ -0,0 +1,39 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+public class AggregateCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private final CharSequenceNormalizer[] normalizers;
+
+  public AggregateCharSequenceNormalizer(CharSequenceNormalizer ... normalizers) {
+    this.normalizers = normalizers;
+  }
+
+  public CharSequence normalize (CharSequence text) {
+
+    for (CharSequenceNormalizer normalizers :
+        normalizers) {
+      text = normalizers.normalize(text);
+    }
+
+    return text;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
new file mode 100644
index 0000000..b5c1f3f
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/CharSequenceNormalizer.java
@@ -0,0 +1,23 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+public interface CharSequenceNormalizer {
+  CharSequence normalize(CharSequence text);
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
new file mode 100644
index 0000000..d1c161c
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/EmojiCharSequenceNormalizer.java
@@ -0,0 +1,38 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class EmojiCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final EmojiCharSequenceNormalizer INSTANCE = new EmojiCharSequenceNormalizer();
+
+  public static EmojiCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  private static final Pattern EMOJI_REGEX =
+      Pattern.compile("[\\uD83C-\\uDBFF\\uDC00-\\uDFFF]+");
+
+  public CharSequence normalize (CharSequence text) {
+    String modified = EMOJI_REGEX.matcher(text).replaceAll(" ");
+    return modified;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
new file mode 100644
index 0000000..6b0452d
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/NumberCharSequenceNormalizer.java
@@ -0,0 +1,36 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class NumberCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final Pattern NUMBER_REGEX = Pattern.compile("\\d+");
+
+  private static final NumberCharSequenceNormalizer INSTANCE = new NumberCharSequenceNormalizer();
+
+  public static NumberCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  public CharSequence normalize (CharSequence text) {
+    return NUMBER_REGEX.matcher(text).replaceAll(" ");
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
new file mode 100644
index 0000000..6183367
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/ShrinkCharSequenceNormalizer.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class ShrinkCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final Pattern REPEATED_CHAR_REGEX = Pattern.compile("(.)\\1{2,}",
+      Pattern.CASE_INSENSITIVE);
+  private static final Pattern SPACE_REGEX = Pattern.compile("\\s{2,}",
+      Pattern.CASE_INSENSITIVE);
+
+  private static final ShrinkCharSequenceNormalizer INSTANCE = new ShrinkCharSequenceNormalizer();
+
+  public static ShrinkCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  public CharSequence normalize (CharSequence text) {
+    text = SPACE_REGEX.matcher(text).replaceAll(" ");
+    return REPEATED_CHAR_REGEX.matcher(text).replaceAll("$1$1").trim();
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
new file mode 100644
index 0000000..b5a8625
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/TwitterCharSequenceNormalizer.java
@@ -0,0 +1,50 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class TwitterCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final Pattern HASH_USER_REGEX =
+      Pattern.compile("[#@]\\S+");
+
+  private static final Pattern RT_REGEX =
+      Pattern.compile("\\b(rt[ :])+", Pattern.CASE_INSENSITIVE);
+
+  private static final Pattern FACE_REGEX =
+      Pattern.compile("[:;x]-?[()dop]", Pattern.CASE_INSENSITIVE);
+
+  private static final Pattern LAUGH_REGEX =
+      Pattern.compile("([hj])+([aieou])+(\\1+\\2+)+", Pattern.CASE_INSENSITIVE);
+
+  private static final TwitterCharSequenceNormalizer INSTANCE = new TwitterCharSequenceNormalizer();
+
+  public static TwitterCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  public CharSequence normalize (CharSequence text) {
+    String modified = HASH_USER_REGEX.matcher(text).replaceAll(" ");
+    modified = RT_REGEX.matcher(modified).replaceAll(" ");
+    modified = FACE_REGEX.matcher(modified).replaceAll(" ");
+    modified = LAUGH_REGEX.matcher(modified).replaceAll("$1$2$1$2");
+    return modified;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizer.java
new file mode 100644
index 0000000..bea5f9a
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UnicodeCharSequenceNormalizer.java
@@ -0,0 +1,297 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Map;
+import java.util.MissingResourceException;
+import java.util.ResourceBundle;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Unicode normalizer based on https://github.com/shuyo/language-detection
+ */
+public class UnicodeCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final UnicodeCharSequenceNormalizer INSTANCE = new UnicodeCharSequenceNormalizer();
+
+  public static UnicodeCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  private static final ResourceBundle RESOURCE_BUNDLE =
+      ResourceBundle.getBundle("opennlp.tools.util.normalizer.unicode_normalizer");
+
+  private static final String LATIN1_EXCLUDED = getMessage("NGram.LATIN1_EXCLUDE");
+
+  public CharSequence normalize(CharSequence text) {
+    StringBuilder ret = new StringBuilder();
+
+
+    CharSequence modified = normalize_vi(text);
+
+    char previous = 0;
+    
+    for (int i = 0; i < modified.length(); i++) {
+      char current = normalize(modified.charAt(i));
+      if (current != ' ' || previous != ' ') {
+        ret.append(current);
+      }
+      previous = current;
+    }
+
+
+    return ret.toString();
+  }
+
+  public char normalize(char ch) {
+    Character.UnicodeBlock block = Character.UnicodeBlock.of(ch);
+    if (block == Character.UnicodeBlock.BASIC_LATIN) {
+      if (ch < 'A' || (ch < 'a' && ch > 'Z') || ch > 'z') {
+        ch = ' ';
+      }
+    } else if (block == Character.UnicodeBlock.LATIN_1_SUPPLEMENT) {
+      if (LATIN1_EXCLUDED.indexOf(ch) >= 0) {
+        ch = ' ';
+      }
+    } else if (block == Character.UnicodeBlock.LATIN_EXTENDED_B) {
+      // normalization for Romanian
+      if (ch == '\u0219') {
+        ch = '\u015f';  // Small S with comma below => with cedilla
+      }
+      if (ch == '\u021b') {
+        ch = '\u0163';  // Small T with comma below => with cedilla
+      }
+    } else if (block == Character.UnicodeBlock.GENERAL_PUNCTUATION) {
+      ch = ' ';
+    } else if (block == Character.UnicodeBlock.ARABIC) {
+      if (ch == '\u06cc') {
+        ch = '\u064a';  // Farsi yeh => Arabic yeh
+      }
+    } else if (block == Character.UnicodeBlock.LATIN_EXTENDED_ADDITIONAL) {
+      if (ch >= '\u1ea0') {
+        ch = '\u1ec3';
+      }
+    } else if (block == Character.UnicodeBlock.HIRAGANA) {
+      ch = '\u3042';
+    } else if (block == Character.UnicodeBlock.KATAKANA) {
+      ch = '\u30a2';
+    } else if (block == Character.UnicodeBlock.BOPOMOFO ||
+        block == Character.UnicodeBlock.BOPOMOFO_EXTENDED) {
+      ch = '\u3105';
+    } else if (block == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {
+      if (CJK_MAP.containsKey(ch)) {
+        ch = CJK_MAP.get(ch);
+      }
+    } else if (block == Character.UnicodeBlock.HANGUL_SYLLABLES) {
+      ch = '\uac00';
+    }
+    return ch;
+  }
+
+  /**
+   * Normalizer for Vietnamese (from Shuyo)
+   *
+   * @param text text to be normalized
+   * @return normalized text
+   */
+  static CharSequence normalize_vi(CharSequence text) {
+    Matcher m = ALPHABET_WITH_DMARK.matcher(text);
+    StringBuffer buf = new StringBuffer();
+    while (m.find()) {
+      int alphabet = TO_NORMALIZE_VI_CHARS.indexOf(m.group(1));
+      int dmark = DMARK_CLASS.indexOf(m.group(2)); // Diacritical Mark
+      m.appendReplacement(buf, NORMALIZED_VI_CHARS[dmark].substring(alphabet, alphabet + 1));
+    }
+    if (buf.length() == 0) {
+      return text;
+    }
+    m.appendTail(buf);
+    return buf.toString();
+  }
+
+
+  static String getMessage(String key) {
+    try {
+      return RESOURCE_BUNDLE.getString(key);
+    } catch (MissingResourceException e) {
+      return '!' + key + '!';
+    }
+  }
+
+  private static final String[] NORMALIZED_VI_CHARS = {
+      getMessage("NORMALIZED_VI_CHARS_0300"),
+      getMessage("NORMALIZED_VI_CHARS_0301"),
+      getMessage("NORMALIZED_VI_CHARS_0303"),
+      getMessage("NORMALIZED_VI_CHARS_0309"),
+      getMessage("NORMALIZED_VI_CHARS_0323")};
+
+  private static final String TO_NORMALIZE_VI_CHARS = getMessage("TO_NORMALIZE_VI_CHARS");
+  private static final String DMARK_CLASS = getMessage("DMARK_CLASS");
+  private static final Pattern ALPHABET_WITH_DMARK =
+      Pattern.compile("([" + TO_NORMALIZE_VI_CHARS + "])([" + DMARK_CLASS + "])");
+
+  /**
+   * CJK Kanji Normalization Mapping
+   */
+  static final String[] CJK_CLASS = {
+      getMessage("NGram.KANJI_1_0"),
+      getMessage("NGram.KANJI_1_2"),
+      getMessage("NGram.KANJI_1_4"),
+      getMessage("NGram.KANJI_1_8"),
+      getMessage("NGram.KANJI_1_11"),
+      getMessage("NGram.KANJI_1_12"),
+      getMessage("NGram.KANJI_1_13"),
+      getMessage("NGram.KANJI_1_14"),
+      getMessage("NGram.KANJI_1_16"),
+      getMessage("NGram.KANJI_1_18"),
+      getMessage("NGram.KANJI_1_22"),
+      getMessage("NGram.KANJI_1_27"),
+      getMessage("NGram.KANJI_1_29"),
+      getMessage("NGram.KANJI_1_31"),
+      getMessage("NGram.KANJI_1_35"),
+      getMessage("NGram.KANJI_2_0"),
+      getMessage("NGram.KANJI_2_1"),
+      getMessage("NGram.KANJI_2_4"),
+      getMessage("NGram.KANJI_2_9"),
+      getMessage("NGram.KANJI_2_10"),
+      getMessage("NGram.KANJI_2_11"),
+      getMessage("NGram.KANJI_2_12"),
+      getMessage("NGram.KANJI_2_13"),
+      getMessage("NGram.KANJI_2_15"),
+      getMessage("NGram.KANJI_2_16"),
+      getMessage("NGram.KANJI_2_18"),
+      getMessage("NGram.KANJI_2_21"),
+      getMessage("NGram.KANJI_2_22"),
+      getMessage("NGram.KANJI_2_23"),
+      getMessage("NGram.KANJI_2_28"),
+      getMessage("NGram.KANJI_2_29"),
+      getMessage("NGram.KANJI_2_30"),
+      getMessage("NGram.KANJI_2_31"),
+      getMessage("NGram.KANJI_2_32"),
+      getMessage("NGram.KANJI_2_35"),
+      getMessage("NGram.KANJI_2_36"),
+      getMessage("NGram.KANJI_2_37"),
+      getMessage("NGram.KANJI_2_38"),
+      getMessage("NGram.KANJI_3_1"),
+      getMessage("NGram.KANJI_3_2"),
+      getMessage("NGram.KANJI_3_3"),
+      getMessage("NGram.KANJI_3_4"),
+      getMessage("NGram.KANJI_3_5"),
+      getMessage("NGram.KANJI_3_8"),
+      getMessage("NGram.KANJI_3_9"),
+      getMessage("NGram.KANJI_3_11"),
+      getMessage("NGram.KANJI_3_12"),
+      getMessage("NGram.KANJI_3_13"),
+      getMessage("NGram.KANJI_3_15"),
+      getMessage("NGram.KANJI_3_16"),
+      getMessage("NGram.KANJI_3_18"),
+      getMessage("NGram.KANJI_3_19"),
+      getMessage("NGram.KANJI_3_22"),
+      getMessage("NGram.KANJI_3_23"),
+      getMessage("NGram.KANJI_3_27"),
+      getMessage("NGram.KANJI_3_29"),
+      getMessage("NGram.KANJI_3_30"),
+      getMessage("NGram.KANJI_3_31"),
+      getMessage("NGram.KANJI_3_32"),
+      getMessage("NGram.KANJI_3_35"),
+      getMessage("NGram.KANJI_3_36"),
+      getMessage("NGram.KANJI_3_37"),
+      getMessage("NGram.KANJI_3_38"),
+      getMessage("NGram.KANJI_4_0"),
+      getMessage("NGram.KANJI_4_9"),
+      getMessage("NGram.KANJI_4_10"),
+      getMessage("NGram.KANJI_4_16"),
+      getMessage("NGram.KANJI_4_17"),
+      getMessage("NGram.KANJI_4_18"),
+      getMessage("NGram.KANJI_4_22"),
+      getMessage("NGram.KANJI_4_24"),
+      getMessage("NGram.KANJI_4_28"),
+      getMessage("NGram.KANJI_4_34"),
+      getMessage("NGram.KANJI_4_39"),
+      getMessage("NGram.KANJI_5_10"),
+      getMessage("NGram.KANJI_5_11"),
+      getMessage("NGram.KANJI_5_12"),
+      getMessage("NGram.KANJI_5_13"),
+      getMessage("NGram.KANJI_5_14"),
+      getMessage("NGram.KANJI_5_18"),
+      getMessage("NGram.KANJI_5_26"),
+      getMessage("NGram.KANJI_5_29"),
+      getMessage("NGram.KANJI_5_34"),
+      getMessage("NGram.KANJI_5_39"),
+      getMessage("NGram.KANJI_6_0"),
+      getMessage("NGram.KANJI_6_3"),
+      getMessage("NGram.KANJI_6_9"),
+      getMessage("NGram.KANJI_6_10"),
+      getMessage("NGram.KANJI_6_11"),
+      getMessage("NGram.KANJI_6_12"),
+      getMessage("NGram.KANJI_6_16"),
+      getMessage("NGram.KANJI_6_18"),
+      getMessage("NGram.KANJI_6_20"),
+      getMessage("NGram.KANJI_6_21"),
+      getMessage("NGram.KANJI_6_22"),
+      getMessage("NGram.KANJI_6_23"),
+      getMessage("NGram.KANJI_6_25"),
+      getMessage("NGram.KANJI_6_28"),
+      getMessage("NGram.KANJI_6_29"),
+      getMessage("NGram.KANJI_6_30"),
+      getMessage("NGram.KANJI_6_32"),
+      getMessage("NGram.KANJI_6_34"),
+      getMessage("NGram.KANJI_6_35"),
+      getMessage("NGram.KANJI_6_37"),
+      getMessage("NGram.KANJI_6_39"),
+      getMessage("NGram.KANJI_7_0"),
+      getMessage("NGram.KANJI_7_3"),
+      getMessage("NGram.KANJI_7_6"),
+      getMessage("NGram.KANJI_7_7"),
+      getMessage("NGram.KANJI_7_9"),
+      getMessage("NGram.KANJI_7_11"),
+      getMessage("NGram.KANJI_7_12"),
+      getMessage("NGram.KANJI_7_13"),
+      getMessage("NGram.KANJI_7_16"),
+      getMessage("NGram.KANJI_7_18"),
+      getMessage("NGram.KANJI_7_19"),
+      getMessage("NGram.KANJI_7_20"),
+      getMessage("NGram.KANJI_7_21"),
+      getMessage("NGram.KANJI_7_23"),
+      getMessage("NGram.KANJI_7_25"),
+      getMessage("NGram.KANJI_7_28"),
+      getMessage("NGram.KANJI_7_29"),
+      getMessage("NGram.KANJI_7_32"),
+      getMessage("NGram.KANJI_7_33"),
+      getMessage("NGram.KANJI_7_35"),
+      getMessage("NGram.KANJI_7_37"),
+  };
+
+  private static final Map<Character, Character> CJK_MAP;
+
+  static {
+    Map<Character, Character> _cjk_map = new HashMap();
+    for (String cjk_list : CJK_CLASS) {
+      char representative = cjk_list.charAt(0);
+      for (int i = 0; i < cjk_list.length(); ++i) {
+        _cjk_map.put(cjk_list.charAt(i), representative);
+      }
+    }
+    CJK_MAP = Collections.unmodifiableMap(_cjk_map);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
new file mode 100644
index 0000000..4be9b63
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/util/normalizer/UrlCharSequenceNormalizer.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+
+package opennlp.tools.util.normalizer;
+
+import java.util.regex.Pattern;
+
+public class UrlCharSequenceNormalizer implements CharSequenceNormalizer {
+
+  private static final Pattern URL_REGEX =
+      Pattern.compile("https?://[-_.?&~;+=/#0-9A-Za-z]+");
+  private static final Pattern MAIL_REGEX =
+      Pattern.compile("[-_.0-9A-Za-z]+@[-_0-9A-Za-z]+[-_.0-9A-Za-z]+");
+
+  private static final UrlCharSequenceNormalizer INSTANCE = new UrlCharSequenceNormalizer();
+
+  public static UrlCharSequenceNormalizer getInstance() {
+    return INSTANCE;
+  }
+
+  public CharSequence normalize (CharSequence text) {
+    String modified = URL_REGEX.matcher(text).replaceAll(" ");
+    return MAIL_REGEX.matcher(modified).replaceAll(" ");
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/resources/opennlp/tools/util/normalizer/unicode_normalizer.properties
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/resources/opennlp/tools/util/normalizer/unicode_normalizer.properties b/opennlp-tools/src/main/resources/opennlp/tools/util/normalizer/unicode_normalizer.properties
new file mode 100644
index 0000000..75cffac
--- /dev/null
+++ b/opennlp-tools/src/main/resources/opennlp/tools/util/normalizer/unicode_normalizer.properties
@@ -0,0 +1,154 @@
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#  This is derived from Shuyo work
+#  https://github.com/shuyo/language-detection
+
+NGram.CJK_KANJI_EXCLUDE=\u0020\uFF08\uFF09
+NGram.LATIN1_EXCLUDE=\u00A0\u00AB\u00B0\u00BB
+NGram.KANJI_1_0=\u4F7C\u6934
+NGram.KANJI_1_2=\u88CF\u95B2
+NGram.KANJI_1_4=\u7027\u7DCB
+NGram.KANJI_1_8=\u4E80\u4E9C\u4EEE\u5263\u5264\u5270\u52C5\u52E7\u52F2\u53B3\u5449\u58CA\u58CC\u5968\u59C9\u59EB\u5D8B\u5DE3\u5E30\u6075\u622F\u623B\u6255\u629C\u629E\u62DD\u62E1\u633F\u635C\u63FA\u6442\u6589\u658E\u6669\u66A6\u66FD\u6804\u685C\u6B69\u6B6F\u6BBB\u6C37\u6C5A\u6D44\u6E09\u6E0B\u6E13\u6EDD\u713C\u72A0\u731F\u7363\u7A32\u7A42\u7A93\u7ADC\u7C8B\u7C9B\u7DD1\u7E01\u7E04\u7E26\u7E4A\u7E4B\u7E70\u8074\u8107\u8133\u81D3\u820E\u8217\u8358\u83D3\u85AC\u8987\u899A\u8B21\u8B72\u8B83\u8CDB\u9045\u90F7\u91C8\u9271\u9283\u92AD\u9665\u967A\u96A0\u96A3\u96B7\u970A\u983C\u9854\u9855\u99C6\u9A12\u9ED9\u9F62
+NGram.KANJI_1_11=\u67D8\u831C
+NGram.KANJI_1_12=\u5742\u57FC\u5800
+NGram.KANJI_1_13=\u4E3C\u4E98\u4FE3\u4FF5\u5072\u51A8\u53A9\u5451\u546A\u5504\u5516\u55A9\u55B0\u5618\u5642\u565B\u567A\u56A2\u57F4\u5840\u5841\u58F1\u59F6\u5A2F\u5B22\u5B8D\u5DCC\u5EFB\u5F10\u60A9\u60E3\u61D0\u62F6\u63B4\u63BB\u63C3\u6681\u685F\u6955\u6962\u696F\u698A\u698E\u69FB\u6A2B\u6A7F\u6B53\u6BD8\u6D99\u6E07\u7460\u7473\u7560\u7573\u758E\u7690\u7815\u783A\u7962\u7A4F\u7A63\u7AEA\u7BED\u7CA7\u7D18\u7D3A\u7E4D\u8061\u8218\u8276\u82C5\u8597\u85AB\u86CD\u874B\u88FE\u8ACF\u8B90\u8D0B\u8FBF\u9013\u9061\u914E\u9154\u918D\u9190\u91A4\u91B8\u9262\u929A\u92ED\u92F3\u932C\u96EB\u96F0\u976D\u97EE\u981A\u99C4\u9A28\u9AC4\u9B8E\u9C10\u9D0E\u9D5C\u9D8F\u9E78\u9EB9\u9EBA\u9EBF
+NGram.KANJI_1_14=\u5F66\u7984\u7985
+NGram.KANJI_1_16=\u5861\u7B25\u844E\u9419\u9D07
+NGram.KANJI_1_18=\u5039\u514E\u51E7\u51EA\u5301\u5302\u5859\u58F7\u59AC\u5C2D\u5CA8\u5EFC\u6357\u64B9\u67CA\u6802\u6834\u68BC\u6900\u6919\u691B\u69D9\u6AE8\u6D9C\u6E8C\u6F09\u6F45\u701E\u7026\u7114\u72DB\u7577\u75E9\u783F\u7895\u7A50\u7AC3\u7B48\u7B86\u7BAA\u7C7E\u7C82\u7C8D\u7CCE\u7D2C\u7F6B\u7FEB\u8557\u85AE\u86CE\u877F\u8997\u8ACC\u8CB0\u8CCE\u8FE9\u9197\u920E\u9266\u927E\u92F2\u9306\u9453\u9784\u982C\u9834\u99C8\u9BF5\u9C2F\u9D2C
+NGram.KANJI_1_22=\u6762\u6A17\u887F
+NGram.KANJI_1_27=\u4E21\u4E57\u4ECF\u4F1D\u4FA1\u4FF3\u5024\u50CD\u5150\u5186\u51E6\u52B4\u52B9\u5358\u53CE\u55B6\u56E3\u56F2\u56F3\u570F\u5727\u5869\u5897\u58F2\u5909\u5B9F\u5BDB\u5BFE\u5C02\u5DFB\u5E2F\u5E81\u5E83\u5EC3\u5F3E\u5F93\u5FB3\u5FB4\u5FDC\u60AA\u6226\u6238\u6271\u62E0\u6319\u63B2\u6483\u64AE\u67A0\u67FB\u691C\u697D\u69D8\u6A29\u6B73\u6B74\u6BCE\u6C17\u6CA2\u6D5C\u6E08\u6E80\u702C\u7523\u767A\u770C\u7D4C\u7D75\u7D76\u7D99\u7D9A\u7DCF\u8535\u8846\u89A7\u89B3\u8A33\u8AAC\u8AAD\u8C4A\u8EE2\u8EFD\u8FBA\u8FBC\u9244\u9332\u95A2\u95D8\u96D1\u99C5\u9A13\u9ED2
+NGram.KANJI_1_29=\u4F0E\u4FFA\u5036\u53E1\u54B2\u5506\u583A\u5C3B\u5CAC\u5CE0\u5CEF\u6803\u68B6\u6A0B\u6A8E\u73C2\u7551\u7826\u7881\u79B0\u7B39\u8429\u8599\u8FBB\u9162\u95C7\u9688\u96BC\u9AEA\u9DF2
+NGram.KANJI_1_31=\u5553\u938C
+NGram.KANJI_1_35=\u51B4\u564C\u57DC\u5B2C\u6822\u685D\u690B\u6973\u6C93\u7511\u7887\u7A17\u83D6\u847A\u8494\u8526\u854E\u85C1\u86F8\u88B4\u93A7\u9B92\u9C39\u9C48\u9C52
+NGram.KANJI_2_0=\u4E2B\u4EC3\u4F09\u4F57\u4F6F\u4F70\u4FD1\u4FDA\u500C\u5043\u516E\u5189\u5241\u530D\u5310\u5412\u54AB\u54AF\u5514\u5556\u55B1\u561F\u573B\u586D\u587D\u58C5\u58D1\u5914\u5A62\u5A6A\u5AE6\u5B40\u5B5B\u5B70\u5BB8\u5CD2\u5D01\u5D34\u5E11\u5EA0\u5F0B\u5F2D\u5F87\u607F\u621B\u6221\u6289\u63A3\u6452\u646D\u64D8\u652B\u6600\u6631\u6641\u66F7\u6773\u67B8\u67DD\u67DE\u6829\u68FB\u69AD\u6A47\u6C10\u6C68\u6C74\u6C85\u6CD3\u6D31\u6D93\u6D94\u6DB8\u6DBF\u6DC5\u6E6E\u6EA7\u6EB4\u6EC2\u6F2A\u6F2F\u6FB9\u6FC2\u6FDB\u6FEE\u70AF\u70FD\u7166\u726F\u729B\u739F\u73DE\u740A\u746D\u749C\u749F\u74E0\u759D\u75A3\u75CD\u75DE\u7600\u7620\u7688\u7738\u7762\u776B\u777D\u77E3\u781D\u7837\u78A3\u7946\u7B60\u7F44\u7F54\u7F5F\u7FAF\u8026\u807F\u80C4\u80DB\u80ED\u81E7\u824B\u82B7\u82E3\u8392\u846D\u84D3\u8548\u85B9\u86DE\u873F\u8753\u8782\u87AB\u87B3\u87D1\u87E0\u87FE\u8821\u88D8\u88E8\u8913\u891A\u892B\u8983\u8C3F\u8C49\u8C82\u8D6D\u8DE4\u8E1D\u8E1E\u8E7C\u8FE5\u8FE8\u9005\u9035\u9050\u9082\u9083\u9
 095\u90E2\u911E\u91AE\u91B4\u93D6\u9621\u968D\u96B9\u96D2\u9711\u9713\u973E\u9AB0\u9AB7\u9AE6\u9B03\u9B23\u9EDC\u9EEF
+NGram.KANJI_2_1=\u4E82\u4F48\u4F54\u50F9\u5167\u528D\u52DE\u532F\u537B\u53C3\u5433\u555F\u55AE\u56B4\u570D\u5716\u58D3\u58DE\u5920\u5967\u5A1B\u5BEB\u5BEC\u5C08\u5C0D\u5C46\u5C6C\u5CFD\u5E36\u5E6B\u5EC8\u5EF3\u5F48\u5F91\u5F9E\u5FB5\u6046\u60E1\u61F7\u6232\u6236\u64C7\u64CA\u64D4\u64DA\u64F4\u651D\u6578\u65B7\u6649\u6A13\u6A23\u6A6B\u6A94\u6AA2\u6B0A\u6B50\u6B61\u6B72\u6B77\u6B78\u6C92\u6EAB\u6EFF\u6FD5\u6FDF\u71DF\u722D\u72C0\u734E\u737B\u746A\u7522\u773E\u78BC\u7A69\u7C3D\u7CB5\u7D55\u7D72\u7DA0\u7DAB\u7DE3\u7E5E\u7E6A\u7E7C\u7E8C\u8072\u807D\u8085\u812B\u8166\u8173\u81D8\u8209\u820A\u8332\u838A\u840A\u85E5\u860B\u8655\u865B\u88DD\u89BA\u89BD\u89C0\u8AAA\u8B6F\u8B7D\u8B8A\u8B93\u8C50\u8CF4\u8E64\u8F15\u8F49\u8FA6\u8FAD\u9109\u9130\u91AB\u91CB\u92B7\u9304\u9322\u95CA\u96A8\u96AA\u96B1\u96B8\u96D6\u96D9\u96DC\u9748\u975C\u986F\u9918\u99DB\u9A57\u9B25\u9EA5\u9EC3\u9EDE\u9F52
+NGram.KANJI_2_4=\u514C\u51AA\u5614\u56AE\u56C2\u582F\u58FA\u5B0C\u5D11\u5DD2\u5DD6\u5E40\u5E5F\u5EEC\u6137\u6417\u6488\u64F2\u652A\u6582\u6689\u689F\u68D7\u69D3\u6A97\u6AB8\u6ABB\u6AC3\u6ADA\u6B7F\u6BB2\u6EA5\u6EC4\u6EF2\u7009\u701D\u7028\u703E\u7165\u71BE\u721B\u7463\u7464\u7469\u7515\u7526\u75FA\u7621\u779E\u79B1\u7A1F\u7AC4\u7AC7\u7B8F\u7BE9\u7D2E\u7D68\u7D8F\u7DB8\u7DBA\u7E46\u7E79\u7F4C\u7F88\u8070\u8073\u8076\u81BE\u82BB\u83A2\u858A\u8591\u861A\u8778\u87EC\u8805\u880D\u893B\u8A1B\u8A25\u8A36\u8A85\u8AA6\u8B17\u8B28\u8CB6\u8CE4\u8D16\u8D1B\u8ECB\u9112\u9214\u9249\u93AC\u9594\u9598\u95BB\u95D5\u965E\u96B4\u97DC\u9821\u9824\u9921\u9952\u9A55\u9A5B\u9B1A\u9C13\u9D09\u9DAF\u9E1A\u9E75\u9F67
+NGram.KANJI_2_9=\u4E9F\u4F6C\u4FDE\u4FFE\u5029\u5140\u51A2\u5345\u539D\u53FB\u54C7\u5599\u560E\u561B\u563B\u566C\u5676\u5729\u574D\u57E4\u595A\u598D\u5A1F\u5A25\u5A77\u5AB2\u5AD6\u5BF0\u5C2C\u5CEA\u5E37\u5F08\u6059\u606A\u6096\u609A\u62A8\u6555\u6556\u66E6\u675E\u68E3\u69BB\u6BCB\u6BD3\u6C1F\u6C26\u6C81\u6DC4\u6DDE\u6E32\u6E44\u6E4D\u6F33\u6F7C\u6FA7\u701A\u701B\u715C\u741B\u7428\u7480\u74A8\u7504\u752C\u768B\u76CE\u78CA\u78FA\u79BA\u7C27\u8046\u81FB\u8331\u8393\u83C1\u8403\u8438\u843C\u8446\u85B0\u87D2\u8862\u8DC6\u9074\u9131\u9672\u96EF\u9704\u9706\u977C\u9ABC\u9E92\u9ECF
+NGram.KANJI_2_10=\u51BD\u5704\u7350\u73A5
+NGram.KANJI_2_11=\u4E15\u4EA2\u4F5A\u50D6\u5349\u53DF\u5484\u5958\u5B34\u5B5A\u5C91\u5E1B\u5F77\u61CB\u61FF\u620C\u620D\u622E\u6248\u6538\u660A\u664F\u678B\u67E9\u69B7\u69C3\u6CB1\u6CD7\u6D5A\u6DAA\u6DC7\u7099\u71EE\u7325\u7425\u7455\u747E\u749E\u75B5\u7678\u7693\u76C2\u77B0\u77BF\u78CB\u7957\u795A\u797A\u7A79\u7B08\u7B75\u7BB4\u7F9A\u7FB2\u7FDF\u80E5\u81BA\u8340\u837C\u8398\u8559\u85A8\u86DF\u8734\u8882\u88F4\u8936\u900D\u907D\u9642\u96C9\u9AFB\u9E9D\u9EBE
+NGram.KANJI_2_12=\u5F57\u7940
+NGram.KANJI_2_13=\u5191\u7791\u792C\u7D46
+NGram.KANJI_2_15=\u5713\u58FD\u5D17\u5D19\u5DBC\u5F4C\u6191\u64A5\u687F\u69AE\u6AFB\u6EEC\u6F3F\u6FE4\u6FF1\u6FFE\u700B\u74CA\u76E1\u76E7\u7926\u792B\u79AE\u7AA9\u7C43\u7C4C\u7C64\u7DBD\u81A0\u856D\u8594\u8606\u8A62\u8AF7\u8CC8\u8CE3\u8D99\u8F1B\u8F3B\u9059\u9127\u9264\u947D\u95A9\u97CB\u980C\u9838\u9846\u99AE\u9A19\u9B06\u9B91\u9F4A\u9F4B
+NGram.KANJI_2_16=\u4E69\u4EC4\u4EDF\u4EF3\u4F0B\u4F5E\u5000\u5028\u50E5\u513B\u5157\u51DC\u52D7\u530F\u5379\u53F5\u5471\u5477\u5555\u555C\u557B\u5594\u55B2\u55C9\u560D\u5616\u562E\u5630\u5653\u5657\u566F\u56A8\u56B6\u5820\u5880\u58CE\u58D9\u5950\u5969\u596D\u599E\u59B3\u59CD\u59D2\u5A40\u5AA7\u5ABC\u5AD7\u5AD8\u5B0B\u5B24\u5B38\u5B53\u5C5C\u5D06\u5D47\u5D94\u5D9D\u5E57\u5EC4\u5F46\u5FAC\u60BD\u60D8\u6123\u615D\u615F\u6175\u618A\u61AB\u61E3\u623E\u6308\u636B\u645F\u6519\u6595\u6698\u66B8\u67D9\u6840\u695D\u696E\u6979\u69C1\u69E8\u6AEC\u6AFA\u6B5F\u6CAC\u6CE0\u6CEF\u6D0C\u6D36\u6DD2\u6DD9\u6DE6\u6DEC\u6E5F\u6FA0\u6FEC\u7156\u71C4\u71DC\u71EC\u71FC\u720D\u7230\u7292\u7296\u72A2\u72CE\u7357\u737A\u7380\u7386\u73A8\u73EE\u743F\u74A6\u74CF\u74D4\u74DA\u755A\u75A5\u75B3\u75C2\u75E0\u75F1\u75FF\u7601\u7609\u7646\u7658\u769A\u76B0\u774F\u775C\u778B\u77BD\u77C7\u7843\u787F\u78F4\u79C8\u7A88\u7A95\u7AFD\u7B1E\u7B67\u7B9D\u7BCC\u7C0D\u7C11\u7C37\u7C40\u7C6E\u7CB3\u7CBD\u7D09\u7D31\u7D40\u7D5B\u
 7D70\u7D91\u7D9E\u7DB0\u7DD9\u7DF9\u7E08\u7E11\u7E1D\u7E35\u7E52\u7FB6\u7FBF\u7FEE\u8012\u801C\u8028\u8052\u8123\u8188\u81C3\u81DA\u81FE\u8210\u82BE\u83A0\u83D4\u8407\u8435\u8477\u849E\u84C6\u84CA\u85F9\u867A\u86B5\u86B6\u86C4\u8706\u8707\u870A\u8768\u87BB\u8831\u8839\u8879\u8921\u8938\u8964\u89A6\u89AC\u8A10\u8A3E\u8AC2\u8ADB\u8AF3\u8B2B\u8B41\u8B4E\u8B5F\u8B6B\u8B92\u8C55\u8C62\u8C73\u8C8A\u8C8D\u8CB2\u8CB3\u8CD2\u8CE1\u8CFB\u8D0D\u8E34\u8E7A\u8E8A\u8ED4\u8EFE\u8F0A\u8F1C\u8F1E\u8F26\u8FAE\u9088\u90C3\u90FE\u9134\u9148\u91D9\u91E9\u9238\u9239\u923D\u924D\u925A\u9296\u92AC\u92BB\u9315\u9319\u931A\u9321\u9370\u9394\u93A2\u93D8\u93E4\u943A\u9477\u9582\u958E\u95A1\u95C8\u95CC\u95D4\u9658\u966C\u970F\u973D\u9744\u975B\u9766\u97A3\u97A6\u97C1\u97C6\u980A\u9837\u9853\u9870\u98AF\u98B3\u98BA\u98E9\u98ED\u9912\u991B\u991E\u993D\u993F\u99D1\u99DF\u9A01\u9A3E\u9A43\u9A4D\u9ACF\u9AE1\u9B22\u9B58\u9C25\u9C3E\u9C54\u9C56\u9D15\u9D23\u9D89\u9DC2\u9DD3\u9E82\u9E8B\u9EA9\u9EE0\u9EF7\u9F07\u9F2F\u9
 F34\u9F3E\u9F5F\u9F6C
+NGram.KANJI_2_18=\u5155\u520E\u55DF\u56C0\u56C1\u5793\u5FD6\u5FF8\u6029\u60FA\u613E\u6147\u615A\u62C8\u6384\u6883\u6894\u68F9\u6AA3\u6AAE\u6AC2\u6E63\u7032\u70A4\u7146\u71FB\u7228\u72F7\u7370\u7441\u74BF\u75B8\u75E3\u7622\u76CD\u7768\u79E3\u7A60\u7B6E\u7BC1\u7C5F\u7D06\u7E2F\u7E39\u8146\u81CF\u8703\u8729\u8737\u87EF\u88D2\u8A22\u8AC4\u8AF6\u8E59\u8F33\u8F42\u9169\u91B1\u9278\u93C3\u93DD\u9460\u946A\u9785\u9AD1\u9B4D\u9B4E\u9C31\u9D12\u9ECC
+NGram.KANJI_2_21=\u502A\u544E\u59AE\u59EC\u5D1B\u66A8\u6BD7\u6C76\u6E1D\u70EF\u742A\u7459\u7FE1\u82EF\u8343\u85C9\u8A79\u90DD
+NGram.KANJI_2_22=\u4EDE\u4F7B\u504C\u50EE\u52E3\u52F0\u536E\u54A9\u54BB\u54BF\u54C2\u54E6\u550F\u556A\u55E8\u564E\u5664\u5671\u568F\u56DD\u572F\u57A0\u5809\u5924\u59A3\u59A4\u59E3\u5A13\u5A23\u5B51\u5B73\u5C50\u5C8C\u6035\u60C6\u6106\u6215\u62CE\u62FD\u64ED\u6549\u6554\u655D\u659B\u65CE\u65D6\u6615\u6624\u665E\u6677\u669D\u66E9\u6772\u677C\u696B\u6A84\u6AA0\u6BFD\u6C16\u6C86\u6C94\u6CD6\u6D2E\u6D39\u6F78\u6FB6\u705E\u70CA\u7168\u723B\u7256\u7284\u73B3\u740D\u742F\u7498\u74A9\u752D\u75F3\u7634\u768E\u76B4\u76E5\u77A0\u77DC\u781F\u782D\u7AA0\u7BFE\u7FF1\u80AB\u8174\u81EC\u8202\u8222\u8228\u82DC\u8306\u83FD\u8469\u84FF\u859C\u8617\u86B1\u8722\u8C89\u8D67\u8DCE\u8E49\u8E76\u8E87\u8FE2\u8FE4\u8FF8\u9016\u905B\u9174\u982B\u98E7\u9955\u9B32
+NGram.KANJI_2_23=\u4F8F\u5055\u524C\u548E\u5583\u594E\u5CB7\u5ED6\u5F5D\u6021\u66B9\u66F0\u6C55\u6C7E\u6C82\u6E2D\u6EC7\u6ED5\u70B3\u71B9\u72C4\u73C0\u7426\u745C\u748B\u7696\u777F\u79A7\u79B9\u7F8C\u8153\u8339\u8386\u8725\u90B5\u9102\u962E\u9716\u97F6
+NGram.KANJI_2_28=\u5733\u57D4\u838E\u8FEA
+NGram.KANJI_2_29=\u50ED\u5F29\u62EE\u6A9C\u7BC6\u80F1\u8129\u8171\u822B\u8AEB
+NGram.KANJI_2_30=\u4EB3\u4F15\u4FB7\u5006\u509A\u50A2\u5102\u5109\u5115\u5137\u5138\u513C\u524B\u524E\u5277\u528A\u52E6\u52FB\u5331\u5436\u5443\u54FD\u5538\u555E\u55C6\u55C7\u5679\u5690\u5695\u56C9\u56D1\u56EA\u588A\u58E2\u5AFB\u5B2A\u5B43\u5B7F\u5BE2\u5C37\u5D27\u5D84\u5D87\u5DD4\u5EC1\u5EDD\u5F12\u5FA0\u60F1\u616B\u61F5\u61F6\u61FE\u62DA\u6371\u6399\u63C0\u6451\u647B\u6493\u64BB\u64BF\u64C4\u64F1\u64F7\u650F\u652C\u665D\u6684\u6688\u66EC\u672E\u68E7\u69A6\u69ED\u69F3\u6A01\u6AAF\u6AE5\u6BA4\u6BAE\u6BAF\u6BC6\u6C08\u6C2C\u6C59\u6D87\u6EBC\u6ECC\u6EF7\u6F6F\u6F80\u6F86\u6FD8\u6FF0\u6FFA\u7006\u7018\u7030\u7051\u7192\u71C9\u71D9\u71F4\u71FE\u7274\u7377\u74A3\u750C\u7613\u7627\u7661\u7662\u7665\u766E\u7671\u7672\u76BA\u775E\u776A\u778C\u78E7\u7955\u7A08\u7AC5\u7B4D\u7C2B\u7C6C\u7CF0\u7D02\u7D1C\u7D73\u7DA2\u7DB5\u7DDE\u7E09\u7E0A\u7E37\u7E43\u7E61\u7E7D\u7E93\u7F3D\u7FF9\u81A9\u8271\u83F8\u84C0\u8514\u85BA\u86A9\u86FB\u879E\u8814\u8836\u889E\u8932\u896A\u896F\u8993\u89B2\u8A15\u8A16\u
 8A1D\u8A5B\u8A6C\u8A6D\u8A7C\u8AA1\u8AA3\u8AA5\u8B0A\u8B4F\u8B59\u8B96\u8C48\u8C54\u8CBD\u8CFA\u8D13\u8E89\u8E8B\u8EAA\u8EC0\u8EDB\u8EFC\u8F12\u8F1F\u8F3E\u8F45\u8FFA\u9015\u9183\u919E\u91A3\u91D7\u91F5\u9209\u9215\u923E\u9240\u9251\u9257\u927B\u9293\u92A8\u92C5\u92C7\u92F0\u9333\u935A\u9382\u938A\u9398\u93B3\u93D7\u93DF\u93E2\u93FD\u942B\u942E\u9433\u9463\u9470\u9472\u947E\u95D0\u96CB\u97C3\u97CC\u981C\u9839\u986B\u98B6\u98EA\u9909\u991A\u9935\u993E\u9951\u99A5\u99B1\u99D9\u99DD\u99F1\u9A2B\u9A62\u9A65\u9AAF\u9AD2\u9AEF\u9B0D\u9B28\u9B77\u9BFD\u9C49\u9C5F\u9C78\u9D3F\u9D72\u9DD7\u9E1B\u9EB4\u9EF4\u9F66\u9F94
+NGram.KANJI_2_31=\u5DBD\u63C6\u6E3E\u7587\u8AF1\u8B5A\u9695
+NGram.KANJI_2_32=\u53A5\u589F\u5CD9\u7109\u7F79\u8006\u8654\u8944\u968B\u96CD
+NGram.KANJI_2_35=\u4F47\u4F91\u4FCE\u4FDF\u527D\u535E\u55DA\u56A5\u5879\u5A11\u5B7A\u5CAB\u5CF4\u5EBE\u5F7F\u5FA8\u601B\u606B\u60B8\u610D\u6134\u619A\u61FA\u6369\u6523\u65CC\u66C4\u6727\u6968\u6A05\u6A48\u6B59\u6BEC\u6D35\u6D38\u6E19\u701F\u7064\u711C\u716C\u71A8\u71E7\u7258\u743A\u746F\u75BD\u75D9\u75F2\u7669\u766C\u76DE\u7729\u77BC\u78EC\u792A\u7A37\u7A62\u7BE6\u7C2A\u7C50\u7D07\u7DD8\u7E5A\u7F8B\u7FD5\u7FF3\u8151\u81CD\u8317\u83F4\u85EA\u85FA\u8823\u895E\u89F4\u8A0C\u8A41\u8AA8\u8ACD\u8B10\u8CC1\u8D05\u8D73\u8E4A\u8E85\u8E91\u8EFB\u8F13\u9087\u914A\u91C9\u923F\u93B0\u9403\u95A8\u95AD\u9730\u9865\u9903\u9945\u9949\u99AD\u99E2\u9A6A\u9D26\u9E1E\u9EDD\u9F2C\u9F72
+NGram.KANJI_2_36=\u4E9E\u4F86\u5011\u50B3\u5152\u5169\u5340\u5718\u5B78\u5BE6\u5BF6\u5C07\u5EE3\u61C9\u6230\u6703\u689D\u6A02\u6C23\u7063\u7368\u756B\u7576\u767C\u7A31\u7D93\u7E23\u7E3D\u81FA\u8207\u842C\u85DD\u865F\u8B49\u8B80\u8CFD\u908A\u9435\u95DC\u965D\u9AD4\u9EE8
+NGram.KANJI_2_37=\u5480\u5580\u5C39\u67EF\u68B5\u6D85\u8521\u90B1
+NGram.KANJI_2_38=\u4E1F\u4F96\u4FE0\u50F1\u5118\u522A\u5291\u52C1\u52DB\u52F3\u52F5\u52F8\u53B2\u55CE\u562F\u580A\u5862\u58AE\u58D8\u58DF\u58E9\u58EF\u5925\u593E\u599D\u5ABD\u5C62\u5EC2\u5EDA\u5EE2\u5F4E\u5F65\u6085\u6158\u61FC\u6200\u62CB\u633E\u6416\u6436\u6490\u64CB\u64E0\u64FA\u6514\u651C\u6524\u6558\u6583\u66B1\u66C6\u66C9\u66E0\u6A11\u6A1E\u6A38\u6A62\u6AB3\u6B16\u6B98\u6BBC\u6C2B\u6DDA\u6DE8\u6DEA\u6DFA\u6EEF\u6EFE\u6F32\u6F51\u6F5B\u700F\u71D2\u7210\u7246\u7260\u72A7\u72F9\u7375\u7378\u758A\u760B\u76DC\u76EA\u77DA\u77FD\u78DA\u7919\u797F\u79AA\u7A05\u7A4C\u7ACA\u7C72\u7D81\u7DDD\u7E31\u7E69\u7E6B\u7E73\u7E96\u7E9C\u81BD\u81C9\u81DF\u8259\u8277\u8396\u83A7\u8523\u8525\u860A\u863F\u8667\u87A2\u87F2\u881F\u883B\u89F8\u8B20\u8B74\u8B9A\u8C4E\u8C6C\u8C93\u8CEC\u8D0A\u8D0F\u8D95\u8E10\u8F4E\u8FAF\u8FF4\u905E\u9072\u9081\u908F\u91AC\u91C0\u91C1\u91D0\u921E\u9223\u9245\u929C\u92B3\u92C1\u9336\u934A\u93C8\u9444\u9452\u947C\u947F\u9592\u95B1\u95C6\u95D6\u95E1\u95E2\u96DE\u9742\u978F\u
 984F\u9871\u98B1\u98C4\u99ED\u9A37\u9A45\u9A5F\u9AEE\u9B27\u9BCA\u9C77\u9D51\u9D5D\u9E79\u9E7C\u9E7D\u9EB5\u9EBC\u9F61\u9F63\u9F90\u9F9C
+NGram.KANJI_3_1=\u5283\u7562\u7DEC\u88E1\u8F2F
+NGram.KANJI_3_2=\u5009\u502B\u5049\u5075\u507D\u5091\u5098\u50B5\u50B7\u50BE\u5100\u5104\u511F\u518A\u525B\u5289\u5442\u5805\u589C\u58C7\u5922\u596A\u5A66\u5B6B\u5BE7\u5BE9\u5DBA\u5E63\u5E7E\u5FB9\u6163\u616E\u6176\u61B2\u61B6\u61F8\u639B\u63DA\u63EE\u640D\u64B2\u64C1\u64EC\u6557\u6575\u6607\u66AB\u68C4\u6A39\u6C96\u6CC1\u6E1B\u6E6F\u6E9D\u6EC5\u6F01\u6F64\u6FC3\u7058\u707D\u7344\u7642\u76E4\u7832\u790E\u7B46\u7D05\u7D0B\u7D14\u7D19\u7D1B\u7D39\u7D61\u7DB1\u7DCA\u7DD2\u7DE0\u7DE9\u7DEF\u7DF4\u7E2E\u7E3E\u8105\u8108\u81E8\u8266\u84CB\u84EE\u85A9\u885D\u88DC\u8972\u8A02\u8A0E\u8A13\u8A17\u8A2A\u8A34\u8A3A\u8A3C\u8A69\u8A73\u8A95\u8AA0\u8AA4\u8AB2\u8AC7\u8ACB\u8B00\u8B1B\u8B1D\u8B5C\u8C9D\u8C9E\u8CA2\u8CA8\u8CA9\u8CAB\u8CAC\u8CB7\u8CBF\u8CC0\u8CDE\u8CE2\u8CFC\u8D08\u8DE1\u8E8D\u8ECC\u8EDF\u8EF8\u8F14\u8F1D\u8F2A\u8F44\u9055\u9069\u9077\u907C\u90F5\u91DD\u9285\u92FC\u9326\u932F\u9375\u9396\u93AE\u93E1\u9451\u9589\u95A3\u9663\u9670\u9673\u96BB\u9801\u9802\u9803\u9806\u9808\u9810\u983B\u9
 84D\u9858\u9867\u98EF\u98F2\u98FE\u990A\u99D0\u9A0E\u9A5A\u9B5A\u9CE5\u9DB4\u9E97\u9F8D
+NGram.KANJI_3_3=\u543E\u5BEE\u5F18\u6590\u725F\u83C5\u85E9\u9E93
+NGram.KANJI_3_4=\u5016\u53AD\u5606\u5629\u58BE\u5F14\u6065\u6144\u646F\u647A\u67F5\u6953\u6C3E\u6F2C\u6F97\u6FB1\u7169\u71E6\u71ED\u74BD\u79BF\u7A1C\u7A4E\u7AAF\u7CDE\u7D17\u7D43\u7E55\u7FA8\u807E\u8139\u8490\u8569\u856A\u87FB\u8A23\u8AB9\u8AE6\u8AFA\u8B2C\u8CD1\u91D8\u92F8\u9318\u96DB\u99B4\u9BC9\u9C2D\u9CF6\u9D61\u9DFA
+NGram.KANJI_3_5=\u4E26\u4F75\u4FC2\u500B\u5074\u5099\u512A\u5225\u5247\u5275\u5287\u52D5\u52D9\u52DD\u52E2\u5354\u54E1\u554F\u5712\u57F7\u5831\u5834\u5BAE\u5C0E\u5C64\u5CA1\u5CF6\u5E2B\u5E79\u5EAB\u5F35\u5F37\u5F8C\u5FA9\u611B\u614B\u63A1\u63DB\u6642\u66F8\u6771\u696D\u6975\u69CB\u6A19\u6A4B\u6A5F\u6BBA\u6C7A\u6E2C\u6E96\u6F22\u70BA\u7121\u71B1\u7372\u73FE\u74B0\u7570\u76E3\u78BA\u7A2E\u7A4D\u7AF6\u7BC0\u7BC4\u7BC9\u7C21\u7D00\u7D04\u7D0D\u7D1A\u7D30\u7D42\u7D44\u7D50\u7D66\u7D71\u7DAD\u7DDA\u7DE8\u7E54\u7F85\u7FA9\u7FD2\u8056\u805E\u8077\u8208\u83EF\u8449\u8853\u885B\u88FD\u8907\u898B\u898F\u8996\u89AA\u8A08\u8A18\u8A2D\u8A31\u8A55\u8A5E\u8A66\u8A71\u8A72\u8A8C\u8A8D\u8A9E\u8ABF\u8AD6\u8AF8\u8B58\u8B70\u8B77\u8CA0\u8CA1\u8CB4\u8CBB\u8CC7\u8CEA\u8ECA\u8ECD\u8F03\u8F09\u8F38\u8FB2\u9023\u9031\u9032\u904A\u904B\u904E\u9054\u9060\u9078\u907A\u9084\u9280\u9577\u9580\u958B\u9593\u9678\u967D\u968A\u968E\u969B\u96E2\u96E3\u96F2\u96FB\u97D3\u97FF\u9805\u9818\u982D\u984C\u985E\u98A8\u98DB\u9
 928\u99AC\u9BAE
+NGram.KANJI_3_8=\u5F6B\u6C4E\u7B87\u8A70
+NGram.KANJI_3_9=\u540B\u5B5C\u826E
+NGram.KANJI_3_11=\u4F83\u4FF8\u51CB\u52BE\u53F1\u548B\u558B\u5CB1\u5D69\u5F3C\u620E\u621F\u64E2\u67DA\u6854\u69CC\u6A35\u6C8C\u6E1A\u6F15\u6FE0\u717D\u7252\u7AFA\u82D3\u83DF\u8431\u9041\u9149\u9798
+NGram.KANJI_3_12=\u4ED5\u55E3\u572D\u57A3\u587E\u5983\u5A9B\u5C90\u5E61\u672D\u6960\u6F5F\u72D9\u72E9\u757F\u7949\u7950\u7E82\u7FCC\u82B8\u90B8\u91DC\u961C\u9B45
+NGram.KANJI_3_13=\u55AB\u6249\u643E\u6841\u68B1\u725D\u7B8B\u7C95\u7E1E\u7F36\u8A03\u8A6B\u8E74\u95A4
+NGram.KANJI_3_15=\u50AD\u50D1\u5132\u51F1\u55AC\u5617\u5687\u584A\u59EA\u5B30\u5BF5\u5C0B\u5C4D\u5EDF\u6182\u61A4\u64AB\u64FE\u66A2\u6897\u694A\u69CD\u6B3D\u6BC0\u6D29\u6F38\u7015\u7149\u71C8\u723A\u7336\u7345\u755D\u76C3\u78A9\u798D\u7AAE\u7DFB\u7E2B\u7F75\u7F77\u81E5\u834A\u852D\u85CD\u8755\u8A3B\u8A54\u8AE7\u8B02\u8B39\u8CAA\u8CE6\u8DA8\u8E5F\u8F5F\u905C\u912D\u919C\u92D2\u932B\u937E\u9418\u9583\u9812\u985B\u9905\u99B3\u99C1\u99D5\u9A30\u9CF3\u9D3B\u9D6C
+NGram.KANJI_3_16=\u6D6C\u72FD\u77A5\u8956\u9C0D
+NGram.KANJI_3_18=\u5919\u5F4A\u6063\u63AC\u649A\u6715\u6AD3\u71D0\u758B\u834F\u85F7\u88DF\u8F61\u93D1\u98F4\u9D60
+NGram.KANJI_3_19=\u4F50\u7DB2\u962A
+NGram.KANJI_3_22=\u5E96\u75D4\u91C6
+NGram.KANJI_3_23=\u5E9A\u6C40\u821C\u839E\u8FED\u9EDB
+NGram.KANJI_3_27=\u5F01\u66DC
+NGram.KANJI_3_29=\u5023\u5208\u531D\u536F\u53E9\u54C9\u598A\u59BE\u5A20\u5D6F\u5DF3\u66C7\u66D6\u66F3\u6775\u6A3D\u6ADB\u6B86\u6C72\u6E25\u73EA\u7435\u760D\u7656\u7825\u78D0\u7A14\u7A6B\u7B20\u7BE0\u7CF8\u7DAC\u7DBB\u7DBE\u80E4\u80F4\u837B\u8466\u8568\u867B\u8A63\u91E7\u9320\u935B\u9591\u965B\u98E2\u990C\u9913\u9BAB
+NGram.KANJI_3_30=\u60B6\u8AD2\u8CC2\u9237\u9328\u934D\u9397\u9830
+NGram.KANJI_3_31=\u4FB6\u50D5\u51CD\u559A\u55AA\u5674\u5857\u585A\u5875\u58B3\u596E\u59E6\u5A41\u5D50\u5E25\u5E33\u5F59\u61C7\u61F2\u6368\u6383\u65AC\u68DF\u68F2\u6A3A\u6B04\u6DBC\u6DF5\u6E26\u6E4A\u6E67\u6F54\u6F70\u6FC1\u6FEB\u7159\u727D\u7652\u77EF\u78EF\u798E\u7A40\u7AAA\u7BE4\u7C60\u7CE7\u7CFE\u7D21\u7D33\u7D5E\u7D79\u7DB4\u7DBF\u7E1B\u7E8F\u7F70\u814E\u816B\u8178\u819A\u84BC\u85A6\u865C\u8766\u8A1F\u8A50\u8A60\u8A6E\u8A87\u8A98\u8AB0\u8ADC\u8AED\u8AEE\u8B0E\u8B19\u8CA7\u8CAF\u8CB8\u8CBC\u8CC3\u8CC4\u8CCA\u8CDC\u8CE0\u8CED\u8ED2\u8F29\u8F3F\u91E3\u920D\u9234\u925B\u9298\u9310\u934B\u958F\u95A5\u9727\u97FB\u9811\u984E\u98FC\u98FD\u99D2\u99FF\u9B31\u9BE8\u9C57\u9CE9\u9CF4\u9D28\u9DF9
+NGram.KANJI_3_32=\u4E1E\u502D\u51A5\u5321\u58EC\u5A3C\u5BC5\u5CE8\u61A9\u620A\u65A1\u6714\u6853\u6893\u6C50\u6C5D\u7436\u745A\u745B\u773A\u7941\u7947\u8543\u865E\u8C5A\u914B\u99A8\u9AB8
+NGram.KANJI_3_35=\u4E99\u5BA5\u5DFD\u608C\u60C7\u60DA\u6190\u61A7\u6753\u6777\u6787\u6B4E\u6F23\u6FE1\u6FEF\u7337\u7827\u786F\u7893\u7ABA\u7B94\u7BB8\u7C3E\u7D62\u7E6D\u80B1\u81BF\u81C6\u821B\u82E7\u83F0\u84D1\u86ED\u8888\u8B01\u8B04\u8F4D\u9291\u92E4\u932E\u9354\u936C\u939A\u9957\u9AED\u9BAA\u9BAD\u9BD6\u9BDB\u9C3B\u9D1B
+NGram.KANJI_3_36=\u50C5\u53E2\u5EE0\u65BC\u70CF\u723E\u7D10\u7D9C\u806F\u8607\u862D\u8A0A\u8AFE\u8CD3\u9019\u9813\u9B6F
+NGram.KANJI_3_37=\u4EA8\u4F3D\u5384\u5EFF\u60DF\u66DD\u6E5B\u8087\u82D1\u8FE6\u9640\u9E9F
+NGram.KANJI_3_38=\u5147\u525D\u5678\u617E\u6372\u79A6\u8ABC\u92EA\u9438\u9817
+NGram.KANJI_4_0=\u6D3C\u718F\u74EE\u8712
+NGram.KANJI_4_9=\u4F84\u54C6\u5565\u68F1\u6D82\u83C7
+NGram.KANJI_4_10=\u4FE9\u4FED\u51FF\u523D\u5300\u5364\u538C\u5450\u5455\u545C\u54D1\u54D7\u5578\u56A3\u58F6\u592F\u5CE6\u5D2D\u5E90\u6073\u607C\u60EB\u61D2\u62E2\u62E3\u631A\u6320\u6323\u6361\u63B7\u63B8\u63BA\u6405\u65A9\u65F7\u6619\u6655\u67A3\u67E0\u6805\u6808\u6866\u6868\u6869\u6A71\u6BE1\u6C79\u6CA5\u6CDE\u6DA4\u6DA7\u6DA9\u6E85\u70DB\u70E6\u70EB\u7115\u724D\u7410\u759F\u75AE\u75EA\u75F9\u762B\u763E\u76B1\u77EB\u783E\u79C3\u7A8D\u7A9C\u7B5D\u7BF1\u7EC5\u7ED2\u7EDE\u7EE3\u7EF7\u7EF8\u7EFD\u7F00\u7F0E\u7F15\u7F1A\u7F20\u7F24\u7F28\u7FA1\u7FD8\u8038\u803B\u804B\u80AE\u817B\u82C7\u8327\u835E\u8367\u83BA\u8424\u864F\u8681\u8682\u8715\u8717\u8721\u8747\u874E\u8845\u886C\u889C\u88E4\u89C5\u8BB6\u8BB9\u8BC0\u8BC5\u8BE1\u8BEB\u8BEC\u8BF5\u8C0E\u8C1A\u8D2E\u8D31\u8D43\u8D4E\u8D58\u8F67\u8F7F\u9489\u9499\u949D\u94A0\u94A5\u94AE\u94BE\u94D0\u94DB\u94F2\u9508\u950C\u951A\u9525\u952D\u952F\u9530\u953B\u9540\u9550\u9570\u9576\u95F0\u960E\u9668\u96CF\u97E7\u9885\u988A\u98A4\u9965\u9975\u997A\u
 997F\u9985\u998D\u998F\u9A6E\u9A6F\u9A74\u9A79\u9A7C\u9A82\u9A87\u9CA4\u9CC4\u9CCD\u9CD6\u9E20\u9E25\u9E35\u9E3D\u9E45\u9E49\u9E4A\u9E66
+NGram.KANJI_4_16=\u576F\u579B\u6345\u78B4\u79EB\u79F8
+NGram.KANJI_4_17=\u4E13\u4E1A\u4E1C\u4E24\u4E25\u4E2A\u4E3E\u4E49\u4E50\u4E66\u4E9A\u4EA7\u4EBF\u4ECE\u4EEC\u4EF7\u4F17\u4F20\u5170\u5173\u519B\u51B3\u51E4\u51FB\u5219\u521B\u522B\u529E\u52A1\u52A8\u52BF\u534F\u5355\u536B\u5386\u53BF\u53D1\u53D8\u542F\u5458\u54CD\u56E2\u56ED\u56F4\u56FE\u573A\u5904\u590D\u5934\u5B81\u5B9E\u5BF9\u5BFC\u5C14\u5C9B\u5E26\u5E7F\u5E94\u5F00\u5F20\u5F3A\u603B\u6218\u65E0\u65F6\u663E\u672F\u6743\u6784\u6807\u6C14\u6C49\u707E\u70ED\u73AF\u73B0\u7535\u76D1\u786E\u79CD\u79EF\u7B80\u7C7B\u7EA2\u7EA6\u7EA7\u7EAA\u7EBF\u7EC4\u7EC7\u7ED3\u7EDF\u7EE7\u7EED\u7EF4\u7F16\u7F57\u804C\u8054\u817E\u8282\u82CF\u83B7\u8425\u89C1\u89C2\u89C4\u89C6\u8BA1\u8BA4\u8BAE\u8BAF\u8BB0\u8BB8\u8BBA\u8BBE\u8BC1\u8BC4\u8BD1\u8BDD\u8BE5\u8BED\u8BF4\u8C03\u8D22\u8D23\u8D28\u8D39\u8D44\u8D5B\u8F66\u8F6C\u8F83\u8FBE\u8FC7\u8FD0\u8FD8\u8FD9\u8FDB\u8FDE\u9009\u94C1\u957F\u95E8\u95EE\u95F4\u95FB\u961F\u9633\u9645\u9646\u96BE\u9879\u9884\u9886\u9898\u98CE\u9A6C\u9F99
+NGram.KANJI_4_18=\u51DB\u67B7
+NGram.KANJI_4_22=\u4FA5\u545B\u5499\u5520\u5570\u56F1\u5A76\u5C96\u60AF\u60ED\u618B\u61A8\u62A0\u62A1\u62E7\u6363\u6390\u63B0\u6400\u6402\u6512\u6748\u70C1\u732C\u765E\u7663\u76CF\u7741\u781A\u7980\u79C6\u79FD\u7AA5\u7B0B\u7B8D\u7BA9\u7BAB\u7BD3\u7CAA\u7EAB\u7ECA\u7EE2\u7F2D\u7F30\u8110\u8113\u81CA\u835A\u8360\u84D6\u852B\u87E5\u8869\u8A8A\u8BA5\u8BF2\u8C05\u8C12\u8D30\u8D4A\u8D61\u8DF7\u8E6D\u8E8F\u8F95\u8F99\u8FAB\u94B3\u94C6\u94E3\u9504\u954A\u9563\u95FA\u9893\u9981\u9992\u9AA1\u9CAB\u9E2F\u9E33\u9EB8
+NGram.KANJI_4_24=\u4E22\u4E8F\u4F1E\u4FA3\u5151\u517D\u51BB\u51D1\u5220\u529D\u52CB\u5367\u5389\u5395\u53E0\u53F9\u5413\u548F\u5524\u575E\u575F\u5784\u5792\u57A6\u57AB\u58F3\u5986\u5988\u5A04\u5A07\u5BA0\u5C18\u5C82\u5DE9\u5E10\u5E1C\u5F2F\u60E9\u6124\u629B\u6321\u6324\u635E\u63FD\u6401\u644A\u6491\u655B\u658B\u6635\u67AB\u67DC\u680B\u692D\u6984\u6A31\u6B7C\u6BD9\u6C22\u6CA6\u6CA7\u6CEA\u6CFB\u6CFC\u6D46\u6D47\u6D4A\u6D51\u6DA1\u6E0A\u6E83\u6EE4\u6EE5\u6F9C\u6FD2\u70C2\u7237\u727A\u730E\u7574\u75AF\u7792\u7816\u7845\u78B1\u7A77\u7A91\u7A9D\u7AD6\u7B3C\u7B5B\u7CAE\u7EA4\u7EB1\u7EBA\u7ECE\u7ED1\u7EF0\u7EF3\u7F14\u7F1D\u7F34\u7F62\u8042\u806A\u80A0\u80A4\u80BE\u80BF\u80C0\u810F\u8138\u8231\u8270\u829C\u82CD\u8350\u83B9\u841D\u8574\u8680\u8BB3\u8BBC\u8BBD\u8BC8\u8BF1\u8BFD\u8C0A\u8C0D\u8C1C\u8C24\u8C26\u8C2C\u8C2D\u8C34\u8D1E\u8D2C\u8D3C\u8D41\u8D42\u8D4C\u8D50\u8D5A\u8F69\u8F88\u8F90\u8FA9\u915D\u9171\u9493\u949E\u94A7\u94A9\u94BB\u94C3\u94C5\u94DD\u94F8\u9505\u9510\u9523\u9524\u95EF\u
 95F7\u95F9\u9600\u9610\u96F3\u97F5\u987D\u9882\u9888\u9896\u98D8\u9971\u9972\u9976\u997C\u9A84\u9A86\u9A8F\u9A97\u9A9A\u9AA4\u9CB8\u9CDE\u9E26\u9E43\u9E64\u9E70\u9F7F\u9F9F
+NGram.KANJI_4_28=\u534E\u62A5\u7ECF\u7F51
+NGram.KANJI_4_34=\u4E34\u4E3D\u4E4C\u4E54\u4E60\u4E61\u4E70\u4EB2\u4EC5\u4EEA\u4F18\u4F1F\u4F24\u4F26\u4FA7\u50A8\u513F\u5174\u517B\u518C\u519C\u51B5\u51CF\u5218\u521A\u5267\u52B3\u5356\u5382\u5385\u538B\u53A6\u5434\u5706\u5723\u5757\u575A\u575B\u575D\u5907\u591F\u593A\u5956\u5B59\u5BA1\u5BAB\u5BBD\u5BBE\u5BFB\u5C42\u5C81\u5E01\u5E08\u5E86\u5E93\u5F02\u5F39\u5F52\u5F55\u5F7B\u6000\u6001\u6076\u620F\u6237\u6267\u6269\u626C\u62A2\u62A4\u62DF\u62E5\u62E9\u6325\u635F\u6362\u6444\u6653\u6682\u6740\u6742\u6768\u6781\u6811\u6837\u6865\u68C0\u6B22\u6BC1\u6BD5\u6C47\u6C9F\u6CAA\u6CFD\u6D4B\u6DA8\u6E10\u6EE1\u6EE8\u706D\u7075\u70DF\u7231\u739B\u7597\u76D6\u76D8\u77FF\u7801\u7840\u79BB\u7A33\u7ADE\u7B14\u7B7E\u7CA4\u7D27\u7EB3\u7EBD\u7EC3\u7EC6\u7EC8\u7ECD\u7ED5\u7ED9\u7EDC\u7EDD\u7EE9\u7EFC\u7EFF\u7F13\u7F29\u8083\u80DC\u8111\u814A\u8230\u827A\u8363\u836F\u8428\u84DD\u867D\u8865\u88AD\u89C8\u8BA2\u8BA8\u8BA9\u8BAD\u8BB2\u8BBF\u8BC6\u8BCD\u8BD5\u8BEF\u8BF7\u8BF8\u8BFA\u8BFB\u8C08\u8D1D\u8D1F\u
 8D21\u8D25\u8D27\u8D2D\u8D2F\u8D35\u8D38\u8DC3\u8F6E\u8F6F\u8F7B\u8F7D\u8F86\u8F91\u8F93\u8F96\u8FB9\u8FBD\u8FC1\u8FDC\u8FDD\u9002\u9057\u90BB\u90D1\u91CA\u9488\u949F\u94A2\u94B1\u94F6\u9500\u9526\u9547\u9614\u9634\u9635\u9636\u9648\u9655\u9669\u9690\u97E9\u9875\u9876\u987A\u987B\u987E\u987F\u9891\u989D\u98DE\u9986\u9A7B\u9A8C\u9C81\u9C9C\u9F50
+NGram.KANJI_4_39=\u4E1B\u4E1D\u4E27\u4EA9\u4ED1\u4ED3\u4F2A\u4FA6\u4FA8\u503A\u503E\u507F\u5188\u51AF\u51C0\u51C9\u51ED\u51EF\u5242\u5251\u52B2\u5362\u53A2\u5415\u5417\u5428\u55B7\u5760\u5899\u5939\u594B\u5987\u5A31\u5A74\u5BAA\u5C1D\u5C7F\u5C97\u5CAD\u5E05\u5E2E\u5E99\u5E9E\u5E9F\u5F03\u5FC6\u5FE7\u60AC\u60CA\u60EF\u626B\u6270\u629A\u62E6\u62E8\u6446\u6447\u654C\u67AA\u680F\u6863\u68A6\u6C64\u6D01\u6D53\u6D9D\u6DA6\u6E14\u6E17\u6EDA\u6EE9\u707F\u70BC\u70E7\u7275\u72B9\u72EE\u72F1\u743C\u7545\u76D0\u7855\u7978\u7B79\u7BEE\u7EA0\u7EAC\u7EAF\u7EB2\u7EB5\u7EB7\u7EB8\u7EB9\u7ED8\u7EEA\u7EF5\u7F05\u7F06\u7F18\u7F5A\u80C1\u80F6\u8109\u8206\u8273\u82F9\u8346\u8361\u83B2\u8427\u8651\u867E\u8854\u89C9\u8BC9\u8BCA\u8BD7\u8BDA\u8BDE\u8BE2\u8BE6\u8BFE\u8C01\u8C0B\u8C10\u8C13\u8C22\u8C23\u8C28\u8C31\u8D24\u8D26\u8D29\u8D2A\u8D2B\u8D34\u8D37\u8D3A\u8D3E\u8D3F\u8D4B\u8D4F\u8D54\u8D56\u8D5E\u8D60\u8D62\u8D75\u8D76\u8D8B\u8F68\u8F70\u8F74\u8F85\u8F89\u8FC8\u8FDF\u900A\u9012\u903B\u9093\u90AE\u917F\u
 9274\u94A6\u94DC\u94ED\u94FA\u94FE\u9501\u950B\u9519\u9521\u952E\u955C\u95EA\u95ED\u95F2\u95F8\u95FD\u9601\u9605\u9647\u96B6\u96FE\u9877\u9881\u9887\u9897\u989C\u98A0\u996D\u996E\u9970\u9A70\u9A71\u9A73\u9A76\u9A7E\u9A91\u9C7C\u9E1F\u9E21\u9E23\u9E2D\u9E3F\u9E4F\u9F84
+NGram.KANJI_5_10=\u5239\u8EAF
+NGram.KANJI_5_11=\u51C4\u8471
+NGram.KANJI_5_12=\u6DC0\u7C98
+NGram.KANJI_5_13=\u5631\u5815\u8695
+NGram.KANJI_5_14=\u4E71\u4FA0\u5265\u52B1\u5374\u53A8\u53D9\u58EE\u5BDD\u5BFF\u5C3D\u5C4A\u5CE1\u5F25\u5F84\u604B\u60A6\u60E7\u60E8\u631F\u636E\u643A\u663C\u664B\u67A2\u6816\u697C\u6B8B\u6BB4\u6D45\u6E7F\u6EDE\u6F5C\u706F\u7089\u72ED\u732A\u732B\u76D7\u793C\u7977\u7A0E\u7A83\u80C6\u811A\u8131\u82A6\u830E\u848B\u865A\u866B\u86EE\u89E6\u8A89\u8DF5\u8E0A\u8E2A\u8F9E\u9065\u968F\u9759\u9EA6
+NGram.KANJI_5_18=\u601C\u75D2
+NGram.KANJI_5_26=\u4E07\u4E0E\u4E89\u4F1A\u4F53\u515A\u5185\u5199\u533A\u533B\u53C2\u53CC\u53F7\u58F0\u5965\u5B66\u5B9D\u5C06\u5C5E\u5F53\u62C5\u6570\u65AD\u65E7\u6761\u6765\u6A2A\u6B27\u6CA1\u6E29\u6E7E\u70B9\u72B6\u72EC\u732E\u753B\u79F0\u88C5\u9EC4
+NGram.KANJI_5_29=\u693F\u82EB
+NGram.KANJI_5_34=\u53F6\u6D9B\u83B1
+NGram.KANJI_5_39=\u5C61\u788D
+NGram.KANJI_6_0=\u4E10\u4E52\u4EC6\u4F88\u4FD0\u51F3\u533E\u53ED\u53EE\u5406\u541D\u5429\u5435\u5440\u5490\u5495\u54B1\u54C4\u54FC\u557C\u55D3\u5669\u56E4\u5777\u5992\u59E8\u5B7D\u5BDE\u5BE5\u5C79\u5C94\u5DCD\u5E18\u5E1A\u5E54\u5FF1\u604D\u6064\u60F6\u6127\u6177\u6233\u6252\u625B\u6273\u6296\u62C2\u62C7\u62F4\u638F\u6396\u63E3\u63EA\u6413\u6479\u64A9\u64C2\u659F\u667E\u6760\u6845\u6963\u6A90\u6B83\u6C13\u6C5E\u6D8E\u6D95\u6DCC\u6ED4\u6F13\u6F3E\u6FA1\u7076\u70D8\u710A\u71CE\u7239\u72E1\u73B7\u7599\u759A\u75A4\u75CA\u7629\u7682\u76C5\u76EF\u778E\u77AA\u787C\u7889\u788C\u78BE\u79E7\u7A96\u7A98\u7B77\u7C7D\u7CB1\u7D0A\u7D6E\u7F94\u7FCE\u8116\u814B\u814C\u819B\u828D\u82DF\u8301\u83E0\u85D5\u8611\u86A3\u8708\u8822\u8C4C\u8DB4\u8DEA\u8E42\u8E66\u8E72\u8EBA\u901B\u9157\u970E\u97ED
+NGram.KANJI_6_3=\u62FC\u88D4\u9B4F
+NGram.KANJI_6_9=\u4ED7\u4F63\u4FCF\u5018\u50BB\u50F5\u5154\u5201\u522E\u5254\u527F\u5306\u5462\u5492\u5496\u54A8\u54AA\u554A\u5561\u5564\u5566\u5885\u5938\u5AC2\u5AE9\u5CED\u5F64\u6084\u608D\u60A8\u60D5\u61C2\u61C8\u6254\u626F\u62AC\u6346\u634D\u640F\u6454\u6487\u6495\u64D2\u6746\u6789\u68B3\u68F5\u695E\u6986\u6995\u69A8\u6A44\u6AAC\u6B79\u6C28\u6C2E\u6CF5\u6DE4\u6E34\u6E3A\u6E89\u6F29\u70AB\u70AC\u7130\u715E\u7184\u71AC\u7238\u7281\u72E0\u74E3\u74F7\u7529\u7578\u761F\u7626\u76D4\u775B\u7779\u7784\u77BB\u780C\u780D\u7838\u7898\u78C5\u78F7\u7AED\u7B28\u7BE1\u7C07\u7CD5\u7CD9\u7CEF\u7F38\u800D\u8084\u809A\u8165\u816E\u832B\u8334\u840D\u8774\u886B\u888D\u88D9\u88F9\u8C41\u8D81\u8D9F\u8E22\u8E29\u8EB2\u8F9C\u9165\u918B\u9631\u964B\u964C\u9661\u9709\u9739\u9776\u9AD3\u9ED4
+NGram.KANJI_6_10=\u4E53\u5582\u5600\u6342\u7B06
+NGram.KANJI_6_11=\u5288\u543C\u5475\u5486\u54EE\u5598\u56BC\u5962\u5A36\u5A9A\u5B75\u5BA6\u5C38\u5C4E\u5F8A\u5F98\u627C\u62CC\u62D7\u63C9\u6930\u6954\u69D0\u6BEF\u6C90\u6CBD\u6CBE\u6F31\u6F88\u70D9\u7329\u75BC\u75F0\u7737\u77D7\u7B19\u7FB9\u803F\u80D6\u813E\u81C0\u8205\u8309\u83BD\u846B\u8517\u868C\u8759\u8815\u8859\u8B6C\u8E81\u8EAC\u90A2\u9698\u9B44
+NGram.KANJI_6_12=\u722C\u7FD4
+NGram.KANJI_6_16=\u5228\u5315\u542E\u54CE\u5509\u5527\u5543\u55B3\u55E1\u5636\u568E\u5FFF\u61E6\u6376\u642A\u6726\u74E4\u76F9\u7736\u7BD9\u8019\u80F0\u80F3\u812F\u818A\u8200\u8214\u8638\u869C\u86C0\u86C6\u86D4\u87C6\u88B1\u8902\u8C7A\u8E4B\u9119
+NGram.KANJI_6_18=\u67D2\u6ED3\u87C0\u87CB\u8DDB\u901E\u9163
+NGram.KANJI_6_20=\u4F5B\u52D2\u54C8\u62FF\u66FC\u6D59\u704C\u7586\u9ECE
+NGram.KANJI_6_21=\u4E48\u4EFF\u4F19\u4FF1\u5021\u5077\u5195\u5212\u5269\u5401\u541E\u5427\u54EA\u5587\u558A\u55BB\u566A\u573E\u574E\u5783\u57AE\u584C\u58E4\u5960\u5976\u59CA\u5A1C\u5DE2\u5F99\u600E\u6015\u6263\u626D\u6293\u62C6\u62D6\u62EF\u62F1\u6316\u632A\u6380\u6389\u63D2\u641E\u64C5\u64CE\u65F1\u6664\u6735\u6770\u67EC\u6846\u684C\u68AD\u6B47\u6B49\u6B67\u6C1B\u6C27\u6C2F\u6C5B\u6C89\u6DF9\u6EAF\u70AE\u70E4\u731C\u7334\u73BB\u7470\u76FC\u788E\u789F\u78B0\u78B3\u7A0D\u7A3B\u7A57\u7CB9\u7F69\u8335\u8354\u84BF\u8DCC\u8DD1\u904F\u90A8\u9189\u9677\u9738\u978B
+NGram.KANJI_6_22=\u5162\u53E8\u542D\u5501\u552C\u5639\u563F\u56B7\u6043\u60B4\u6194\u61CA\u634E\u63CD\u6414\u64AC\u6DAE\u6E43\u6F66\u7095\u7316\u733E\u7728\u7830\u78D5\u7ABF\u7FE9\u8018\u80EF\u8198\u8693\u86AA\u86AF\u874C\u8783\u879F\u8892\u8E6C
+NGram.KANJI_6_23=\u4FD8\u4FEF\u501A\u5085\u5180\u526A\u5323\u54ED\u5634\u56CA\u58A9\u58F9\u5955\u5978\u59DA\u5A49\u5B55\u5BC7\u5BE8\u5D4C\u5E62\u6467\u64BC\u6500\u655E\u6572\u658C\u6670\u68CD\u68D5\u68E0\u6912\u6A0A\u6BB7\u6C9B\u6D3D\u6DC6\u6E23\u6F8E\u7011\u7092\u714C\u73AB\u7405\u7624\u76D2\u7960\u79C9\u7A20\u7BF7\u7F50\u804A\u8086\u81C2\u8292\u82DE\u852C\u857E\u859B\u8760\u8C6B\u8DBE\u8E48\u8F9F\u96A7
+NGram.KANJI_6_25=\u4E8E\u5DF2\u5FB7\u7AD9
+NGram.KANJI_6_28=\u4E58\u4ECD\u4EFD\u4F30\u4F60\u4F69\u503C\u5047\u51B0\u51F0\u5361\u5377\u53E6\u54E5\u552E\u5708\u5740\u5761\u57C3\u5821\u589E\u5979\u59C6\u5B69\u5B83\u5E15\u5E76\u5F17\u5F88\u6208\u622A\u624E\u627E\u62D4\u62DC\u63ED\u641C\u6536\u6548\u65C1\u665A\u6668\u67E5\u6B65\u6BCF\u6C61\u6CDB\u6D4E\u6D89\u6DB5\u6E38\u6EAA\u6FB3\u70B8\u745F\u7538\u7A97\u7F3A\u7F55\u805A\u8258\u827E\u82AC\u8303\u83F2\u8482\u85CF\u8DDF\u903E\u9080\u970D\u9760\u9ED1\u9ED8
+NGram.KANJI_6_29=\u634F\u6518\u7B50\u809B
+NGram.KANJI_6_30=\u54A7\u57C2\u5AB3\u60CB\u6886\u8378\u85D0\u8671
+NGram.KANJI_6_32=\u5080\u5121\u51A4\u54AC\u55DC\u592D\u5DEB\u6292\u68D8\u69B4\u6A59\u6E24\u7FC5\u80DA\u8180\u86DB\u8700\u8DCB\u9761
+NGram.KANJI_6_34=\u4E30\u51E0\u542C\u613F
+NGram.KANJI_6_35=\u4E56\u547B\u55FD\u5C41\u606C\u6115\u6CAE\u7119\u795F\u7CDC\u86C9\u86F9\u8713\u873B\u8757\u8925\u892A\u96F9
+NGram.KANJI_6_37=\u51B2\u5308\u5398\u54B8\u59DC\u5C4F\u5D14\u5F6D\u60E0\u6241\u6350\u699C\u6BEB\u6C6A\u6CC4\u6DEE\u6F58\u6F6D\u7199\u77EE\u7ADF\u8058\u820D\u8212\u8389\u8587\u884D\u8881\u8FA8\u8FF9\u96D5
+NGram.KANJI_6_39=\u574F\u6251\u6302
+NGram.KANJI_7_0=\u52FA\u5544\u60F0\u6994\u86A4\u86E4
+NGram.KANJI_7_3=\u4E59\u4E7E\u4EAD\u4EF0\u4EF2\u4F0F\u4F10\u4FAF\u4FCA\u500D\u501F\u5076\u508D\u50E7\u5112\u5146\u5192\u51AC\u51DD\u51FD\u5200\u5237\u524A\u52A3\u52C3\u52C7\u52DF\u5351\u5352\u5353\u5378\u537F\u53E5\u5439\u54FA\u574A\u5782\u57CB\u5893\u58C1\u5915\u5937\u5949\u5951\u5974\u59B9\u5A18\u5A5A\u5ACC\u5B54\u5B5D\u5B64\u5B8F\u5BBF\u5BD2\u5C3A\u5C6F\u5CB3\u5D07\u5DE7\u5E84\u5E8A\u5F26\u5F69\u5F70\u5F90\u5FAA\u5FCD\u6012\u6016\u602A\u60A0\u60B2\u60BC\u6148\u6162\u6170\u6291\u6298\u62AB\u62BC\u62BD\u62D2\u62D3\u62D8\u62F3\u6311\u638C\u6398\u63E1\u642C\u6458\u64A4\u654F\u656C\u659C\u65E2\u65E8\u65EC\u6606\u6614\u6676\u6691\u6696\u66F9\u6749\u676F\u679A\u679D\u67CF\u67D4\u67F1\u67F3\u67F4\u6817\u6842\u6843\u6851\u68A8\u68CB\u68D2\u6B20\u6B32\u6BBF\u6C57\u6C88\u6CCA\u6D17\u6D1E\u6D69\u6D6E\u6D78\u6DE1\u6DFB\u6E58\u6EB6\u6F0F\u6F20\u7070\u708E\u70AD\u7126\u718A\u71C3\u7267\u72C2\u731B\u7384\u73A9\u73CD\u7434\u75AB\u75DB\u76C6\u76FE\u773C\u7891\u78C1\u795D\u7965\u79D2\u79DF\u79E6\u7
 A00\u7B11\u7B51\u7B54\u7C89\u7C92\u7CD6\u7D2B\u7F8A\u7FBD\u7FFC\u8010\u80A5\u80CE\u8150\u8179\u819C\u8247\u829D\u82B3\u82D7\u82E6\u8302\u8336\u8352\u83CA\u83CC\u83DC\u845B\u846C\u84B2\u84B8\u84C4\u8584\u864E\u86C7\u8861\u8863\u8870\u888B\u8896\u88D5\u8986\u8C46\u8DA3\u8E0F\u8F9B\u8FC5\u8FEB\u8FF7\u9003\u9006\u902E\u9042\u9063\u90ED\u963B\u9676\u96EA\u9756\u9B3C\u9B42\u9F3B
+NGram.KANJI_7_6=\u4E01\u4E03\u4E45\u4E5D\u4E88\u4E92\u4EA1\u4ECB\u4EE4\u4F01\u4F0A\u4F2F\u4F3C\u4F4E\u4F4F\u4F55\u4F8B\u4F9D\u4FBF\u4FEE\u505C\u50CF\u516B\u516D\u5175\u5177\u5178\u5207\u520A\u5224\u526F\u529F\u52A9\u5343\u5348\u535A\u5370\u53BB\u53CB\u53F3\u5409\u542B\u544A\u547C\u5584\u5747\u5802\u590F\u592B\u5931\u5947\u597D\u5A01\u5A92\u5B63\u5B8C\u5B97\u5BA2\u5BA3\u5BA4\u5BB3\u5BB9\u5BC6\u5BCC\u5BDF\u5C04\u5C1A\u5C45\u5C4B\u5CB8\u5DE6\u5E0C\u5E1D\u5E2D\u5E55\u5E8F\u5E95\u5E97\u5EA7\u5EB7\u5EF6\u5F8B\u5FAE\u5FC5\u5FD7\u5FF5\u601D\u6025\u606F\u60F3\u611F\u623F\u6253\u6279\u627F\u6295\u6297\u62EC\u6388\u6392\u63F4\u6545\u6551\u6574\u6599\u65C5\u65E9\u6613\u6620\u6625\u666E\u666F\u66B4\u66F4\u670D\u671B\u6728\u672B\u6751\u677E\u67B6\u6838\u6839\u6848\u68EE\u690D\u6982\u6A21\u6B4C\u6B62\u6B66\u6BB5\u6BCD\u6C0F\u6C38\u6C42\u6CBF\u6CE2\u6CE8\u6D0B\u6D3E\u6D88\u6DF1\u6E05\u6E56\u706B\u7167\u7206\u7236\u7247\u7387\u7530\u7537\u7559\u7565\u7591\u75C5\u767B\u767D\u767E\u7687\u76DB\u76DF\u7
 71F\u7763\u77ED\u7834\u79FB\u7A81\u7AE0\u7AEF\u7B56\u7B97\u7C4D\u7CBE\u7D20\u7D22\u7F72\u7FA4\u8001\u8003\u81F4\u822A\u826F\u82B1\u8349\u843D\u878D\u8857\u89D2\u8B66\u8C37\u8D70\u8D85\u8D8A\u8DB3\u8FF0\u8FFD\u9001\u901F\u90A3\u90A6\u914D\u91CE\u9632\u963F\u9644\u964D\u9664\u96C4\u96E8\u9752\u9769\u98DF
+NGram.KANJI_7_7=\u4E09\u4E0A\u4E0B\u4E0D\u4E16\u4E3B\u4E8B\u4E8C\u4EE3\u4EE5\u4F4D\u4F5C\u4F7F\u5165\u5168\u516C\u5171\u51FA\u5206\u5229\u5236\u524D\u529B\u52A0\u5316\u5317\u5357\u539F\u53CA\u53F0\u5408\u540C\u540D\u548C\u5730\u57FA\u5916\u591A\u5929\u5B50\u5B9A\u5BB6\u5C0F\u5C71\u5DDE\u5DE5\u5E02\u5E73\u5EA6\u5EFA\u5F0F\u6027\u6210\u6240\u6307\u653F\u6587\u65B0\u65B9\u660E\u6700\u6709\u671F\u672C\u6B21\u6B63\u6C11\u6CBB\u6CD5\u6D77\u7269\u7279\u7406\u751F\u7528\u7531\u754C\u76EE\u76F8\u793E\u79D1\u7ACB\u7B2C\u7B49\u7CFB\u8005\u80FD\u81EA\u82F1\u884C\u8868\u897F\u8981\u901A\u9053\u90E8\u90FD\u91CD\u9AD8
+NGram.KANJI_7_9=\u4E4D\u4F36\u5319\u6A61\u6DCB\u7194
+NGram.KANJI_7_11=\u4E5E\u4F43\u5026\u50FB\u515C\u5243\u5420\u5446\u54B3\u54BD\u553E\u55A7\u5703\u5984\u5AC9\u5B09\u5C51\u5DFE\u5ED3\u5F1B\u6055\u618E\u62D9\u65A7\u6652\u6977\u6EBA\u707C\u75D8\u79E4\u7AFF\u7B4F\u7CA5\u808B\u8098\u80B4\u8235\u82DB\u849C\u8549\u868A\u86FE\u8718\u914C
+NGram.KANJI_7_12=\u4E08\u4E38\u4F8D\u50DA\u5203\u5256\u52C9\u52D8\u52FE\u5320\u533F\u5375\u53D4\u540F\u54E8\u56DA\u5806\u5996\u5999\u59A5\u59A8\u59FF\u5AE1\u5BB0\u5BF8\u5C09\u5C3F\u5C48\u5C65\u5D29\u5E06\u5E4C\u5EB5\u5EB6\u5EB8\u5F13\u5FCC\u5FD8\u6052\u606D\u609F\u60D1\u614E\u6247\u62B1\u6349\u64E6\u6577\u65ED\u6674\u6734\u67C4\u6850\u690E\u6A58\u6B3A\u6B89\u6C41\u6CBC\u6CCC\u6CF3\u6D74\u6DAF\u6DF3\u6ECB\u6F02\u6F84\u71E5\u7261\u7272\u72AC\u72FC\u733F\u7409\u755C\u76F2\u7720\u77AC\u77E2\u7802\u786B\u78E8\u7901\u7948\u79E9\u7A1A\u7A74\u7AE3\u7B4B\u7B52\u7BB1\u7C3F\u8015\u8096\u809D\u80A2\u80A9\u80AA\u80BA\u80F8\u8102\u810A\u8154\u8155\u8170\u817A\u81A8\u81ED\u820C\u8236\u82BD\u8305\u83E9\u83F1\u840C\u85FB\u8650\u8702\u8A93\u8E44\u8FB0\u9038\u9091\u90AA\u916C\u9175\u9177\u9685\u96C0\u96C7\u96CC\u97AD
+NGram.KANJI_7_13=\u63D6\u803D
+NGram.KANJI_7_16=\u602F\u7566
+NGram.KANJI_7_18=\u634C\u7C38
+NGram.KANJI_7_19=\u4E18\u4E73\u4E95\u4EAB\u4EC1\u4ED8\u4ED9\u4F11\u4F34\u4F38\u4F59\u4FB5\u4FC3\u4FD7\u5012\u5019\u5065\u50AC\u5144\u5145\u514D\u517C\u51A0\u51B7\u5211\u5238\u523A\u523B\u5272\u52E4\u5360\u5371\u539A\u541B\u5426\u5438\u5473\u54F2\u5510\u552F\u5531\u559C\u5609\u56F0\u56FA\u591C\u5948\u594F\u59BB\u59D3\u5B85\u5B87\u5B88\u5B99\u5B9C\u5BC4\u5BFA\u5C0A\u5C3E\u5CA9\u5D0E\u5DE1\u5DE8\u5DEE\u5DF1\u5E45\u5E78\u5E7B\u5E7C\u5EAD\u5EF7\u5F1F\u5F31\u5F79\u5F7C\u5F85\u5F92\u5FA1\u5FE0\u6050\u60A3\u6212\u62DB\u632F\u6355\u63A2\u63AA\u63CF\u642D\u6469\u64CD\u653B\u6563\u660C\u662D\u667A\u6697\u66FF\u6750\u675F\u677F\u6790\u67D3\u682A\u6885\u68B0\u6B8A\u6B96\u6BDB\u6C60\u6CB9\u6CC9\u6D25\u6D66\u6DB2\u6DF7\u6E21\u6ED1\u6F2B\u6F6E\u6FC0\u7235\u725B\u72AF\u7389\u7532\u7533\u756A\u75BE\u75C7\u76AE\u76CA\u7740\u786C\u7956\u7968\u796D\u7981\u79C0\u79C1\u79CB\u79D8\u7A3F\u7AE5\u7AF9\u7E41\u7F6A\u7FFB\u8089\u80CC\u80DE\u81E3\u821E\u8239\u82E5\u8328\u8377\u85E4\u8840\u88C1\u88C2\u8C6A\u8D64\u
 8DDD\u8FCE\u8FD4\u9000\u9014\u907F\u90CA\u90CE\u90E1\u9152\u9178\u9686\u9694\u969C\u9707\u9732\u9AA8\u9B54\u9E7F\u9EBB
+NGram.KANJI_7_20=\u4E39\u4E43\u4EAE\u4F73\u504F\u505A\u51C6\u51CC\u52AA\u5339\u5347\u53EB\u53EC\u5448\u5766\u57F9\u5854\u585E\u58A8\u5B8B\u5C01\u5CF0\u5E72\u5EC9\u5F80\u5F81\u5FBD\u5FEB\u6069\u6211\u624D\u628A\u62B5\u62CD\u6309\u63A7\u64AD\u6566\u6597\u65CB\u65D7\u6628\u6717\u6731\u674E\u675C\u683D\u6881\u6B3E\u6BD2\u6C7D\u6C99\u6CE5\u6CF0\u6D1B\u6D2A\u70C8\u719F\u724C\u7259\u73E0\u73ED\u745E\u74E6\u7518\u751A\u7686\u770B\u7B26\u8033\u80A1\u80E1\u821F\u83AB\u8499\u8D74\u8DE8\u900F\u9010\u9047\u904D\u906D\u9675\u96C5\u96F6\u96F7\u9700\u9F13
+NGram.KANJI_7_21=\u5764\u59D0\u5A03\u6062\u6108\u68C9\u7164\u79BE\u7BAD\u903C
+NGram.KANJI_7_23=\u4EA5\u50B2\u532A\u5366\u543B\u54E9\u5632\u59D1\u5BB5\u5DF7\u5F6A\u5F6C\u5FFD\u6070\u6168\u61BE\u63A0\u63A9\u6478\u65A4\u68A7\u6A1F\u6CAB\u70F9\u711A\u723D\u7262\u72F8\u751C\u754F\u75B9\u76C8\u7709\u7897\u7CCA\u7F9E\u8299\u82AD\u82B9\u82D4\u8304\u84C9\u84EC\u854A\u85AF\u86D9\u8FA3\u9187\u97A0
+NGram.KANJI_7_25=\u4E14\u4E5F\u4F46\u514B\u5176\u5230\u5373\u53EA\u540E\u5982\u5C3C\u5DF4\u6216\u62C9\u65AF\u66FE\u6B64\u6D32\u6D6A\u7BC7\u800C
+NGram.KANJI_7_28=\u4E4E\u4E9B\u4EA6\u4EC0\u4FC4\u5403\u5957\u5C24\u6089\u6258\u67D0\u758F\u7FF0\u8D6B
+NGram.KANJI_7_29=\u4FAE\u5944\u5A29\u6101\u62ED\u6328\u637B\u6666\u6687\u66AE\u673D\u6756\u67FF\u6813\u68A2\u699B\u7078\u708A\u7396\u7422\u7525\u75E2\u76BF\u7766\u77B3\u7A3C\u7A92\u819D\u81FC\u8237\u8338\u8511\u88F3\u8FC2
+NGram.KANJI_7_32=\u4E11\u4F3A\u4F51\u5197\u51B6\u51F9\u52FF\u541F\u5507\u5589\u5993\u5A7F\u5AC1\u5B9B\u5BC2\u5BE1\u5F04\u5F0A\u5F27\u6020\u6028\u6068\u6094\u6109\u611A\u614C\u621A\u62B9\u62D0\u62F7\u62FE\u632B\u633D\u6367\u660F\u6627\u6643\u66D9\u674F\u6795\u67AF\u67D1\u6876\u68DA\u68FA\u6905\u69FD\u6A80\u6B6A\u6CB8\u6CE3\u6DD1\u6DEB\u6E9C\u6EA2\u6EF4\u6F06\u714E\u716E\u722A\u7280\u74A7\u752B\u75B2\u75D5\u75F4\u77AD\u77E9\u785D\u79BD\u7A3D\u7A9F\u7B1B\u7B95\u7C9F\u7CDF\u80C3\u8106\u817F\u818F\u81B3\u828B\u82A5\u82AF\u840E\u851A\u853D\u8776\u87F9\u8877\u8910\u8912\u8C79\u8D66\u8FB1\u9017\u90C1\u916A\u9699\u96C1\u971C\u9774\u978D
+NGram.KANJI_7_33=\u4E4B\u4E86\u4E94\u4EA4\u4EAC\u4ECA\u4ED6\u4EF6\u4EFB\u4F9B\u4FDD\u4FE1\u5143\u5148\u5149\u518D\u5217\u521D\u5305\u5341\u534A\u53C8\u53CD\u53D6\u53D7\u53E3\u53E4\u53EF\u53F2\u53F8\u5404\u5411\u5468\u547D\u54C1\u5546\u5668\u56DB\u56DE\u56E0\u571F\u578B\u57CE\u57DF\u5883\u58EB\u592A\u592E\u5973\u59CB\u59D4\u5B57\u5B58\u5B89\u5B98\u5C11\u5C31\u5C40\u5C55\u5DDD\u5E03\u5E38\u5E9C\u5F15\u5F62\u5F71\u5F97\u5FC3\u60C5\u610F\u624B\u6280\u6301\u63A5\u63A8\u63D0\u652F\u6539\u653E\u6559\u65BD\u65CF\u661F\u66F2\u671D\u672A\u6797\u679C\u6821\u683C\u6B7B\u6BD4\u6C34\u6C5F\u6CB3\u6D3B\u6D41\u6E2F\u6E90\u6F14\u7136\u7248\u738B\u7403\u76F4\u7701\u77E5\u77F3\u7814\u793A\u795E\u798F\u7A0B\u7A76\u7A7A\u7BA1\u7C73\u7F6E\u7F8E\u80B2\u81F3\u822C\u8272\u8457\u88AB\u89E3\u8A00\u8C61\u8D77\u8DEF\u8EAB\u8FD1\u9020\u91CC\u91CF\u91D1\u9650\u9662\u96C6\u975E\u9762\u97F3\u9996\u9999
+NGram.KANJI_7_35=\u55C5\u57A2\u58D5\u59E5\u637A\u74E2\u7CE0\u895F
+NGram.KANJI_7_37=\u4E19\u4E32\u4E4F\u4E91\u4EC7\u4ED4\u4F0D\u5141\u51E1\u51F6\u51F8\u52AB\u535C\u53C9\u53DB\u540A\u5410\u54C0\u559D\u5750\u5751\u576A\u57E0\u5824\u582A\u5830\u5835\u5851\u5858\u586B\u5954\u59FB\u5A46\u5B5F\u5BB4\u5BD3\u5C16\u5C60\u5CFB\u5D16\u5E16\u5E3D\u5E7D\u5E87\u5ECA\u5FD9\u60DC\u60F9\u6155\u6167\u6234\u626E\u6276\u6284\u633A\u6377\u6492\u649E\u64B0\u6562\u6591\u65A5\u65E6\u65FA\u6602\u670B\u676D\u68AF\u695A\u6B23\u6BC5\u6C70\u6C83\u6CE1\u6D8C\u6DD8\u6E20\u71D5\u72D0\u72D7\u73B2\u73CA\u7433\u7483\u74DC\u74F6\u7554\u764C\u7761\u77DB\u78A7\u7A46\u7A7F\u7A84\u7C97\u7D2F\u7FC1\u7FE0\u8000\u8017\u808C\u80AF\u8404\u8461\u8463\u8475\u8513\u85AA\u8679\u86CB\u871C\u87BA\u88F8\u8C8C\u8DF3\u8FC4\u901D\u9022\u906E\u9075\u9192\u91C7\u966A\u971E\u9910\u9B41\u9F0E\u9F20
+TO_NORMALIZE_VI_CHARS=AEIOUYaeiouy\u00c2\u00ca\u00d4\u00e2\u00ea\u00f4\u0102\u0103\u01a0\u01a1\u01af\u01b0
+DMARK_CLASS=\u0300\u0301\u0303\u0309\u0323
+NORMALIZED_VI_CHARS_0300=\u00C0\u00C8\u00CC\u00D2\u00D9\u1EF2\u00E0\u00E8\u00EC\u00F2\u00F9\u1EF3\u1EA6\u1EC0\u1ED2\u1EA7\u1EC1\u1ED3\u1EB0\u1EB1\u1EDC\u1EDD\u1EEA\u1EEB
+NORMALIZED_VI_CHARS_0301=\u00C1\u00C9\u00CD\u00D3\u00DA\u00DD\u00E1\u00E9\u00ED\u00F3\u00FA\u00FD\u1EA4\u1EBE\u1ED0\u1EA5\u1EBF\u1ED1\u1EAE\u1EAF\u1EDA\u1EDB\u1EE8\u1EE9
+NORMALIZED_VI_CHARS_0303=\u00C3\u1EBC\u0128\u00D5\u0168\u1EF8\u00E3\u1EBD\u0129\u00F5\u0169\u1EF9\u1EAA\u1EC4\u1ED6\u1EAB\u1EC5\u1ED7\u1EB4\u1EB5\u1EE0\u1EE1\u1EEE\u1EEF
+NORMALIZED_VI_CHARS_0309=\u1EA2\u1EBA\u1EC8\u1ECE\u1EE6\u1EF6\u1EA3\u1EBB\u1EC9\u1ECF\u1EE7\u1EF7\u1EA8\u1EC2\u1ED4\u1EA9\u1EC3\u1ED5\u1EB2\u1EB3\u1EDE\u1EDF\u1EEC\u1EED
+NORMALIZED_VI_CHARS_0323=\u1EA0\u1EB8\u1ECA\u1ECC\u1EE4\u1EF4\u1EA1\u1EB9\u1ECB\u1ECD\u1EE5\u1EF5\u1EAC\u1EC6\u1ED8\u1EAD\u1EC7\u1ED9\u1EB6\u1EB7\u1EE2\u1EE3\u1EF0\u1EF1

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
new file mode 100644
index 0000000..cbe7d1a
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/DummyFactory.java
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+
+public class DummyFactory extends LanguageDetectorFactory {
+
+
+  public DummyFactory() {
+    super();
+  }
+
+  @Override
+  public void init() {
+    super.init();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
new file mode 100644
index 0000000..f6c8b18
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorContextGeneratorTest.java
@@ -0,0 +1,43 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Arrays;
+import java.util.Collection;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageDetectorContextGeneratorTest {
+
+  @Test
+  public void extractContext() throws Exception {
+    String doc = "abcde fghijk";
+
+    LanguageDetectorContextGenerator cg = new LanguageDetectorContextGenerator();
+
+    Collection<String> features = Arrays.asList(cg.getContext(doc));
+
+    Assert.assertEquals(21, features.size());
+    Assert.assertTrue(features.contains("ab"));
+    Assert.assertTrue(features.contains("abc"));
+    Assert.assertTrue(features.contains("e f"));
+    Assert.assertTrue(features.contains(" fg"));
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java
new file mode 100644
index 0000000..520fc71
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorCrossValidatorTest.java
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.util.TrainingParameters;
+
+public class LanguageDetectorCrossValidatorTest {
+
+  @Test
+  public void evaluate() throws Exception {
+
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ITERATIONS_PARAM, 100);
+    params.put(TrainingParameters.CUTOFF_PARAM, 5);
+    params.put("PrintMessages", false);
+
+
+    final AtomicInteger correctCount = new AtomicInteger();
+    final AtomicInteger incorrectCount = new AtomicInteger();
+
+    LanguageDetectorCrossValidator cv = new LanguageDetectorCrossValidator(params,
+        new LanguageDetectorFactory(), new LanguageDetectorEvaluationMonitor() {
+          @Override
+          public void correctlyClassified(LanguageSample reference,
+                                          LanguageSample prediction) {
+            correctCount.incrementAndGet();
+          }
+
+          @Override
+          public void missclassified(LanguageSample reference,
+                                     LanguageSample prediction) {
+            incorrectCount.incrementAndGet();
+          }
+        });
+
+    LanguageDetectorSampleStream sampleStream = LanguageDetectorMETest.createSampleStream();
+
+    cv.evaluate(sampleStream, 2);
+
+    Assert.assertEquals(99, cv.getDocumentCount());
+    Assert.assertEquals(0.98989898989899, cv.getDocumentAccuracy(), 0.01);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java
new file mode 100644
index 0000000..8bdd71b
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorEvaluatorTest.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.concurrent.atomic.AtomicInteger;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+
+public class LanguageDetectorEvaluatorTest {
+
+  @Test
+  public void processSample() throws Exception {
+    LanguageDetectorModel model = LanguageDetectorMETest.trainModel();
+    LanguageDetectorME langdetector = new LanguageDetectorME(model);
+
+    final AtomicInteger correctCount = new AtomicInteger();
+    final AtomicInteger incorrectCount = new AtomicInteger();
+
+    LanguageDetectorEvaluator evaluator = new LanguageDetectorEvaluator(langdetector,
+        new LanguageDetectorEvaluationMonitor() {
+          @Override
+          public void correctlyClassified(LanguageSample reference,
+                                          LanguageSample prediction) {
+            correctCount.incrementAndGet();
+          }
+
+          @Override
+          public void missclassified(LanguageSample reference,
+                                     LanguageSample prediction) {
+            incorrectCount.incrementAndGet();
+          }
+        });
+
+    evaluator.evaluateSample(new LanguageSample(new Language("pob"),
+        "escreve e faz palestras pelo mundo inteiro sobre anjos"));
+
+    evaluator.evaluateSample(new LanguageSample(new Language("fra"),
+        "escreve e faz palestras pelo mundo inteiro sobre anjos"));
+
+    evaluator.evaluateSample(new LanguageSample(new Language("fra"),
+        "escreve e faz palestras pelo mundo inteiro sobre anjos"));
+
+
+    Assert.assertEquals(1, correctCount.get());
+    Assert.assertEquals(2, incorrectCount.get());
+
+    Assert.assertEquals(3, evaluator.getDocumentCount());
+    Assert.assertEquals(0.33, evaluator.getAccuracy(), 0.01);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
new file mode 100644
index 0000000..2a6c0ce
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorFactoryTest.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
+public class LanguageDetectorFactoryTest {
+
+
+  private LanguageDetectorModel model;
+
+  @Before
+  public void train() throws Exception {
+
+    ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory(
+        LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt");
+
+    PlainTextByLineStream lineStream = new PlainTextByLineStream(streamFactory, "UTF-8");
+
+    LanguageDetectorSampleStream sampleStream = new LanguageDetectorSampleStream(lineStream);
+
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "0");
+
+    this.model = LanguageDetectorME.train(sampleStream, params, new DummyFactory());
+  }
+
+  @Test
+  public void testCorrectFactory() throws IOException {
+    byte[] serialized = LanguageDetectorMETest.serializeModel(model);
+
+    LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized));
+
+    Assert.assertTrue(myModel.getFactory() instanceof DummyFactory);
+
+  }
+
+  @Test
+  public void testDummyFactory() throws Exception {
+    byte[] serialized = LanguageDetectorMETest.serializeModel(
+        LanguageDetectorMETest.trainModel(new DummyFactory()));
+
+    LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized));
+
+    Assert.assertTrue(myModel.getFactory() instanceof DummyFactory);
+
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
new file mode 100644
index 0000000..8caca1d
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/langdetect/LanguageDetectorMETest.java
@@ -0,0 +1,114 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package opennlp.tools.langdetect;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Before;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.util.PlainTextByLineStream;
+import opennlp.tools.util.TrainingParameters;
+
+
+public class LanguageDetectorMETest {
+
+  private LanguageDetectorModel model;
+
+  @Before
+  public void init() throws Exception {
+
+    this.model = trainModel();
+
+  }
+
+  @Test
+  public void testPredictLanguages() {
+    LanguageDetector ld = new LanguageDetectorME(this.model);
+    Language[] languages = ld.predictLanguages("estava em uma marcenaria na Rua Bruno");
+
+    Assert.assertEquals(4, languages.length);
+    Assert.assertEquals("pob", languages[0].getLang());
+    Assert.assertEquals("ita", languages[1].getLang());
+    Assert.assertEquals("spa", languages[2].getLang());
+    Assert.assertEquals("fra", languages[3].getLang());
+  }
+
+  @Test
+  public void testPredictLanguage() {
+    LanguageDetector ld = new LanguageDetectorME(this.model);
+    Language language = ld.predictLanguage("Dove è meglio che giochi");
+
+    Assert.assertEquals("ita", language.getLang());
+  }
+
+  @Test
+  public void testSupportedLanguages() {
+
+    LanguageDetector ld = new LanguageDetectorME(this.model);
+    String[] supportedLanguages = ld.getSupportedLanguages();
+
+    Assert.assertEquals(4, supportedLanguages.length);
+  }
+
+  @Test
+  public void testLoadFromSerialized() throws IOException {
+    byte[] serialized = serializeModel(model);
+
+    LanguageDetectorModel myModel = new LanguageDetectorModel(new ByteArrayInputStream(serialized));
+
+    Assert.assertNotNull(myModel);
+
+  }
+
+  protected static byte[] serializeModel(LanguageDetectorModel model) throws IOException {
+
+    ByteArrayOutputStream out = new ByteArrayOutputStream();
+    model.serialize(out);
+    return out.toByteArray();
+  }
+
+  public static LanguageDetectorModel trainModel() throws Exception {
+    return trainModel(new LanguageDetectorFactory());
+  }
+
+  public static LanguageDetectorModel trainModel(LanguageDetectorFactory factory) throws Exception {
+
+
+    LanguageDetectorSampleStream sampleStream = createSampleStream();
+
+    TrainingParameters params = new TrainingParameters();
+    params.put(TrainingParameters.ITERATIONS_PARAM, "100");
+    params.put(TrainingParameters.CUTOFF_PARAM, "2");
+
+    return LanguageDetectorME.train(sampleStream, params, factory);
+  }
+
+  public static LanguageDetectorSampleStream createSampleStream() throws IOException {
+
+    ResourceAsStreamFactory streamFactory = new ResourceAsStreamFactory(
+        LanguageDetectorMETest.class, "/opennlp/tools/doccat/DoccatSample.txt");
+
+    PlainTextByLineStream lineStream = new PlainTextByLineStream(streamFactory, "UTF-8");
+
+    return new LanguageDetectorSampleStream(lineStream);
+  }
+}

[14/21] opennlp git commit: OPENNLP-1079 Added BratDocumentParser. Closed Annotation stream in BratDocument

Posted by jo...@apache.org.

OPENNLP-1079 Added BratDocumentParser. Closed Annotation stream in
BratDocument

Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/e9728694
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/e9728694
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/e9728694

Branch: refs/heads/LangDetect
Commit: e972869486f85c3424875a443eb04bda2eeb6bd3
Parents: 1aa5432
Author: Daniel Russ <dr...@mail.nih.gov>
Authored: Thu May 25 14:57:27 2017 -0400
Committer: Daniel Russ <dr...@mail.nih.gov>
Committed: Thu May 25 14:59:45 2017 -0400

----------------------------------------------------------------------
 .../tools/formats/brat/BratDocument.java        |   1 +
 .../tools/formats/brat/BratDocumentParser.java  | 149 +++++++++++++++++++
 .../formats/brat/BratNameSampleStream.java      | 120 +--------------
 3 files changed, 154 insertions(+), 116 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/e9728694/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java
index 1b9aee2..51723be 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocument.java
@@ -91,6 +91,7 @@ public class BratDocument {
     while ((ann = annStream.read()) != null) {
       annotations.add(ann);
     }
+    annStream.close();
 
     return new BratDocument(config, id, text.toString(), annotations);
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/e9728694/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
new file mode 100644
index 0000000..24ba887
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratDocumentParser.java
@@ -0,0 +1,149 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.brat;
+
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+
+import opennlp.tools.namefind.NameSample;
+import opennlp.tools.sentdetect.SentenceDetector;
+import opennlp.tools.tokenize.Tokenizer;
+import opennlp.tools.util.Span;
+
+public class BratDocumentParser {
+
+  private SentenceDetector sentDetector;
+  private Tokenizer tokenizer;
+
+  public BratDocumentParser(SentenceDetector sentenceDetector, Tokenizer tokenizer) {
+    this.sentDetector = sentenceDetector;
+    this.tokenizer = tokenizer;
+  }
+
+  public List<NameSample> parse(BratDocument sample) {
+    // Note: Some entities might not match sentence boundaries,
+    // to be able to print warning a set of entities id must be maintained
+    // to check if all entities have been used up after the matching is done
+
+    Set<String> entityIdSet = new HashSet<>();
+    Map<Integer, Span> coveredIndexes = new HashMap<>();
+
+    for (BratAnnotation ann : sample.getAnnotations()) {
+      if (ann instanceof SpanAnnotation) {
+        entityIdSet.add(ann.getId());
+
+        Span span = ((SpanAnnotation) ann).getSpan();
+        for (int i = span.getStart(); i < span.getEnd(); i++) {
+          coveredIndexes.put(i, span);
+        }
+      }
+    }
+
+    List<Span> sentences = new ArrayList<>();
+    for (Span sentence : sentDetector.sentPosDetect(sample.getText())) {
+      Span conflictingName = coveredIndexes.get(sentence.getStart());
+
+      if (sentences.size() > 0 && conflictingName != null &&
+          conflictingName.getStart() < sentence.getStart()) {
+        Span lastSentence = sentences.remove(sentences.size() - 1);
+        sentences.add(new Span(lastSentence.getStart(), sentence.getEnd()));
+
+        System.out.println("Correcting sentence segmentation in document " +
+            sample.getId());
+      }
+      else {
+        sentences.add(sentence);
+      }
+    }
+
+    // TODO: Token breaks should be enforced on name span boundaries
+    // a) Just split tokens
+    // b) Implement a custom token split validator which can be injected into the Tokenizer
+
+    // Currently we are missing all
+
+    List<NameSample> samples = new ArrayList<>(sentences.size());
+
+    for (Span sentence : sentences) {
+
+      String sentenceText = sentence.getCoveredText(
+          sample.getText()).toString();
+
+      Span[] tokens = tokenizer.tokenizePos(sentenceText);
+
+      // Note:
+      // A begin and end token index can be identical, but map to different
+      // tokens, to distinguish between between the two begin indexes are
+      // stored with a negative sign, and end indexes are stored with a positive sign
+      // in the tokenIndexMap.
+      // The tokenIndexMap maps to the sentence local token index.
+
+      Map<Integer, Integer> tokenIndexMap = new HashMap<>();
+
+      for (int i = 0; i < tokens.length; i++) {
+        tokenIndexMap.put(-(sentence.getStart() + tokens[i].getStart()), i);
+        tokenIndexMap.put(sentence.getStart() + tokens[i].getEnd(), i + 1);
+      }
+
+      List<Span> names = new ArrayList<>();
+
+      for (BratAnnotation ann : sample.getAnnotations()) {
+
+        if (ann instanceof SpanAnnotation) {
+          SpanAnnotation entity = (SpanAnnotation) ann;
+
+          Span entitySpan = entity.getSpan();
+
+          if (sentence.contains(entitySpan)) {
+            entityIdSet.remove(ann.getId());
+
+            entitySpan = entitySpan.trim(sample.getText());
+
+            Integer nameBeginIndex = tokenIndexMap.get(-entitySpan.getStart());
+            Integer nameEndIndex = tokenIndexMap.get(entitySpan.getEnd());
+
+            if (nameBeginIndex != null && nameEndIndex != null) {
+              names.add(new Span(nameBeginIndex, nameEndIndex, entity.getType()));
+            }
+            else {
+              System.err.println("Dropped entity " + entity.getId() + " ("
+                  + entitySpan.getCoveredText(sample.getText()) + ") " + " in document "
+                  + sample.getId() + ", it is not matching tokenization!");
+            }
+          }
+        }
+      }
+
+      samples.add(new NameSample(sample.getId(), Span.spansToStrings(tokens, sentenceText),
+          names.toArray(new Span[names.size()]), null, samples.size() == 0));
+    }
+
+    for (String id : entityIdSet) {
+      System.err.println("Dropped entity " + id + " in document " +
+          sample.getId() + ", is not matching sentence segmentation!");
+    }
+
+    return samples;
+  }
+}
+

http://git-wip-us.apache.org/repos/asf/opennlp/blob/e9728694/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
index 569f450..cc066ad 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
@@ -18,12 +18,7 @@
 package opennlp.tools.formats.brat;
 
 import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
 import java.util.List;
-import java.util.Map;
-import java.util.Set;
 
 import opennlp.tools.namefind.NameSample;
 import opennlp.tools.sentdetect.SentenceDetector;
@@ -33,22 +28,19 @@ import opennlp.tools.tokenize.Tokenizer;
 import opennlp.tools.tokenize.TokenizerME;
 import opennlp.tools.tokenize.TokenizerModel;
 import opennlp.tools.util.ObjectStream;
-import opennlp.tools.util.Span;
 
 /**
  * Generates Name Sample objects for a Brat Document object.
  */
 public class BratNameSampleStream extends SegmenterObjectStream<BratDocument, NameSample> {
 
-  private SentenceDetector sentDetector;
-  private Tokenizer tokenizer;
+  private final BratDocumentParser parser;
 
   public BratNameSampleStream(SentenceDetector sentDetector,
       Tokenizer tokenizer, ObjectStream<BratDocument> samples) {
     super(samples);
 
-    this.sentDetector = sentDetector;
-    this.tokenizer = tokenizer;
+    this.parser = new BratDocumentParser(sentDetector, tokenizer);
   }
 
   public BratNameSampleStream(SentenceModel sentModel, TokenizerModel tokenModel,
@@ -56,115 +48,11 @@ public class BratNameSampleStream extends SegmenterObjectStream<BratDocument, Na
     super(samples);
 
     // TODO: We can pass in custom validators here ...
-    this.sentDetector = new SentenceDetectorME(sentModel);
-    this.tokenizer = new TokenizerME(tokenModel);
+    this.parser = new BratDocumentParser(new SentenceDetectorME(sentModel), new TokenizerME(tokenModel)); 
   }
 
   @Override
   protected List<NameSample> read(BratDocument sample) throws IOException {
-
-    // Note: Some entities might not match sentence boundaries,
-    // to be able to print warning a set of entities id must be maintained
-    // to check if all entities have been used up after the matching is done
-
-    Set<String> entityIdSet = new HashSet<>();
-    Map<Integer, Span> coveredIndexes = new HashMap<>();
-
-    for (BratAnnotation ann : sample.getAnnotations()) {
-      if (ann instanceof SpanAnnotation) {
-        entityIdSet.add(ann.getId());
-
-        Span span = ((SpanAnnotation) ann).getSpan();
-        for (int i = span.getStart(); i < span.getEnd(); i++) {
-          coveredIndexes.put(i, span);
-        }
-      }
-    }
-
-    List<Span> sentences = new ArrayList<>();
-    for (Span sentence : sentDetector.sentPosDetect(sample.getText())) {
-      Span conflictingName = coveredIndexes.get(sentence.getStart());
-
-      if (sentences.size() > 0 && conflictingName != null &&
-          conflictingName.getStart() < sentence.getStart()) {
-        Span lastSentence = sentences.remove(sentences.size() - 1);
-        sentences.add(new Span(lastSentence.getStart(), sentence.getEnd()));
-
-        System.out.println("Correcting sentence segmentation in document " +
-            sample.getId());
-      }
-      else {
-        sentences.add(sentence);
-      }
-    }
-
-    // TODO: Token breaks should be enforced on name span boundaries
-    // a) Just split tokens
-    // b) Implement a custom token split validator which can be injected into the Tokenizer
-
-    // Currently we are missing all
-
-    List<NameSample> samples = new ArrayList<>(sentences.size());
-
-    for (Span sentence : sentences) {
-
-      String sentenceText = sentence.getCoveredText(
-          sample.getText()).toString();
-
-      Span[] tokens = tokenizer.tokenizePos(sentenceText);
-
-      // Note:
-      // A begin and end token index can be identical, but map to different
-      // tokens, to distinguish between between the two begin indexes are
-      // stored with a negative sign, and end indexes are stored with a positive sign
-      // in the tokenIndexMap.
-      // The tokenIndexMap maps to the sentence local token index.
-
-      Map<Integer, Integer> tokenIndexMap = new HashMap<>();
-
-      for (int i = 0; i < tokens.length; i++) {
-        tokenIndexMap.put(-(sentence.getStart() + tokens[i].getStart()), i);
-        tokenIndexMap.put(sentence.getStart() + tokens[i].getEnd(), i + 1);
-      }
-
-      List<Span> names = new ArrayList<>();
-
-      for (BratAnnotation ann : sample.getAnnotations()) {
-
-        if (ann instanceof SpanAnnotation) {
-          SpanAnnotation entity = (SpanAnnotation) ann;
-
-          Span entitySpan = entity.getSpan();
-
-          if (sentence.contains(entitySpan)) {
-            entityIdSet.remove(ann.getId());
-
-            entitySpan = entitySpan.trim(sample.getText());
-
-            Integer nameBeginIndex = tokenIndexMap.get(-entitySpan.getStart());
-            Integer nameEndIndex = tokenIndexMap.get(entitySpan.getEnd());
-
-            if (nameBeginIndex != null && nameEndIndex != null) {
-              names.add(new Span(nameBeginIndex, nameEndIndex, entity.getType()));
-            }
-            else {
-              System.err.println("Dropped entity " + entity.getId() + " ("
-                  + entitySpan.getCoveredText(sample.getText()) + ") " + " in document "
-                  + sample.getId() + ", it is not matching tokenization!");
-            }
-          }
-        }
-      }
-
-      samples.add(new NameSample(sample.getId(), Span.spansToStrings(tokens, sentenceText),
-          names.toArray(new Span[names.size()]), null, samples.size() == 0));
-    }
-
-    for (String id : entityIdSet) {
-      System.err.println("Dropped entity " + id + " in document " +
-          sample.getId() + ", is not matching sentence segmentation!");
-    }
-
-    return samples;
+    return parser.parse(sample);
   }
 }

[13/21] opennlp git commit: OPENNLP-1077 Made the constructors for the BratNameSampleStream public.

Posted by jo...@apache.org.

OPENNLP-1077 Made the constructors for the BratNameSampleStream public.

Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/1aa54328
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/1aa54328
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/1aa54328

Branch: refs/heads/LangDetect
Commit: 1aa543286acbcb3c23aebaba003fa024a54aabb2
Parents: 6f80a89
Author: Daniel Russ <dr...@mail.nih.gov>
Authored: Wed May 24 15:47:12 2017 -0400
Committer: Daniel Russ <dr...@mail.nih.gov>
Committed: Wed May 24 15:47:12 2017 -0400

----------------------------------------------------------------------
 .../java/opennlp/tools/formats/brat/BratNameSampleStream.java    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/1aa54328/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
index 5a96d2d..569f450 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/brat/BratNameSampleStream.java
@@ -43,7 +43,7 @@ public class BratNameSampleStream extends SegmenterObjectStream<BratDocument, Na
   private SentenceDetector sentDetector;
   private Tokenizer tokenizer;
 
-  protected BratNameSampleStream(SentenceDetector sentDetector,
+  public BratNameSampleStream(SentenceDetector sentDetector,
       Tokenizer tokenizer, ObjectStream<BratDocument> samples) {
     super(samples);
 
@@ -51,7 +51,7 @@ public class BratNameSampleStream extends SegmenterObjectStream<BratDocument, Na
     this.tokenizer = tokenizer;
   }
 
-  protected BratNameSampleStream(SentenceModel sentModel, TokenizerModel tokenModel,
+  public BratNameSampleStream(SentenceModel sentModel, TokenizerModel tokenModel,
       ObjectStream<BratDocument> samples) {
     super(samples);

[09/21] opennlp git commit: OPENNLP-1074: Reduce visibility of eval methods

Posted by jo...@apache.org.

OPENNLP-1074: Reduce visibility of eval methods


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/226612f4
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/226612f4
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/226612f4

Branch: refs/heads/LangDetect
Commit: 226612f48bb40eb55ef5814ab9ee995fe9b30f71
Parents: b581c20
Author: Jörn Kottmann <jo...@apache.org>
Authored: Mon May 22 16:05:33 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Mon May 22 16:05:33 2017 +0200

----------------------------------------------------------------------
 opennlp-tools/src/main/java/opennlp/tools/ml/maxent/GISModel.java | 3 +--
 .../main/java/opennlp/tools/ml/maxent/quasinewton/QNModel.java    | 3 +--
 .../main/java/opennlp/tools/ml/naivebayes/NaiveBayesModel.java    | 3 +--
 .../main/java/opennlp/tools/ml/perceptron/PerceptronModel.java    | 3 +--
 4 files changed, 4 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/226612f4/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/GISModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/GISModel.java b/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/GISModel.java
index b8b830e..81b2690 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/GISModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/GISModel.java
@@ -152,8 +152,7 @@ public final class GISModel extends AbstractModel {
    *         string representation of the outcomes can be obtained from the
    *         method getOutcome(int i).
    */
-  @Deprecated // visibility will be reduced in 1.8.1
-  public static double[] eval(int[] context, float[] values, double[] prior,
+  static double[] eval(int[] context, float[] values, double[] prior,
       EvalParameters model) {
     Context[] params = model.getParams();
     int[] numfeats = new int[model.getNumOutcomes()];

http://git-wip-us.apache.org/repos/asf/opennlp/blob/226612f4/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/quasinewton/QNModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/quasinewton/QNModel.java b/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/quasinewton/QNModel.java
index f02ee75..d73d6cc 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/quasinewton/QNModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/maxent/quasinewton/QNModel.java
@@ -103,8 +103,7 @@ public class QNModel extends AbstractModel {
    *          Model parameters
    * @return Normalized probabilities for the outcomes given the context.
    */
-  @Deprecated // visibility will be reduced in 1.8.1
-  public static double[] eval(int[] context, float[] values, double[] probs,
+  static double[] eval(int[] context, float[] values, double[] probs,
       int nOutcomes, int nPredLabels, double[] parameters) {
 
     for (int i = 0; i < context.length; i++) {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/226612f4/opennlp-tools/src/main/java/opennlp/tools/ml/naivebayes/NaiveBayesModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/naivebayes/NaiveBayesModel.java b/opennlp-tools/src/main/java/opennlp/tools/ml/naivebayes/NaiveBayesModel.java
index 0a28704..98c2735 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/naivebayes/NaiveBayesModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/naivebayes/NaiveBayesModel.java
@@ -88,8 +88,7 @@ public class NaiveBayesModel extends AbstractModel {
     return eval(context, null, prior, model, true);
   }
 
-  @Deprecated // visibility will be reduced in 1.8.1
-  public static double[] eval(int[] context, float[] values, double[] prior,
+  static double[] eval(int[] context, float[] values, double[] prior,
                               EvalParameters model, boolean normalize) {
     Probabilities<Integer> probabilities = new LogProbabilities<>();
     Context[] params = model.getParams();

http://git-wip-us.apache.org/repos/asf/opennlp/blob/226612f4/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/PerceptronModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/PerceptronModel.java b/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/PerceptronModel.java
index d66b553..39c4891 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/PerceptronModel.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/ml/perceptron/PerceptronModel.java
@@ -66,8 +66,7 @@ public class PerceptronModel extends AbstractModel {
     return eval(context,null,prior,model,true);
   }
 
-  @Deprecated // visibility will be reduced in 1.8.1
-  public static double[] eval(int[] context, float[] values, double[] prior, EvalParameters model,
+  static double[] eval(int[] context, float[] values, double[] prior, EvalParameters model,
                               boolean normalize) {
     Context[] params = model.getParams();
     double[] activeParameters;

[11/21] opennlp git commit: OPENNLP-1075 Add streams for sentence and token samples for conllu

Posted by jo...@apache.org.

OPENNLP-1075 Add streams for sentence and token samples for conllu


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/5bf5366e
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/5bf5366e
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/5bf5366e

Branch: refs/heads/LangDetect
Commit: 5bf5366e2d5eca700d33d5882b65a5795cb3d656
Parents: d378c06
Author: Jörn Kottmann <jo...@apache.org>
Authored: Tue May 23 17:28:33 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed May 24 16:29:51 2017 +0200

----------------------------------------------------------------------
 .../tools/cmdline/StreamFactoryRegistry.java    |  4 ++
 .../conllu/ConlluLemmaSampleStreamFactory.java  |  5 +-
 .../tools/formats/conllu/ConlluSentence.java    | 15 +++-
 .../conllu/ConlluSentenceSampleStream.java      | 59 +++++++++++++++
 .../ConlluSentenceSampleStreamFactory.java      | 65 +++++++++++++++++
 .../tools/formats/conllu/ConlluStream.java      | 30 +++++++-
 .../formats/conllu/ConlluTokenSampleStream.java | 75 ++++++++++++++++++++
 .../conllu/ConlluTokenSampleStreamFactory.java  | 61 ++++++++++++++++
 .../conllu/ConlluSentenceSampleStreamTest.java  | 69 ++++++++++++++++++
 .../tools/formats/conllu/ConlluStreamTest.java  | 56 +++++++++++++++
 .../conllu/ConlluTokenSampleStreamTest.java     | 53 ++++++++++++++
 .../formats/conllu/ConlluWordLineTest.java      |  4 +-
 .../formats/conllu/de-ud-train-sample.conllu    | 30 ++++++++
 13 files changed, 517 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index 9977519..2cff212 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -44,6 +44,8 @@ import opennlp.tools.formats.ad.ADTokenSampleStreamFactory;
 import opennlp.tools.formats.brat.BratNameSampleStreamFactory;
 import opennlp.tools.formats.conllu.ConlluLemmaSampleStreamFactory;
 import opennlp.tools.formats.conllu.ConlluPOSSampleStreamFactory;
+import opennlp.tools.formats.conllu.ConlluSentenceSampleStreamFactory;
+import opennlp.tools.formats.conllu.ConlluTokenSampleStreamFactory;
 import opennlp.tools.formats.convert.NameToSentenceSampleStreamFactory;
 import opennlp.tools.formats.convert.NameToTokenSampleStreamFactory;
 import opennlp.tools.formats.convert.POSToSentenceSampleStreamFactory;
@@ -113,6 +115,8 @@ public final class StreamFactoryRegistry {
     LetsmtSentenceStreamFactory.registerFactory();
     MosesSentenceSampleStreamFactory.registerFactory();
 
+    ConlluTokenSampleStreamFactory.registerFactory();
+    ConlluSentenceSampleStreamFactory.registerFactory();
     ConlluPOSSampleStreamFactory.registerFactory();
     ConlluLemmaSampleStreamFactory.registerFactory();
   }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
index 4806967..3204d7e 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluLemmaSampleStreamFactory.java
@@ -34,8 +34,6 @@ import opennlp.tools.util.ObjectStream;
  */
 public class ConlluLemmaSampleStreamFactory extends AbstractSampleStreamFactory<LemmaSample> {
 
-  public static final String CONLLU_FORMAT = "conllu";
-
   interface Parameters extends BasicFormatParams {
     @ArgumentParser.ParameterDescription(valueName = "tagset",
         description = "u|x u for unified tags and x for language-specific part-of-speech tags")
@@ -45,7 +43,8 @@ public class ConlluLemmaSampleStreamFactory extends AbstractSampleStreamFactory<
 
   public static void registerFactory() {
     StreamFactoryRegistry.registerFactory(LemmaSample.class,
-        CONLLU_FORMAT, new ConlluLemmaSampleStreamFactory(Parameters.class));
+        ConlluPOSSampleStreamFactory.CONLLU_FORMAT,
+        new ConlluLemmaSampleStreamFactory(Parameters.class));
   }
 
   protected <P> ConlluLemmaSampleStreamFactory(Class<P> params) {

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
index 5d92d89..bbd2b96 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentence.java
@@ -23,11 +23,24 @@ public class ConlluSentence {
 
   private List<ConlluWordLine> wordLines;
 
-  ConlluSentence(List<ConlluWordLine> wordLines) {
+  private String sentenceIdComment;
+  private String textComment;
+
+  ConlluSentence(List<ConlluWordLine> wordLines, String sentenceIdComment, String textComment) {
     this.wordLines = wordLines;
+    this.sentenceIdComment = sentenceIdComment;
+    this.textComment = textComment;
   }
 
   public List<ConlluWordLine> getWordLines() {
     return wordLines;
   }
+
+  public String getSentenceIdComment() {
+    return sentenceIdComment;
+  }
+
+  public String getTextComment() {
+    return textComment;
+  }
 }

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java
new file mode 100644
index 0000000..f49e205
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStream.java
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class ConlluSentenceSampleStream extends FilterObjectStream<ConlluSentence, SentenceSample> {
+
+  private final int sentencesPerSample;
+
+  public ConlluSentenceSampleStream(ObjectStream<ConlluSentence> samples, int sentencesPerSample) {
+    super(samples);
+    this.sentencesPerSample = sentencesPerSample;
+  }
+
+  @Override
+  public SentenceSample read() throws IOException {
+    StringBuilder documentText = new StringBuilder();
+
+    List<Span> sentenceSpans = new ArrayList<>();
+
+    ConlluSentence sentence;
+    for (int i = 0; i <  sentencesPerSample && (sentence = samples.read()) != null; i++) {
+
+      int startIndex = documentText.length();
+      documentText.append(sentence.getTextComment()).append(' ');
+      sentenceSpans.add(new Span(startIndex, documentText.length() - 1));
+    }
+
+    if (documentText.length() > 0) {
+      documentText.setLength(documentText.length() - 1);
+      return new SentenceSample(documentText, sentenceSpans.toArray(new Span[sentenceSpans.size()]));
+    }
+
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
new file mode 100644
index 0000000..000af27
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamFactory.java
@@ -0,0 +1,65 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluSentenceSampleStreamFactory extends AbstractSampleStreamFactory<SentenceSample> {
+
+  interface Parameters extends BasicFormatParams {
+    @ArgumentParser.ParameterDescription(valueName = "sentencesPerSample",
+        description = "number of sentences per sample")
+    String getSentencesPerSample();
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(SentenceSample.class,
+        ConlluPOSSampleStreamFactory.CONLLU_FORMAT,
+        new ConlluSentenceSampleStreamFactory(ConlluSentenceSampleStreamFactory.Parameters.class));
+  }
+
+  protected <P> ConlluSentenceSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  @Override
+  public ObjectStream<SentenceSample> create(String[] args) {
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    InputStreamFactory inFactory =
+        CmdLineUtil.createInputStreamFactory(params.getData());
+
+    try {
+      return new ConlluSentenceSampleStream(new ConlluStream(inFactory),
+          Integer.parseInt(params.getSentencesPerSample()));
+    } catch (IOException e) {
+      // That will throw an exception
+      CmdLineUtil.handleCreateObjectStreamError(e);
+    }
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
index 873a9ed..cbac450 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluStream.java
@@ -49,15 +49,39 @@ public class ConlluStream implements ObjectStream<ConlluSentence> {
 
       BufferedReader reader = new BufferedReader(new StringReader(sentence));
 
+      String sentenceId = null;
+      String text = null;
+
       String line;
       while ((line = reader.readLine())  != null) {
-        // # indicates a comment line and should be skipped
-        if (!line.trim().startsWith("#")) {
+        // # indicates a comment line and contains additional data
+        if (line.trim().startsWith("#")) {
+          String commentLine = line.trim().substring(1);
+
+          int separator = commentLine.indexOf('=');
+
+          if (separator != -1) {
+            String firstPart = commentLine.substring(0, separator).trim();
+            String secondPart = commentLine.substring(separator + 1, commentLine.length()).trim();
+
+            if (!secondPart.isEmpty()) {
+              switch (firstPart) {
+                case "sent_id":
+                  sentenceId = secondPart;
+                  break;
+                case "text":
+                  text = secondPart;
+                  break;
+              }
+            }
+          }
+        }
+        else {
           wordLines.add(new ConlluWordLine(line));
         }
       }
 
-      return new ConlluSentence(wordLines);
+      return new ConlluSentence(wordLines, sentenceId, text);
     }
 
     return null;

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
new file mode 100644
index 0000000..a9ad937
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStream.java
@@ -0,0 +1,75 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.StringUtil;
+
+public class ConlluTokenSampleStream extends FilterObjectStream<ConlluSentence, TokenSample> {
+
+  public ConlluTokenSampleStream(ObjectStream<ConlluSentence> samples) {
+    super(samples);
+  }
+
+  @Override
+  public TokenSample read() throws IOException {
+    ConlluSentence sentence = samples.read();
+    if (sentence != null) {
+      if (sentence.getTextComment() != null) {
+        StringBuilder text = new StringBuilder(sentence.getTextComment());
+        int searchIndex = 0;
+
+        for (ConlluWordLine wordLine : sentence.getWordLines()) {
+
+          // skip over inserted words which are not in the source text
+          if (wordLine.getId().contains(".")) {
+            continue;
+          }
+
+          String token = wordLine.getForm();
+          int tokenIndex = text.indexOf(token, searchIndex);
+
+          if (tokenIndex == -1) {
+            throw new IOException(String.format("Failed to match token [%s] in sentence [%s] with text [%s]",
+                token, sentence.getSentenceIdComment(), text));
+          }
+
+          int charAfterTokenIndex = tokenIndex + token.length();
+          if (charAfterTokenIndex < text.length()) {
+            if (!StringUtil.isWhitespace(text.charAt(charAfterTokenIndex))) {
+              text.insert(charAfterTokenIndex,
+                  TokenSample.DEFAULT_SEPARATOR_CHARS);
+              searchIndex += TokenSample.DEFAULT_SEPARATOR_CHARS.length();
+            }
+
+            searchIndex += token.length();
+          }
+        }
+        return TokenSample.parse(text.toString(), TokenSample.DEFAULT_SEPARATOR_CHARS);
+      }
+      else {
+        throw new IOException("Sentence is missing raw text sample!");
+      }
+    }
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
new file mode 100644
index 0000000..5db0407
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamFactory.java
@@ -0,0 +1,61 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluTokenSampleStreamFactory extends AbstractSampleStreamFactory<TokenSample> {
+
+  interface Parameters extends BasicFormatParams {
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(TokenSample.class,
+        ConlluPOSSampleStreamFactory.CONLLU_FORMAT,
+        new ConlluTokenSampleStreamFactory(ConlluTokenSampleStreamFactory.Parameters.class));
+  }
+
+  protected <P> ConlluTokenSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  @Override
+  public ObjectStream<TokenSample> create(String[] args) {
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    InputStreamFactory inFactory =
+        CmdLineUtil.createInputStreamFactory(params.getData());
+
+    try {
+      return new ConlluTokenSampleStream(new ConlluStream(inFactory));
+    } catch (IOException e) {
+      // That will throw an exception
+      CmdLineUtil.handleCreateObjectStreamError(e);
+    }
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java
new file mode 100644
index 0000000..d45d38f
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluSentenceSampleStreamTest.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.sentdetect.SentenceSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.Span;
+
+public class ConlluSentenceSampleStreamTest {
+
+  @Test
+  public void testParseTwoSentences() throws IOException {
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, "de-ud-train-sample.conllu");
+
+    try (ObjectStream<SentenceSample> stream =
+             new ConlluSentenceSampleStream(new ConlluStream(streamFactory), 1)) {
+
+      SentenceSample sample1 = stream.read();
+
+      Assert.assertEquals("Fachlich kompetent, sehr gute Beratung und ein freundliches Team.",
+          sample1.getDocument());
+
+      Assert.assertEquals(new Span(0, 65), sample1.getSentences()[0]);
+
+      SentenceSample sample2 = stream.read();
+
+      Assert.assertEquals("Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch " +
+          "endlich keine Rückenschmerzen mehr.", sample2.getDocument());
+      Assert.assertEquals(new Span(0, 95), sample2.getSentences()[0]);
+
+      Assert.assertNull("Stream must be exhausted", stream.read());
+    }
+
+    try (ObjectStream<SentenceSample> stream =
+             new ConlluSentenceSampleStream(new ConlluStream(streamFactory), 3)) {
+      SentenceSample sample = stream.read();
+
+      Assert.assertEquals("Fachlich kompetent, sehr gute Beratung und ein freundliches Team."
+           + " Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch endlich keine "
+           + "Rückenschmerzen mehr.",
+          sample.getDocument());
+
+      Assert.assertNull("Stream must be exhausted", stream.read());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java
new file mode 100644
index 0000000..63968a1
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluStreamTest.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluStreamTest {
+
+  @Test
+  public void testParseTwoSentences() throws IOException {
+
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, "de-ud-train-sample.conllu");
+
+    try (ObjectStream<ConlluSentence> stream = new ConlluStream(streamFactory)) {
+      ConlluSentence sent1 = stream.read();
+
+      Assert.assertEquals("train-s21", sent1.getSentenceIdComment());
+      Assert.assertEquals("Fachlich kompetent, sehr gute Beratung und ein freundliches Team.",
+          sent1.getTextComment());
+      Assert.assertEquals(11, sent1.getWordLines().size());
+
+      ConlluSentence sent2 = stream.read();
+
+      Assert.assertEquals("train-s22", sent2.getSentenceIdComment());
+      Assert.assertEquals(
+          "Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch endlich keine Rückenschmerzen mehr.",
+          sent2.getTextComment());
+      Assert.assertEquals(14, sent2.getWordLines().size());
+
+      Assert.assertNull("Stream must be exhausted", stream.read());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
new file mode 100644
index 0000000..62cb9a6
--- /dev/null
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluTokenSampleStreamTest.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.conllu;
+
+import java.io.IOException;
+
+import org.junit.Assert;
+import org.junit.Test;
+
+import opennlp.tools.formats.ResourceAsStreamFactory;
+import opennlp.tools.tokenize.TokenSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+
+public class ConlluTokenSampleStreamTest {
+
+  @Test
+  public void testParseTwoSentences() throws IOException {
+    InputStreamFactory streamFactory =
+        new ResourceAsStreamFactory(ConlluStreamTest.class, "de-ud-train-sample.conllu");
+
+    try (ObjectStream<TokenSample> stream = new ConlluTokenSampleStream(new ConlluStream(streamFactory))) {
+
+      TokenSample expected1 = TokenSample.parse(
+          "Fachlich kompetent" + TokenSample.DEFAULT_SEPARATOR_CHARS
+          + ", sehr gute Beratung und ein freundliches Team" + TokenSample.DEFAULT_SEPARATOR_CHARS
+          + ".", TokenSample.DEFAULT_SEPARATOR_CHARS);
+      Assert.assertEquals(expected1, stream.read());
+
+      TokenSample expected2 = TokenSample.parse("Beiden Zahnärzten verdanke ich einen " +
+          "neuen Biss und dadurch endlich keine Rückenschmerzen mehr"
+          + TokenSample.DEFAULT_SEPARATOR_CHARS + ".", TokenSample.DEFAULT_SEPARATOR_CHARS);
+      Assert.assertEquals(expected2, stream.read());
+
+      Assert.assertNull("Stream must be exhausted", stream.read());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
index 4676f6f..005ec55 100644
--- a/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
+++ b/opennlp-tools/src/test/java/opennlp/tools/formats/conllu/ConlluWordLineTest.java
@@ -27,10 +27,10 @@ public class ConlluWordLineTest {
   @Test
   public void testParseLine() throws InvalidFormatException {
     ConlluWordLine line = new ConlluWordLine(
-        "12\tHÃ¤nden\tHand\tNOUN\tNN\tCase=Dat|Number=Plur\t5\tnmod\t_\t_");
+        "12\tHänden\tHand\tNOUN\tNN\tCase=Dat|Number=Plur\t5\tnmod\t_\t_");
 
     Assert.assertEquals("12", line.getId());
-    Assert.assertEquals("HÃ¤nden", line.getForm());
+    Assert.assertEquals("Händen", line.getForm());
     Assert.assertEquals("Hand", line.getLemma());
     Assert.assertEquals("NOUN", line.getPosTag(ConlluTagset.U));
     Assert.assertEquals("NN", line.getPosTag(ConlluTagset.X));

http://git-wip-us.apache.org/repos/asf/opennlp/blob/5bf5366e/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu
new file mode 100644
index 0000000..13c19da
--- /dev/null
+++ b/opennlp-tools/src/test/resources/opennlp/tools/formats/conllu/de-ud-train-sample.conllu
@@ -0,0 +1,30 @@
+# sent_id = train-s21
+# text = Fachlich kompetent, sehr gute Beratung und ein freundliches Team.
+1	Fachlich	fachlich	ADV	ADJD	_	2	advmod	_	_
+2	kompetent	kompetent	ADJ	ADJD	Degree=Pos	0	root	_	SpaceAfter=No
+3	,	,	PUNCT	$,	_	2	punct	_	_
+4	sehr	sehr	ADV	ADV	_	5	advmod	_	_
+5	gute	gut	ADJ	ADJA	Degree=Pos	6	amod	_	_
+6	Beratung	Beratung	NOUN	NN	_	2	parataxis	_	_
+7	und	und	CCONJ	KON	_	10	cc	_	_
+8	ein	ein	DET	ART	Definite=Ind|PronType=Art	10	det	_	_
+9	freundliches	freundlich	ADJ	ADJA	Degree=Pos	10	amod	_	_
+10	Team	Team	NOUN	NN	_	6	conj	_	SpaceAfter=No
+11	.	.	PUNCT	$.	_	2	punct	_	_
+
+# sent_id = train-s22
+# text = Beiden Zahnärzten verdanke ich einen neuen Biss und dadurch endlich keine Rückenschmerzen mehr.
+1	Beiden	beide	PRON	PIAT	Case=Dat|Number=Plur|NumType=Card|PronType=Tot	2	det	_	_
+2	Zahnärzten	Zahnarzt	NOUN	NN	Case=Dat|Number=Plur	3	iobj	_	_
+3	verdanke	verdanken	VERB	VVFIN	Number=Sing|Person=1|VerbForm=Fin	0	root	_	_
+4	ich	ich	PRON	PPER	Case=Nom|Number=Sing|Person=1|PronType=Prs	3	nsubj	_	_
+5	einen	ein	DET	ART	Case=Acc|Definite=Ind|Number=Plur|PronType=Art	7	det	_	_
+6	neuen	neu	ADJ	ADJA	Case=Acc|Degree=Pos|Number=Plur	7	amod	_	_
+7	Biss	Biß	NOUN	NN	Case=Acc|Number=Plur	3	obj	_	_
+8	und	und	CCONJ	KON	_	12	cc	_	_
+9	dadurch	dadurch	ADV	PAV	_	7	advmod	_	_
+10	endlich	endlich	ADV	ADV	_	12	advmod	_	_
+11	keine	kein	PRON	PIAT	PronType=Neg	12	advmod	_	_
+12	Rückenschmerzen	Rückenschmerz	NOUN	NN	_	7	conj	_	_
+13	mehr	mehr	ADV	ADV	_	12	advmod	_	SpaceAfter=No
+14	.	.	PUNCT	$.	_	3	punct	_	_
\ No newline at end of file

[21/21] opennlp git commit: OPENNLP-788: Add LanguageDetector tool

Posted by jo...@apache.org.

OPENNLP-788: Add LanguageDetector tool


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/a9853284
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/a9853284
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/a9853284

Branch: refs/heads/LangDetect
Commit: a985328464c130bf516d19eace49a1b8e3095022
Parents: 15ac7bd
Author: William D C M SILVA <co...@apache.org>
Authored: Wed May 17 13:34:21 2017 -0300
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Tue Jun 6 12:07:05 2017 +0200

----------------------------------------------------------------------
 NOTICE                                          |   7 +
 .../main/java/opennlp/tools/cmdline/CLI.java    |  12 +
 .../cmdline/FineGrainedReportListener.java      |  13 +-
 .../tools/cmdline/StreamFactoryRegistry.java    |   4 +
 .../LanguageDetectorConverterTool.java          |  28 ++
 .../LanguageDetectorCrossValidatorTool.java     | 123 ++++++++
 ...LanguageDetectorEvaluationErrorListener.java |  54 ++++
 .../LanguageDetectorEvaluatorTool.java          | 139 +++++++++
 ...nguageDetectorFineGrainedReportListener.java |  70 +++++
 .../langdetect/LanguageDetectorModelLoader.java |  42 +++
 .../langdetect/LanguageDetectorTool.java        |  88 ++++++
 .../langdetect/LanguageDetectorTrainerTool.java |  83 ++++++
 .../cmdline/langdetect/TrainingParams.java      |  40 +++
 .../LanguageDetectorSampleStreamFactory.java    |  66 +++++
 .../formats/LeipzigDoccatSampleStream.java      |   5 +-
 .../LeipzigDocumentSampleStreamFactory.java     |   3 +
 .../leipzig/LeipzigLanguageSampleStream.java    | 136 +++++++++
 .../LeipzigLanguageSampleStreamFactory.java     |  74 +++++
 .../java/opennlp/tools/langdetect/Language.java |  73 +++++
 .../tools/langdetect/LanguageDetector.java      |  31 ++
 .../LanguageDetectorContextGenerator.java       |  80 +++++
 .../LanguageDetectorCrossValidator.java         | 107 +++++++
 .../LanguageDetectorEvaluationMonitor.java      |  28 ++
 .../langdetect/LanguageDetectorEvaluator.java   |  99 +++++++
 .../langdetect/LanguageDetectorEventStream.java |  69 +++++
 .../langdetect/LanguageDetectorFactory.java     |  53 ++++
 .../tools/langdetect/LanguageDetectorME.java    |  97 ++++++
 .../tools/langdetect/LanguageDetectorModel.java |  82 +++++
 .../LanguageDetectorSampleStream.java           |  55 ++++
 .../tools/langdetect/LanguageSample.java        |  68 +++++
 .../AggregateCharSequenceNormalizer.java        |  39 +++
 .../util/normalizer/CharSequenceNormalizer.java |  23 ++
 .../normalizer/EmojiCharSequenceNormalizer.java |  38 +++
 .../NumberCharSequenceNormalizer.java           |  36 +++
 .../ShrinkCharSequenceNormalizer.java           |  40 +++
 .../TwitterCharSequenceNormalizer.java          |  50 ++++
 .../UnicodeCharSequenceNormalizer.java          | 297 +++++++++++++++++++
 .../normalizer/UrlCharSequenceNormalizer.java   |  40 +++
 .../normalizer/unicode_normalizer.properties    | 154 ++++++++++
 .../opennlp/tools/langdetect/DummyFactory.java  |  33 +++
 .../LanguageDetectorContextGeneratorTest.java   |  43 +++
 .../LanguageDetectorCrossValidatorTest.java     |  64 ++++
 .../LanguageDetectorEvaluatorTest.java          |  68 +++++
 .../langdetect/LanguageDetectorFactoryTest.java |  75 +++++
 .../langdetect/LanguageDetectorMETest.java      | 114 +++++++
 .../tools/langdetect/LanguageSampleTest.java    |  89 ++++++
 .../opennlp/tools/langdetect/LanguageTest.java  | 101 +++++++
 .../EmojiCharSequenceNormalizerTest.java        |  43 +++
 .../NumberCharSequenceNormalizerTest.java       |  32 ++
 .../ShrinkCharSequenceNormalizerTest.java       |  41 +++
 .../TwitterCharSequenceNormalizerTest.java      |  62 ++++
 .../UnicodeCharSequenceNormalizerTest.java      | 263 ++++++++++++++++
 .../UrlCharSequenceNormalizerTest.java          |  47 +++
 53 files changed, 3618 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/NOTICE
----------------------------------------------------------------------
diff --git a/NOTICE b/NOTICE
index c0b8394..36d90e2 100644
--- a/NOTICE
+++ b/NOTICE
@@ -10,3 +10,10 @@ opennlp-tools/src/main/java/opennlp/tools/stemmer/snowball
 were developed by Martin Porter and Richard Boulton.
 The full snowball package is available from
 http://snowball.tartarus.org/
+
+
+The Language Detector normalizer in
+opennlp.tools.util.normalizer.UnicodeCharSequenceNormalizer.java
+and its tests and resources were developed by Shuyo Nakatani.
+The full Language Detector package is available from
+https://github.com/shuyo/language-detection

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
index b575f71..c828e26 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/CLI.java
@@ -37,6 +37,11 @@ import opennlp.tools.cmdline.doccat.DoccatEvaluatorTool;
 import opennlp.tools.cmdline.doccat.DoccatTool;
 import opennlp.tools.cmdline.doccat.DoccatTrainerTool;
 import opennlp.tools.cmdline.entitylinker.EntityLinkerTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorConverterTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorCrossValidatorTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorEvaluatorTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorTool;
+import opennlp.tools.cmdline.langdetect.LanguageDetectorTrainerTool;
 import opennlp.tools.cmdline.languagemodel.NGramLanguageModelTool;
 import opennlp.tools.cmdline.lemmatizer.LemmatizerEvaluatorTool;
 import opennlp.tools.cmdline.lemmatizer.LemmatizerMETool;
@@ -90,6 +95,13 @@ public final class CLI {
     tools.add(new DoccatCrossValidatorTool());
     tools.add(new DoccatConverterTool());
 
+    // Language Detector
+    tools.add(new LanguageDetectorTool());
+    tools.add(new LanguageDetectorTrainerTool());
+    tools.add(new LanguageDetectorConverterTool());
+    tools.add(new LanguageDetectorCrossValidatorTool());
+    tools.add(new LanguageDetectorEvaluatorTool());
+
     // Dictionary Builder
     tools.add(new DictionaryBuilderTool());
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
index 714561a..75b84aa 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/FineGrainedReportListener.java
@@ -802,8 +802,8 @@ public abstract class FineGrainedReportListener {
       }
     }
 
-    public void add(String[] text, String ref, String pred) {
-      int length = text.length;
+    public void add(int length, String ref, String pred) {
+
       averageSentenceLength.add(length);
 
       if (minimalSentenceLength > length) {
@@ -820,7 +820,16 @@ public abstract class FineGrainedReportListener {
       updateTagFMeasure(refs, preds);
 
       commit("", ref, pred);
+    }
+
+    public void add(String[] text, String ref, String pred) {
+      int length = text.length;
+      this.add(length, ref, pred);
+    }
 
+    public void add(CharSequence text, String ref, String pred) {
+      int length = text.length();
+      this.add(length, ref, pred);
     }
 
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
index 3d68945..48b8025 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/StreamFactoryRegistry.java
@@ -29,6 +29,7 @@ import opennlp.tools.formats.ConllXSentenceSampleStreamFactory;
 import opennlp.tools.formats.ConllXTokenSampleStreamFactory;
 import opennlp.tools.formats.DocumentSampleStreamFactory;
 import opennlp.tools.formats.EvalitaNameSampleStreamFactory;
+import opennlp.tools.formats.LanguageDetectorSampleStreamFactory;
 import opennlp.tools.formats.LeipzigDocumentSampleStreamFactory;
 import opennlp.tools.formats.LemmatizerSampleStreamFactory;
 import opennlp.tools.formats.NameSampleDataStreamFactory;
@@ -56,6 +57,7 @@ import opennlp.tools.formats.convert.ParseToTokenSampleStreamFactory;
 import opennlp.tools.formats.frenchtreebank.ConstitParseSampleStreamFactory;
 import opennlp.tools.formats.irishsentencebank.IrishSentenceBankSentenceStreamFactory;
 import opennlp.tools.formats.irishsentencebank.IrishSentenceBankTokenSampleStreamFactory;
+import opennlp.tools.formats.leipzig.LeipzigLanguageSampleStreamFactory;
 import opennlp.tools.formats.letsmt.LetsmtSentenceStreamFactory;
 import opennlp.tools.formats.moses.MosesSentenceSampleStreamFactory;
 import opennlp.tools.formats.muc.Muc6NameSampleStreamFactory;
@@ -79,6 +81,7 @@ public final class StreamFactoryRegistry {
     TokenSampleStreamFactory.registerFactory();
     WordTagSampleStreamFactory.registerFactory();
     LemmatizerSampleStreamFactory.registerFactory();
+    LanguageDetectorSampleStreamFactory.registerFactory();
 
     NameToSentenceSampleStreamFactory.registerFactory();
     NameToTokenSampleStreamFactory.registerFactory();
@@ -124,6 +127,7 @@ public final class StreamFactoryRegistry {
 
     IrishSentenceBankSentenceStreamFactory.registerFactory();
     IrishSentenceBankTokenSampleStreamFactory.registerFactory();
+    LeipzigLanguageSampleStreamFactory.registerFactory();
   }
 
   public static final String DEFAULT_FORMAT = "opennlp";

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java
new file mode 100644
index 0000000..69d9db7
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorConverterTool.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import opennlp.tools.cmdline.AbstractConverterTool;
+import opennlp.tools.langdetect.LanguageSample;
+
+public class LanguageDetectorConverterTool extends AbstractConverterTool<LanguageSample> {
+
+  public LanguageDetectorConverterTool() {
+    super(LanguageSample.class);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java
new file mode 100644
index 0000000..bf68fbb
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorCrossValidatorTool.java
@@ -0,0 +1,123 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.LinkedList;
+import java.util.List;
+
+import opennlp.tools.cmdline.AbstractCrossValidatorTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.CVParams;
+import opennlp.tools.cmdline.params.FineGrainedEvaluatorParams;
+import opennlp.tools.langdetect.LanguageDetectorCrossValidator;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageDetectorFactory;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.eval.EvaluationMonitor;
+import opennlp.tools.util.model.ModelUtil;
+
+public final class LanguageDetectorCrossValidatorTool extends
+    AbstractCrossValidatorTool<LanguageSample,
+        LanguageDetectorCrossValidatorTool.CVToolParams> {
+
+  interface CVToolParams extends CVParams, TrainingParams, FineGrainedEvaluatorParams {
+  }
+
+  public LanguageDetectorCrossValidatorTool() {
+    super(LanguageSample.class, CVToolParams.class);
+  }
+
+  public String getShortDescription() {
+    return "K-fold cross validator for the learnable Language Detector";
+  }
+
+  public void run(String format, String[] args) {
+    super.run(format, args);
+
+    mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
+    if (mlParams == null) {
+      mlParams = ModelUtil.createDefaultTrainingParameters();
+    }
+
+    List<EvaluationMonitor<LanguageSample>> listeners = new LinkedList<>();
+    if (params.getMisclassified()) {
+      listeners.add(new LanguageDetectorEvaluationErrorListener());
+    }
+
+    LanguageDetectorFineGrainedReportListener reportListener = null;
+    File reportFile = params.getReportOutputFile();
+    OutputStream reportOutputStream = null;
+    if (reportFile != null) {
+      CmdLineUtil.checkOutputFile("Report Output File", reportFile);
+      try {
+        reportOutputStream = new FileOutputStream(reportFile);
+        reportListener = new LanguageDetectorFineGrainedReportListener(reportOutputStream);
+        listeners.add(reportListener);
+      } catch (FileNotFoundException e) {
+        throw createTerminationIOException(e);
+      }
+    }
+
+    LanguageDetectorEvaluationMonitor[] listenersArr = listeners
+        .toArray(new LanguageDetectorEvaluationMonitor[listeners.size()]);
+
+    LanguageDetectorCrossValidator validator;
+    try {
+      LanguageDetectorFactory factory = LanguageDetectorFactory.create(params.getFactory());
+      validator = new LanguageDetectorCrossValidator(mlParams,
+          factory, listenersArr);
+
+      validator.evaluate(sampleStream, params.getFolds());
+    } catch (IOException e) {
+      throw new TerminateToolException(-1,
+          "IO error while reading training data or indexing data: " + e.getMessage(), e);
+    } finally {
+      try {
+        sampleStream.close();
+      } catch (IOException e) {
+        // sorry that this can fail
+      }
+    }
+
+    System.out.println("done");
+
+    if (reportListener != null) {
+      System.out.println("Writing fine-grained report to "
+          + params.getReportOutputFile().getAbsolutePath());
+      reportListener.writeReport();
+
+      try {
+        // TODO: is it a problem to close the stream now?
+        reportOutputStream.close();
+      } catch (IOException e) {
+        // nothing to do
+      }
+    }
+
+    System.out.println();
+
+    System.out.println("Accuracy: " + validator.getDocumentAccuracy() + "\n" +
+        "Number of documents: " + validator.getDocumentCount());
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java
new file mode 100644
index 0000000..073ef31
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluationErrorListener.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.OutputStream;
+
+import opennlp.tools.cmdline.EvaluationErrorPrinter;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+/**
+ * A default implementation of {@link EvaluationMonitor} that prints to an
+ * output stream.
+ *
+ */
+public class LanguageDetectorEvaluationErrorListener extends
+    EvaluationErrorPrinter<LanguageSample> implements LanguageDetectorEvaluationMonitor {
+
+  /**
+   * Creates a listener that will print to System.err
+   */
+  public LanguageDetectorEvaluationErrorListener() {
+    super(System.err);
+  }
+
+  /**
+   * Creates a listener that will print to a given {@link OutputStream}
+   */
+  public LanguageDetectorEvaluationErrorListener(OutputStream outputStream) {
+    super(outputStream);
+  }
+
+  @Override
+  public void missclassified(LanguageSample reference, LanguageSample prediction) {
+    printError(reference, prediction);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java
new file mode 100644
index 0000000..fb929bf
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorEvaluatorTool.java
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.OutputStream;
+import java.util.LinkedList;
+import java.util.List;
+
+import opennlp.tools.cmdline.AbstractEvaluatorTool;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.PerformanceMonitor;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.EvaluatorParams;
+import opennlp.tools.cmdline.params.FineGrainedEvaluatorParams;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageDetectorEvaluator;
+import opennlp.tools.langdetect.LanguageDetectorME;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+public final class LanguageDetectorEvaluatorTool extends
+    AbstractEvaluatorTool<LanguageSample, LanguageDetectorEvaluatorTool.EvalToolParams> {
+
+  interface EvalToolParams extends EvaluatorParams, FineGrainedEvaluatorParams {
+  }
+
+  public LanguageDetectorEvaluatorTool() {
+    super(LanguageSample.class, EvalToolParams.class);
+  }
+
+  public String getShortDescription() {
+    return "Measures the performance of the Language Detector model with the reference data";
+  }
+
+  public void run(String format, String[] args) {
+    super.run(format, args);
+
+    LanguageDetectorModel model = new LanguageDetectorModelLoader().load(params.getModel());
+
+    List<EvaluationMonitor<LanguageSample>> listeners = new LinkedList<>();
+    if (params.getMisclassified()) {
+      listeners.add(new LanguageDetectorEvaluationErrorListener());
+    }
+
+    LanguageDetectorFineGrainedReportListener reportListener = null;
+    File reportFile = params.getReportOutputFile();
+    OutputStream reportOutputStream = null;
+    if (reportFile != null) {
+      CmdLineUtil.checkOutputFile("Report Output File", reportFile);
+      try {
+        reportOutputStream = new FileOutputStream(reportFile);
+        reportListener = new LanguageDetectorFineGrainedReportListener(reportOutputStream);
+        listeners.add(reportListener);
+      } catch (FileNotFoundException e) {
+        throw new TerminateToolException(-1,
+            "IO error while creating LanguageDetector fine-grained report file: "
+                + e.getMessage());
+      }
+    }
+
+    LanguageDetectorEvaluator evaluator = new LanguageDetectorEvaluator(
+        new LanguageDetectorME(model),
+        listeners.toArray(new LanguageDetectorEvaluationMonitor[listeners.size()]));
+
+    final PerformanceMonitor monitor = new PerformanceMonitor("doc");
+
+    ObjectStream<LanguageSample> measuredSampleStream = new ObjectStream<LanguageSample>() {
+
+      public LanguageSample read() throws IOException {
+        monitor.incrementCounter();
+        return sampleStream.read();
+      }
+
+      public void reset() throws IOException {
+        sampleStream.reset();
+      }
+
+      public void close() throws IOException {
+        sampleStream.close();
+      }
+    };
+
+    monitor.startAndPrintThroughput();
+
+    try {
+      evaluator.evaluate(measuredSampleStream);
+    } catch (IOException e) {
+      System.err.println("failed");
+      throw new TerminateToolException(-1, "IO error while reading test data: "
+          + e.getMessage(), e);
+    } finally {
+      try {
+        measuredSampleStream.close();
+      } catch (IOException e) {
+        // sorry that this can fail
+      }
+    }
+
+    monitor.stopAndPrintFinalResult();
+
+    System.out.println();
+
+    System.out.println(evaluator);
+
+    if (reportListener != null) {
+      System.out.println("Writing fine-grained report to "
+          + params.getReportOutputFile().getAbsolutePath());
+      reportListener.writeReport();
+
+      try {
+        // TODO: is it a problem to close the stream now?
+        reportOutputStream.close();
+      } catch (IOException e) {
+        // nothing to do
+      }
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java
new file mode 100644
index 0000000..70bf3eb
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorFineGrainedReportListener.java
@@ -0,0 +1,70 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.OutputStream;
+
+import opennlp.tools.cmdline.FineGrainedReportListener;
+import opennlp.tools.langdetect.LanguageDetectorEvaluationMonitor;
+import opennlp.tools.langdetect.LanguageSample;
+
+/**
+ * Generates a detailed report for the POS Tagger.
+ * <p>
+ * It is possible to use it from an API and access the statistics using the
+ * provided getters
+ */
+public class LanguageDetectorFineGrainedReportListener
+    extends FineGrainedReportListener implements LanguageDetectorEvaluationMonitor {
+
+  /**
+   * Creates a listener that will print to {@link System#err}
+   */
+  public LanguageDetectorFineGrainedReportListener() {
+    this(System.err);
+  }
+
+  /**
+   * Creates a listener that prints to a given {@link OutputStream}
+   */
+  public LanguageDetectorFineGrainedReportListener(OutputStream outputStream) {
+    super(outputStream);
+  }
+
+  // methods inherited from EvaluationMonitor
+
+  public void missclassified(LanguageSample reference, LanguageSample prediction) {
+    statsAdd(reference, prediction);
+  }
+
+  public void correctlyClassified(LanguageSample reference, LanguageSample prediction) {
+    statsAdd(reference, prediction);
+  }
+
+  private void statsAdd(LanguageSample reference, LanguageSample prediction) {
+    getStats().add(reference.getContext(),
+        reference.getLanguage().getLang(), prediction.getLanguage().getLang());
+  }
+
+  public void writeReport() {
+    printGeneralStatistics();
+    printTagsErrorRank();
+    printGeneralConfusionTable();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java
new file mode 100644
index 0000000..c8700fd
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorModelLoader.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.IOException;
+import java.io.InputStream;
+
+import opennlp.tools.cmdline.ModelLoader;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+
+/**
+ * Loads a Language Detector Model for the command line tools.
+ * <p>
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class LanguageDetectorModelLoader extends ModelLoader<LanguageDetectorModel> {
+
+  public LanguageDetectorModelLoader() {
+    super("Language Detector");
+  }
+
+  @Override
+  protected LanguageDetectorModel loadModel(InputStream modelIn) throws IOException {
+    return new LanguageDetectorModel(modelIn);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java
new file mode 100644
index 0000000..6175fe3
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTool.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.BasicCmdLineTool;
+import opennlp.tools.cmdline.CLI;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.PerformanceMonitor;
+import opennlp.tools.cmdline.SystemInputStreamFactory;
+import opennlp.tools.langdetect.Language;
+import opennlp.tools.langdetect.LanguageDetector;
+import opennlp.tools.langdetect.LanguageDetectorME;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.ParagraphStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+public class LanguageDetectorTool extends BasicCmdLineTool {
+
+  @Override
+  public String getShortDescription() {
+    return "learned language detector";
+  }
+
+  @Override
+  public String getHelp() {
+    return "Usage: " + CLI.CMD + " " + getName() + " model < documents";
+  }
+
+  @Override
+  public void run(String[] args) {
+
+    if (0 == args.length) {
+      System.out.println(getHelp());
+    } else {
+
+      LanguageDetectorModel model = new LanguageDetectorModelLoader().load(new File(args[0]));
+
+      LanguageDetector langDetectME = new LanguageDetectorME(model);
+
+      /*
+       * moved initialization to the try block to catch new IOException
+       */
+      ObjectStream<String> documentStream;
+
+      PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "doc");
+      perfMon.start();
+
+      try {
+        documentStream = new ParagraphStream(new PlainTextByLineStream(
+            new SystemInputStreamFactory(), SystemInputStreamFactory.encoding()));
+        String document;
+        while ((document = documentStream.read()) != null) {
+
+          Language lang = langDetectME.predictLanguage(document);
+
+          LanguageSample sample = new LanguageSample(lang, document);
+          System.out.println(sample.toString());
+
+          perfMon.incrementCounter();
+        }
+      } catch (IOException e) {
+        CmdLineUtil.handleStdinIoError(e);
+      }
+
+      perfMon.stopAndPrintFinalResult();
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java
new file mode 100644
index 0000000..6735293
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/LanguageDetectorTrainerTool.java
@@ -0,0 +1,83 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.AbstractTrainerTool;
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.langdetect.LanguageDetectorFactory;
+import opennlp.tools.langdetect.LanguageDetectorME;
+import opennlp.tools.langdetect.LanguageDetectorModel;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.model.ModelUtil;
+
+public class LanguageDetectorTrainerTool
+    extends AbstractTrainerTool<LanguageSample, LanguageDetectorTrainerTool.TrainerToolParams> {
+
+  interface TrainerToolParams extends TrainingParams {
+    @ArgumentParser.ParameterDescription(valueName = "modelFile", description = "output model file.")
+    File getModel();
+
+    @ArgumentParser.ParameterDescription(valueName = "paramsFile", description = "training parameters file.")
+    @ArgumentParser.OptionalParameter()
+    String getParams();
+  }
+
+  public LanguageDetectorTrainerTool() {
+    super(LanguageSample.class, TrainerToolParams.class);
+  }
+
+  @Override
+  public String getShortDescription() {
+    return "trainer for the learnable language detector";
+  }
+
+  @Override
+  public void run(String format, String[] args) {
+    super.run(format, args);
+
+    mlParams = CmdLineUtil.loadTrainingParameters(params.getParams(), false);
+    if (mlParams == null) {
+      mlParams = ModelUtil.createDefaultTrainingParameters();
+    }
+
+    File modelOutFile = params.getModel();
+
+    CmdLineUtil.checkOutputFile("language detector model", modelOutFile);
+
+    LanguageDetectorModel model;
+    try {
+      LanguageDetectorFactory factory = LanguageDetectorFactory.create(params.getFactory());
+      model = LanguageDetectorME.train(sampleStream, mlParams, factory);
+    } catch (IOException e) {
+      throw createTerminationIOException(e);
+    }
+    finally {
+      try {
+        sampleStream.close();
+      } catch (IOException e) {
+        // sorry that this can fail
+      }
+    }
+
+    CmdLineUtil.writeModel("language detector", modelOutFile, model);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java
new file mode 100644
index 0000000..2937c3d
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/cmdline/langdetect/TrainingParams.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.cmdline.langdetect;
+
+import opennlp.tools.cmdline.ArgumentParser.OptionalParameter;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+
+/**
+ * TrainingParams for Language Detector.
+ *
+ * Note: Do not use this class, internal use only!
+ */
+interface TrainingParams {
+
+  @ParameterDescription(valueName = "paramsFile", description = "training parameters file.")
+  @OptionalParameter()
+  String getParams();
+
+  @ParameterDescription(valueName = "factoryName",
+      description = "A sub-class of LanguageDetectorFactory" +
+          " where to get implementation and resources.")
+  @OptionalParameter
+  String getFactory();
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
new file mode 100644
index 0000000..ef60063
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LanguageDetectorSampleStreamFactory.java
@@ -0,0 +1,66 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats;
+
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.CmdLineUtil;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.params.BasicFormatParams;
+import opennlp.tools.doccat.DocumentSampleStream;
+import opennlp.tools.langdetect.LanguageDetectorSampleStream;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.InputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+/**
+ * Factory producing OpenNLP {@link DocumentSampleStream}s.
+ */
+public class LanguageDetectorSampleStreamFactory
+    extends AbstractSampleStreamFactory<LanguageSample> {
+
+  interface Parameters extends BasicFormatParams {
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(LanguageSample.class,
+            StreamFactoryRegistry.DEFAULT_FORMAT,
+            new LanguageDetectorSampleStreamFactory(Parameters.class));
+  }
+
+  protected <P> LanguageDetectorSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  public ObjectStream<LanguageSample> create(String[] args) {
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+
+    CmdLineUtil.checkInputFile("Data", params.getData());
+    InputStreamFactory sampleDataIn = CmdLineUtil.createInputStreamFactory(params.getData());
+    ObjectStream<String> lineStream = null;
+    try {
+      lineStream = new PlainTextByLineStream(sampleDataIn, params.getEncoding());
+    } catch (IOException ex) {
+      CmdLineUtil.handleCreateObjectStreamError(ex);
+    }
+
+    return new LanguageDetectorSampleStream(lineStream);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
index 8ed0036..7059e21 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDoccatSampleStream.java
@@ -39,8 +39,11 @@ import opennlp.tools.util.PlainTextByLineStream;
  * <p>
  * The input text is tokenized with the {@link SimpleTokenizer}. The input text classified
  * by the language model must also be tokenized by the {@link SimpleTokenizer} to produce
- * exactly the same tokenization during testing and training.ø
+ * exactly the same tokenization during testing and training.
+ *
+ * @deprecated will be removed, use the language detector instead
  */
+@Deprecated
 public class LeipzigDoccatSampleStream extends
     FilterObjectStream<String, DocumentSample> {
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
index bd2453b..d6ff9ba 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/LeipzigDocumentSampleStreamFactory.java
@@ -33,7 +33,10 @@ import opennlp.tools.util.ObjectStreamUtils;
 
 /**
  * <b>Note:</b> Do not use this class, internal use only!
+ *
+ * @deprecated will be removed, use the language detector instead
  */
+@Deprecated
 public class LeipzigDocumentSampleStreamFactory
     extends AbstractSampleStreamFactory<DocumentSample> {
 

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
new file mode 100644
index 0000000..6c4d009
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStream.java
@@ -0,0 +1,136 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.leipzig;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.util.Arrays;
+import java.util.Iterator;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import opennlp.tools.langdetect.Language;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.MarkableFileInputStreamFactory;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.PlainTextByLineStream;
+
+public class LeipzigLanguageSampleStream implements ObjectStream<LanguageSample> {
+
+  private class LeipzigSentencesStream implements ObjectStream<LanguageSample> {
+    private final String lang;
+    private int sentencesPerSample;
+    private int numberOfSamples;
+
+    private ObjectStream<String> lineStream;
+    private int sampleCount;
+
+    LeipzigSentencesStream(String lang, File sentencesFile, int sentencesPerSample, int numberOfSamples)
+        throws IOException {
+      this.lang = sentencesFile.getName().substring(0, 3);
+      this.sentencesPerSample = sentencesPerSample;
+      this.numberOfSamples = numberOfSamples;
+
+      lineStream = new PlainTextByLineStream(new MarkableFileInputStreamFactory(sentencesFile),
+          StandardCharsets.UTF_8);
+    }
+
+    @Override
+    public LanguageSample read() throws IOException {
+
+      if (sampleCount < numberOfSamples) {
+        StringBuilder sampleString = new StringBuilder();
+
+        int count = 0;
+        String line;
+        while (count < sentencesPerSample && (line = lineStream.read()) != null) {
+
+          int textStart = line.indexOf('\t') + 1;
+
+          // TODO: It should it be changed to contain an array of sample strings ?!
+          sampleString.append(line.substring(textStart) + " ");
+
+          count++;
+        }
+
+        if (sampleString.length() > 0) {
+          sampleCount++;
+          return new LanguageSample(new Language(lang), sampleString);
+        }
+      }
+      return null;
+    }
+  }
+
+  private final int sentencesPerSample;
+
+  private Map<String, Integer> langSampleCounts;
+  private File[] sentencesFiles;
+
+  private Iterator<File> sentencesFilesIt;
+  private ObjectStream<LanguageSample> sampleStream;
+
+  public LeipzigLanguageSampleStream(File leipzigFolder, final int sentencesPerSample,
+                                     final int samplesPerLanguage) throws IOException {
+    this.sentencesPerSample = sentencesPerSample;
+    // TODO: Use a FileFilter to make this more reliable in case there are files which should be ignored
+    sentencesFiles = leipzigFolder.listFiles();
+    Arrays.sort(sentencesFiles);
+
+    Map<String, Integer> langCounts = Arrays.stream(sentencesFiles)
+        .map(file -> file.getName().substring(0, 3))
+        .collect(Collectors.groupingBy(String::toString, Collectors.summingInt(v -> 1)));
+
+    langSampleCounts = langCounts.entrySet().stream()
+        .collect(Collectors.toMap(Map.Entry::getKey, e -> samplesPerLanguage / e.getValue()));
+
+    reset();
+  }
+
+  public LanguageSample read() throws IOException {
+    LanguageSample sample;
+    if (sampleStream != null && (sample = sampleStream.read()) != null) {
+      return sample;
+    }
+    else {
+      if (sentencesFilesIt.hasNext()) {
+        File sentencesFile = sentencesFilesIt.next();
+        System.out.println(sentencesFile);
+        String lang = sentencesFile.getName().substring(0, 3);
+
+        sampleStream = new LeipzigSentencesStream(lang, sentencesFile,
+            sentencesPerSample, langSampleCounts.get(lang));
+
+        return read();
+      }
+    }
+    return null;
+  }
+
+  @Override
+  public void reset() throws IOException {
+    sentencesFilesIt = Arrays.asList(sentencesFiles).iterator();
+    sampleStream = null;
+  }
+
+  public static void main(String[] args) throws Exception {
+    new LeipzigLanguageSampleStream(new File("/home/blue/opennlp-data-dir/leipzig-lang"),
+        10, 100000);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
new file mode 100644
index 0000000..59a7551
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/formats/leipzig/LeipzigLanguageSampleStreamFactory.java
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.formats.leipzig;
+
+import java.io.File;
+import java.io.IOException;
+
+import opennlp.tools.cmdline.ArgumentParser;
+import opennlp.tools.cmdline.ArgumentParser.ParameterDescription;
+import opennlp.tools.cmdline.StreamFactoryRegistry;
+import opennlp.tools.cmdline.TerminateToolException;
+import opennlp.tools.cmdline.params.EncodingParameter;
+import opennlp.tools.formats.AbstractSampleStreamFactory;
+import opennlp.tools.langdetect.LanguageSample;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * <b>Note:</b> Do not use this class, internal use only!
+ */
+public class LeipzigLanguageSampleStreamFactory
+    extends AbstractSampleStreamFactory<LanguageSample> {
+
+  interface Parameters extends EncodingParameter {
+    @ParameterDescription(valueName = "sentencesDir",
+        description = "dir with Leipig sentences to be used")
+    File getSentencesDir();
+
+    @ParameterDescription(valueName = "sentencesPerSample",
+        description = "number of sentences per sample")
+    String getSentencesPerSample();
+
+    @ParameterDescription(valueName = "samplesPerLanguage",
+        description = "number of samples per language")
+    String getSamplesPerLanguage();
+  }
+
+  protected <P> LeipzigLanguageSampleStreamFactory(Class<P> params) {
+    super(params);
+  }
+
+  public static void registerFactory() {
+    StreamFactoryRegistry.registerFactory(LanguageSample.class,
+        "leipzig", new LeipzigLanguageSampleStreamFactory(Parameters.class));
+  }
+
+  public ObjectStream<LanguageSample> create(String[] args) {
+
+    Parameters params = ArgumentParser.parse(args, Parameters.class);
+    File sentencesFileDir = params.getSentencesDir();
+
+    try {
+      return new LeipzigLanguageSampleStream(sentencesFileDir,
+          Integer.parseInt(params.getSentencesPerSample()),
+          Integer.parseInt(params.getSamplesPerLanguage()));
+    } catch (IOException e) {
+      throw new TerminateToolException(-1, "IO error while opening sample data.", e);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
new file mode 100644
index 0000000..f780759
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/Language.java
@@ -0,0 +1,73 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Objects;
+
+/**
+ * Class for holding the document language and its confidence
+ */
+public class Language {
+  private final String lang;
+  private final double confidence;
+
+  public Language(String lang) {
+    this(lang, 0);
+  }
+
+  public Language(String lang, double confidence) {
+    Objects.requireNonNull(lang, "lang must not be null");
+    this.lang = lang;
+    this.confidence = confidence;
+  }
+
+  public String getLang() {
+    return lang;
+  }
+
+  public double getConfidence() {
+    return confidence;
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append(getLang()).append(" (").append(this.confidence).append(")");
+    return sb.toString();
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(getLang(), getConfidence());
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+
+    if (obj instanceof Language) {
+      Language a = (Language) obj;
+
+      return getLang().equals(a.getLang());
+    }
+
+    return false;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
new file mode 100644
index 0000000..0004494
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetector.java
@@ -0,0 +1,31 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+/**
+ * The interface for LanguageDetector which provide the @{@link Language} according to the context.
+ */
+public interface LanguageDetector {
+
+  Language[] predictLanguages(CharSequence content);
+
+  Language predictLanguage(CharSequence content);
+
+  String[] getSupportedLanguages();
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
new file mode 100644
index 0000000..b28c601
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Collection;
+import java.util.LinkedList;
+
+import opennlp.tools.ngram.NGramModel;
+import opennlp.tools.util.StringList;
+import opennlp.tools.util.StringUtil;
+import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.CharSequenceNormalizer;
+import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.UnicodeCharSequenceNormalizer;
+import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer;
+
+/**
+ * Context generator for document categorizer
+ */
+class LanguageDetectorContextGenerator {
+
+  private final int minLength;
+  private final int maxLength;
+  private final CharSequenceNormalizer normalizer;
+
+  LanguageDetectorContextGenerator(int minLength, int maxLength) {
+    this.minLength = minLength;
+    this.maxLength = maxLength;
+
+    this.normalizer = new AggregateCharSequenceNormalizer(
+        EmojiCharSequenceNormalizer.getInstance(),
+        UrlCharSequenceNormalizer.getInstance(),
+        TwitterCharSequenceNormalizer.getInstance(),
+        NumberCharSequenceNormalizer.getInstance(),
+        UnicodeCharSequenceNormalizer.getInstance(),
+        ShrinkCharSequenceNormalizer.getInstance()
+    );
+  }
+
+  /**
+   * Initializes the current instance with min 2 length and max 5 length of ngrams.
+   */
+  LanguageDetectorContextGenerator() {
+    this(2, 3);
+  }
+
+  public String[] getContext(String document) {
+
+    Collection<String> context = new LinkedList<>();
+
+    NGramModel model = new NGramModel();
+    String normalized = normalizer.normalize(document).toString();
+    model.add(normalized, minLength, maxLength);
+
+    for (StringList tokenList : model) {
+      if (tokenList.size() > 0) {
+        context.add(StringUtil.toLowerCase(tokenList.getToken(0)));
+      }
+    }
+    return context.toArray(new String[context.size()]);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java
new file mode 100644
index 0000000..ce1823a
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorCrossValidator.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.IOException;
+
+import opennlp.tools.doccat.FeatureGenerator;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+import opennlp.tools.util.eval.CrossValidationPartitioner;
+import opennlp.tools.util.eval.Mean;
+
+/**
+ * Cross validator for language detector
+ */
+public class LanguageDetectorCrossValidator {
+
+  private final TrainingParameters params;
+
+  private Mean documentAccuracy = new Mean();
+
+  private LanguageDetectorEvaluationMonitor[] listeners;
+
+  private LanguageDetectorFactory factory;
+
+
+  /**
+   * Creates a {@link LanguageDetectorCrossValidator} with the given
+   * {@link FeatureGenerator}s.
+   */
+  public LanguageDetectorCrossValidator(TrainingParameters mlParams,
+                                        LanguageDetectorFactory factory,
+                                        LanguageDetectorEvaluationMonitor ... listeners) {
+    this.params = mlParams;
+    this.listeners = listeners;
+    this.factory = factory;
+  }
+
+  /**
+   * Starts the evaluation.
+   *
+   * @param samples
+   *          the data to train and test
+   * @param nFolds
+   *          number of folds
+   *
+   * @throws IOException
+   */
+  public void evaluate(ObjectStream<LanguageSample> samples, int nFolds)
+      throws IOException {
+
+    CrossValidationPartitioner<LanguageSample> partitioner =
+        new CrossValidationPartitioner<>(samples, nFolds);
+
+    while (partitioner.hasNext()) {
+
+      CrossValidationPartitioner.TrainingSampleStream<LanguageSample> trainingSampleStream =
+          partitioner.next();
+
+      LanguageDetectorModel model = LanguageDetectorME.train(
+          trainingSampleStream, params, factory);
+
+      LanguageDetectorEvaluator evaluator = new LanguageDetectorEvaluator(
+          new LanguageDetectorME(model), listeners);
+
+      evaluator.evaluate(trainingSampleStream.getTestSampleStream());
+
+      documentAccuracy.add(evaluator.getAccuracy(),
+          evaluator.getDocumentCount());
+
+    }
+  }
+
+  /**
+   * Retrieves the accuracy for all iterations.
+   *
+   * @return the word accuracy
+   */
+  public double getDocumentAccuracy() {
+    return documentAccuracy.mean();
+  }
+
+  /**
+   * Retrieves the number of words which where validated over all iterations.
+   * The result is the amount of folds multiplied by the total number of words.
+   *
+   * @return the word count
+   */
+  public long getDocumentCount() {
+    return documentAccuracy.count();
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
new file mode 100644
index 0000000..30f3313
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluationMonitor.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import opennlp.tools.util.eval.EvaluationMonitor;
+
+/**
+ * {@link EvaluationMonitor} for Language Detector.
+ */
+public interface LanguageDetectorEvaluationMonitor extends
+    EvaluationMonitor<LanguageSample> {
+
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java
new file mode 100644
index 0000000..bbf73c3
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEvaluator.java
@@ -0,0 +1,99 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import opennlp.tools.doccat.DocumentCategorizer;
+import opennlp.tools.util.eval.Evaluator;
+import opennlp.tools.util.eval.Mean;
+
+/**
+ * The {@link LanguageDetectorEvaluator} measures the performance of
+ * the given {@link LanguageDetector} with the provided reference
+ * {@link LanguageSample}s.
+ *
+ * @see LanguageDetector
+ * @see LanguageSample
+ */
+public class LanguageDetectorEvaluator extends Evaluator<LanguageSample> {
+
+  private LanguageDetector languageDetector;
+
+  private Mean accuracy = new Mean();
+
+  /**
+   * Initializes the current instance.
+   *
+   * @param langDetect the language detector instance
+   */
+  public LanguageDetectorEvaluator(LanguageDetector langDetect,
+                                   LanguageDetectorEvaluationMonitor ... listeners) {
+    super(listeners);
+    this.languageDetector = langDetect;
+  }
+
+  /**
+   * Evaluates the given reference {@link LanguageSample} object.
+   *
+   * This is done by categorizing the document from the provided
+   * {@link LanguageSample}. The detected language is then used
+   * to calculate and update the score.
+   *
+   * @param sample the reference {@link LanguageSample}.
+   */
+  public LanguageSample processSample(LanguageSample sample) {
+
+    CharSequence document = sample.getContext();
+
+    Language predicted = languageDetector.predictLanguage(document);
+
+
+
+    if (sample.getLanguage().getLang().equals(predicted.getLang())) {
+      accuracy.add(1);
+    }
+    else {
+      accuracy.add(0);
+    }
+
+    return new LanguageSample(predicted, sample.getContext());
+  }
+
+  /**
+   * Retrieves the accuracy of provided {@link DocumentCategorizer}.
+   *
+   * accuracy = correctly categorized documents / total documents
+   *
+   * @return the accuracy
+   */
+  public double getAccuracy() {
+    return accuracy.mean();
+  }
+
+  public long getDocumentCount() {
+    return accuracy.count();
+  }
+
+  /**
+   * Represents this objects as human readable {@link String}.
+   */
+  @Override
+  public String toString() {
+    return "Accuracy: " + accuracy.mean() + "\n" +
+        "Number of documents: " + accuracy.count();
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
new file mode 100644
index 0000000..b556a4d
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorEventStream.java
@@ -0,0 +1,69 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Iterator;
+
+import opennlp.tools.ml.model.Event;
+import opennlp.tools.util.AbstractEventStream;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * Iterator-like class for modeling language detector events.
+ */
+public class LanguageDetectorEventStream extends AbstractEventStream<LanguageSample> {
+
+  private LanguageDetectorContextGenerator mContextGenerator;
+
+  /**
+   * Initializes the current instance via samples and feature generators.
+   *
+   * @param data {@link ObjectStream} of {@link LanguageSample}s
+   */
+  public LanguageDetectorEventStream(ObjectStream<LanguageSample> data) {
+    super(data);
+
+    mContextGenerator =
+        new LanguageDetectorContextGenerator();
+  }
+
+  @Override
+  protected Iterator<Event> createEvents(final LanguageSample sample) {
+
+    return new Iterator<Event>() {
+
+      private boolean isVirgin = true;
+
+      public boolean hasNext() {
+        return isVirgin;
+      }
+
+      public Event next() {
+
+        isVirgin = false;
+
+        return new Event(sample.getLanguage().getLang(),
+            mContextGenerator.getContext(sample.getContext().toString()));
+      }
+
+      public void remove() {
+        throw new UnsupportedOperationException();
+      }
+    };
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
new file mode 100644
index 0000000..5cebbba
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorFactory.java
@@ -0,0 +1,53 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.ext.ExtensionLoader;
+
+
+public class LanguageDetectorFactory extends BaseToolFactory {
+
+  public static LanguageDetectorFactory create(String subclassName)
+      throws InvalidFormatException {
+    if (subclassName == null) {
+      // will create the default factory
+      return new LanguageDetectorFactory();
+    }
+    try {
+      LanguageDetectorFactory theFactory = ExtensionLoader.instantiateExtension(
+          LanguageDetectorFactory.class, subclassName);
+      theFactory.init();
+      return theFactory;
+    } catch (Exception e) {
+      String msg = "Could not instantiate the " + subclassName
+          + ". The initialization throw an exception.";
+      throw new InvalidFormatException(msg, e);
+    }
+  }
+
+  public void init() {
+    // nothing to do
+  }
+
+  @Override
+  public void validateArtifactMap() throws InvalidFormatException {
+    // nothing to validate
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
new file mode 100644
index 0000000..74a1cea
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorME.java
@@ -0,0 +1,97 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.Map;
+
+import opennlp.tools.ml.AbstractEventTrainer;
+import opennlp.tools.ml.EventTrainer;
+import opennlp.tools.ml.TrainerFactory;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.ObjectStream;
+import opennlp.tools.util.TrainingParameters;
+
+/**
+ * Implements learnable Language Detector
+ */
+public class LanguageDetectorME implements LanguageDetector {
+
+  private LanguageDetectorModel model;
+  private LanguageDetectorContextGenerator mContextGenerator;
+
+  /**
+   * Initializes the current instance with a language detector model. Default feature
+   * generation is used.
+   *
+   * @param model the language detector model
+   */
+  public LanguageDetectorME(LanguageDetectorModel model) {
+    this.model = model;
+    this.mContextGenerator = new LanguageDetectorContextGenerator();
+  }
+
+  @Override
+  public Language[] predictLanguages(CharSequence content) {
+    double[] eval = model.getMaxentModel().eval(mContextGenerator.getContext(content.toString()));
+    Language[] arr = new Language[eval.length];
+    for (int i = 0; i < eval.length; i++) {
+      arr[i] = new Language(model.getMaxentModel().getOutcome(i), eval[i]);
+    }
+
+    Arrays.sort(arr, (o1, o2) -> Double.compare(o2.getConfidence(), o1.getConfidence()));
+    return arr;
+  }
+
+  @Override
+  public Language predictLanguage(CharSequence content) {
+    return predictLanguages(content)[0];
+  }
+
+  @Override
+  public String[] getSupportedLanguages() {
+    int numberLanguages = model.getMaxentModel().getNumOutcomes();
+    String[] languages = new String[numberLanguages];
+    for (int i = 0; i < numberLanguages; i++) {
+      languages[i] = model.getMaxentModel().getOutcome(i);
+    }
+    return languages;
+  }
+
+
+  public static LanguageDetectorModel train(ObjectStream<LanguageSample> samples,
+                                            TrainingParameters mlParams,
+                                            LanguageDetectorFactory factory)
+      throws IOException {
+
+    Map<String, String> manifestInfoEntries = new HashMap<>();
+
+    mlParams.putIfAbsent(AbstractEventTrainer.DATA_INDEXER_PARAM,
+        AbstractEventTrainer.DATA_INDEXER_ONE_PASS_VALUE);
+
+    EventTrainer trainer = TrainerFactory.getEventTrainer(
+        mlParams, manifestInfoEntries);
+
+    MaxentModel model = trainer.train(
+        new LanguageDetectorEventStream(samples));
+
+    return new LanguageDetectorModel(model, manifestInfoEntries, factory);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
new file mode 100644
index 0000000..c0d9703
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorModel.java
@@ -0,0 +1,82 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.File;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.URL;
+import java.util.Map;
+
+import opennlp.tools.ml.model.AbstractModel;
+import opennlp.tools.ml.model.MaxentModel;
+import opennlp.tools.util.BaseToolFactory;
+import opennlp.tools.util.InvalidFormatException;
+import opennlp.tools.util.model.BaseModel;
+
+/**
+ * A model for language detection
+ */
+public class LanguageDetectorModel extends BaseModel {
+
+  private static final String COMPONENT_NAME = "LanguageDetectorME";
+  private static final String LANGDETECT_MODEL_ENTRY_NAME = "langdetect.model";
+
+  public LanguageDetectorModel(MaxentModel langdetectModel,
+                               Map<String, String> manifestInfoEntries,
+                               LanguageDetectorFactory factory) {
+    super(COMPONENT_NAME, "und", manifestInfoEntries, factory);
+
+    artifactMap.put(LANGDETECT_MODEL_ENTRY_NAME, langdetectModel);
+    checkArtifactMap();
+  }
+
+  public LanguageDetectorModel(InputStream in) throws IOException {
+    super(COMPONENT_NAME, in);
+  }
+
+  public LanguageDetectorModel(File modelFile) throws IOException {
+    super(COMPONENT_NAME, modelFile);
+  }
+
+  public LanguageDetectorModel(URL modelURL) throws IOException {
+    super(COMPONENT_NAME, modelURL);
+  }
+
+  @Override
+  protected void validateArtifactMap() throws InvalidFormatException {
+    super.validateArtifactMap();
+
+    if (!(artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME) instanceof AbstractModel)) {
+      throw new InvalidFormatException("Language detector model is incomplete!");
+    }
+  }
+
+  public LanguageDetectorFactory getFactory() {
+    return (LanguageDetectorFactory) this.toolFactory;
+  }
+
+  @Override
+  protected Class<? extends BaseToolFactory> getDefaultFactory() {
+    return LanguageDetectorFactory.class;
+  }
+
+  public MaxentModel getMaxentModel() {
+    return (MaxentModel) artifactMap.get(LANGDETECT_MODEL_ENTRY_NAME);
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
new file mode 100644
index 0000000..2a407f7
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorSampleStream.java
@@ -0,0 +1,55 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.io.IOException;
+
+import opennlp.tools.util.FilterObjectStream;
+import opennlp.tools.util.ObjectStream;
+
+/**
+ * This class reads in string encoded training samples, parses them and
+ * outputs {@link LanguageSample} objects.
+ * <p>
+ * Format:<br>
+ * Each line contains one sample document.<br>
+ * The language is the first string in the line followed by a tab and the document content.<br>
+ * Sample line: category-string tab-char document line-break-char(s)<br>
+ */
+public class LanguageDetectorSampleStream
+    extends FilterObjectStream<String, LanguageSample> {
+
+  public LanguageDetectorSampleStream(ObjectStream<String> samples) {
+    super(samples);
+  }
+
+  public LanguageSample read() throws IOException {
+    String sampleString;
+    while ((sampleString = samples.read()) != null) {
+      int tabIndex = sampleString.indexOf("\t");
+      if (tabIndex > 0) {
+        String lang = sampleString.substring(0, tabIndex);
+        String context = sampleString.substring(tabIndex + 1);
+
+        return new LanguageSample(new Language(lang), context);
+      }
+    }
+
+    return null;
+  }
+}

http://git-wip-us.apache.org/repos/asf/opennlp/blob/a9853284/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
new file mode 100644
index 0000000..f454864
--- /dev/null
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageSample.java
@@ -0,0 +1,68 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package opennlp.tools.langdetect;
+
+import java.util.Objects;
+
+/**
+ * Class which holds a classified document and its @{@link Language}.
+ */
+public class LanguageSample {
+
+  private final Language language;
+  private final CharSequence context;
+
+  public LanguageSample(Language language, CharSequence context) {
+    this.language = Objects.requireNonNull(language, "language must not be null");
+    this.context = Objects.requireNonNull(context, "context must not be null");
+  }
+
+  public Language getLanguage() {
+    return language;
+  }
+
+  public CharSequence getContext() {
+    return context;
+  }
+
+  @Override
+  public String toString() {
+    return language.getLang() + '\t' +  context;
+  }
+
+  @Override
+  public int hashCode() {
+    return Objects.hash(getContext(), getLanguage());
+  }
+
+  @Override
+  public boolean equals(Object obj) {
+    if (this == obj) {
+      return true;
+    }
+
+    if (obj instanceof LanguageSample) {
+      LanguageSample a = (LanguageSample) obj;
+
+      return getLanguage().equals(a.getLanguage())
+          && getContext().equals(a.getContext());
+    }
+
+    return false;
+  }
+}

[06/21] opennlp git commit: NoJira: Update link to latest docs

Posted by jo...@apache.org.

NoJira: Update link to latest docs


Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/217f5eb0
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/217f5eb0
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/217f5eb0

Branch: refs/heads/LangDetect
Commit: 217f5eb0dc4c2bc462563f9c2363815e862bc1d8
Parents: 6d2c8fc
Author: smarthi <sm...@apache.org>
Authored: Thu May 18 21:13:48 2017 -0400
Committer: smarthi <sm...@apache.org>
Committed: Thu May 18 21:13:48 2017 -0400

----------------------------------------------------------------------
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/opennlp/blob/217f5eb0/README.md
----------------------------------------------------------------------
diff --git a/README.md b/README.md
index 824afe0..19e3961 100644
--- a/README.md
+++ b/README.md
@@ -21,7 +21,7 @@ Welcome to Apache OpenNLP!
 [![Build Status](https://api.travis-ci.org/apache/opennlp.svg?branch=master)](https://travis-ci.org/apache/opennlp)
 [![Coverage Status](https://coveralls.io/repos/github/apache/opennlp/badge.svg?branch=master)](https://coveralls.io/github/apache/opennlp?branch=master)
 [![Maven Central](https://maven-badges.herokuapp.com/maven-central/org.apache.opennlp/opennlp/badge.svg?style=plastic)](https://maven-badges.herokuapp.com/maven-central/org.apache.opennlp/opennlp)
-[![Documentation Status](https://img.shields.io/:docs-latest-green.svg)](http://opennlp.apache.org/documentation.html)
+[![Documentation Status](https://img.shields.io/:docs-latest-green.svg)](http://opennlp.apache.org/docs/index.html)
 [![GitHub license](https://img.shields.io/badge/license-Apache%202-blue.svg)](https://raw.githubusercontent.com/apache/opennlp/master/LICENSE)
 [![Twitter Follow](https://img.shields.io/twitter/follow/ApacheOpennlp.svg?style=social)](https://twitter.com/ApacheOpenNLP)

[01/21] opennlp git commit: closes apache/opennlp#157 *Won't fix* [Forced Update!]

[18/21] opennlp git commit: OPENNLP-1087: Add convenience methods to load from Path

[03/21] opennlp git commit: Update README for release 1.8.0 RC

[10/21] opennlp git commit: OPENNLP-1076: Add validation of spans to SentenceSample

[17/21] opennlp git commit: OPENNLP-1085: Add methods to write model to File or Path

[05/21] opennlp git commit: [maven-release-plugin] prepare for next development iteration

[19/21] opennlp git commit: OPENNLP-788: Add LanguageDetector tool

[04/21] opennlp git commit: [maven-release-plugin] prepare release opennlp-1.8.0

[16/21] opennlp git commit: OPENNLP-1083: Conll-U Sample contraction handling

[12/21] opennlp git commit: OPENNLP-1050: Add formats support for Irish Sentence Bank

[02/21] opennlp git commit: OPENNLP-979 Update lemmatizer doc after API change

[08/21] opennlp git commit: OPENNLP-1068: Use current version to generate changes list

[15/21] opennlp git commit: OPENNLP-1078: Fix NPE in irishsentencebank reader

[07/21] opennlp git commit: OPENNLP-1054: Remove deprecated Heap and HeapList

[20/21] opennlp git commit: OPENNLP-788: Add LanguageDetector tool

[14/21] opennlp git commit: OPENNLP-1079 Added BratDocumentParser. Closed Annotation stream in BratDocument

[13/21] opennlp git commit: OPENNLP-1077 Made the constructors for the BratNameSampleStream public.

[09/21] opennlp git commit: OPENNLP-1074: Reduce visibility of eval methods

[11/21] opennlp git commit: OPENNLP-1075 Add streams for sentence and token samples for conllu

[21/21] opennlp git commit: OPENNLP-788: Add LanguageDetector tool

[06/21] opennlp git commit: NoJira: Update link to latest docs

[01/21] opennlp git commit: closes apache/opennlp#157 Won't fix [Forced Update!]