You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2017/06/14 10:23:40 UTC
opennlp git commit: Generate 1 - 3 char grams,
disable unicode normalizer
Repository: opennlp
Updated Branches:
refs/heads/LangDetect 6e9da1c00 -> 0b5e4a491
Generate 1 - 3 char grams, disable unicode normalizer
Project: http://git-wip-us.apache.org/repos/asf/opennlp/repo
Commit: http://git-wip-us.apache.org/repos/asf/opennlp/commit/0b5e4a49
Tree: http://git-wip-us.apache.org/repos/asf/opennlp/tree/0b5e4a49
Diff: http://git-wip-us.apache.org/repos/asf/opennlp/diff/0b5e4a49
Branch: refs/heads/LangDetect
Commit: 0b5e4a491c5480ce53ebb48632a3c12afcdc8b29
Parents: 6e9da1c
Author: Jörn Kottmann <jo...@apache.org>
Authored: Wed Jun 14 12:23:12 2017 +0200
Committer: Jörn Kottmann <jo...@apache.org>
Committed: Wed Jun 14 12:23:12 2017 +0200
----------------------------------------------------------------------
.../langdetect/LanguageDetectorContextGenerator.java | 14 ++++++--------
1 file changed, 6 insertions(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/opennlp/blob/0b5e4a49/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
----------------------------------------------------------------------
diff --git a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
index b28c601..699d2eb 100644
--- a/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
+++ b/opennlp-tools/src/main/java/opennlp/tools/langdetect/LanguageDetectorContextGenerator.java
@@ -17,19 +17,17 @@
package opennlp.tools.langdetect;
+import java.util.ArrayList;
import java.util.Collection;
-import java.util.LinkedList;
import opennlp.tools.ngram.NGramModel;
import opennlp.tools.util.StringList;
-import opennlp.tools.util.StringUtil;
import opennlp.tools.util.normalizer.AggregateCharSequenceNormalizer;
import opennlp.tools.util.normalizer.CharSequenceNormalizer;
import opennlp.tools.util.normalizer.EmojiCharSequenceNormalizer;
import opennlp.tools.util.normalizer.NumberCharSequenceNormalizer;
import opennlp.tools.util.normalizer.ShrinkCharSequenceNormalizer;
import opennlp.tools.util.normalizer.TwitterCharSequenceNormalizer;
-import opennlp.tools.util.normalizer.UnicodeCharSequenceNormalizer;
import opennlp.tools.util.normalizer.UrlCharSequenceNormalizer;
/**
@@ -50,21 +48,21 @@ class LanguageDetectorContextGenerator {
UrlCharSequenceNormalizer.getInstance(),
TwitterCharSequenceNormalizer.getInstance(),
NumberCharSequenceNormalizer.getInstance(),
- UnicodeCharSequenceNormalizer.getInstance(),
+ // UnicodeCharSequenceNormalizer.getInstance(),
ShrinkCharSequenceNormalizer.getInstance()
);
}
/**
- * Initializes the current instance with min 2 length and max 5 length of ngrams.
+ * Initializes the current instance with min 1 length and max 3 length of ngrams.
*/
LanguageDetectorContextGenerator() {
- this(2, 3);
+ this(1, 3);
}
public String[] getContext(String document) {
- Collection<String> context = new LinkedList<>();
+ Collection<String> context = new ArrayList<>();
NGramModel model = new NGramModel();
String normalized = normalizer.normalize(document).toString();
@@ -72,7 +70,7 @@ class LanguageDetectorContextGenerator {
for (StringList tokenList : model) {
if (tokenList.size() > 0) {
- context.add(StringUtil.toLowerCase(tokenList.getToken(0)));
+ context.add(tokenList.getToken(0));
}
}
return context.toArray(new String[context.size()]);