You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2018/05/31 09:46:13 UTC
[opennlp-sandbox] branch master updated: Adjust settings to match
namefinder.py trainer
This is an automated email from the ASF dual-hosted git repository.
joern pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git
The following commit(s) were added to refs/heads/master by this push:
new c136c85 Adjust settings to match namefinder.py trainer
c136c85 is described below
commit c136c857087232f269f0f4c7610ab02a4129caa7
Author: Jörn Kottmann <jo...@apache.org>
AuthorDate: Thu May 31 11:45:41 2018 +0200
Adjust settings to match namefinder.py trainer
---
.../java/org/apache/opennlp/namefinder/WordIndexer.java | 17 +++++++----------
1 file changed, 7 insertions(+), 10 deletions(-)
diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java
index 3954092..738a952 100644
--- a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java
+++ b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java
@@ -17,8 +17,6 @@
package org.apache.opennlp.namefinder;
-import opennlp.tools.util.StringUtil;
-
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStream;
@@ -28,6 +26,8 @@ import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
+import opennlp.tools.util.StringUtil;
+
public class WordIndexer {
private final Map<Character, Integer> char2idx;
@@ -35,11 +35,9 @@ public class WordIndexer {
public static String UNK = "$UNK$";
public static String NUM = "$NUM$";
- public static String NONE = "O";
- //private boolean useChars = true;
- private boolean lowerCase = true;
- private boolean allowUnk = true;
+ private boolean lowerCase = false;
+ private boolean allowUnk = false;
private Pattern digitPattern = Pattern.compile("\\d+(,\\d+)*(\\.\\d+)?");
@@ -114,8 +112,9 @@ public class WordIndexer {
if (lowerCase) {
word = StringUtil.toLowerCase(word);
}
- if (digitPattern.matcher(word).find())
- word = NUM;
+
+ // if (digitPattern.matcher(word).find())
+ // word = NUM;
// 2. get id of word
Integer wordId;
@@ -140,7 +139,6 @@ public class WordIndexer {
return tokenIds;
}
-
public class Ids {
private int[] chars;
@@ -162,5 +160,4 @@ public class WordIndexer {
this.word = word;
}
}
-
}
--
To stop receiving notification emails like this one, please contact
joern@apache.org.