You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2018/05/31 09:46:13 UTC

[opennlp-sandbox] branch master updated: Adjust settings to match namefinder.py trainer

This is an automated email from the ASF dual-hosted git repository.

joern pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git


The following commit(s) were added to refs/heads/master by this push:
     new c136c85  Adjust settings to match namefinder.py trainer
c136c85 is described below

commit c136c857087232f269f0f4c7610ab02a4129caa7
Author: Jörn Kottmann <jo...@apache.org>
AuthorDate: Thu May 31 11:45:41 2018 +0200

    Adjust settings to match namefinder.py trainer
---
 .../java/org/apache/opennlp/namefinder/WordIndexer.java | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java
index 3954092..738a952 100644
--- a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java
+++ b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/WordIndexer.java
@@ -17,8 +17,6 @@
 
 package org.apache.opennlp.namefinder;
 
-import opennlp.tools.util.StringUtil;
-
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.InputStream;
@@ -28,6 +26,8 @@ import java.util.HashMap;
 import java.util.Map;
 import java.util.regex.Pattern;
 
+import opennlp.tools.util.StringUtil;
+
 public class WordIndexer {
 
   private final Map<Character, Integer> char2idx;
@@ -35,11 +35,9 @@ public class WordIndexer {
 
   public static String UNK = "$UNK$";
   public static String NUM = "$NUM$";
-  public static String NONE = "O";
 
-  //private boolean useChars = true;
-  private boolean lowerCase = true;
-  private boolean allowUnk = true;
+  private boolean lowerCase = false;
+  private boolean allowUnk = false;
 
   private Pattern digitPattern = Pattern.compile("\\d+(,\\d+)*(\\.\\d+)?");
 
@@ -114,8 +112,9 @@ public class WordIndexer {
     if (lowerCase) {
       word = StringUtil.toLowerCase(word);
     }
-    if (digitPattern.matcher(word).find())
-      word = NUM;
+
+    // if (digitPattern.matcher(word).find())
+    //  word = NUM;
 
     // 2. get id of word
     Integer wordId;
@@ -140,7 +139,6 @@ public class WordIndexer {
     return tokenIds;
   }
 
-
   public class Ids {
 
     private int[] chars;
@@ -162,5 +160,4 @@ public class WordIndexer {
       this.word = word;
     }
   }
-
 }

-- 
To stop receiving notification emails like this one, please contact
joern@apache.org.