You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2018/10/04 21:29:25 UTC

[opennlp-sandbox] branch master updated: Add Java API for namecat and more Randomize training data, add dropout, add test eval

This is an automated email from the ASF dual-hosted git repository.

joern pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git


The following commit(s) were added to refs/heads/master by this push:
     new 04da946  Add Java API for namecat and more Randomize training data, add dropout, add test eval
04da946 is described below

commit 04da9467964734e9885b9f8a664177e41e9bdee3
Author: Jörn Kottmann <jo...@apache.org>
AuthorDate: Fri Jun 29 15:17:49 2018 +0200

    Add Java API for namecat and more
    Randomize training data, add dropout, add test eval
---
 .../main/java/org/apache/opennlp/ModelUtil.java    |  54 +++++++++++
 .../apache/opennlp/namecat/NameCategorizer.java    | 107 +++++++++++++++++++++
 .../apache/opennlp/namefinder/SequenceTagging.java |  26 +----
 tf-ner-poc/src/main/python/namecat/namecat.py      |  59 ++++++++++--
 4 files changed, 212 insertions(+), 34 deletions(-)

diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/ModelUtil.java b/tf-ner-poc/src/main/java/org/apache/opennlp/ModelUtil.java
new file mode 100644
index 0000000..fa80241
--- /dev/null
+++ b/tf-ner-poc/src/main/java/org/apache/opennlp/ModelUtil.java
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp;
+
+import java.io.IOException;
+import java.io.InputStream;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.util.zip.ZipEntry;
+import java.util.zip.ZipInputStream;
+
+public class ModelUtil {
+
+  public static Path writeModelToTmpDir(InputStream modelIn) throws IOException {
+    Path tmpDir = Files.createTempDirectory("opennlp2");
+
+    // Unzip the model to a temp directory
+    ZipInputStream zis = new ZipInputStream(modelIn);
+    ZipEntry zipEntry = zis.getNextEntry();
+    while(zipEntry != null){
+      Path newFile = tmpDir.resolve(zipEntry.getName());
+
+      if (zipEntry.isDirectory()) {
+        Files.createDirectories(newFile);
+      }
+      else {
+        Files.copy(zis, newFile);
+        // This is a bit of hack, but should work fine for now ...
+        newFile.toFile().deleteOnExit();
+      }
+
+      zipEntry = zis.getNextEntry();
+    }
+    zis.closeEntry();
+    zis.close();
+
+    return tmpDir;
+  }
+}
diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/namecat/NameCategorizer.java b/tf-ner-poc/src/main/java/org/apache/opennlp/namecat/NameCategorizer.java
new file mode 100644
index 0000000..7ec98b3
--- /dev/null
+++ b/tf-ner-poc/src/main/java/org/apache/opennlp/namecat/NameCategorizer.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.namecat;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+import org.apache.opennlp.ModelUtil;
+import org.tensorflow.SavedModelBundle;
+import org.tensorflow.Session;
+import org.tensorflow.Tensor;
+
+public class NameCategorizer {
+
+  private final Session session;
+  private final Map<Character, Integer> charMap = new HashMap<>();
+  private final Map<Integer, String> labelMap;
+
+  public NameCategorizer(InputStream vocabChars, InputStream labelDict,
+                         InputStream modelZipPackage) throws IOException {
+
+    try (BufferedReader in = new BufferedReader(new InputStreamReader(vocabChars,
+        StandardCharsets.UTF_8))) {
+      in.lines().forEach(ch -> charMap.put(ch.charAt(0), charMap.size()));
+    }
+
+    labelMap = new HashMap<>();
+    try (BufferedReader in = new BufferedReader(new InputStreamReader(labelDict,
+        StandardCharsets.UTF_8))) {
+      in.lines().forEach(label -> labelMap.put(labelMap.size(), label));
+    }
+
+    Path tmpModelPath = ModelUtil.writeModelToTmpDir(modelZipPackage);
+
+    SavedModelBundle model = SavedModelBundle.load(tmpModelPath.toString(), "serve");
+    session = model.session();
+  }
+
+  private static int argmax(float[] x) {
+    if (x == null || x.length == 0) {
+      throw new IllegalArgumentException("Vector x is null or empty");
+    }
+
+    int maxIdx = 0;
+    for (int i = 1; i < x.length; i++) {
+      if (x[maxIdx] < x[i])
+        maxIdx = i;
+    }
+    return maxIdx;
+  }
+
+  public String[] categorize(String[] names) {
+    if (names.length == 0) {
+      return new String[0];
+    }
+
+    int maxLength = Arrays.stream(names).mapToInt(String::length).max().getAsInt();
+
+    int charIds[][] = new int[names.length][maxLength];
+    int nameLengths[] = new int[names.length];
+
+    for (int nameIndex = 0; nameIndex < names.length; nameIndex++) {
+      for (int charIndex = 0; charIndex < names[nameIndex].length(); charIndex++) {
+        charIds[nameIndex][charIndex] = charMap.get(names[nameIndex].charAt(charIndex));
+      }
+      nameLengths[nameIndex] = names[nameIndex].length();
+    }
+
+    try (Tensor<?> dropout = Tensor.create(1f, Float.class);
+         Tensor<?> charTensor = Tensor.create(charIds);
+         Tensor<?> nameLength = Tensor.create(nameLengths)) {
+      List<Tensor<?>> result = session.runner()
+          .feed("dropout_keep_prop", dropout)
+          .feed("char_ids", charTensor)
+          .feed("name_lengths", nameLength)
+          .fetch("norm_probs", 0).run();
+
+      try (Tensor<?> probTensor = result.get(0)) {
+        float[][] probs = probTensor.copyTo(new float[names.length][labelMap.size()]);
+        return Arrays.stream(probs).map(prob -> labelMap.get(argmax(prob))).toArray(String[]::new);
+      }
+    }
+  }
+}
diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/SequenceTagging.java b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/SequenceTagging.java
index 5f3d513..2464445 100644
--- a/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/SequenceTagging.java
+++ b/tf-ner-poc/src/main/java/org/apache/opennlp/namefinder/SequenceTagging.java
@@ -20,13 +20,11 @@ package org.apache.opennlp.namefinder;
 import java.io.FileInputStream;
 import java.io.IOException;
 import java.io.InputStream;
-import java.nio.file.Files;
 import java.nio.file.Path;
 import java.util.Arrays;
 import java.util.List;
-import java.util.zip.ZipEntry;
-import java.util.zip.ZipInputStream;
 
+import org.apache.opennlp.ModelUtil;
 import org.tensorflow.SavedModelBundle;
 import org.tensorflow.Session;
 import org.tensorflow.Tensor;
@@ -57,27 +55,7 @@ public class SequenceTagging implements TokenNameFinder, AutoCloseable {
     wordIndexer = new WordIndexer(vocabWords, vocabChars);
     indexTagger = new IndexTagger(vocabTags);
 
-    Path tmpDir = Files.createTempDirectory("opennlp2_namefinder");
-
-    // Unzip the model to a temp directory
-    ZipInputStream zis = new ZipInputStream(modelZipPackage);
-    ZipEntry zipEntry = zis.getNextEntry();
-    while(zipEntry != null){
-      Path newFile = tmpDir.resolve(zipEntry.getName());
-
-      if (zipEntry.isDirectory()) {
-        Files.createDirectories(newFile);
-      }
-      else {
-        Files.copy(zis, newFile);
-        // This is a bit of hack, but should work fine for now ...
-        newFile.toFile().deleteOnExit();
-      }
-
-      zipEntry = zis.getNextEntry();
-    }
-    zis.closeEntry();
-    zis.close();
+    Path tmpDir = ModelUtil.writeModelToTmpDir(modelZipPackage);
 
     model = SavedModelBundle.load(tmpDir.toString(), "serve");
     session = model.session();
diff --git a/tf-ner-poc/src/main/python/namecat/namecat.py b/tf-ner-poc/src/main/python/namecat/namecat.py
index 2205756..c71a7b1 100644
--- a/tf-ner-poc/src/main/python/namecat/namecat.py
+++ b/tf-ner-poc/src/main/python/namecat/namecat.py
@@ -22,6 +22,7 @@ import tensorflow as tf
 import sys
 from math import floor
 import numpy as np
+import random
 
 def load_data(file):
     with open(file) as f:
@@ -35,6 +36,9 @@ def load_data(file):
 
 # create placeholders
 def create_placeholders():
+
+    dropout_keep_prob = tf.placeholder(tf.float32, name="dropout_keep_prop")
+
     # shape is batch_size, and length of name
     char_ids_ph = tf.placeholder(tf.int32, shape=[None, None], name="char_ids")
 
@@ -43,9 +47,10 @@ def create_placeholders():
 
     # shape is batch_size
     y_ph = tf.placeholder(tf.int32, shape=[None], name="y")
-    return char_ids_ph, name_lengths_ph, y_ph
+    return dropout_keep_prob, char_ids_ph, name_lengths_ph, y_ph
+
+def create_graph(dropout_keep_prob, char_ids_ph, name_lengths_ph, y_ph, nchars, nclasses):
 
-def create_graph(char_ids_ph, name_lengths_ph, y_ph, nchars, nclasses):
 
     dim_char = 100
 
@@ -54,7 +59,9 @@ def create_graph(char_ids_ph, name_lengths_ph, y_ph, nchars, nclasses):
 
     char_embeddings = tf.nn.embedding_lookup(K, char_ids_ph)
 
-    char_hidden_size = 100
+    char_embeddings = tf.nn.dropout(char_embeddings, dropout_keep_prob)
+
+    char_hidden_size = 256
     cell_fw = tf.contrib.rnn.LSTMCell(char_hidden_size, state_is_tuple=True)
     cell_bw = tf.contrib.rnn.LSTMCell(char_hidden_size, state_is_tuple=True)
 
@@ -66,6 +73,8 @@ def create_graph(char_ids_ph, name_lengths_ph, y_ph, nchars, nclasses):
 
     output = tf.concat([output_fw, output_bw], axis=-1)
 
+    output = tf.nn.dropout(output, dropout_keep_prob)
+
     W = tf.get_variable("W", shape=[2*char_hidden_size, nclasses])
     b = tf.get_variable("b", shape=[nclasses])
     logits = tf.nn.xw_plus_b(output, W, b, name="logits")
@@ -76,7 +85,9 @@ def create_graph(char_ids_ph, name_lengths_ph, y_ph, nchars, nclasses):
 
     loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=y_ph)
     mean_loss = tf.reduce_mean(loss)
+
     train_op = tf.train.AdamOptimizer().minimize(loss)
+    #train_op = tf.train.RMSPropOptimizer(learning_rate=0.001).minimize(loss)
 
     return train_op, norm_probs
 
@@ -107,6 +118,12 @@ def mini_batch(label_dict, char_dict, labels, names, batch_size, batch_index):
 
     return label_batch, np.asarray(name_batch), name_length
 
+def write_mapping(tags, output_filename):
+    with open(output_filename, 'w', encoding='utf-8') as f:
+        for i, tag in enumerate(tags):
+            f.write(tag)
+            f.write("\n")
+
 def main():
 
     if len(sys.argv) != 4:
@@ -131,9 +148,12 @@ def main():
 
     char_dict = {k: v for v, k in enumerate(char_set)}
 
-    char_ids_ph, name_lengths_ph, y_ph = create_placeholders()
+    write_mapping(label_dict, "label_dict.txt")
+    write_mapping(char_dict, "char_dict.txt")
 
-    train_op, probs_op = create_graph(char_ids_ph, name_lengths_ph, y_ph, len(char_set), len(label_dict))
+    dropout_keep_prob, char_ids_ph, name_lengths_ph, y_ph = create_placeholders()
+
+    train_op, probs_op = create_graph(dropout_keep_prob, char_ids_ph, name_lengths_ph, y_ph, len(char_set), len(label_dict))
 
     sess = tf.Session(config=tf.ConfigProto(allow_soft_placement=True,
                                             log_device_placement=True))
@@ -143,14 +163,18 @@ def main():
         sess.run(init)
 
         batch_size = 20
-        for epoch in range(100):
+        for epoch in range(10):
             print("Epoch " + str(epoch))
             acc_train = []
-            for batch_index in range(floor(len(names_train) / batch_size)):
+
+            batch_indexes = list(range(floor(len(names_train) / batch_size)))
+            random.Random(epoch).shuffle(batch_indexes)
+
+            for batch_index in batch_indexes:
                 label_train_batch, name_train_batch, name_train_length = \
                     mini_batch(label_dict, char_dict, labels_train, names_train, batch_size, batch_index)
 
-                feed_dict = {char_ids_ph: name_train_batch, name_lengths_ph: name_train_length, y_ph: label_train_batch}
+                feed_dict = {dropout_keep_prob: 0.5, char_ids_ph: name_train_batch, name_lengths_ph: name_train_length, y_ph: label_train_batch}
                 _, probs = sess.run([train_op, probs_op], feed_dict)
 
                 acc_train.append((batch_size  - np.sum(np.abs(label_train_batch - np.argmax(probs, axis=1)))) / batch_size)
@@ -162,14 +186,29 @@ def main():
                 label_dev_batch, name_dev_batch, name_dev_length = \
                     mini_batch(label_dict, char_dict, labels_dev, names_dev, batch_size, batch_index)
 
-                feed_dict = {char_ids_ph: name_dev_batch, name_lengths_ph: name_dev_length, y_ph: label_dev_batch}
+                feed_dict = {dropout_keep_prob: 1, char_ids_ph: name_dev_batch, name_lengths_ph: name_dev_length, y_ph: label_dev_batch}
                 probs = sess.run(probs_op, feed_dict)
 
                 acc_dev.append((batch_size  - np.sum(np.abs(label_dev_batch - np.argmax(probs, axis=1)))) / batch_size)
 
             print("Dev acc: " + str(np.mean(acc_dev)))
 
-    # Add code to save the model, and resource files ....
+        #acc_test = []
+        #for batch_index in range(floor(len(names_test) / batch_size)):
+        #    label_test_batch, name_test_batch, name_test_length = \
+        #        mini_batch(label_dict, char_dict, labels_test, names_test, batch_size, batch_index)
+
+        #    feed_dict = {char_ids_ph: name_test_batch, name_lengths_ph: name_test_length, y_ph: label_test_batch}
+        #    probs = sess.run(probs_op, feed_dict)
+
+        #    acc_test.append((batch_size  - np.sum(np.abs(label_test_batch - np.argmax(probs, axis=1)))) / batch_size)
+
+        #print("Test acc: " + str(np.mean(acc_test)))
+
+        saver = tf.train.Saver()
+        builder = tf.saved_model.builder.SavedModelBuilder("./namecat_model" + str(epoch))
+        builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING])
+        builder.save()
 
 if __name__ == "__main__":
     main()