You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@opennlp.apache.org by jo...@apache.org on 2018/11/21 15:17:45 UTC

[opennlp-sandbox] branch master updated: Add first draft of normalizer Java API

This is an automated email from the ASF dual-hosted git repository.

joern pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/opennlp-sandbox.git


The following commit(s) were added to refs/heads/master by this push:
     new 30067a7  Add first draft of normalizer Java API
30067a7 is described below

commit 30067a7a5b8bd69d376032b338d7986f3ee2116c
Author: Jörn Kottmann <jo...@apache.org>
AuthorDate: Wed Nov 21 16:15:33 2018 +0100

    Add first draft of normalizer Java API
---
 tf-ner-poc/pom.xml                                 |   3 +-
 .../org/apache/opennlp/normalizer/Normalizer.java  | 119 +++++++++++++++++++++
 .../src/main/python/normalizer/normalizer.py       | 103 +++++++++---------
 3 files changed, 175 insertions(+), 50 deletions(-)

diff --git a/tf-ner-poc/pom.xml b/tf-ner-poc/pom.xml
index 45bdce0..8042da9 100644
--- a/tf-ner-poc/pom.xml
+++ b/tf-ner-poc/pom.xml
@@ -9,7 +9,7 @@
     <version>1.0-SNAPSHOT</version>
 
     <properties>
-        <tensorflow.version>1.7.0</tensorflow.version>
+        <tensorflow.version>1.12.0</tensorflow.version>
     </properties>
 
     <dependencies>
@@ -43,6 +43,7 @@
                     <target>1.8</target>
                 </configuration>
             </plugin>
+
         </plugins>
     </build>
 
diff --git a/tf-ner-poc/src/main/java/org/apache/opennlp/normalizer/Normalizer.java b/tf-ner-poc/src/main/java/org/apache/opennlp/normalizer/Normalizer.java
new file mode 100644
index 0000000..2ad4809
--- /dev/null
+++ b/tf-ner-poc/src/main/java/org/apache/opennlp/normalizer/Normalizer.java
@@ -0,0 +1,119 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.opennlp.normalizer;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.nio.file.Path;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.stream.Collectors;
+
+import org.apache.opennlp.ModelUtil;
+import org.tensorflow.SavedModelBundle;
+import org.tensorflow.Session;
+import org.tensorflow.Tensor;
+
+public class Normalizer {
+
+  private final Session session;
+  private final Map<Character, Integer> sourceCharMap;
+  private final Map<Integer, Character> targetCharMap;
+
+  Normalizer(InputStream sourceCharMapIn, InputStream targetCharMapIn,
+             InputStream modelZipPackage) throws IOException {
+
+    Path tmpModelPath = ModelUtil.writeModelToTmpDir(modelZipPackage);
+
+    SavedModelBundle model = SavedModelBundle.load(tmpModelPath.toString(), "serve");
+    session = model.session();
+
+    sourceCharMap = loadCharMap(sourceCharMapIn).entrySet()
+        .stream()
+        .collect(Collectors.toMap(Map.Entry::getValue, c -> c.getKey()));
+
+    targetCharMap = loadCharMap(targetCharMapIn);
+  }
+
+  private static Map<Integer, Character> loadCharMap(InputStream in) throws IOException {
+    try(BufferedReader reader = new BufferedReader(
+        new InputStreamReader(in, StandardCharsets.UTF_8))) {
+      Map<Integer, Character> characterMap = new HashMap<>();
+
+      String tag;
+      while ((tag = reader.readLine()) != null) {
+        characterMap.put(characterMap.size(), tag.charAt(0));
+      }
+
+      return Collections.unmodifiableMap(characterMap);
+    }
+  }
+
+  public String[] normalize(String[] texts) {
+
+    // TODO: Batch size is hard coded in the graph, make it dynamic or at padding here
+
+    int textLengths[] = Arrays.stream(texts).mapToInt(String::length).toArray();
+    int maxLength = Arrays.stream(textLengths).max().getAsInt();
+
+    int charIds[][] = new int[texts.length][maxLength];
+
+    for (int textIndex = 0; textIndex < texts.length; textIndex++) {
+      for (int charIndex = 0; charIndex < texts[textIndex].length(); charIndex++) {
+        charIds[textIndex][charIndex] = sourceCharMap.get(texts[textIndex].charAt(charIndex));
+      }
+
+      textLengths[textIndex] = texts[textIndex].length();
+    }
+
+    try (Tensor<?> charTensor = Tensor.create(charIds);
+         Tensor<?> textLength = Tensor.create(textLengths)) {
+
+      List<Tensor<?>> result = session.runner()
+          .feed("encoder_char_ids", charTensor)
+          .feed("encoder_lengths", textLength)
+          .fetch("decode", 0).run();
+
+      try (Tensor<?> translationTensor = result.get(0)) {
+        // TODO: This can't be hard coded ... normalized form doesn't need to have static length
+        int[][] translations =
+            translationTensor.copyTo(new int[texts.length][9]); // shape is (20, 9) in eval py code
+
+        List<String> normalizedTexts = new ArrayList<>();
+
+        for (int ti = 0; ti < translations.length; ti++) {
+          StringBuilder normalizedText = new StringBuilder();
+          for (int ci = 0; ci < translations[ti].length; ci++) {
+            normalizedText.append(targetCharMap.get(translations[ti][ci]));
+          }
+
+          normalizedTexts.add(normalizedText.toString());
+        }
+
+        return normalizedTexts.toArray(new String[normalizedTexts.size()]);
+      }
+    }
+  }
+}
diff --git a/tf-ner-poc/src/main/python/normalizer/normalizer.py b/tf-ner-poc/src/main/python/normalizer/normalizer.py
index 18e9338..86e735e 100644
--- a/tf-ner-poc/src/main/python/normalizer/normalizer.py
+++ b/tf-ner-poc/src/main/python/normalizer/normalizer.py
@@ -161,16 +161,6 @@ def create_graph(mode, batch_size, encoder_nchars, max_target_length, decoder_nc
         gradients, _ = tf.clip_by_global_norm(gradients, 10.0)
         optimize = optimizer.apply_gradients(zip(gradients, v))
 
-        # decoder is here ...
-        #helperE = tf.contrib.seq2seq.GreedyEmbeddingHelper(
-        #    decoder_embedding_weights,
-        #    tf.fill([batch_size], decoder_nchars-2), decoder_nchars-1)
-        #decoderE = tf.contrib.seq2seq.BasicDecoder(
-        #    decoder_cell, helperE, encoder_state,
-        #    output_layer=projection_layer)
-        #outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoderE, maximum_iterations=15, output_time_major=True)
-
-
         return encoder_char_ids_ph, encoder_lengths_ph, decoder_char_ids_ph, decoder_lengths, optimize, train_prediction, outputs
 
     if "EVAL" == mode:
@@ -182,10 +172,11 @@ def create_graph(mode, batch_size, encoder_nchars, max_target_length, decoder_nc
             output_layer=projection_layer)
         outputs, _, _ = tf.contrib.seq2seq.dynamic_decode(decoderE, maximum_iterations=15)
 
-        translations = outputs.sample_id
+
+        translations = tf.identity(outputs.sample_id, name="decode")
 
         # the outputs don't decode anything ...
-        return encoder_char_ids_ph, encoder_lengths_ph, outputs
+        return encoder_char_ids_ph, encoder_lengths_ph, translations
 
 def encode_chars(names):
     char_set = set()
@@ -193,6 +184,12 @@ def encode_chars(names):
         char_set = char_set.union(name)
     return {k: v for v, k in enumerate(char_set)}
 
+# TODO: Deduplicate this, same as in namefinder.py
+def write_mapping(tags, output_filename):
+    with open(output_filename, 'w', encoding='utf-8') as f:
+        for i, tag in enumerate(tags):
+            f.write('{}\n'.format(tag))
+
 def main():
 
     checkpoints_path = "/tmp/model/checkpoints"
@@ -202,12 +199,17 @@ def main():
     source_test, target_test = load_data("date_test.txt")
 
     source_char_dict = encode_chars(source_train + source_dev + source_test)
+
+    write_mapping(source_char_dict, 'source_char_dict.txt')
+
     target_char_dict = encode_chars(target_train + target_dev + target_test)
 
     # TODO: Find better chars for begin and end markers
     target_char_dict['S'] = len(target_char_dict)
     target_char_dict['E'] = len(target_char_dict)
 
+    write_mapping(target_char_dict, 'target_char_dict.txt')
+
     target_dict_rev = {v: k for k, v in target_char_dict.items()}
 
     batch_size = 20
@@ -230,58 +232,61 @@ def main():
             create_graph("EVAL", batch_size, len(source_char_dict), target_max_len, len(target_char_dict))
         eval_saver = tf.train.Saver()
 
-    eval_sess = tf.Session(graph=eval_graph)
+        eval_sess = tf.Session(graph=eval_graph)
 
-
-    for epoch in range(200):
+    for epoch in range(30):
         print("Epoch " + str(epoch))
 
-        for batch_index in range(floor(len(source_train) / batch_size)):
-            if batch_index > 0 and batch_index % 100 == 0:
-                print("batch_index " + str(batch_index))
-
-            target_batch, target_length, source_batch, source_length = \
-                mini_batch(target_char_dict, target_train, source_char_dict, source_train, batch_size, batch_index)
+        with train_graph.as_default():
+            for batch_index in range(floor(len(source_train) / batch_size)):
+                if batch_index > 0 and batch_index % 100 == 0:
+                    print("batch_index " + str(batch_index))
 
-            feed_dict = {t_encoder_lengths_ph: source_length, t_encoder_char_ids_ph: source_batch,
-                         t_decoder_lengths: target_length, t_decoder_char_ids_ph: target_batch}
+                target_batch, target_length, source_batch, source_length = \
+                    mini_batch(target_char_dict, target_train, source_char_dict, source_train, batch_size, batch_index)
 
-            t1, dec1 = train_sess.run([t_adam_optimize, t_dec_out], feed_dict)
-            dec2 = train_sess.run([t_dec_out], feed_dict)
-            tv=1
+                feed_dict = {t_encoder_lengths_ph: source_length, t_encoder_char_ids_ph: source_batch,
+                             t_decoder_lengths: target_length, t_decoder_char_ids_ph: target_batch}
 
-        # Save train model, and restore it into the eval session
-        checkpoint_path = train_saver.save(train_sess, checkpoints_path, global_step=epoch)
-        eval_saver.restore(eval_sess, checkpoint_path)
+                t1, dec1 = train_sess.run([t_adam_optimize, t_dec_out], feed_dict)
+                dec2 = train_sess.run([t_dec_out], feed_dict)
+                tv=1
 
-        count_correct = 0
-        for batch_index in range(floor(len(source_dev) / batch_size)):
-            target_batch, target_length, source_batch, source_length = \
-                mini_batch(target_char_dict, target_dev, source_char_dict, source_dev, batch_size, batch_index)
+            # Save train model, and restore it into the eval session
+            checkpoint_path = train_saver.save(train_sess, checkpoints_path, global_step=epoch)
+            eval_saver.restore(eval_sess, checkpoint_path)
 
-            begin = batch_index
-            end = min(batch_index + batch_size, len(source_dev))
-            target_strings = target_dev[begin:end]
+        with eval_graph.as_default():
+            count_correct = 0
+            for batch_index in range(floor(len(source_dev) / batch_size)):
+                target_batch, target_length, source_batch, source_length = \
+                    mini_batch(target_char_dict, target_dev, source_char_dict, source_dev, batch_size, batch_index)
 
+                begin = batch_index
+                end = min(batch_index + batch_size, len(source_dev))
+                target_strings = target_dev[begin:end]
 
-            feed_dict = {e_encoder_lengths_ph: source_length, e_encoder_char_ids_ph: source_batch}
-            result = eval_sess.run(e_dec_out, feed_dict)
+                feed_dict = {e_encoder_lengths_ph: source_length, e_encoder_char_ids_ph: source_batch}
+                result = eval_sess.run(e_dec_out, feed_dict)
 
+                decoded_dates = []
 
-            decoded_dates = []
+                for coded_date in result:
+                    date = ""
+                    for char_id in coded_date:
+                        if not char_id == len(target_char_dict) - 1:
+                            date = date + (target_dict_rev[char_id])
+                    decoded_dates.append(date)
 
-            for coded_date in result.sample_id:
-                date = ""
-                for char_id in coded_date:
-                    if not char_id == len(target_char_dict) - 1:
-                        date = date + (target_dict_rev[char_id])
-                decoded_dates.append(date)
+                for i in range(len(target_strings)):
+                    if target_strings[i] == decoded_dates[i]:
+                        count_correct = count_correct + 1
 
-            for i in range(len(target_strings)):
-                if target_strings[i] == decoded_dates[i]:
-                    count_correct = count_correct + 1
+            print("Dev: " + str(count_correct / len(target_dev)))
 
-        print("Dev: " + str(count_correct / len(target_dev)))
+            builder = tf.saved_model.builder.SavedModelBuilder("./normalizer_model" + str(epoch))
+            builder.add_meta_graph_and_variables(eval_sess, [tf.saved_model.tag_constants.SERVING])
+            builder.save()
 
 if __name__ == "__main__":
     main()