You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/11/17 18:25:08 UTC

[3/8] incubator-joshua git commit: added saving of custom grammar

added saving of custom grammar


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/2229d1b9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/2229d1b9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/2229d1b9

Branch: refs/heads/save_custom_grammars
Commit: 2229d1b9f4ed167507d721ff2cfe87ccfadf28a3
Parents: 81baa65
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Nov 16 20:57:53 2016 -0500
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Thu Nov 17 13:24:07 2016 -0500

----------------------------------------------------------------------
 .../java/org/apache/joshua/decoder/Decoder.java |  9 +++
 .../apache/joshua/decoder/ff/tm/Grammar.java    |  5 ++
 .../tm/hash_based/MemoryBasedBatchGrammar.java  | 73 +++++++++++++++++++-
 .../decoder/ff/tm/packed/PackedGrammar.java     |  5 ++
 .../joshua/decoder/phrase/PhraseTable.java      | 10 +++
 5 files changed, 99 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/2229d1b9/src/main/java/org/apache/joshua/decoder/Decoder.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/Decoder.java b/src/main/java/org/apache/joshua/decoder/Decoder.java
index c15898c..ca2bfaa 100644
--- a/src/main/java/org/apache/joshua/decoder/Decoder.java
+++ b/src/main/java/org/apache/joshua/decoder/Decoder.java
@@ -673,8 +673,17 @@ public class Decoder {
    * @param rule the rule to add
    */
   public void addCustomRule(Rule rule) {
+    if (customPhraseTable == null) {
+      LOG.warn("No custom grammar was found in the config file; can't add rule");
+      LOG.warn("Add the following line to your config and restart Joshua to enable it:");
+      LOG.warn("  tm = phrase -owner custom -maxspan 20 -path /path/to/custom.grammar");
+      LOG.warn("The owner must be 'custom'");
+    }
+      
     customPhraseTable.addRule(rule);
     rule.estimateRuleCost(featureFunctions);
+    
+    customPhraseTable.save();
   }
 
   public Grammar getCustomPhraseTable() {

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/2229d1b9/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java b/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
index 8f90d1b..27d3862 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
@@ -117,4 +117,9 @@ public interface Grammar {
    * @param rule the {@link org.apache.joshua.decoder.ff.tm.Rule}
    */
   void addRule(Rule rule);
+  
+  /**
+   * Write the grammar out to some permanent location (disk, database, etc).
+   */
+  void save();
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/2229d1b9/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
index ebfa996..f53affa 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
@@ -18,7 +18,16 @@
  */
 package org.apache.joshua.decoder.ff.tm.hash_based;
 
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
 import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
@@ -69,15 +78,27 @@ public class MemoryBasedBatchGrammar extends AbstractGrammar {
   private GrammarReader<Rule> modelReader;
 
   /**
-   * Constructor used by Decoder mostly. Default spanLimit of 20
+   * Constructor used by Decoder mostly.
    * @param owner the associated decoder-wide {@link org.apache.joshua.decoder.ff.tm.OwnerMap}
    * @param config a {@link org.apache.joshua.decoder.JoshuaConfiguration} object
    * @param spanLimit the maximum span of the input grammar rule(s) can be applied to.
    */
   public MemoryBasedBatchGrammar(String owner, JoshuaConfiguration config, int spanLimit) {
+    this(null, owner, config, spanLimit);
+  }
+  
+  /**
+   * Constructor used by Decoder for creating custom grammars.
+   * 
+   * @param file the file to load the grammar from
+   * @param owner the associated decoder-wide {@link org.apache.joshua.decoder.ff.tm.OwnerMap}
+   * @param config a {@link org.apache.joshua.decoder.JoshuaConfiguration} object
+   * @param spanLimit the maximum span of the input grammar rule(s) can be applied to.
+   */
+  public MemoryBasedBatchGrammar(String file, String owner, JoshuaConfiguration config, int spanLimit) {
     super(owner, config, spanLimit);
   }
-
+  
   /**
    * Constructor to initialize a GrammarReader (unowned)
    * @param reader the GrammarReader used for storing ASCII line-based grammars on disk.
@@ -99,7 +120,11 @@ public class MemoryBasedBatchGrammar extends AbstractGrammar {
     this.grammarFile = grammarFile;
 
     // ==== loading grammar
-    this.modelReader = createReader(formatKeyword, grammarFile);
+    try {
+      this.modelReader = createReader(formatKeyword, grammarFile);
+    } catch (IOException e) {
+      LOG.warn("Couldn't load a '{}' type grammar from file '{}'", formatKeyword, grammarFile);
+    }
     if (modelReader != null) {
       for (Rule rule : modelReader)
         if (rule != null) {
@@ -235,6 +260,48 @@ public class MemoryBasedBatchGrammar extends AbstractGrammar {
   }
 
   /**
+   * Saves the grammar to the specified location.
+   */
+  @Override
+  public void save() {
+    
+    LOG.info("Saving custom grammar to file '{}'", grammarFile);
+    
+    try {
+      BufferedWriter out = new BufferedWriter(new OutputStreamWriter(
+          new FileOutputStream(grammarFile), "UTF-8"));
+
+      ArrayList<Trie> nodes = new ArrayList<Trie>();
+      nodes.add(root);
+      while (nodes.size() > 0) {
+        Trie trie = nodes.remove(0);
+        // find all rules at the current node, print them
+        if (trie.hasRules()) {
+          for (Rule rule: trie.getRuleCollection().getRules()) {
+            try {
+              LOG.info("  rule: {}", rule.textFormat());
+              out.write(rule.textFormat() + "\n");
+            } catch (IOException e) {
+              e.printStackTrace();
+              return;
+            }
+          }
+        }
+
+        // graph is acyclical so we shouldn't have to check for having visited
+        if (trie.hasExtensions())
+          nodes.addAll(trie.getExtensions());
+      }
+      
+      out.close();
+    } catch (IOException e) {
+      // TODO Auto-generated catch block
+      e.printStackTrace();
+      return;
+    }
+  }
+
+  /**
    * Adds a default set of glue rules.
    * 
    * @param featureFunctions an {@link java.util.ArrayList} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/2229d1b9/src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java b/src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java
index 914bdd2..c1ee160 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java
@@ -1021,6 +1021,11 @@ public class PackedGrammar extends AbstractGrammar {
   public void addRule(Rule rule) {
     throw new RuntimeException("PackedGrammar.addRule(): I can't add rules");
   }
+  
+  @Override
+  public void save() {
+    throw new RuntimeException("PackedGrammar.save(): I can't be saved");
+  }
 
   /**
    * Read the config file

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/2229d1b9/src/main/java/org/apache/joshua/decoder/phrase/PhraseTable.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/phrase/PhraseTable.java b/src/main/java/org/apache/joshua/decoder/phrase/PhraseTable.java
index de11f5f..74d2a8f 100644
--- a/src/main/java/org/apache/joshua/decoder/phrase/PhraseTable.java
+++ b/src/main/java/org/apache/joshua/decoder/phrase/PhraseTable.java
@@ -19,7 +19,9 @@
 package org.apache.joshua.decoder.phrase;
 
 import java.io.File;
+import java.io.FileNotFoundException;
 import java.io.IOException;
+import java.io.UnsupportedEncodingException;
 import java.util.List;
 
 import org.apache.joshua.corpus.Vocabulary;
@@ -117,6 +119,14 @@ public class PhraseTable implements Grammar {
     backend.addRule(rule);
   }
   
+  /**
+   * Saves the grammar to disk. Only supported when the backend is a MemoryBasedBatchGrammar.
+   */
+  @Override
+  public void save() {
+    backend.save();
+  }
+  
   @Override
   public void addOOVRules(int sourceWord, List<FeatureFunction> featureFunctions) {
     // TODO: _OOV shouldn't be outright added, since the word might not be OOV for the LM (but now almost