You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/11/17 18:25:08 UTC
[3/8] incubator-joshua git commit: added saving of custom grammar
added saving of custom grammar
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/2229d1b9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/2229d1b9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/2229d1b9
Branch: refs/heads/save_custom_grammars
Commit: 2229d1b9f4ed167507d721ff2cfe87ccfadf28a3
Parents: 81baa65
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Nov 16 20:57:53 2016 -0500
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Thu Nov 17 13:24:07 2016 -0500
----------------------------------------------------------------------
.../java/org/apache/joshua/decoder/Decoder.java | 9 +++
.../apache/joshua/decoder/ff/tm/Grammar.java | 5 ++
.../tm/hash_based/MemoryBasedBatchGrammar.java | 73 +++++++++++++++++++-
.../decoder/ff/tm/packed/PackedGrammar.java | 5 ++
.../joshua/decoder/phrase/PhraseTable.java | 10 +++
5 files changed, 99 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/2229d1b9/src/main/java/org/apache/joshua/decoder/Decoder.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/Decoder.java b/src/main/java/org/apache/joshua/decoder/Decoder.java
index c15898c..ca2bfaa 100644
--- a/src/main/java/org/apache/joshua/decoder/Decoder.java
+++ b/src/main/java/org/apache/joshua/decoder/Decoder.java
@@ -673,8 +673,17 @@ public class Decoder {
* @param rule the rule to add
*/
public void addCustomRule(Rule rule) {
+ if (customPhraseTable == null) {
+ LOG.warn("No custom grammar was found in the config file; can't add rule");
+ LOG.warn("Add the following line to your config and restart Joshua to enable it:");
+ LOG.warn(" tm = phrase -owner custom -maxspan 20 -path /path/to/custom.grammar");
+ LOG.warn("The owner must be 'custom'");
+ }
+
customPhraseTable.addRule(rule);
rule.estimateRuleCost(featureFunctions);
+
+ customPhraseTable.save();
}
public Grammar getCustomPhraseTable() {
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/2229d1b9/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java b/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
index 8f90d1b..27d3862 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
@@ -117,4 +117,9 @@ public interface Grammar {
* @param rule the {@link org.apache.joshua.decoder.ff.tm.Rule}
*/
void addRule(Rule rule);
+
+ /**
+ * Write the grammar out to some permanent location (disk, database, etc).
+ */
+ void save();
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/2229d1b9/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
index ebfa996..f53affa 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
@@ -18,7 +18,16 @@
*/
package org.apache.joshua.decoder.ff.tm.hash_based;
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileWriter;
import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.UnsupportedEncodingException;
+import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
@@ -69,15 +78,27 @@ public class MemoryBasedBatchGrammar extends AbstractGrammar {
private GrammarReader<Rule> modelReader;
/**
- * Constructor used by Decoder mostly. Default spanLimit of 20
+ * Constructor used by Decoder mostly.
* @param owner the associated decoder-wide {@link org.apache.joshua.decoder.ff.tm.OwnerMap}
* @param config a {@link org.apache.joshua.decoder.JoshuaConfiguration} object
* @param spanLimit the maximum span of the input grammar rule(s) can be applied to.
*/
public MemoryBasedBatchGrammar(String owner, JoshuaConfiguration config, int spanLimit) {
+ this(null, owner, config, spanLimit);
+ }
+
+ /**
+ * Constructor used by Decoder for creating custom grammars.
+ *
+ * @param file the file to load the grammar from
+ * @param owner the associated decoder-wide {@link org.apache.joshua.decoder.ff.tm.OwnerMap}
+ * @param config a {@link org.apache.joshua.decoder.JoshuaConfiguration} object
+ * @param spanLimit the maximum span of the input grammar rule(s) can be applied to.
+ */
+ public MemoryBasedBatchGrammar(String file, String owner, JoshuaConfiguration config, int spanLimit) {
super(owner, config, spanLimit);
}
-
+
/**
* Constructor to initialize a GrammarReader (unowned)
* @param reader the GrammarReader used for storing ASCII line-based grammars on disk.
@@ -99,7 +120,11 @@ public class MemoryBasedBatchGrammar extends AbstractGrammar {
this.grammarFile = grammarFile;
// ==== loading grammar
- this.modelReader = createReader(formatKeyword, grammarFile);
+ try {
+ this.modelReader = createReader(formatKeyword, grammarFile);
+ } catch (IOException e) {
+ LOG.warn("Couldn't load a '{}' type grammar from file '{}'", formatKeyword, grammarFile);
+ }
if (modelReader != null) {
for (Rule rule : modelReader)
if (rule != null) {
@@ -235,6 +260,48 @@ public class MemoryBasedBatchGrammar extends AbstractGrammar {
}
/**
+ * Saves the grammar to the specified location.
+ */
+ @Override
+ public void save() {
+
+ LOG.info("Saving custom grammar to file '{}'", grammarFile);
+
+ try {
+ BufferedWriter out = new BufferedWriter(new OutputStreamWriter(
+ new FileOutputStream(grammarFile), "UTF-8"));
+
+ ArrayList<Trie> nodes = new ArrayList<Trie>();
+ nodes.add(root);
+ while (nodes.size() > 0) {
+ Trie trie = nodes.remove(0);
+ // find all rules at the current node, print them
+ if (trie.hasRules()) {
+ for (Rule rule: trie.getRuleCollection().getRules()) {
+ try {
+ LOG.info(" rule: {}", rule.textFormat());
+ out.write(rule.textFormat() + "\n");
+ } catch (IOException e) {
+ e.printStackTrace();
+ return;
+ }
+ }
+ }
+
+ // graph is acyclical so we shouldn't have to check for having visited
+ if (trie.hasExtensions())
+ nodes.addAll(trie.getExtensions());
+ }
+
+ out.close();
+ } catch (IOException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ return;
+ }
+ }
+
+ /**
* Adds a default set of glue rules.
*
* @param featureFunctions an {@link java.util.ArrayList} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/2229d1b9/src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java b/src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java
index 914bdd2..c1ee160 100644
--- a/src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java
+++ b/src/main/java/org/apache/joshua/decoder/ff/tm/packed/PackedGrammar.java
@@ -1021,6 +1021,11 @@ public class PackedGrammar extends AbstractGrammar {
public void addRule(Rule rule) {
throw new RuntimeException("PackedGrammar.addRule(): I can't add rules");
}
+
+ @Override
+ public void save() {
+ throw new RuntimeException("PackedGrammar.save(): I can't be saved");
+ }
/**
* Read the config file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/2229d1b9/src/main/java/org/apache/joshua/decoder/phrase/PhraseTable.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/phrase/PhraseTable.java b/src/main/java/org/apache/joshua/decoder/phrase/PhraseTable.java
index de11f5f..74d2a8f 100644
--- a/src/main/java/org/apache/joshua/decoder/phrase/PhraseTable.java
+++ b/src/main/java/org/apache/joshua/decoder/phrase/PhraseTable.java
@@ -19,7 +19,9 @@
package org.apache.joshua.decoder.phrase;
import java.io.File;
+import java.io.FileNotFoundException;
import java.io.IOException;
+import java.io.UnsupportedEncodingException;
import java.util.List;
import org.apache.joshua.corpus.Vocabulary;
@@ -117,6 +119,14 @@ public class PhraseTable implements Grammar {
backend.addRule(rule);
}
+ /**
+ * Saves the grammar to disk. Only supported when the backend is a MemoryBasedBatchGrammar.
+ */
+ @Override
+ public void save() {
+ backend.save();
+ }
+
@Override
public void addOOVRules(int sourceWord, List<FeatureFunction> featureFunctions) {
// TODO: _OOV shouldn't be outright added, since the word might not be OOV for the LM (but now almost