You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/05/27 00:34:23 UTC
[17/32] incubator-joshua git commit: packer now writes out a version,
removed custom backend phrase-based checks
packer now writes out a version, removed custom backend phrase-based checks
- The packed grammar now writes out a version number. The current version is 3. PackedGrammar will throw a runtime exception if no version information is found. However, the format has only changed for phrase-based grammars; any currently packed grammar can be made compatible by adding the line
version = 3
to the packed grammar "config" file
- Removed checks for which grammar backend is being used for phrase-based decoding. It's now totally generic! Very nice.
- Updated test cases to match.
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/366f4086
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/366f4086
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/366f4086
Branch: refs/heads/JOSHUA-252
Commit: 366f408672e2d29b69a78531b57056649629e978
Parents: 53a0fcf
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed May 25 05:47:48 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed May 25 05:47:48 2016 -0400
----------------------------------------------------------------------
src/joshua/decoder/ff/tm/GrammarReader.java | 1 -
.../decoder/ff/tm/format/MosesFormatReader.java | 8 ++---
.../decoder/ff/tm/packed/PackedGrammar.java | 33 ++++++++++++++---
src/joshua/decoder/phrase/PhraseTable.java | 13 +++----
src/joshua/decoder/phrase/Stacks.java | 2 +-
src/joshua/tools/GrammarPacker.java | 37 ++++++++++++++------
test/decoder/phrase/decode/rules.packed/config | 1 +
7 files changed, 63 insertions(+), 32 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/366f4086/src/joshua/decoder/ff/tm/GrammarReader.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/GrammarReader.java b/src/joshua/decoder/ff/tm/GrammarReader.java
index 3b973a2..3432e53 100644
--- a/src/joshua/decoder/ff/tm/GrammarReader.java
+++ b/src/joshua/decoder/ff/tm/GrammarReader.java
@@ -23,7 +23,6 @@ import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;
-import joshua.corpus.Vocabulary;
import joshua.decoder.Decoder;
import joshua.util.io.LineReader;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/366f4086/src/joshua/decoder/ff/tm/format/MosesFormatReader.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/format/MosesFormatReader.java b/src/joshua/decoder/ff/tm/format/MosesFormatReader.java
index 7e43075..47a3e46 100644
--- a/src/joshua/decoder/ff/tm/format/MosesFormatReader.java
+++ b/src/joshua/decoder/ff/tm/format/MosesFormatReader.java
@@ -44,16 +44,14 @@ import joshua.util.io.LineReader;
public class MosesFormatReader extends HieroFormatReader {
- private int lhs;
-
public MosesFormatReader(String grammarFile) throws IOException {
super(grammarFile);
- this.lhs = Vocabulary.id("[X]");
+ Vocabulary.id("[X]");
}
public MosesFormatReader() {
super();
- this.lhs = Vocabulary.id("[X]");
+ Vocabulary.id("[X]");
}
/**
@@ -77,8 +75,6 @@ public class MosesFormatReader extends HieroFormatReader {
public Rule parseLine(String line) {
String[] fields = line.split(fieldDelimiter);
- int arity = 1;
-
StringBuffer hieroLine = new StringBuffer();
hieroLine.append("[X] ||| [X,1] " + fields[0] + " ||| [X,1] " + fields[1] + " |||");
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/366f4086/src/joshua/decoder/ff/tm/packed/PackedGrammar.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/packed/PackedGrammar.java b/src/joshua/decoder/ff/tm/packed/PackedGrammar.java
index cc58578..e9f1a5c 100644
--- a/src/joshua/decoder/ff/tm/packed/PackedGrammar.java
+++ b/src/joshua/decoder/ff/tm/packed/PackedGrammar.java
@@ -91,6 +91,7 @@ import joshua.decoder.ff.tm.Rule;
import joshua.decoder.ff.tm.RuleCollection;
import joshua.decoder.ff.tm.Trie;
import joshua.decoder.ff.tm.hash_based.ExtensionIterator;
+import joshua.util.FormatUtils;
import joshua.util.encoding.EncoderConfiguration;
import joshua.util.encoding.FloatEncoder;
import joshua.util.io.LineReader;
@@ -109,19 +110,22 @@ public class PackedGrammar extends AbstractGrammar {
private final File vocabFile; // store path to vocabulary file
public static final String VOCABULARY_FILENAME = "vocabulary";
-
- // The grammar specification keyword (e.g., "thrax" or "moses")
- private String type;
+
+ // The version number of the earliest supported grammar packer
+ public static final int SUPPORTED_VERSION = 3;
// A rule cache for commonly used tries to avoid excess object allocations
// Testing shows there's up to ~95% hit rate when cache size is 5000 Trie nodes.
private final Cache<Trie, List<Rule>> cached_rules;
+ private String grammarDir;
+
public PackedGrammar(String grammar_dir, int span_limit, String owner, String type,
JoshuaConfiguration joshuaConfiguration) throws FileNotFoundException, IOException {
super(joshuaConfiguration);
+
+ this.grammarDir = grammar_dir;
this.spanLimit = span_limit;
- this.type = type;
// Read the vocabulary.
vocabFile = new File(grammar_dir + File.separator + VOCABULARY_FILENAME);
@@ -566,7 +570,7 @@ public class PackedGrammar extends AbstractGrammar {
System.arraycopy(parent_src, 0, src, 0, parent_src.length);
src[src.length - 1] = symbol;
arity = parent_arity;
- if (Vocabulary.nt(symbol))
+ if (FormatUtils.isNonterminal(symbol))
arity++;
}
@@ -941,11 +945,30 @@ public class PackedGrammar extends AbstractGrammar {
throw new RuntimeException("PackedGrammar.addRule(): I can't add rules");
}
+ /**
+ * Read the config file
+ *
+ * TODO: this should be rewritten using typeconfig.
+ *
+ * @param config
+ * @throws IOException
+ */
private void readConfig(String config) throws IOException {
+ int version = 0;
+
for (String line: new LineReader(config)) {
String[] tokens = line.split(" = ");
if (tokens[0].equals("max-source-len"))
this.maxSourcePhraseLength = Integer.parseInt(tokens[1]);
+ else if (tokens[0].equals("version")) {
+ version = Integer.parseInt(tokens[1]);
+ }
+ }
+
+ if (version != 3) {
+ String message = String.format("The grammar at %s was packed with packer version %d, but the earliest supported version is %d",
+ this.grammarDir, version, SUPPORTED_VERSION);
+ throw new RuntimeException(message);
}
}
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/366f4086/src/joshua/decoder/phrase/PhraseTable.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/phrase/PhraseTable.java b/src/joshua/decoder/phrase/PhraseTable.java
index bcf7135..38b7ef4 100644
--- a/src/joshua/decoder/phrase/PhraseTable.java
+++ b/src/joshua/decoder/phrase/PhraseTable.java
@@ -77,18 +77,14 @@ public class PhraseTable implements Grammar {
}
/**
- * Returns the longest source phrase read. For {@link MemoryBasedBatchGrammar}s, we subtract 1
- * since the grammar includes the nonterminal. For {@link PackedGrammar}s, the value was either
- * in the packed config file (Joshua 6.0.2+) or was passed in via the TM config line.
+ * Returns the longest source phrase read. Because phrases have a dummy nonterminal prepended to
+ * them, we need to subtract 1.
*
* @return
*/
@Override
public int getMaxSourcePhraseLength() {
- if (backend instanceof MemoryBasedBatchGrammar)
- return this.backend.getMaxSourcePhraseLength() - 1;
- else
- return this.backend.getMaxSourcePhraseLength();
+ return this.backend.getMaxSourcePhraseLength() - 1;
}
/**
@@ -100,8 +96,7 @@ public class PhraseTable implements Grammar {
public RuleCollection getPhrases(int[] sourceWords) {
if (sourceWords.length != 0) {
Trie pointer = getTrieRoot();
- if (! (backend instanceof PackedGrammar))
- pointer = pointer.match(Vocabulary.id("[X]"));
+ pointer = pointer.match(Vocabulary.id("[X]"));
int i = 0;
while (pointer != null && i < sourceWords.length)
pointer = pointer.match(sourceWords[i++]);
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/366f4086/src/joshua/decoder/phrase/Stacks.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/phrase/Stacks.java b/src/joshua/decoder/phrase/Stacks.java
index eda7d8b..f81ede9 100644
--- a/src/joshua/decoder/phrase/Stacks.java
+++ b/src/joshua/decoder/phrase/Stacks.java
@@ -137,7 +137,7 @@ public class Stacks {
Stack tailStack = stacks.get(from_stack);
if (Decoder.VERBOSE >= 3)
- System.err.println(String.format("\n WORDS %d MAX %d (STACK %d phrase_length %d)", source_words,
+ Decoder.LOG(3, String.format("\n WORDS %d MAX %d (STACK %d phrase_length %d)", source_words,
chart.MaxSourcePhraseLength(), from_stack, phrase_length));
// Iterate over antecedents in this stack.
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/366f4086/src/joshua/tools/GrammarPacker.java
----------------------------------------------------------------------
diff --git a/src/joshua/tools/GrammarPacker.java b/src/joshua/tools/GrammarPacker.java
index df8383b..8c39582 100644
--- a/src/joshua/tools/GrammarPacker.java
+++ b/src/joshua/tools/GrammarPacker.java
@@ -1,4 +1,4 @@
-/*
+/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -28,7 +28,6 @@ import java.io.FileWriter;
import java.io.IOException;
import java.io.PrintWriter;
import java.nio.ByteBuffer;
-import java.util.Arrays;
import java.util.ArrayList;
import java.util.LinkedList;
import java.util.List;
@@ -50,6 +49,20 @@ public class GrammarPacker {
private static final Logger logger = Logger.getLogger(GrammarPacker.class.getName());
+ /**
+ * The packed grammar version number. Increment this any time you add new features, and update
+ * the documentation.
+ *
+ * Version history:
+ *
+ * - 3 (May 2016). This was the first version that was marked. It removed the special phrase-
+ * table packing that packed phrases without the [X,1] on the source and target sides, which
+ * then required special handling in the decoder to use for phrase-based decoding.
+ *
+ *
+ */
+ public static final int VERSION = 3;
+
// Size limit for slice in bytes.
private static int DATA_SIZE_LIMIT = (int) (Integer.MAX_VALUE * 0.8);
// Estimated average number of feature entries for one rule.
@@ -66,7 +79,7 @@ public class GrammarPacker {
public String getGrammar() {
return grammar;
}
-
+
public String getOutputDirectory() {
return output;
}
@@ -160,7 +173,7 @@ public class GrammarPacker {
// Explore pass. Learn vocabulary and feature value histograms.
logger.info("Exploring: " + grammar);
-
+
HieroFormatReader grammarReader = getGrammarReader();
explore(grammarReader);
@@ -185,9 +198,10 @@ public class GrammarPacker {
logger.info(String.format("Writing config to '%s'", configFile));
// Write config options
FileWriter config = new FileWriter(configFile);
+ config.write(String.format("version = %d\n", VERSION));
config.write(String.format("max-source-len = %d\n", max_source_len));
config.close();
-
+
// Read previously written encoder configuration to match up to changed
// vocabulary id's.
logger.info("Reading encoding.");
@@ -212,7 +226,7 @@ public class GrammarPacker {
*
* @param grammarFile
* @return
- * @throws IOException
+ * @throws IOException
*/
private HieroFormatReader getGrammarReader() throws IOException {
LineReader reader = new LineReader(grammar);
@@ -224,14 +238,17 @@ public class GrammarPacker {
}
}
+ /**
+ * This first pass over the grammar
+ * @param reader
+ */
private void explore(HieroFormatReader reader) {
- int counter = 0;
+
// We always assume a labeled grammar. Unlabeled features are assumed to be dense and to always
// appear in the same order. They are assigned numeric names in order of appearance.
this.types.setLabeled(true);
for (Rule rule: reader) {
- counter++;
max_source_len = Math.max(max_source_len, rule.getFrench().length);
@@ -239,7 +256,7 @@ public class GrammarPacker {
* NOTE: In case of nonterminals, we add both stripped versions ("[X]")
* and "[X,1]" to the vocabulary.
*
- * TODO: MJP May 2016: do we need to add [X,1]? If so, should be done in FormatReaders.
+ * TODO: MJP May 2016: Is it necessary to add [X,1]?
*/
// Add feature names to vocabulary and pass the value through the
@@ -359,7 +376,7 @@ public class GrammarPacker {
for (int f = 0; f < feature_entries.length; ++f) {
String feature_entry = feature_entries[f];
int feature_id;
- float feature_value;
+ float feature_value;
if (feature_entry.contains("=")) {
String[] parts = feature_entry.split("=");
if (parts[0].equals("Alignment"))
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/366f4086/test/decoder/phrase/decode/rules.packed/config
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/decode/rules.packed/config b/test/decoder/phrase/decode/rules.packed/config
index 9c2b25e..7bdb804 100644
--- a/test/decoder/phrase/decode/rules.packed/config
+++ b/test/decoder/phrase/decode/rules.packed/config
@@ -1 +1,2 @@
+version = 3
max-source-len = 4