You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/05/27 00:34:23 UTC

[17/32] incubator-joshua git commit: packer now writes out a version, removed custom backend phrase-based checks

packer now writes out a version, removed custom backend phrase-based checks

-  The packed grammar now writes out a version number. The current version is 3. PackedGrammar will throw a runtime exception if no version information is found. However, the format has only changed for phrase-based grammars; any currently packed grammar can be made compatible by adding the line

       version = 3

to the packed grammar "config" file

- Removed checks for which grammar backend is being used for phrase-based decoding. It's now totally generic! Very nice.

- Updated test cases to match.


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/366f4086
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/366f4086
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/366f4086

Branch: refs/heads/JOSHUA-252
Commit: 366f408672e2d29b69a78531b57056649629e978
Parents: 53a0fcf
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed May 25 05:47:48 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed May 25 05:47:48 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/ff/tm/GrammarReader.java     |  1 -
 .../decoder/ff/tm/format/MosesFormatReader.java |  8 ++---
 .../decoder/ff/tm/packed/PackedGrammar.java     | 33 ++++++++++++++---
 src/joshua/decoder/phrase/PhraseTable.java      | 13 +++----
 src/joshua/decoder/phrase/Stacks.java           |  2 +-
 src/joshua/tools/GrammarPacker.java             | 37 ++++++++++++++------
 test/decoder/phrase/decode/rules.packed/config  |  1 +
 7 files changed, 63 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/366f4086/src/joshua/decoder/ff/tm/GrammarReader.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/GrammarReader.java b/src/joshua/decoder/ff/tm/GrammarReader.java
index 3b973a2..3432e53 100644
--- a/src/joshua/decoder/ff/tm/GrammarReader.java
+++ b/src/joshua/decoder/ff/tm/GrammarReader.java
@@ -23,7 +23,6 @@ import java.util.Iterator;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
-import joshua.corpus.Vocabulary;
 import joshua.decoder.Decoder;
 import joshua.util.io.LineReader;
 

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/366f4086/src/joshua/decoder/ff/tm/format/MosesFormatReader.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/format/MosesFormatReader.java b/src/joshua/decoder/ff/tm/format/MosesFormatReader.java
index 7e43075..47a3e46 100644
--- a/src/joshua/decoder/ff/tm/format/MosesFormatReader.java
+++ b/src/joshua/decoder/ff/tm/format/MosesFormatReader.java
@@ -44,16 +44,14 @@ import joshua.util.io.LineReader;
 
 public class MosesFormatReader extends HieroFormatReader {
 
-  private int lhs;
-  
   public MosesFormatReader(String grammarFile) throws IOException {
     super(grammarFile);
-    this.lhs = Vocabulary.id("[X]");
+    Vocabulary.id("[X]");
   }
   
   public MosesFormatReader() {
     super();
-    this.lhs = Vocabulary.id("[X]");
+    Vocabulary.id("[X]");
   }
   
   /**
@@ -77,8 +75,6 @@ public class MosesFormatReader extends HieroFormatReader {
   public Rule parseLine(String line) {
     String[] fields = line.split(fieldDelimiter);
     
-    int arity = 1;
-
     StringBuffer hieroLine = new StringBuffer();
     hieroLine.append("[X] ||| [X,1] " + fields[0] + " ||| [X,1] " + fields[1] + " |||");
 

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/366f4086/src/joshua/decoder/ff/tm/packed/PackedGrammar.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/packed/PackedGrammar.java b/src/joshua/decoder/ff/tm/packed/PackedGrammar.java
index cc58578..e9f1a5c 100644
--- a/src/joshua/decoder/ff/tm/packed/PackedGrammar.java
+++ b/src/joshua/decoder/ff/tm/packed/PackedGrammar.java
@@ -91,6 +91,7 @@ import joshua.decoder.ff.tm.Rule;
 import joshua.decoder.ff.tm.RuleCollection;
 import joshua.decoder.ff.tm.Trie;
 import joshua.decoder.ff.tm.hash_based.ExtensionIterator;
+import joshua.util.FormatUtils;
 import joshua.util.encoding.EncoderConfiguration;
 import joshua.util.encoding.FloatEncoder;
 import joshua.util.io.LineReader;
@@ -109,19 +110,22 @@ public class PackedGrammar extends AbstractGrammar {
   private final File vocabFile; // store path to vocabulary file
 
   public static final String VOCABULARY_FILENAME = "vocabulary";
-
-  // The grammar specification keyword (e.g., "thrax" or "moses")
-  private String type;
+  
+  // The version number of the earliest supported grammar packer
+  public static final int SUPPORTED_VERSION = 3;
 
   // A rule cache for commonly used tries to avoid excess object allocations
   // Testing shows there's up to ~95% hit rate when cache size is 5000 Trie nodes.
   private final Cache<Trie, List<Rule>> cached_rules;
 
+  private String grammarDir;
+
   public PackedGrammar(String grammar_dir, int span_limit, String owner, String type,
       JoshuaConfiguration joshuaConfiguration) throws FileNotFoundException, IOException {
     super(joshuaConfiguration);
+
+    this.grammarDir = grammar_dir;
     this.spanLimit = span_limit;
-    this.type = type;
 
     // Read the vocabulary.
     vocabFile = new File(grammar_dir + File.separator + VOCABULARY_FILENAME);
@@ -566,7 +570,7 @@ public class PackedGrammar extends AbstractGrammar {
         System.arraycopy(parent_src, 0, src, 0, parent_src.length);
         src[src.length - 1] = symbol;
         arity = parent_arity;
-        if (Vocabulary.nt(symbol))
+        if (FormatUtils.isNonterminal(symbol))
           arity++;
       }
 
@@ -941,11 +945,30 @@ public class PackedGrammar extends AbstractGrammar {
     throw new RuntimeException("PackedGrammar.addRule(): I can't add rules");
   }
   
+  /** 
+   * Read the config file
+   * 
+   * TODO: this should be rewritten using typeconfig.
+   * 
+   * @param config
+   * @throws IOException
+   */
   private void readConfig(String config) throws IOException {
+    int version = 0;
+    
     for (String line: new LineReader(config)) {
       String[] tokens = line.split(" = ");
       if (tokens[0].equals("max-source-len"))
         this.maxSourcePhraseLength = Integer.parseInt(tokens[1]);
+      else if (tokens[0].equals("version")) {
+        version = Integer.parseInt(tokens[1]);
+      }
+    }
+    
+    if (version != 3) {
+      String message = String.format("The grammar at %s was packed with packer version %d, but the earliest supported version is %d",
+          this.grammarDir, version, SUPPORTED_VERSION);
+      throw new RuntimeException(message);
     }
   }
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/366f4086/src/joshua/decoder/phrase/PhraseTable.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/phrase/PhraseTable.java b/src/joshua/decoder/phrase/PhraseTable.java
index bcf7135..38b7ef4 100644
--- a/src/joshua/decoder/phrase/PhraseTable.java
+++ b/src/joshua/decoder/phrase/PhraseTable.java
@@ -77,18 +77,14 @@ public class PhraseTable implements Grammar {
   }
       
   /**
-   * Returns the longest source phrase read. For {@link MemoryBasedBatchGrammar}s, we subtract 1
-   * since the grammar includes the nonterminal. For {@link PackedGrammar}s, the value was either
-   * in the packed config file (Joshua 6.0.2+) or was passed in via the TM config line.
+   * Returns the longest source phrase read. Because phrases have a dummy nonterminal prepended to
+   * them, we need to subtract 1.
    * 
    * @return
    */
   @Override
   public int getMaxSourcePhraseLength() {
-    if (backend instanceof MemoryBasedBatchGrammar)
-      return this.backend.getMaxSourcePhraseLength() - 1;
-    else
-      return this.backend.getMaxSourcePhraseLength();
+    return this.backend.getMaxSourcePhraseLength() - 1;
   }
 
   /**
@@ -100,8 +96,7 @@ public class PhraseTable implements Grammar {
   public RuleCollection getPhrases(int[] sourceWords) {
     if (sourceWords.length != 0) {
       Trie pointer = getTrieRoot();
-      if (! (backend instanceof PackedGrammar))
-        pointer = pointer.match(Vocabulary.id("[X]"));
+      pointer = pointer.match(Vocabulary.id("[X]"));
       int i = 0;
       while (pointer != null && i < sourceWords.length)
         pointer = pointer.match(sourceWords[i++]);

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/366f4086/src/joshua/decoder/phrase/Stacks.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/phrase/Stacks.java b/src/joshua/decoder/phrase/Stacks.java
index eda7d8b..f81ede9 100644
--- a/src/joshua/decoder/phrase/Stacks.java
+++ b/src/joshua/decoder/phrase/Stacks.java
@@ -137,7 +137,7 @@ public class Stacks {
         Stack tailStack = stacks.get(from_stack);
         
         if (Decoder.VERBOSE >= 3)
-          System.err.println(String.format("\n  WORDS %d MAX %d (STACK %d phrase_length %d)", source_words,
+          Decoder.LOG(3, String.format("\n  WORDS %d MAX %d (STACK %d phrase_length %d)", source_words,
               chart.MaxSourcePhraseLength(), from_stack, phrase_length));
         
         // Iterate over antecedents in this stack.

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/366f4086/src/joshua/tools/GrammarPacker.java
----------------------------------------------------------------------
diff --git a/src/joshua/tools/GrammarPacker.java b/src/joshua/tools/GrammarPacker.java
index df8383b..8c39582 100644
--- a/src/joshua/tools/GrammarPacker.java
+++ b/src/joshua/tools/GrammarPacker.java
@@ -1,4 +1,4 @@
-/*
+/**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
@@ -28,7 +28,6 @@ import java.io.FileWriter;
 import java.io.IOException;
 import java.io.PrintWriter;
 import java.nio.ByteBuffer;
-import java.util.Arrays;
 import java.util.ArrayList;
 import java.util.LinkedList;
 import java.util.List;
@@ -50,6 +49,20 @@ public class GrammarPacker {
 
   private static final Logger logger = Logger.getLogger(GrammarPacker.class.getName());
 
+  /**
+   * The packed grammar version number. Increment this any time you add new features, and update
+   * the documentation.
+   * 
+   * Version history:
+   * 
+   * - 3 (May 2016). This was the first version that was marked. It removed the special phrase-
+   * table packing that packed phrases without the [X,1] on the source and target sides, which
+   * then required special handling in the decoder to use for phrase-based decoding.
+   * 
+   * 
+   */
+  public static final int VERSION = 3;
+  
   // Size limit for slice in bytes.
   private static int DATA_SIZE_LIMIT = (int) (Integer.MAX_VALUE * 0.8);
   // Estimated average number of feature entries for one rule.
@@ -66,7 +79,7 @@ public class GrammarPacker {
   public String getGrammar() {
     return grammar;
   }
-  
+
   public String getOutputDirectory() {
     return output;
   }
@@ -160,7 +173,7 @@ public class GrammarPacker {
 
     // Explore pass. Learn vocabulary and feature value histograms.
     logger.info("Exploring: " + grammar);
-    
+
     HieroFormatReader grammarReader = getGrammarReader();
     explore(grammarReader);
 
@@ -185,9 +198,10 @@ public class GrammarPacker {
     logger.info(String.format("Writing config to '%s'", configFile));
     // Write config options
     FileWriter config = new FileWriter(configFile);
+    config.write(String.format("version = %d\n", VERSION));
     config.write(String.format("max-source-len = %d\n", max_source_len));
     config.close();
-    
+
     // Read previously written encoder configuration to match up to changed
     // vocabulary id's.
     logger.info("Reading encoding.");
@@ -212,7 +226,7 @@ public class GrammarPacker {
    * 
    * @param grammarFile
    * @return
-   * @throws IOException 
+   * @throws IOException
    */
   private HieroFormatReader getGrammarReader() throws IOException {
     LineReader reader = new LineReader(grammar);
@@ -224,14 +238,17 @@ public class GrammarPacker {
     }
   }
 
+  /**
+   * This first pass over the grammar 
+   * @param reader
+   */
   private void explore(HieroFormatReader reader) {
-    int counter = 0;
+
     // We always assume a labeled grammar. Unlabeled features are assumed to be dense and to always
     // appear in the same order. They are assigned numeric names in order of appearance.
     this.types.setLabeled(true);
 
     for (Rule rule: reader) {
-      counter++;
 
       max_source_len = Math.max(max_source_len, rule.getFrench().length);
 
@@ -239,7 +256,7 @@ public class GrammarPacker {
        * NOTE: In case of nonterminals, we add both stripped versions ("[X]")
        * and "[X,1]" to the vocabulary.
        * 
-       * TODO: MJP May 2016: do we need to add [X,1]? If so, should be done in FormatReaders.
+       * TODO: MJP May 2016: Is it necessary to add [X,1]?
        */
 
       // Add feature names to vocabulary and pass the value through the
@@ -359,7 +376,7 @@ public class GrammarPacker {
       for (int f = 0; f < feature_entries.length; ++f) {
         String feature_entry = feature_entries[f];
         int feature_id;
-        float feature_value; 
+        float feature_value;
         if (feature_entry.contains("=")) {
           String[] parts = feature_entry.split("=");
           if (parts[0].equals("Alignment"))

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/366f4086/test/decoder/phrase/decode/rules.packed/config
----------------------------------------------------------------------
diff --git a/test/decoder/phrase/decode/rules.packed/config b/test/decoder/phrase/decode/rules.packed/config
index 9c2b25e..7bdb804 100644
--- a/test/decoder/phrase/decode/rules.packed/config
+++ b/test/decoder/phrase/decode/rules.packed/config
@@ -1 +1,2 @@
+version = 3
 max-source-len = 4