You are viewing a plain text version of this content. The canonical link for it is here.

Posted to commits@joshua.apache.org by mj...@apache.org on 2016/05/25 18:34:00 UTC

[1/8] incubator-joshua git commit: removed cruft from Grammar interface (regex grammars, manual rules, writing grammar out)

Repository: incubator-joshua
Updated Branches:
  refs/heads/JOSHUA-273 [created] f4090b0fb


removed cruft from Grammar interface (regex grammars, manual rules, writing grammar out)


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/fe2c4341
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/fe2c4341
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/fe2c4341

Branch: refs/heads/JOSHUA-273
Commit: fe2c43412f0fc373e11ac7216eb70993435e7c41
Parents: 4d73c17
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed May 25 06:35:59 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed May 25 06:38:39 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/chart_parser/Chart.java      |   6 +-
 src/joshua/decoder/chart_parser/DotChart.java   |  60 +--
 .../chart_parser/ManualConstraintsHandler.java  | 217 -----------
 src/joshua/decoder/ff/tm/Grammar.java           |  23 --
 .../decoder/ff/tm/SentenceFilteredGrammar.java  | 373 -------------------
 .../tm/hash_based/MemoryBasedBatchGrammar.java  |  25 --
 .../decoder/ff/tm/packed/PackedGrammar.java     |   5 -
 src/joshua/decoder/phrase/PhraseTable.java      |  16 -
 8 files changed, 6 insertions(+), 719 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe2c4341/src/joshua/decoder/chart_parser/Chart.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/chart_parser/Chart.java b/src/joshua/decoder/chart_parser/Chart.java
index b9b74de..ba6b3df 100644
--- a/src/joshua/decoder/chart_parser/Chart.java
+++ b/src/joshua/decoder/chart_parser/Chart.java
@@ -150,14 +150,10 @@ public class Chart {
     // each grammar will have a dot chart
     this.dotcharts = new DotChart[this.grammars.length];
     for (int i = 0; i < this.grammars.length; i++)
-      this.dotcharts[i] = new DotChart(this.inputLattice, this.grammars[i], this,
-          this.grammars[i].isRegexpGrammar());
+      this.dotcharts[i] = new DotChart(this.inputLattice, this.grammars[i], this);
 
     // Begin to do initialization work
 
-//    manualConstraintsHandler = new ManualConstraintsHandler(this, grammars[grammars.length - 1],
-//        sentence.constraints());
-
     stateConstraint = null;
     if (sentence.target() != null)
       // stateConstraint = new StateConstraint(sentence.target());

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe2c4341/src/joshua/decoder/chart_parser/DotChart.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/chart_parser/DotChart.java b/src/joshua/decoder/chart_parser/DotChart.java
index b82b68c..796256e 100644
--- a/src/joshua/decoder/chart_parser/DotChart.java
+++ b/src/joshua/decoder/chart_parser/DotChart.java
@@ -19,7 +19,6 @@
 package joshua.decoder.chart_parser;
 
 import java.util.ArrayList;
-import java.util.Arrays;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -91,9 +90,6 @@ class DotChart {
   /* Represents the input sentence being translated. */
   private final Lattice<Token> input;
 
-  /* If enabled, rule terminals are treated as regular expressions. */
-  private final boolean regexpMatching;
-
   // ===============================================================
   // Static fields
   // ===============================================================
@@ -119,7 +115,7 @@ class DotChart {
 
 
 
-  public DotChart(Lattice<Token> input, Grammar grammar, Chart chart, boolean regExpMatching) {
+  public DotChart(Lattice<Token> input, Grammar grammar, Chart chart) {
 
     this.dotChart = chart;
     this.pGrammar = grammar;
@@ -127,7 +123,6 @@ class DotChart {
     this.sentLen = input.size();
 
     this.dotcells = new ChartSpan<DotCell>(sentLen, null);
-    this.regexpMatching = regExpMatching;
 
     seed();
   }
@@ -207,22 +202,10 @@ class DotChart {
           // List<Trie> child_tnodes = ruleMatcher.produceMatchingChildTNodesTerminalevel(dotNode,
           // last_word);
 
-          List<Trie> child_tnodes = null;
-
-          if (this.regexpMatching) {
-            child_tnodes = matchAll(dotNode, last_word);
-          } else {
-            Trie child_node = dotNode.trieNode.match(last_word);
-            child_tnodes = Arrays.asList(child_node);
-          }
-
-          if (!(child_tnodes == null || child_tnodes.isEmpty())) {
-            for (Trie child_tnode : child_tnodes) {
-              if (null != child_tnode) {
-                addDotItem(child_tnode, i, j - 1 + arc_len, dotNode.antSuperNodes, null,
-                    dotNode.srcPath.extend(arc));
-              }
-            }
+          Trie child_node = dotNode.trieNode.match(last_word);
+          if (child_node != null) {
+            addDotItem(child_node, i, j - 1 + arc_len, dotNode.antSuperNodes, null,
+                dotNode.srcPath.extend(arc));
           }
         }
       }
@@ -291,39 +274,6 @@ class DotChart {
     }
   }
 
-  /*
-   * We introduced the ability to have regular expressions in rules for matching against terminals.
-   * For example, you could have the rule
-   * 
-   * <pre> [X] ||| l?s herman?s ||| siblings </pre>
-   * 
-   * When this is enabled for a grammar, we need to test against *all* (positive) outgoing arcs of
-   * the grammar trie node to see if any of them match, and then return the whole set. This is quite
-   * expensive, which is why you should only enable regular expressions for small grammars.
-   */
-
-  private ArrayList<Trie> matchAll(DotNode dotNode, int wordID) {
-    ArrayList<Trie> trieList = new ArrayList<>();
-    HashMap<Integer, ? extends Trie> childrenTbl = dotNode.trieNode.getChildren();
-
-    if (childrenTbl != null && wordID >= 0) {
-      // get all the extensions, map to string, check for *, build regexp
-      for (Map.Entry<Integer, ? extends Trie> entry : childrenTbl.entrySet()) {
-        Integer arcID = entry.getKey();
-        if (arcID == wordID) {
-          trieList.add(entry.getValue());
-        } else {
-          String arcWord = Vocabulary.word(arcID);
-          if (Vocabulary.word(wordID).matches(arcWord)) {
-            trieList.add(entry.getValue());
-          }
-        }
-      }
-    }
-    return trieList;
-  }
-
-
   /**
    * Creates a {@link DotNode} and adds it into the {@link DotChart} at the correct place. These
    * are (possibly incomplete) rule applications. 

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe2c4341/src/joshua/decoder/chart_parser/ManualConstraintsHandler.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/chart_parser/ManualConstraintsHandler.java b/src/joshua/decoder/chart_parser/ManualConstraintsHandler.java
deleted file mode 100644
index baed984..0000000
--- a/src/joshua/decoder/chart_parser/ManualConstraintsHandler.java
+++ /dev/null
@@ -1,217 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.chart_parser;
-
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.List;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.ff.tm.Grammar;
-import joshua.decoder.ff.tm.Rule;
-import joshua.decoder.segment_file.ConstraintRule;
-import joshua.decoder.segment_file.ConstraintSpan;
-
-/**
- * @author Zhifei Li, <zh...@gmail.com>
- */
-
-public class ManualConstraintsHandler {
-
-  // TODO: each span only has one ConstraintSpan
-  // contain spans that have LHS or RHS constraints (they are always hard)
-  private HashMap<String, ConstraintSpan> constraintSpansForFiltering;
-
-  // contain spans that have hard "rule" constraint; key: start_span; value:
-  // end_span
-  private ArrayList<Span> spansWithHardRuleConstraint;
-
-  private Chart chart;
-  private Grammar grammarForConstructManualRule;
-
-  private static final Logger logger = Logger.getLogger(ManualConstraintsHandler.class.getName());
-
-  public ManualConstraintsHandler(Chart chart, Grammar grammarForConstructManualRule,
-      List<ConstraintSpan> constraintSpans) {
-    this.chart = chart;
-    this.grammarForConstructManualRule = grammarForConstructManualRule;
-    initialize(constraintSpans);
-  }
-
-  private void initialize(List<ConstraintSpan> constraintSpans) {
-    /**
-     * Note that manual constraints or OOV handling is not part of seeding
-     * */
-    /**
-     * (1) add manual rule (only allow flat rules) into the chart as constraints (2) add RHS or LHS
-     * constraint into constraintSpansForFiltering (3) add span signature into
-     * setOfSpansWithHardRuleConstraint; if the span contains a hard "RULE" constraint
-     */
-    if (null != constraintSpans) {
-
-      for (ConstraintSpan cSpan : constraintSpans) {
-        if (null != cSpan.rules()) {
-          boolean shouldAdd = false; // contain LHS or RHS constraints?
-          for (ConstraintRule cRule : cSpan.rules()) {
-            /**
-             * Note that LHS and RHS constraints are always hard, while Rule constraint can be soft
-             * or hard
-             **/
-            switch (cRule.type()) {
-              case RULE:
-                // == prepare the feature scores
-                // TODO: this require the input always specify the right number of
-                // features
-                float[] featureScores = new float[cRule.features().length];
-
-                for (int i = 0; i < featureScores.length; i++) {
-                  if (cSpan.isHard()) {
-                    featureScores[i] = 0; // force the feature cost as zero
-                  } else {
-                    featureScores[i] = cRule.features()[i];
-                  }
-                }
-
-                /**
-                 * If the RULE constraint is hard, then we should filter all out all consituents
-                 * (within this span), which are contructed from regular grammar
-                 */
-                if (cSpan.isHard()) {
-                  if (null == this.spansWithHardRuleConstraint) {
-                    this.spansWithHardRuleConstraint = new ArrayList<Span>();
-                  }
-                  this.spansWithHardRuleConstraint.add(new Span(cSpan.start(), cSpan.end()));
-                }
-
-                int arity = 0; // only allow flat rule (i.e. arity=0)
-                Rule rule =
-                    this.grammarForConstructManualRule.constructManualRule(
-                        Vocabulary.id(cRule.lhs()), Vocabulary.addAll(cRule.foreignRhs()),
-                        Vocabulary.addAll(cRule.nativeRhs()), featureScores, arity);
-
-                // add to the chart
-                chart.addAxiom(cSpan.start(), cSpan.end(), rule, new SourcePath());
-                if (logger.isLoggable(Level.INFO))
-                  logger.info("Adding RULE constraint for span " + cSpan.start() + ", "
-                      + cSpan.end() + "; isHard=" + cSpan.isHard() + rule.getLHS());
-                break;
-
-              default:
-                shouldAdd = true;
-            }
-          }
-          if (shouldAdd) {
-            if (logger.isLoggable(Level.INFO))
-              logger.info("Adding LHS or RHS constraint for span " + cSpan.start() + ", "
-                  + cSpan.end());
-            if (null == this.constraintSpansForFiltering) {
-              this.constraintSpansForFiltering = new HashMap<String, ConstraintSpan>();
-            }
-            this.constraintSpansForFiltering.put(getSpanSignature(cSpan.start(), cSpan.end()),
-                cSpan);
-          }
-        }
-      }
-    }
-
-  }
-
-  // ===============================================================
-  // Manual constraint annotation methods and classes
-  // ===============================================================
-
-  /**
-   * if there are any LHS or RHS constraints for a span, then all the applicable grammar rules in
-   * that span will have to pass the filter.
-   */
-  public List<Rule> filterRules(int i, int j, List<Rule> rulesIn) {
-    if (null == this.constraintSpansForFiltering) return rulesIn;
-    ConstraintSpan cSpan = this.constraintSpansForFiltering.get(getSpanSignature(i, j));
-    if (null == cSpan) { // no filtering
-      return rulesIn;
-    } else {
-
-      List<Rule> rulesOut = new ArrayList<Rule>();
-      for (Rule gRule : rulesIn) {
-        // gRule will survive, if any constraint (LHS or RHS) lets it survive
-        for (ConstraintRule cRule : cSpan.rules()) {
-          if (shouldSurvive(cRule, gRule)) {
-            rulesOut.add(gRule);
-            break;
-          }
-        }
-      }
-      return rulesOut;
-    }
-  }
-
-  /**
-   * should we filter out the gRule based on the manually provided constraint cRule
-   */
-  public boolean shouldSurvive(ConstraintRule cRule, Rule gRule) {
-
-    switch (cRule.type()) {
-      case LHS:
-        return (gRule.getLHS() == Vocabulary.id(cRule.lhs()));
-      case RHS:
-        int[] targetWords = Vocabulary.addAll(cRule.nativeRhs());
-
-        if (targetWords.length != gRule.getEnglish().length) return false;
-
-        for (int t = 0; t < targetWords.length; t++) {
-          if (targetWords[t] != gRule.getEnglish()[t]) return false;
-        }
-
-        return true;
-      default: // not surviving
-        return false;
-    }
-  }
-
-  /**
-   * if a span is *within* the coverage of a *hard* rule constraint, then this span will be only
-   * allowed to use the mannual rules
-   */
-  public boolean containHardRuleConstraint(int startSpan, int endSpan) {
-    if (null != this.spansWithHardRuleConstraint) {
-      for (Span span : this.spansWithHardRuleConstraint) {
-        if (startSpan >= span.startPos && endSpan <= span.endPos) return true;
-      }
-    }
-    return false;
-  }
-
-  private String getSpanSignature(int i, int j) {
-    return i + " " + j;
-  }
-
-  private static class Span {
-
-    int startPos;
-    int endPos;
-
-    public Span(int startPos, int endPos) {
-      this.startPos = startPos;
-      this.endPos = endPos;
-    }
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe2c4341/src/joshua/decoder/ff/tm/Grammar.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/Grammar.java b/src/joshua/decoder/ff/tm/Grammar.java
index a834442..d8d530a 100644
--- a/src/joshua/decoder/ff/tm/Grammar.java
+++ b/src/joshua/decoder/ff/tm/Grammar.java
@@ -91,29 +91,6 @@ public interface Grammar {
   int getNumDenseFeatures();
 
   /**
-   * This is used to construct a manual rule supported from outside the grammar, but the owner
-   * should be the same as the grammar. Rule ID will the same as OOVRuleId, and no lattice cost
-   */
-  @Deprecated
-  Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords, float[] scores, int arity);
-
-  /**
-   * Dump the grammar to disk.
-   * 
-   * @param file
-   */
-  @Deprecated
-  void writeGrammarOnDisk(String file);
-
-  /**
-   * This returns true if the grammar contains rules that are regular expressions, possibly matching
-   * many different inputs.
-   * 
-   * @return true if the grammar's rules may contain regular expressions.
-   */
-  boolean isRegexpGrammar();
-
-  /**
    * Return the grammar's owner.
    */
   int getOwner();

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe2c4341/src/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/SentenceFilteredGrammar.java b/src/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
deleted file mode 100644
index d540727..0000000
--- a/src/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
+++ /dev/null
@@ -1,373 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.tm;
-
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map.Entry;
-
-import joshua.decoder.ff.tm.hash_based.ExtensionIterator;
-import joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
-import joshua.decoder.segment_file.Sentence;
-
-/**
- * This class implements dynamic sentence-level filtering. This is accomplished with a parallel
- * trie, a subset of the original trie, that only contains trie paths that are reachable from
- * traversals of the current sentence.
- * 
- * @author Matt Post <po...@cs.jhu.edu>
- */
-public class SentenceFilteredGrammar extends MemoryBasedBatchGrammar {
-  private AbstractGrammar baseGrammar;
-  private SentenceFilteredTrie filteredTrie;
-  private int[] tokens;
-  private Sentence sentence;
-
-  /**
-   * Construct a new sentence-filtered grammar. The main work is done in the enclosed trie (obtained
-   * from the base grammar, which contains the complete grammar).
-   * 
-   * @param baseGrammar
-   * @param sentence
-   */
-  SentenceFilteredGrammar(AbstractGrammar baseGrammar, Sentence sentence) {
-    super(baseGrammar.joshuaConfiguration);
-    this.baseGrammar = baseGrammar;
-    this.sentence = sentence;
-    this.tokens = sentence.getWordIDs();
-
-    int origCount = getNumRules(baseGrammar.getTrieRoot());
-    long startTime = System.currentTimeMillis();
-
-    /* Filter the rules; returns non-null object */
-    this.filteredTrie = filter(baseGrammar.getTrieRoot());
-    int filteredCount = getNumRules();
-
-    float seconds = (System.currentTimeMillis() - startTime) / 1000.0f;
-
-    System.err.println(String.format(
-        "Sentence-level filtering of sentence %d (%d -> %d rules) in %.3f seconds", sentence.id(),
-        origCount, filteredCount, seconds));
-  }
-
-  @Override
-  public Trie getTrieRoot() {
-    return filteredTrie;
-  }
-
-  /**
-   * This function is poorly named: it doesn't mean whether a rule exists in the grammar for the
-   * current span, but whether the grammar is permitted to apply rules to the current span (a
-   * grammar-level parameter). As such we can just chain to the underlying grammar.
-   */
-  @Override
-  public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) {
-    return baseGrammar.hasRuleForSpan(startIndex, endIndex, pathLength);
-  }
-
-  @Override
-  public int getNumRules() {
-    return getNumRules(getTrieRoot());
-  }
-
-  /**
-   * A convenience function that counts the number of rules in a grammar's trie.
-   * 
-   * @param node
-   * @return
-   */
-  public int getNumRules(Trie node) {
-    int numRules = 0;
-    if (node != null) {
-      if (node.getRuleCollection() != null)
-        numRules += node.getRuleCollection().getRules().size();
-
-      if (node.getExtensions() != null)
-        for (Trie child : node.getExtensions())
-          numRules += getNumRules(child);
-    }
-
-    return numRules;
-  }
-
-  @Override
-  public Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords, float[] scores,
-      int aritity) {
-    // TODO Auto-generated method stub
-    return null;
-  }
-
-  @Override
-  public boolean isRegexpGrammar() {
-    return false;
-  }
-
-  /**
-   * What is the algorithm?
-   * 
-   * Take the first word of the sentence, and start at the root of the trie. There are two things to
-   * consider: (a) word matches and (b) nonterminal matches.
-   * 
-   * For a word match, simply follow that arc along the trie. We create a parallel arc in our
-   * filtered grammar to represent it. Each arc in the filtered trie knows about its
-   * corresponding/underlying node in the unfiltered grammar trie.
-   * 
-   * A nonterminal is always permitted to match. The question then is how much of the input sentence
-   * we imagine it consumed. The answer is that it could have been any amount. So the recursive call
-   * has to be a set of calls, one each to the next trie node with different lengths of the sentence
-   * remaining.
-   * 
-   * A problem occurs when we have multiple sequential nonterminals. For scope-3 grammars, there can
-   * be four sequential nonterminals (in the case when they are grounded by terminals on both ends
-   * of the nonterminal chain). We'd like to avoid looking at all possible ways to split up the
-   * subsequence, because with respect to filtering rules, they are all the same.
-   * 
-   * We accomplish this with the following restriction: for purposes of grammar filtering, only the
-   * first in a sequence of nonterminal traversals can consume more than one word. Each of the
-   * subsequent ones would have to consume just one word. We then just have to record in the
-   * recursive call whether the last traversal was a nonterminal or not.
-   * 
-   * @return the root of the filtered trie
-   */
-  private SentenceFilteredTrie filter(Trie unfilteredTrieRoot) {
-    SentenceFilteredTrie filteredTrieRoot = new SentenceFilteredTrie(unfilteredTrieRoot);
-
-    // System.err.println(String.format("FILTERING TO SENTENCE\n  %s\n",
-    // Vocabulary.getWords(tokens)));
-
-    /*
-     * The root of the trie is where rule applications start, so we simply try all possible
-     * positions in the sentence.
-     */
-    for (int i = 0; i < tokens.length; i++) {
-      filter(i, filteredTrieRoot, false);
-    }
-
-    return filteredTrieRoot;
-  }
-
-  /**
-   * Matches rules against the sentence. Intelligently handles chains of sequential nonterminals.
-   * Marks arcs that are traversable for this sentence.
-   * 
-   * @param i the position in the sentence to start matching
-   * @param trie the trie node to match against
-   * @param lastWasNT true if the match that brought us here was against a nonterminal
-   */
-  private void filter(int i, SentenceFilteredTrie trieNode, boolean lastWasNT) {
-    if (i >= tokens.length)
-      return;
-
-    /* Make sure the underlying unfiltered node has children. */
-    Trie unfilteredTrieNode = trieNode.unfilteredTrieNode;
-    if (unfilteredTrieNode.getChildren() == null) {
-      // trieNode.path.retreat();
-      return;
-    }
-
-    /* Match a word */
-    Trie trie = unfilteredTrieNode.match(tokens[i]);
-    if (trie != null) {
-      /*
-       * The current filtered node might already have an arc for this label. If so, retrieve it
-       * (since we still need to follow it); if not, create it.
-       */
-      SentenceFilteredTrie nextFilteredTrie = trieNode.match(tokens[i]);
-      if (nextFilteredTrie == null) {
-        nextFilteredTrie = new SentenceFilteredTrie(trie);
-        trieNode.children.put(tokens[i], nextFilteredTrie);
-      }
-
-      /*
-       * Now continue, trying to match the child node against the next position in the sentence. The
-       * third argument records that this match was not against a nonterminal.
-       */
-      filter(i + 1, nextFilteredTrie, false);
-    }
-
-    /*
-     * Now we attempt to match nonterminals. Any nonterminal is permitted to match any region of the
-     * sentence, up to the maximum span for that grammar. So we enumerate all children of the
-     * current (unfiltered) trie grammar node, looking for nonterminals (items whose label value is
-     * less than 0), then recurse.
-     * 
-     * There is one subtlely. Adjacent nonterminals in a grammar rule can match a span (i, j) in (j
-     * - i - 1) ways, but for purposes of determining whether a rule fits, this is all wasted
-     * effort. To handle this, we allow the first nonterminal in a sequence to record 1, 2, 3, ...
-     * terminals (up to the grammar's span limit, or the rest of the sentence, whichever is
-     * shorter). Subsequent adjacent nonterminals are permitted to consume only a single terminal.
-     */
-    HashMap<Integer, ? extends Trie> children = unfilteredTrieNode.getChildren();
-    if (children != null) {
-      for (int label : children.keySet()) {
-        if (label < 0) {
-          SentenceFilteredTrie nextFilteredTrie = trieNode.match(label);
-          if (nextFilteredTrie == null) {
-            nextFilteredTrie = new SentenceFilteredTrie(unfilteredTrieNode.match(label));
-            trieNode.children.put(label, nextFilteredTrie);
-          }
-
-          /*
-           * Recurse. If the last match was a nonterminal, we can only consume one more token.
-           * 
-           * TODO: This goes too far by looking at the whole sentence; each grammar has a maximum
-           * span limit which should be consulted. What we should be doing is passing the point
-           * where we started matching the current sentence, so we can apply this span limit, which
-           * is easily accessible (baseGrammar.spanLimit).
-           */
-          int maxJ = lastWasNT ? (i + 1) : tokens.length;
-          for (int j = i + 1; j <= maxJ; j++) {
-            filter(j, nextFilteredTrie, true);
-          }
-        }
-      }
-    }
-  }
-
-  /**
-   * Alternate filter that uses regular expressions, walking the grammar trie and matching the
-   * source side of each rule collection against the input sentence. Failed matches are discarded,
-   * and trie nodes extending from that position need not be explored.
-   * 
-   * @return the root of the filtered trie if any rules were retained, otherwise null
-   */
-  @SuppressWarnings("unused")
-  private SentenceFilteredTrie filter_regexp(Trie unfilteredTrie) {
-    SentenceFilteredTrie trie = null;
-
-    /* Case 1: keep the trie node if it has a rule collection that matches the sentence */
-    if (unfilteredTrie.hasRules())
-      if (matchesSentence(unfilteredTrie))
-        trie = new SentenceFilteredTrie(unfilteredTrie);
-      else
-        return null;
-
-    /* Case 2: keep the trie node if it has children who have valid rule collections */
-    if (unfilteredTrie.hasExtensions())
-      for (Entry<Integer, ? extends Trie> arc : unfilteredTrie.getChildren().entrySet()) {
-        Trie unfilteredChildTrie = arc.getValue();
-        SentenceFilteredTrie nextTrie = filter_regexp(unfilteredChildTrie);
-        if (nextTrie != null) {
-          if (trie == null)
-            trie = new SentenceFilteredTrie(unfilteredTrie);
-          trie.children.put(arc.getKey(), nextTrie);
-        }
-      }
-
-    return trie;
-  }
-
-  private boolean matchesSentence(Trie childTrie) {
-    Rule rule = childTrie.getRuleCollection().getRules().get(0);
-    return rule.matches(sentence);
-  }
-
-  /**
-   * Implements a filtered trie, by sitting on top of a base trie and annotating nodes that match
-   * the given input sentence.
-   * 
-   * @author Matt Post <po...@cs.jhu.edu>
-   * 
-   */
-  public class SentenceFilteredTrie implements Trie {
-
-    /* The underlying unfiltered trie node. */
-    private Trie unfilteredTrieNode;
-
-    /* The child nodes in the filtered trie. */
-    private HashMap<Integer, SentenceFilteredTrie> children = null;
-
-    /**
-     * Constructor.
-     * 
-     * @param trieRoot
-     * @param source
-     */
-    public SentenceFilteredTrie(Trie unfilteredTrieNode) {
-      this.unfilteredTrieNode = unfilteredTrieNode;
-      this.children = new HashMap<Integer, SentenceFilteredTrie>();
-    }
-
-    @Override
-    public SentenceFilteredTrie match(int wordID) {
-      if (children != null)
-        return children.get(wordID);
-      return null;
-    }
-
-    @Override
-    public boolean hasExtensions() {
-      return children != null;
-    }
-
-    @Override
-    public Collection<SentenceFilteredTrie> getExtensions() {
-      if (children != null)
-        return children.values();
-
-      return null;
-    }
-
-    @Override
-    public HashMap<Integer, SentenceFilteredTrie> getChildren() {
-      return children;
-    }
-
-    @Override
-    public boolean hasRules() {
-      // Chain to the underlying unfiltered node.
-      return unfilteredTrieNode.hasRules();
-    }
-
-    @Override
-    public RuleCollection getRuleCollection() {
-      // Chain to the underlying unfiltered node, since the rule collection just varies by target
-      // side.
-      return unfilteredTrieNode.getRuleCollection();
-    }
-
-    /**
-     * Counts the number of rules.
-     * 
-     * @return the number of rules rooted at this node.
-     */
-    public int getNumRules() {
-      int numRules = 0;
-      if (getTrieRoot() != null)
-        if (getTrieRoot().getRuleCollection() != null)
-          numRules += getTrieRoot().getRuleCollection().getRules().size();
-
-      for (SentenceFilteredTrie node : getExtensions())
-        numRules += node.getNumRules();
-
-      return numRules;
-    }
-
-    @Override
-    public Iterator<Integer> getTerminalExtensionIterator() {
-      return new ExtensionIterator(children, true);
-    }
-
-    @Override
-    public Iterator<Integer> getNonterminalExtensionIterator() {
-      return new ExtensionIterator(children, false);
-    }
-  }
-}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe2c4341/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java b/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
index 6740c26..9bb184e 100644
--- a/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
+++ b/src/joshua/decoder/ff/tm/hash_based/MemoryBasedBatchGrammar.java
@@ -68,9 +68,6 @@ public class MemoryBasedBatchGrammar extends AbstractGrammar {
 
   private GrammarReader<Rule> modelReader;
 
-  /* Whether the grammar's rules contain regular expressions. */
-  private boolean isRegexpGrammar = false;
-
   // ===============================================================
   // Static Fields
   // ===============================================================
@@ -106,7 +103,6 @@ public class MemoryBasedBatchGrammar extends AbstractGrammar {
     Vocabulary.id(defaultLHSSymbol);
     this.spanLimit = spanLimit;
     this.grammarFile = grammarFile;
-    this.setRegexpGrammar(formatKeyword.equals("regexp"));
 
     // ==== loading grammar
     this.modelReader = createReader(formatKeyword, grammarFile);
@@ -150,12 +146,6 @@ public class MemoryBasedBatchGrammar extends AbstractGrammar {
     return this.qtyRulesRead;
   }
 
-  @Override
-  public Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords,
-      float[] denseScores, int arity) {
-    return null;
-  }
-
   /**
    * if the span covered by the chart bin is greater than the limit, then return false
    */
@@ -233,21 +223,6 @@ public class MemoryBasedBatchGrammar extends AbstractGrammar {
   }
 
   /**
-   * This returns true if the grammar contains rules that are regular expressions, possibly matching
-   * many different inputs.
-   * 
-   * @return true if the grammar's rules may contain regular expressions.
-   */
-  @Override
-  public boolean isRegexpGrammar() {
-    return this.isRegexpGrammar;
-  }
-
-  public void setRegexpGrammar(boolean value) {
-    this.isRegexpGrammar = value;
-  }
-
-  /***
    * Takes an input word and creates an OOV rule in the current grammar for that word.
    * 
    * @param sourceWord

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe2c4341/src/joshua/decoder/ff/tm/packed/PackedGrammar.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/packed/PackedGrammar.java b/src/joshua/decoder/ff/tm/packed/PackedGrammar.java
index e9f1a5c..ff2bf41 100644
--- a/src/joshua/decoder/ff/tm/packed/PackedGrammar.java
+++ b/src/joshua/decoder/ff/tm/packed/PackedGrammar.java
@@ -931,11 +931,6 @@ public class PackedGrammar extends AbstractGrammar {
   }
 
   @Override
-  public boolean isRegexpGrammar() {
-    return false;
-  }
-
-  @Override
   public void addOOVRules(int word, List<FeatureFunction> featureFunctions) {
     throw new RuntimeException("PackedGrammar.addOOVRules(): I can't add OOV rules");
   }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe2c4341/src/joshua/decoder/phrase/PhraseTable.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/phrase/PhraseTable.java b/src/joshua/decoder/phrase/PhraseTable.java
index 38b7ef4..0724361 100644
--- a/src/joshua/decoder/phrase/PhraseTable.java
+++ b/src/joshua/decoder/phrase/PhraseTable.java
@@ -169,22 +169,6 @@ public class PhraseTable implements Grammar {
   }
 
   @Override
-  public Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords, float[] scores,
-      int arity) {
-    return backend.constructManualRule(lhs,  sourceWords, targetWords, scores, arity);
-  }
-
-  @Override
-  public void writeGrammarOnDisk(String file) {
-    backend.writeGrammarOnDisk(file);
-  }
-
-  @Override
-  public boolean isRegexpGrammar() {
-    return backend.isRegexpGrammar();
-  }
-
-  @Override
   public int getOwner() {
     return backend.getOwner();
   }

[4/8] incubator-joshua git commit: Large refactor of the Translation output interface

Posted by mj...@apache.org.

Large refactor of the Translation output interface

Translation outputs were a bit of a mess, so a group of us sat down and came up with a plan to fix it. This should helpful in refining the API.

This commit fixes many of these problems.

-  Instead of returning Translation objects, the calls to Decoder.translate() now return HyperGraph objects. As before, a HyperGraph represents the complete (pruned) search space the decoder explored. A HyperGraph can then be operated on by KBestExtractors and by the new TranslationFactory object.
-  KBestExtractors is now an iterator that takes a HyperGraph object and returns DerivationState objects, each representing a single derivation tree
-  Translation and StructuredTranslation are now combined. Translation is effectively a dummy object with a number of fields of interest that get populated by TranslationFactory, per explicit requests. Each request returns the TranslationFactory object, so you can easily chain calls, and then retrieve the Translation object at the end. e.g.,
-  Neither KBestExtractors nor Translation objects do any printing. This improved encapsulation is a big improvement over the past. After building your Translation objects, they will contain only small objects such as strings, feature vectors, and alignments, that can be safely passed downstream while the HyperGraph gets destroyed. Also, code for processing and formatting is all now in one place, the TranslationFactory.

Other, unrelated edits:

-  Removed the forest rescoring and OracleExtraction classes. These are useful but not used, and are hard to read and should therefore be rewritten


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/f2f82c38
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/f2f82c38
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/f2f82c38

Branch: refs/heads/JOSHUA-273
Commit: f2f82c38af9aebd28f9d27f685a2e99767a2e575
Parents: fe88c68
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed May 25 11:06:13 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed May 25 11:06:13 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/Decoder.java                 | 147 ++--
 src/joshua/decoder/DecoderThread.java           |  21 +-
 src/joshua/decoder/JoshuaConfiguration.java     |  20 -
 src/joshua/decoder/StructuredTranslation.java   | 125 ---
 src/joshua/decoder/Translation.java             | 215 ++---
 src/joshua/decoder/TranslationFactory.java      | 127 +++
 src/joshua/decoder/Translations.java            |  16 +-
 src/joshua/decoder/ff/fragmentlm/Tree.java      |   2 +-
 .../hypergraph/FeatureVectorExtractor.java      |   1 -
 .../decoder/hypergraph/KBestExtractor.java      | 460 ++---------
 .../hypergraph/OutputStringExtractor.java       |   1 -
 .../decoder/hypergraph/ViterbiExtractor.java    |   4 +-
 .../hypergraph/WordAlignmentExtractor.java      |   7 +-
 src/joshua/decoder/io/JSONMessage.java          |  31 +-
 src/joshua/oracle/OracleExtractionHG.java       | 794 -------------------
 src/joshua/oracle/OracleExtractor.java          |  58 --
 16 files changed, 397 insertions(+), 1632 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2f82c38/src/joshua/decoder/Decoder.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/Decoder.java b/src/joshua/decoder/Decoder.java
index 22ed8b9..fc4ba89 100644
--- a/src/joshua/decoder/Decoder.java
+++ b/src/joshua/decoder/Decoder.java
@@ -36,6 +36,7 @@ import java.util.concurrent.BlockingQueue;
 
 import com.google.common.base.Strings;
 
+import hep.aida.ref.Test;
 import joshua.corpus.Vocabulary;
 import joshua.decoder.ff.FeatureVector;
 import joshua.decoder.JoshuaConfiguration.INPUT_TYPE;
@@ -50,6 +51,9 @@ import joshua.decoder.ff.tm.Trie;
 import joshua.decoder.ff.tm.format.HieroFormatReader;
 import joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
 import joshua.decoder.ff.tm.packed.PackedGrammar;
+import joshua.decoder.hypergraph.DerivationState;
+import joshua.decoder.hypergraph.HyperGraph;
+import joshua.decoder.hypergraph.KBestExtractor;
 import joshua.decoder.io.JSONMessage;
 import joshua.decoder.io.TranslationRequestStream;
 import joshua.decoder.phrase.PhraseTable;
@@ -86,10 +90,10 @@ import joshua.util.io.LineReader;
  */
 public class Decoder {
 
-  private final JoshuaConfiguration joshuaConfiguration;
+  private final JoshuaConfiguration config;
 
   public JoshuaConfiguration getJoshuaConfiguration() {
-    return joshuaConfiguration;
+    return config;
   }
 
   /*
@@ -141,10 +145,10 @@ public class Decoder {
    * testing.
    */
   private Decoder(JoshuaConfiguration joshuaConfiguration) {
-    this.joshuaConfiguration = joshuaConfiguration;
+    this.config = joshuaConfiguration;
     this.grammars = new ArrayList<Grammar>();
     this.threadPool = new ArrayBlockingQueue<DecoderThread>(
-        this.joshuaConfiguration.num_parallel_decoders, true);
+        this.config.num_parallel_decoders, true);
     this.customPhraseTable = null;
   }
 
@@ -183,7 +187,7 @@ public class Decoder {
      */
     private OutputStream out;
     
-    RequestParallelizer(TranslationRequestStream request, Translations response, OutputStream out) {
+    RequestParallelizer(TranslationRequestStream request, Translations response) {
       this.request = request;
       this.response = response;
       this.out = out;
@@ -309,7 +313,7 @@ public class Decoder {
         }
 
         // Search for the rule in the trie
-        int nt_i = Vocabulary.id(joshuaConfiguration.default_non_terminal);
+        int nt_i = Vocabulary.id(config.default_non_terminal);
         Trie trie = customPhraseTable.getTrieRoot().match(nt_i);
 
         for (String word: tokens[0].split("\\s+")) {
@@ -414,8 +418,8 @@ public class Decoder {
        * corresponding Translations object, and return the thread to the pool.
        */
       try {
-        Translation translation = decoderThread.translate(this.sentence);
-        translations.record(translation);
+        HyperGraph hg = decoderThread.translate(this.sentence);
+        translations.record(hg);
 
         /*
          * This is crucial! It's what makes the thread available for the next sentence to be
@@ -444,21 +448,27 @@ public class Decoder {
   public void decodeAll(TranslationRequestStream request, OutputStream out) throws IOException {
     Translations translations = new Translations(request);
 
-    /* Start a thread to handle requests on the input stream */
-    new RequestParallelizer(request, translations, out).start();
+    /* Start a thread to handle requests on the input stream. This thread will continually
+     * request individual DecoderThreads from the pool until all of the input segments have been
+     * translated. It returns them *in order* through an iterator interface as they become available.
+     */ 
+    new RequestParallelizer(request, translations).start();
     
     // Create the n-best output stream
     FileWriter nbest_out = null;
-    if (joshuaConfiguration.n_best_file != null)
-      nbest_out = new FileWriter(joshuaConfiguration.n_best_file);
+    if (config.n_best_file != null)
+      nbest_out = new FileWriter(config.n_best_file);
     
     for (;;) {
-      Translation translation = translations.next();
-      if (translation == null)
+      HyperGraph hg = translations.next();
+      if (hg == null)
         break;
 
-      if (joshuaConfiguration.input_type == INPUT_TYPE.json || joshuaConfiguration.server_type == SERVER_TYPE.HTTP) {
-        JSONMessage message = JSONMessage.buildMessage(translation);
+      Sentence sentence = hg.sentence;
+      
+      if (config.input_type == INPUT_TYPE.json || config.server_type == SERVER_TYPE.HTTP) {
+        KBestExtractor extractor = new KBestExtractor(sentence, hg, featureFunctions, weights, false, config);
+        JSONMessage message = JSONMessage.buildMessage(sentence, extractor, config);
         out.write(message.toString().getBytes());
         
       } else {
@@ -469,27 +479,42 @@ public class Decoder {
          * format.
          */
         String text;
-        if (joshuaConfiguration.moses) {
-          text = translation.toString().replaceAll("=", "= ");
-          // Write the complete formatted string to STDOUT
-          if (joshuaConfiguration.n_best_file != null)
-            nbest_out.write(text);
-          
-          // Extract just the translation and output that to STDOUT
-          text = text.substring(0,  text.indexOf('\n'));
-          String[] fields = text.split(" \\|\\|\\| ");
-          text = fields[1] + "\n";
+        if (config.moses) {
+          KBestExtractor extractor = new KBestExtractor(sentence, hg, featureFunctions, weights, false, config);
+          final String mosesFormat = "%i ||| %s ||| %f ||| %c"; 
           
-        } else {
-          text = translation.toString();
+          int k = 1;
+          for (DerivationState derivation: extractor) {
+            if (k > config.topN)
+              break;
+            
+            TranslationFactory factory = new TranslationFactory(sentence, derivation, config);
+            Translation translation = factory.formattedTranslation(mosesFormat).translation();
+            text = translation.getFormattedTranslation().replaceAll("=",  "= ");
+            // Write the complete formatted string to STDOUT
+            if (config.n_best_file != null)
+              nbest_out.write(text + "\n");
+            
+            k++;
+          }
         }
 
-        out.write(text.getBytes());
+        KBestExtractor extractor = new KBestExtractor(sentence, hg, featureFunctions, weights, false, config);
+        DerivationState viterbi = extractor.getViterbiDerivation();
+        Translation best = new TranslationFactory(sentence, viterbi, config)
+            .formattedTranslation(config.outputFormat)
+              .translation();
+        
+        Decoder.LOG(1, String.format("Translation %d: %.3f %s", sentence.id(), best.score(), best.toString()));
+
+        String bestString = best.getFormattedTranslation();
+        out.write(bestString.getBytes());
+        out.write("\n".getBytes());
       }
       out.flush();
     }
     
-    if (joshuaConfiguration.n_best_file != null)
+    if (config.n_best_file != null)
       nbest_out.close();
   }
 
@@ -500,16 +525,16 @@ public class Decoder {
    * @param sentence
    * @return The translated sentence
    */
-  public Translation decode(Sentence sentence) {
+  public HyperGraph decode(Sentence sentence) {
     // Get a thread.
 
     try {
       DecoderThread thread = threadPool.take();
-      Translation translation = thread.translate(sentence);
+      HyperGraph translation = thread.translate(sentence);
       threadPool.put(thread);
-
       return translation;
 
+
     } catch (InterruptedException e) {
       e.printStackTrace();
     }
@@ -614,7 +639,7 @@ public class Decoder {
    * @return the feature in Moses format
    */
   private String mosesize(String feature) {
-    if (joshuaConfiguration.moses) {
+    if (config.moses) {
       if (feature.startsWith("tm_") || feature.startsWith("lm_"))
         return feature.replace("_", "-");
     }
@@ -636,26 +661,26 @@ public class Decoder {
       /* Weights can be listed in a separate file (denoted by parameter "weights-file") or directly
        * in the Joshua config file. Config file values take precedent.
        */
-      this.readWeights(joshuaConfiguration.weights_file);
+      this.readWeights(config.weights_file);
       
       
       /* Add command-line-passed weights to the weights array for processing below */
-      if (!Strings.isNullOrEmpty(joshuaConfiguration.weight_overwrite)) {
-        String[] tokens = joshuaConfiguration.weight_overwrite.split("\\s+");
+      if (!Strings.isNullOrEmpty(config.weight_overwrite)) {
+        String[] tokens = config.weight_overwrite.split("\\s+");
         for (int i = 0; i < tokens.length; i += 2) {
           String feature = tokens[i];
           float value = Float.parseFloat(tokens[i+1]);
           
-          if (joshuaConfiguration.moses)
+          if (config.moses)
             feature = demoses(feature);
           
-          joshuaConfiguration.weights.add(String.format("%s %s", feature, tokens[i+1]));
+          config.weights.add(String.format("%s %s", feature, tokens[i+1]));
           Decoder.LOG(1, String.format("COMMAND LINE WEIGHT: %s -> %.3f", feature, value));
         }
       }
 
       /* Read the weights found in the config file */
-      for (String pairStr: joshuaConfiguration.weights) {
+      for (String pairStr: config.weights) {
         String pair[] = pairStr.split("\\s+");
 
         /* Sanity check for old-style unsupported feature invocations. */
@@ -690,10 +715,10 @@ public class Decoder {
       this.initializeFeatureFunctions();
 
       // This is mostly for compatibility with the Moses tuning script
-      if (joshuaConfiguration.show_weights_and_quit) {
+      if (config.show_weights_and_quit) {
         for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) {
           String name = DENSE_FEATURE_NAMES.get(i);
-          if (joshuaConfiguration.moses) 
+          if (config.moses) 
             System.out.println(String.format("%s= %.5f", mosesize(name), weights.getDense(i)));
           else
             System.out.println(String.format("%s %.5f", name, weights.getDense(i)));
@@ -702,7 +727,7 @@ public class Decoder {
       }
       
       // Sort the TM grammars (needed to do cube pruning)
-      if (joshuaConfiguration.amortized_sorting) {
+      if (config.amortized_sorting) {
         Decoder.LOG(1, "Grammar sorting happening lazily on-demand.");
       } else {
         long pre_sort_time = System.currentTimeMillis();
@@ -714,9 +739,9 @@ public class Decoder {
       }
 
       // Create the threads
-      for (int i = 0; i < joshuaConfiguration.num_parallel_decoders; i++) {
+      for (int i = 0; i < config.num_parallel_decoders; i++) {
         this.threadPool.put(new DecoderThread(this.grammars, Decoder.weights,
-            this.featureFunctions, joshuaConfiguration));
+            this.featureFunctions, config));
       }
 
     } catch (IOException e) {
@@ -738,13 +763,13 @@ public class Decoder {
    */
   private void initializeTranslationGrammars() throws IOException {
 
-    if (joshuaConfiguration.tms.size() > 0) {
+    if (config.tms.size() > 0) {
 
       // collect packedGrammars to check if they use a shared vocabulary
       final List<PackedGrammar> packed_grammars = new ArrayList<>();
 
       // tm = {thrax/hiero,packed,samt,moses} OWNER LIMIT FILE
-      for (String tmLine : joshuaConfiguration.tms) {
+      for (String tmLine : config.tms) {
 
         String type = tmLine.substring(0,  tmLine.indexOf(' '));
         String[] args = tmLine.substring(tmLine.indexOf(' ')).trim().split("\\s+");
@@ -758,7 +783,7 @@ public class Decoder {
         if (! type.equals("moses") && ! type.equals("phrase")) {
           if (new File(path).isDirectory()) {
             try {
-              PackedGrammar packed_grammar = new PackedGrammar(path, span_limit, owner, type, joshuaConfiguration);
+              PackedGrammar packed_grammar = new PackedGrammar(path, span_limit, owner, type, config);
               packed_grammars.add(packed_grammar);
               grammar = packed_grammar;
             } catch (FileNotFoundException e) {
@@ -769,7 +794,7 @@ public class Decoder {
           } else {
             // thrax, hiero, samt
             grammar = new MemoryBasedBatchGrammar(type, path, owner,
-                joshuaConfiguration.default_non_terminal, span_limit, joshuaConfiguration);
+                config.default_non_terminal, span_limit, config);
           }
           
         } else {
@@ -778,8 +803,8 @@ public class Decoder {
               ? Integer.parseInt(parsedArgs.get("max-source-len"))
               : -1;
 
-          joshuaConfiguration.search_algorithm = "stack";
-          grammar = new PhraseTable(path, owner, type, joshuaConfiguration, maxSourceLen);
+          config.search_algorithm = "stack";
+          grammar = new PhraseTable(path, owner, type, config, maxSourceLen);
         }
 
         this.grammars.add(grammar);
@@ -789,25 +814,25 @@ public class Decoder {
 
     } else {
       Decoder.LOG(1, "* WARNING: no grammars supplied!  Supplying dummy glue grammar.");
-      MemoryBasedBatchGrammar glueGrammar = new MemoryBasedBatchGrammar("glue", joshuaConfiguration);
+      MemoryBasedBatchGrammar glueGrammar = new MemoryBasedBatchGrammar("glue", config);
       glueGrammar.setSpanLimit(-1);
       glueGrammar.addGlueRules(featureFunctions);
       this.grammars.add(glueGrammar);
     }
     
     /* Add the grammar for custom entries */
-    this.customPhraseTable = new PhraseTable(null, "custom", "phrase", joshuaConfiguration, 0);
+    this.customPhraseTable = new PhraseTable(null, "custom", "phrase", config, 0);
     this.grammars.add(this.customPhraseTable);
     
     /* Create an epsilon-deleting grammar */
-    if (joshuaConfiguration.lattice_decoding) {
+    if (config.lattice_decoding) {
       Decoder.LOG(1, "Creating an epsilon-deleting grammar");
-      MemoryBasedBatchGrammar latticeGrammar = new MemoryBasedBatchGrammar("lattice", joshuaConfiguration);
+      MemoryBasedBatchGrammar latticeGrammar = new MemoryBasedBatchGrammar("lattice", config);
       latticeGrammar.setSpanLimit(-1);
       HieroFormatReader reader = new HieroFormatReader();
 
-      String goalNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.goal_symbol);
-      String defaultNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.default_non_terminal);
+      String goalNT = FormatUtils.cleanNonTerminal(config.goal_symbol);
+      String defaultNT = FormatUtils.cleanNonTerminal(config.default_non_terminal);
 
       String ruleString = String.format("[%s] ||| [%s,1] <eps> ||| [%s,1] ||| ", goalNT, goalNT, defaultNT,
           goalNT, defaultNT);
@@ -826,7 +851,7 @@ public class Decoder {
       String owner = Vocabulary.word(grammar.getOwner());
       if (! ownersSeen.contains(owner)) {
         this.featureFunctions.add(new PhraseModel(weights, new String[] { "tm", "-owner", owner },
-            joshuaConfiguration, grammar));
+            config, grammar));
         ownersSeen.add(owner);
       }
     }
@@ -882,7 +907,7 @@ public class Decoder {
         Float value = Float.parseFloat(tokens[1]);
         
         // Kludge for compatibility with Moses tuners
-        if (joshuaConfiguration.moses) {
+        if (config.moses) {
           feature = demoses(feature);
         }
 
@@ -925,7 +950,7 @@ public class Decoder {
    */
   private void initializeFeatureFunctions() throws IOException {
 
-    for (String featureLine : joshuaConfiguration.features) {
+    for (String featureLine : config.features) {
       // feature-function = NAME args
       // 1. create new class named NAME, pass it config, weights, and the args
 
@@ -938,7 +963,7 @@ public class Decoder {
         Class<?> clas = getClass(featureName);
         Constructor<?> constructor = clas.getConstructor(FeatureVector.class,
             String[].class, JoshuaConfiguration.class);
-        this.featureFunctions.add((FeatureFunction) constructor.newInstance(weights, fields, joshuaConfiguration));
+        this.featureFunctions.add((FeatureFunction) constructor.newInstance(weights, fields, config));
       } catch (Exception e) {
         e.printStackTrace();
         System.err.println("* FATAL: could not find a feature '" + featureName + "'");

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2f82c38/src/joshua/decoder/DecoderThread.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/DecoderThread.java b/src/joshua/decoder/DecoderThread.java
index b1857cd..cf78420 100644
--- a/src/joshua/decoder/DecoderThread.java
+++ b/src/joshua/decoder/DecoderThread.java
@@ -27,6 +27,7 @@ import joshua.decoder.chart_parser.Chart;
 import joshua.decoder.ff.FeatureFunction;
 import joshua.decoder.ff.FeatureVector;
 import joshua.decoder.ff.SourceDependentFF;
+import joshua.decoder.ff.lm.StateMinimizingLanguageModel;
 import joshua.decoder.ff.tm.Grammar;
 import joshua.decoder.hypergraph.ForestWalker;
 import joshua.decoder.hypergraph.GrammarBuilderWalkerFunction;
@@ -92,7 +93,7 @@ public class DecoderThread extends Thread {
    * 
    * @param sentence The sentence to be translated.
    */
-  public Translation translate(Sentence sentence) {
+  public HyperGraph translate(Sentence sentence) {
 
     Decoder.LOG(1, "Input " + sentence.id() + ", " + sentence.fullSource());
 
@@ -102,7 +103,7 @@ public class DecoderThread extends Thread {
     // skip blank sentences
     if (sentence.isEmpty()) {
       Decoder.LOG(1, "Translation " + sentence.id() + ": Translation took 0 seconds");
-      return new Translation(sentence, null, featureFunctions, joshuaConfiguration);
+      return null;
     }
     
     long startTime = System.currentTimeMillis();
@@ -148,9 +149,21 @@ public class DecoderThread extends Thread {
     Decoder.LOG(1, String.format("Input %d: Memory used is %.1f MB", sentence.id(), (Runtime
         .getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / 1000000.0));
 
+     /*
+     * KenLM hack. If using KenLMFF, we need to tell KenLM to delete the pool used to create chart
+     * objects for this sentence.
+     */
+    // TODO: make sure this works here
+    for (FeatureFunction feature : featureFunctions) {
+      if (feature instanceof StateMinimizingLanguageModel) {
+        ((StateMinimizingLanguageModel) feature).destroyPool(sentence.id());
+        break;
+      }
+    }
+    
     /* Return the translation unless we're doing synchronous parsing. */
     if (!joshuaConfiguration.parse || hypergraph == null) {
-      return new Translation(sentence, hypergraph, featureFunctions, joshuaConfiguration);
+      return hypergraph;
     }
 
     /*****************************************************************************************/
@@ -186,7 +199,7 @@ public class DecoderThread extends Thread {
     logger.info(String.format("Memory used after sentence %d is %.1f MB", sentence.id(), (Runtime
         .getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / 1000000.0));
 
-    return new Translation(sentence, englishParse, featureFunctions, joshuaConfiguration); // or do something else
+    return englishParse;
   }
 
   private Grammar getGrammarFromHyperGraph(String goal, HyperGraph hg) {

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2f82c38/src/joshua/decoder/JoshuaConfiguration.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/JoshuaConfiguration.java b/src/joshua/decoder/JoshuaConfiguration.java
index c874534..3f20f46 100644
--- a/src/joshua/decoder/JoshuaConfiguration.java
+++ b/src/joshua/decoder/JoshuaConfiguration.java
@@ -209,18 +209,6 @@ public class JoshuaConfiguration {
   public int server_port = 0;
 
   /*
-   * Whether to do forest rescoring. If set to true, the references are expected on STDIN along with
-   * the input sentences in the following format:
-   * 
-   * input sentence ||| ||| reference1 ||| reference2 ...
-   * 
-   * (The second field is reserved for the output sentence for alignment and forced decoding).
-   */
-
-  public boolean rescoreForest = false;
-  public float rescoreForestWeight = 10.0f;
-
-  /*
    * Location of fragment mapping file, which maps flattened SCFG rules to their internal
    * representation.
    */
@@ -564,14 +552,6 @@ public class JoshuaConfiguration {
             server_port = Integer.parseInt(fds[1]);
             logger.info(String.format("    server-port: %d", server_port));
 
-          } else if (parameter.equals(normalize_key("rescore-forest"))) {
-            rescoreForest = true;
-            logger.info(String.format("    rescore-forest: %s", rescoreForest));
-
-          } else if (parameter.equals(normalize_key("rescore-forest-weight"))) {
-            rescoreForestWeight = Float.parseFloat(fds[1]);
-            logger.info(String.format("    rescore-forest-weight: %f", rescoreForestWeight));
-
           } else if (parameter.equals(normalize_key("maxlen"))) {
             // reset the maximum length
             maxlen = Integer.parseInt(fds[1]);

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2f82c38/src/joshua/decoder/StructuredTranslation.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/StructuredTranslation.java b/src/joshua/decoder/StructuredTranslation.java
deleted file mode 100644
index 7b2185f..0000000
--- a/src/joshua/decoder/StructuredTranslation.java
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder;
-
-import static java.util.Arrays.asList;
-import static java.util.Collections.emptyList;
-import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiFeatures;
-import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiString;
-import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiWordAlignmentList;
-import static joshua.util.FormatUtils.removeSentenceMarkers;
-
-import java.util.List;
-import java.util.Map;
-
-import joshua.decoder.ff.FeatureFunction;
-import joshua.decoder.hypergraph.HyperGraph;
-import joshua.decoder.segment_file.Sentence;
-
-/**
- * structuredTranslation provides a more structured access to translation
- * results than the Translation class.
- * Members of instances of this class can be used upstream.
- * <br/>
- * TODO:
- * Enable K-Best extraction.
- * 
- * @author fhieber
- */
-public class StructuredTranslation {
-  
-  private final Sentence sourceSentence;
-  private final String translationString;
-  private final List<String> translationTokens;
-  private final float translationScore;
-  private final List<List<Integer>> translationWordAlignments;
-  private final Map<String,Float> translationFeatures;
-  private final float extractionTime;
-  
-  public StructuredTranslation(final Sentence sourceSentence,
-      final HyperGraph hypergraph,
-      final List<FeatureFunction> featureFunctions) {
-    
-      final long startTime = System.currentTimeMillis();
-      
-      this.sourceSentence = sourceSentence;
-      this.translationString = removeSentenceMarkers(getViterbiString(hypergraph));
-      this.translationTokens = extractTranslationTokens();
-      this.translationScore = extractTranslationScore(hypergraph);
-      this.translationFeatures = getViterbiFeatures(hypergraph, featureFunctions, sourceSentence).getMap();
-      this.translationWordAlignments = getViterbiWordAlignmentList(hypergraph);
-      this.extractionTime = (System.currentTimeMillis() - startTime) / 1000.0f;
-  }
-  
-  private float extractTranslationScore(final HyperGraph hypergraph) {
-    if (hypergraph == null) {
-      return 0;
-    } else {
-      return hypergraph.goalNode.getScore();
-    }
-  }
-  
-  private List<String> extractTranslationTokens() {
-    if (translationString.isEmpty()) {
-      return emptyList();
-    } else {
-      return asList(translationString.split("\\s+"));
-    }
-  }
-  
-  // Getters to use upstream
-  
-  public Sentence getSourceSentence() {
-    return sourceSentence;
-  }
-
-  public int getSentenceId() {
-    return sourceSentence.id();
-  }
-
-  public String getTranslationString() {
-    return translationString;
-  }
-
-  public List<String> getTranslationTokens() {
-    return translationTokens;
-  }
-
-  public float getTranslationScore() {
-    return translationScore;
-  }
-
-  /**
-   * Returns a list of target to source alignments.
-   */
-  public List<List<Integer>> getTranslationWordAlignments() {
-    return translationWordAlignments;
-  }
-  
-  public Map<String,Float> getTranslationFeatures() {
-    return translationFeatures;
-  }
-  
-  /**
-   * Time taken to build output information from the hypergraph.
-   */
-  public Float getExtractionTime() {
-    return extractionTime;
-  }
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2f82c38/src/joshua/decoder/Translation.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/Translation.java b/src/joshua/decoder/Translation.java
index 8004d9f..2af065c 100644
--- a/src/joshua/decoder/Translation.java
+++ b/src/joshua/decoder/Translation.java
@@ -18,24 +18,16 @@
  */
 package joshua.decoder;
 
-import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiFeatures;
-import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiString;
-import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiWordAlignments;
 import static joshua.util.FormatUtils.removeSentenceMarkers;
+import static joshua.util.FormatUtils.unescapeSpecialSymbols;
 
-import java.io.BufferedWriter;
-import java.io.IOException;
-import java.io.StringWriter;
 import java.util.List;
 
-import joshua.decoder.ff.FeatureFunction;
 import joshua.decoder.ff.FeatureVector;
-import joshua.decoder.ff.lm.StateMinimizingLanguageModel;
-import joshua.decoder.hypergraph.HyperGraph;
-import joshua.decoder.hypergraph.KBestExtractor;
-import joshua.decoder.io.DeNormalize;
+import joshua.decoder.hypergraph.WordAlignmentState;
 import joshua.decoder.segment_file.Sentence;
 
+
 /**
  * This class represents translated input objects (sentences or lattices). It is aware of the source
  * sentence and id and contains the decoded hypergraph. Translation objects are returned by
@@ -45,158 +37,83 @@ import joshua.decoder.segment_file.Sentence;
  */
 
 public class Translation {
-  private Sentence source;
-
-  /**
-   * This stores the output of the translation so we don't have to hold onto the hypergraph while we
-   * wait for the outputs to be assembled.
-   */
-  private String output = null;
-
-  private StructuredTranslation structuredTranslation = null;
-  
-  public Translation(Sentence source, HyperGraph hypergraph, 
-      List<FeatureFunction> featureFunctions, JoshuaConfiguration joshuaConfiguration) {
-    this.source = source;
+  private final Sentence sourceSentence;
+  private final String rawTranslation;
+  private final String translation;
+  private final float translationScore;
+  private String formattedTranslation;
+  private List<List<Integer>> translationWordAlignments;
+  private float extractionTime;
+  private FeatureVector features;
+  private WordAlignmentState wordAlignment;
+
+  public Translation(final Sentence source, final String output, final float cost) {
+    this.sourceSentence = source;
+    this.rawTranslation = output;
+    this.translationScore = cost;
     
-    if (joshuaConfiguration.use_structured_output) {
-      
-      structuredTranslation = new StructuredTranslation(
-          source, hypergraph, featureFunctions);
-      this.output = structuredTranslation.getTranslationString();
-      
-    } else {
-
-      StringWriter sw = new StringWriter();
-      BufferedWriter out = new BufferedWriter(sw);
-
-      try {
-        if (hypergraph != null) {
-          if (!joshuaConfiguration.hypergraphFilePattern.equals("")) {
-            hypergraph.dump(String.format(joshuaConfiguration.hypergraphFilePattern, source.id()), featureFunctions);
-          }
-
-          long startTime = System.currentTimeMillis();
-
-          // We must put this weight as zero, otherwise we get an error when we try to retrieve it
-          // without checking
-          Decoder.weights.increment("BLEU", 0);
-          
-          if (joshuaConfiguration.topN == 0) {
-            
-            /* construct Viterbi output */
-            final String best = getViterbiString(hypergraph);
-            
-            Decoder.LOG(1, String.format("Translation %d: %.3f %s", source.id(), hypergraph.goalNode.getScore(),
-                best));
-            
-            /*
-             * Setting topN to 0 turns off k-best extraction, in which case we need to parse through
-             * the output-string, with the understanding that we can only substitute variables for the
-             * output string, sentence number, and model score.
-             */
-            String translation = joshuaConfiguration.outputFormat
-                .replace("%s", removeSentenceMarkers(best))
-                .replace("%S", DeNormalize.processSingleLine(best))
-                .replace("%c", String.format("%.3f", hypergraph.goalNode.getScore()))
-                .replace("%i", String.format("%d", source.id()));
-            
-            if (joshuaConfiguration.outputFormat.contains("%a")) {
-              translation = translation.replace("%a", getViterbiWordAlignments(hypergraph));
-            }
-            
-            if (joshuaConfiguration.outputFormat.contains("%f")) {
-              final FeatureVector features = getViterbiFeatures(hypergraph, featureFunctions, source);
-              translation = translation.replace("%f", joshuaConfiguration.moses ? features.mosesString() : features.toString());
-            }
-            
-            out.write(translation);
-            out.newLine();
-            
-          } else {
-            
-            final KBestExtractor kBestExtractor = new KBestExtractor(
-                source, featureFunctions, Decoder.weights, false, joshuaConfiguration);
-            kBestExtractor.lazyKBestExtractOnHG(hypergraph, joshuaConfiguration.topN, out);
-
-            if (joshuaConfiguration.rescoreForest) {
-              Decoder.weights.increment("BLEU", joshuaConfiguration.rescoreForestWeight);
-              kBestExtractor.lazyKBestExtractOnHG(hypergraph, joshuaConfiguration.topN, out);
-
-              Decoder.weights.increment("BLEU", -joshuaConfiguration.rescoreForestWeight);
-              kBestExtractor.lazyKBestExtractOnHG(hypergraph, joshuaConfiguration.topN, out);
-            }
-          }
-
-          float seconds = (float) (System.currentTimeMillis() - startTime) / 1000.0f;
-          Decoder.LOG(1, String.format("Input %d: %d-best extraction took %.3f seconds", id(),
-              joshuaConfiguration.topN, seconds));
-
-      } else {
-        
-        // Failed translations and blank lines get empty formatted outputs
-        // @formatter:off
-        String outputString = joshuaConfiguration.outputFormat
-            .replace("%s", source.source())
-            .replace("%e", "")
-            .replace("%S", "")
-            .replace("%t", "()")
-            .replace("%i", Integer.toString(source.id()))
-            .replace("%f", "")
-            .replace("%c", "0.000");
-        // @formatter:on
-
-        out.write(outputString);
-        out.newLine();
-      }
-
-        out.flush();
-      } catch (IOException e) {
-        e.printStackTrace();
-        System.exit(1);
-      }
-      
-      this.output = sw.toString();
-      
-    }
-
-    /*
-     * KenLM hack. If using KenLMFF, we need to tell KenLM to delete the pool used to create chart
-     * objects for this sentence.
-     */
-    for (FeatureFunction feature : featureFunctions) {
-      if (feature instanceof StateMinimizingLanguageModel) {
-        ((StateMinimizingLanguageModel) feature).destroyPool(getSourceSentence().id());
-        break;
-      }
-    }
+    this.translation = unescapeSpecialSymbols(removeSentenceMarkers(rawTranslation));
     
+//    final long startTime = System.currentTimeMillis();
+//    this.extractionTime = (System.currentTimeMillis() - startTime) / 1000.0f;
   }
 
   public Sentence getSourceSentence() {
-    return this.source;
+    return this.sourceSentence;
   }
 
+  public float score() {
+    return translationScore;
+  }
+
+  /**
+   * Returns a list of target to source alignments.
+   */
+  public List<List<Integer>> getTranslationWordAlignments() {
+    return translationWordAlignments;
+  }
+  
+  /**
+   * Time taken to build output information from the hypergraph.
+   */
+  public Float getExtractionTime() {
+    return extractionTime;
+  }
+  
   public int id() {
-    return source.id();
+    return sourceSentence.id();
   }
 
   @Override
   public String toString() {
-    return output;
+    return this.translation;
   }
   
-  /**
-   * Returns the StructuredTranslation object
-   * if JoshuaConfiguration.construct_structured_output == True.
-   * @throws RuntimeException if StructuredTranslation object not set.
-   * @return
-   */
-  public StructuredTranslation getStructuredTranslation() {
-    if (structuredTranslation == null) {
-      throw new RuntimeException("No StructuredTranslation object created. You should set JoshuaConfigration.construct_structured_output = true");
-    }
-    return structuredTranslation;
+  public String rawTranslation() {
+    return this.rawTranslation;
+  }
+
+  public void setFormattedTranslation(String formattedTranslation) {
+    this.formattedTranslation = formattedTranslation;
+  }
+  
+  public String getFormattedTranslation() {
+    return this.formattedTranslation;
+  }
+
+  public void setFeatures(FeatureVector features) {
+    this.features = features;
   }
   
+  public FeatureVector getFeatures() {
+    return this.features;
+  }
+
+  public void setWordAlignment(WordAlignmentState wordAlignment) {
+    this.wordAlignment = wordAlignment;
+  }
+
+  public Object getWordAlignment() {
+    return this.wordAlignment;
+  }
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2f82c38/src/joshua/decoder/TranslationFactory.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/TranslationFactory.java b/src/joshua/decoder/TranslationFactory.java
new file mode 100644
index 0000000..9d1953e
--- /dev/null
+++ b/src/joshua/decoder/TranslationFactory.java
@@ -0,0 +1,127 @@
+package joshua.decoder;
+
+import joshua.decoder.ff.FeatureVector;
+import joshua.decoder.hypergraph.DerivationState;
+import joshua.decoder.hypergraph.KBestExtractor.Side;
+import joshua.decoder.io.DeNormalize;
+import joshua.decoder.segment_file.Sentence;
+import joshua.decoder.segment_file.Token;
+import joshua.util.FormatUtils;
+
+import java.util.List;
+
+public class TranslationFactory {
+
+  private final Sentence sentence;
+  private final JoshuaConfiguration config;
+
+  private DerivationState derivation;
+  private Translation translation;
+
+  public TranslationFactory(Sentence sentence, DerivationState derivation, JoshuaConfiguration config) {
+    this.sentence = sentence;
+    this.derivation = derivation;
+    this.config = config;
+    
+    if (this.derivation != null) {
+      this.translation = new Translation(sentence, derivation.getHypothesis(), derivation.getCost());
+    } else {
+      this.translation = new Translation(sentence, null, 0.0f);
+    }
+  }
+  
+  public Translation translation() {
+    return this.translation;
+  }
+
+  public TranslationFactory formattedTranslation(String format) {
+
+    // TODO: instead of calling replace() a million times, walk through yourself and find the
+    // special characters, and then replace them.  If you do this from the right side the index
+    // replacement should be a lot more efficient than what we're doing here, particularly since
+    // all these arguments get evaluated whether they're used or not
+
+    String output = format
+        .replace("%s", translation.toString())
+        .replace("%e", derivation.getHypothesis(Side.SOURCE))
+        .replace("%S", DeNormalize.processSingleLine(translation.toString()))
+        .replace("%c", String.format("%.3f", translation.score()))
+        .replace("%i", Integer.toString(sentence.id()));
+
+    if (output.contains("%a")) {
+      this.alignments().translation();
+      output = output.replace("%a", translation.getWordAlignment().toString());
+    }
+
+    if (config.outputFormat.contains("%f")) {
+      this.features();
+      final FeatureVector features = translation.getFeatures();
+      output = output.replace("%f", config.moses ? features.mosesString() : features.toString());
+    }
+    
+    if (output.contains("%t")) {
+      // TODO: also store in Translation objection
+      output = output.replace("%t", derivation.getTree());
+    }
+
+    /* %d causes a derivation with rules one per line to be output */
+    if (output.contains("%d")) {
+      // TODO: also store in Translation objection
+      output = output.replace("%d", derivation.getDerivation());
+    }
+
+    translation.setFormattedTranslation(maybeProjectCase(derivation, output));
+    return this;
+  }
+
+  /** 
+   * Stores the features
+   * 
+   * @return
+   */
+  public TranslationFactory features() {
+    translation.setFeatures(derivation.getFeatures());
+    return this;
+  }
+  
+  public TranslationFactory alignments() {
+    // TODO: write this
+    //    this.translation.setAlignments(getViterbiWordAlignmentList(derivation);
+    translation.setWordAlignment(derivation.getWordAlignment());
+    return this;
+  }
+  
+  /**
+   * If requested, projects source-side lettercase to target, and appends the alignment from
+   * to the source-side sentence in ||s.
+   * 
+   * @param hypothesis
+   * @param state
+   * @return
+   */
+  private String maybeProjectCase(DerivationState derivation, String hypothesis) {
+    String output = hypothesis;
+
+    if (config.project_case) {
+      String[] tokens = hypothesis.split("\\s+");
+      List<List<Integer>> points = derivation.getWordAlignment().toFinalList();
+      for (int i = 0; i < points.size(); i++) {
+        List<Integer> target = points.get(i);
+        for (int source: target) {
+          Token token = sentence.getTokens().get(source + 1); // skip <s>
+          String annotation = "";
+          if (token != null && token.getAnnotation("lettercase") != null)
+            annotation = token.getAnnotation("lettercase");
+          if (source != 0 && annotation.equals("upper"))
+            tokens[i] = FormatUtils.capitalize(tokens[i]);
+          else if (annotation.equals("all-upper"))
+            tokens[i] = tokens[i].toUpperCase();
+        }
+      }
+
+      output = String.join(" ",  tokens);
+    }
+
+    return output;
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2f82c38/src/joshua/decoder/Translations.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/Translations.java b/src/joshua/decoder/Translations.java
index e6ba9e6..5bb0456 100644
--- a/src/joshua/decoder/Translations.java
+++ b/src/joshua/decoder/Translations.java
@@ -19,6 +19,8 @@
 package joshua.decoder;
 
 import java.util.LinkedList;
+
+import joshua.decoder.hypergraph.HyperGraph;
 import joshua.decoder.io.TranslationRequestStream;
 
 /**
@@ -44,13 +46,13 @@ public class Translations {
   private int currentID = 0;
 
   /* The set of translated sentences. */
-  private LinkedList<Translation> translations = null;
+  private LinkedList<HyperGraph> translations = null;
 
   private boolean spent = false;
 
   public Translations(TranslationRequestStream request) {
     this.request = request;
-    this.translations = new LinkedList<Translation>();
+    this.translations = new LinkedList<HyperGraph>();
   }
 
   /**
@@ -75,21 +77,21 @@ public class Translations {
    * 
    * @param translation
    */
-  public void record(Translation translation) {
+  public void record(HyperGraph hyperGraph) {
     synchronized (this) {
 
       /* Pad the set of translations with nulls to accommodate the new translation. */
-      int offset = translation.id() - currentID;
+      int offset = hyperGraph.sentID() - currentID;
       while (offset >= translations.size())
         translations.add(null);
-      translations.set(offset, translation);
+      translations.set(offset, hyperGraph);
 
       /*
        * If the id of the current translation is at the head of the list (first element), then we
        * have the next Translation to be return, and we should notify anyone waiting on next(),
        * which will then remove the item and increment the currentID.
        */
-      if (translation.id() == currentID) {
+      if (hyperGraph.sentID() == currentID) {
         this.notify();
       }
     }
@@ -99,7 +101,7 @@ public class Translations {
    * Returns the next Translation, blocking if necessary until it's available, since the next
    * Translation might not have been produced yet.
    */
-  public Translation next() {
+  public HyperGraph next() {
     synchronized (this) {
 
       /*

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2f82c38/src/joshua/decoder/ff/fragmentlm/Tree.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/fragmentlm/Tree.java b/src/joshua/decoder/ff/fragmentlm/Tree.java
index b52ccce..5051c1b 100644
--- a/src/joshua/decoder/ff/fragmentlm/Tree.java
+++ b/src/joshua/decoder/ff/fragmentlm/Tree.java
@@ -26,9 +26,9 @@ import java.util.*;
 import joshua.corpus.Vocabulary;
 import joshua.decoder.ff.fragmentlm.Trees.PennTreeReader;
 import joshua.decoder.ff.tm.Rule;
+import joshua.decoder.hypergraph.DerivationState;
 import joshua.decoder.hypergraph.HGNode;
 import joshua.decoder.hypergraph.HyperEdge;
-import joshua.decoder.hypergraph.KBestExtractor.DerivationState;
 import joshua.util.io.LineReader;
 
 /**

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2f82c38/src/joshua/decoder/hypergraph/FeatureVectorExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/FeatureVectorExtractor.java b/src/joshua/decoder/hypergraph/FeatureVectorExtractor.java
index dbe4f4b..a6e30a0 100644
--- a/src/joshua/decoder/hypergraph/FeatureVectorExtractor.java
+++ b/src/joshua/decoder/hypergraph/FeatureVectorExtractor.java
@@ -24,7 +24,6 @@ import java.util.List;
 
 import joshua.decoder.ff.FeatureFunction;
 import joshua.decoder.ff.FeatureVector;
-import joshua.decoder.hypergraph.KBestExtractor.DerivationState;
 import joshua.decoder.hypergraph.KBestExtractor.DerivationVisitor;
 import joshua.decoder.segment_file.Sentence;
 

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2f82c38/src/joshua/decoder/hypergraph/KBestExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/KBestExtractor.java b/src/joshua/decoder/hypergraph/KBestExtractor.java
index 6dd3207..b8f167c 100644
--- a/src/joshua/decoder/hypergraph/KBestExtractor.java
+++ b/src/joshua/decoder/hypergraph/KBestExtractor.java
@@ -18,9 +18,6 @@
  */
 package joshua.decoder.hypergraph;
 
-import static joshua.util.FormatUtils.unescapeSpecialSymbols;
-import static joshua.util.FormatUtils.removeSentenceMarkers;
-
 import java.io.BufferedWriter;
 import java.io.IOException;
 import java.io.OutputStreamWriter;
@@ -29,18 +26,17 @@ import java.util.Comparator;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
+import java.util.Iterator;
 import java.util.List;
 import java.util.PriorityQueue;
 
 import joshua.corpus.Vocabulary;
-import joshua.decoder.BLEU;
 import joshua.decoder.JoshuaConfiguration;
 import joshua.decoder.ff.FeatureFunction;
 import joshua.decoder.ff.FeatureVector;
 import joshua.decoder.ff.fragmentlm.Tree;
 import joshua.decoder.ff.state_maintenance.DPState;
 import joshua.decoder.ff.tm.Rule;
-import joshua.decoder.io.DeNormalize;
 import joshua.decoder.segment_file.Sentence;
 import joshua.decoder.segment_file.Token;
 import joshua.util.FormatUtils;
@@ -87,19 +83,22 @@ import joshua.util.FormatUtils;
  * The configuration parameter `top-n` controls how many items are returned. If this is set to 0,
  * k-best extraction should be turned off entirely.
  * 
+ * You can call getViterbiDerivation() essentially for free. But as soon as you call hasNext()
+ * (or next(), e.g., via the iterator), you're going to trigger some relatively expensive
+ * k-best computation.
+ * 
  * @author Zhifei Li, <zh...@gmail.com>
  * @author Matt Post <po...@cs.jhu.edu>
  */
-public class KBestExtractor {
+public class KBestExtractor implements Iterator<DerivationState>, Iterable<DerivationState> { 
   private final JoshuaConfiguration joshuaConfiguration;
-  private final String outputFormat;
   private final HashMap<HGNode, VirtualNode> virtualNodesTable = new HashMap<HGNode, VirtualNode>();
 
   // static final String rootSym = JoshuaConfiguration.goal_symbol;
   static final String rootSym = "ROOT";
   static final int rootID = Vocabulary.id(rootSym);
 
-  private enum Side {
+  public enum Side {
     SOURCE, TARGET
   };
 
@@ -107,55 +106,59 @@ public class KBestExtractor {
   private final boolean extractUniqueNbest;
 
   /* Which side to output (source or target) */
-  private final Side defaultSide;
+  final Side defaultSide;
 
   /* The input sentence */
-  private final Sentence sentence;
+  final Sentence sentence;
 
   /* The weights being used to score the forest */
-  private final FeatureVector weights;
+  final FeatureVector weights;
 
   /* The feature functions */
-  private final List<FeatureFunction> featureFunctions;
-
-  /* BLEU statistics of the references */
-  private BLEU.References references = null;
+  final List<FeatureFunction> featureFunctions;
+  private HyperGraph hyperGraph;
+  private DerivationState nextDerivation = null;
+  private int derivationCounter;
 
   public KBestExtractor(
       Sentence sentence,
+      HyperGraph hyperGraph,
       List<FeatureFunction> featureFunctions,
       FeatureVector weights,
       boolean isMonolingual,
       JoshuaConfiguration joshuaConfiguration) {
 
     this.featureFunctions = featureFunctions;
-
+    this.hyperGraph = hyperGraph;
     this.joshuaConfiguration = joshuaConfiguration;
-    this.outputFormat = this.joshuaConfiguration.outputFormat;
     this.extractUniqueNbest = joshuaConfiguration.use_unique_nbest;
 
     this.weights = weights;
     this.defaultSide = (isMonolingual ? Side.SOURCE : Side.TARGET);
     this.sentence = sentence;
-
-    if (joshuaConfiguration.rescoreForest) {
-      references = new BLEU.References(sentence.references());
-    }
+    
+    // initialize the iterator
+    this.derivationCounter = 0;
+    this.nextDerivation = getViterbiDerivation();
   }
 
   /**
-   * Returns the kth derivation.
+   * Returns the Viterbi derivation. You don't want to use the general k-best extraction code because
+   * (a) the Viterbi derivation is always needed and (b) k-best extraction is slow. So this is basically
+   * a convenience function that by-passes the expensive k-best extraction for a common use-case.
    * 
-   * You may need to reset_state() before you call this function for the first time.
-   * 
-   * @param node the node to start at
-   * @param k the kth best derivation (indexed from 1)
-   * @return the derivation object
+   * @return the Viterib derivation
    */
-  public DerivationState getKthDerivation(HGNode node, int k) {
-    VirtualNode virtualNode = getVirtualNode(node);
-    return virtualNode.lazyKBestExtractOnNode(this, k);
+  public DerivationState getViterbiDerivation() {
+    
+    /* TODO: this is just a short-cut to get this working. Instead of triggering the k-best extraction,
+     * it would be better to have a shortcut function that can construction a {@link DerivationState object}
+     * from the hypergraph directly, which would be a lot cheaper.
+     */
+    hasNext();
+    return this.nextDerivation;
   }
+
   
   /**
    * Compute the string that is output from the decoder, using the "output-format" config file
@@ -163,142 +166,18 @@ public class KBestExtractor {
    * 
    * You may need to reset_state() before you call this function for the first time.
    */
-  public String getKthHyp(HGNode node, int k) {
+  public DerivationState getKthHyp(HGNode node, int k) {
 
-    String outputString = null;
-    
     // Determine the k-best hypotheses at each HGNode
     VirtualNode virtualNode = getVirtualNode(node);
     DerivationState derivationState = virtualNode.lazyKBestExtractOnNode(this, k);
-//    DerivationState derivationState = getKthDerivation(node, k);
-    if (derivationState != null) {
-      // ==== read the kbest from each hgnode and convert to output format
-      String hypothesis = maybeProjectCase(
-                            unescapeSpecialSymbols(
-                              removeSentenceMarkers(
-                                derivationState.getHypothesis())), derivationState);
-      
-      
-      /*
-       * To save space, the decoder only stores the model cost,
-       * no the individual feature values.
-       * If you want to output them, you have to replay them.
-       */
-
-      FeatureVector features = new FeatureVector();
-      if (outputFormat.contains("%f") || outputFormat.contains("%d"))
-        features = derivationState.getFeatures();
-
-      outputString = outputFormat
-          .replace("%k", Integer.toString(k))
-          .replace("%s", hypothesis)
-          .replace("%S", DeNormalize.processSingleLine(hypothesis))
-          // TODO (kellens): Fix the recapitalization here
-          .replace("%i", Integer.toString(sentence.id()))
-          .replace("%f", joshuaConfiguration.moses ? features.mosesString() : features.toString())
-          .replace("%c", String.format("%.3f", derivationState.cost));
-
-      if (outputFormat.contains("%t")) {
-        outputString = outputString.replace("%t", derivationState.getTree());
-      }
 
-      if (outputFormat.contains("%e")) {
-        outputString = outputString.replace("%e", removeSentenceMarkers(derivationState.getHypothesis(Side.SOURCE)));
-      }
-
-      /* %d causes a derivation with rules one per line to be output */
-      if (outputFormat.contains("%d")) {
-        outputString = outputString.replace("%d", derivationState.getDerivation());
-      }
-      
-      /* %a causes output of word level alignments between input and output hypothesis */
-      if (outputFormat.contains("%a")) {
-        outputString = outputString.replace("%a",  derivationState.getWordAlignmentString());
-      }
-      
-    }
-
-    return outputString;
+    return derivationState;
   }
 
   // =========================== end kbestHypergraph
 
   /**
-   * If requested, projects source-side lettercase to target, and appends the alignment from
-   * to the source-side sentence in ||s.
-   * 
-   * @param hypothesis
-   * @param state
-   * @return
-   */
-  private String maybeProjectCase(String hypothesis, DerivationState state) {
-    String output = hypothesis;
-
-    if (joshuaConfiguration.project_case) {
-      String[] tokens = hypothesis.split("\\s+");
-      List<List<Integer>> points = state.getWordAlignment();
-      for (int i = 0; i < points.size(); i++) {
-        List<Integer> target = points.get(i);
-        for (int source: target) {
-          Token token = sentence.getTokens().get(source + 1); // skip <s>
-          String annotation = "";
-          if (token != null && token.getAnnotation("lettercase") != null)
-            annotation = token.getAnnotation("lettercase");
-          if (source != 0 && annotation.equals("upper"))
-            tokens[i] = FormatUtils.capitalize(tokens[i]);
-          else if (annotation.equals("all-upper"))
-            tokens[i] = tokens[i].toUpperCase();
-        }
-      }
-
-      output = String.join(" ",  tokens);
-    }
-
-    return output;
-  }
-
-  /**
-   * Convenience function for k-best extraction that prints to STDOUT.
-   */
-  public void lazyKBestExtractOnHG(HyperGraph hg, int topN) throws IOException {
-    lazyKBestExtractOnHG(hg, topN, new BufferedWriter(new OutputStreamWriter(System.out)));
-  }
-
-  /**
-   * This is the entry point for extracting k-best hypotheses. It computes all of them, writing
-   * the results to the BufferedWriter passed in. If you want intermediate access to the k-best
-   * derivations, you'll want to call getKthHyp() or getKthDerivation() directly.
-   * 
-   * The number of derivations that are looked for is controlled by the `top-n` parameter.
-   * Note that when `top-n` is set to 0, k-best extraction is disabled entirely, and only things 
-   * like the viterbi string and the model score are available to the decoder. Since k-best
-   * extraction involves the recomputation of features to get the component values, turning off
-   * that extraction saves a lot of time when only the 1-best string is desired.
-   * 
-   * @param hg the hypergraph to extract from
-   * @param topN how many to extract
-   * @param out object to write to
-   * @throws IOException
-   */
-  public void lazyKBestExtractOnHG(HyperGraph hg, int topN, BufferedWriter out) throws IOException {
-
-    resetState();
-
-    if (null == hg.goalNode)
-      return;
-
-    for (int k = 1; k <= topN; k++) {
-      String hypStr = getKthHyp(hg.goalNode, k);
-      if (null == hypStr)
-        break;
-
-      out.write(hypStr);
-      out.write("\n");
-      out.flush();
-    }
-  }
-
-  /**
    * This clears the virtualNodesTable, which maintains a list of virtual nodes. This should be
    * called in between forest rescorings.
    */
@@ -313,7 +192,7 @@ public class KBestExtractor {
    * @param hgnode
    * @return the corresponding VirtualNode
    */
-  private VirtualNode getVirtualNode(HGNode hgnode) {
+  VirtualNode getVirtualNode(HGNode hgnode) {
     VirtualNode virtualNode = virtualNodesTable.get(hgnode);
     if (null == virtualNode) {
       virtualNode = new VirtualNode(hgnode);
@@ -330,7 +209,7 @@ public class KBestExtractor {
    * queue of candidates.
    */
 
-  private class VirtualNode {
+  class VirtualNode {
 
     // The node being annotated.
     HGNode node = null;
@@ -452,7 +331,7 @@ public class KBestExtractor {
         newRanks[i] = previousState.ranks[i] + 1;
 
         // Create a new state so we can see if it's new. The cost will be set below if it is.
-        DerivationState nextState = new DerivationState(previousState.parentNode,
+        DerivationState nextState = new DerivationState(KBestExtractor.this, previousState.parentNode,
             previousState.edge, newRanks, 0.0f, previousState.edgePos);
 
         // Don't add the state to the list of candidates if it's already been added.
@@ -469,9 +348,6 @@ public class KBestExtractor {
                 + virtualTailNode.nbests.get(newRanks[i] - 1).getModelCost();
             nextState.setCost(cost);
 
-            if (joshuaConfiguration.rescoreForest)
-              nextState.bleu = nextState.computeBLEU();
-
             candHeap.add(nextState);
             derivationTable.add(nextState);
 
@@ -582,243 +458,12 @@ public class KBestExtractor {
       }
       cost = (float) hyperEdge.getBestDerivationScore();
 
-      DerivationState state = new DerivationState(parentNode, hyperEdge, ranks, cost, edgePos);
-      if (joshuaConfiguration.rescoreForest)
-        state.bleu = state.computeBLEU();
+      DerivationState state = new DerivationState(KBestExtractor.this, parentNode, hyperEdge, ranks, cost, edgePos);
 
       return state;
     }
   };
 
-  /**
-   * A DerivationState describes which path to follow through the hypergraph. For example, it
-   * might say to use the 1-best from the first tail node, the 9th-best from the second tail node,
-   * and so on. This information is represented recursively through a chain of DerivationState
-   * objects. This function follows that chain, extracting the information according to a number
-   * of parameters, and returning results to a string, and also (optionally) accumulating the
-   * feature values into the passed-in FeatureVector.
-   */
-
-  // each DerivationState roughly corresponds to a hypothesis
-  public class DerivationState {
-    /* The edge ("e" in the paper) */
-    public HyperEdge edge;
-
-    /* The edge's parent node */
-    public HGNode parentNode;
-
-    /*
-     * This state's position in its parent node's list of incoming hyperedges (used in signature
-     * calculation)
-     */
-    public int edgePos;
-
-    /*
-     * The rank item to select from each of the incoming tail nodes ("j" in the paper, an ArrayList
-     * of size |e|)
-     */
-    public int[] ranks;
-
-    /*
-     * The cost of the hypothesis, including a weighted BLEU score, if any.
-     */
-    private float cost;
-
-    private float bleu = 0.0f;
-
-    /*
-     * The BLEU sufficient statistics associated with the edge's derivation. Note that this is a
-     * function of the complete derivation headed by the edge, i.e., all the particular
-     * subderivations of edges beneath it. That is why it must be contained in DerivationState
-     * instead of in the HyperEdge itself.
-     */
-    BLEU.Stats stats = null;
-
-    public DerivationState(HGNode pa, HyperEdge e, int[] r, float c, int pos) {
-      parentNode = pa;
-      edge = e;
-      ranks = r;
-      cost = c;
-      edgePos = pos;
-      bleu = 0.0f;
-    }
-
-    /**
-     * Computes a scaled approximate BLEU from the accumulated statistics. We know the number of
-     * words; to compute the effective reference length, we take the real reference length statistic
-     * and scale it by the percentage of the input sentence that is consumed, based on the
-     * assumption that the total number of words in the hypothesis scales linearly with the input
-     * sentence span.
-     * 
-     * @return
-     */
-    public float computeBLEU() {
-      if (stats == null) {
-        float percentage = 1.0f * (parentNode.j - parentNode.i) / (sentence.length());
-        // System.err.println(String.format("computeBLEU: (%d - %d) / %d = %f", parentNode.j,
-        // parentNode.i, sentence.length(), percentage));
-        stats = BLEU.compute(edge, percentage, references);
-
-        if (edge.getTailNodes() != null) {
-          for (int id = 0; id < edge.getTailNodes().size(); id++) {
-            stats.add(getChildDerivationState(edge, id).stats);
-          }
-        }
-      }
-
-      return BLEU.score(stats);
-    }
-
-    public void setCost(float cost2) {
-      this.cost = cost2;
-    }
-
-    /**
-     * Returns the model cost. This is obtained by subtracting off the incorporated BLEU score (if
-     * used).
-     * 
-     * @return
-     */
-    public float getModelCost() {
-      return this.cost;
-    }
-
-    /**
-     * Returns the model cost plus the BLEU score.
-     * 
-     * @return
-     */
-    public float getCost() {
-      return cost - weights.getSparse("BLEU") * bleu;
-    }
-
-    public String toString() {
-      StringBuilder sb = new StringBuilder(String.format("DS[[ %s (%d,%d)/%d ||| ",
-          Vocabulary.word(parentNode.lhs), parentNode.i, parentNode.j, edgePos));
-      sb.append("ranks=[ ");
-      if (ranks != null)
-        for (int i = 0; i < ranks.length; i++)
-          sb.append(ranks[i] + " ");
-      sb.append("] ||| " + String.format("%.5f ]]", cost));
-      return sb.toString();
-    }
-
-    public boolean equals(Object other) {
-      if (other instanceof DerivationState) {
-        DerivationState that = (DerivationState) other;
-        if (edgePos == that.edgePos) {
-          if (ranks != null && that.ranks != null) {
-            if (ranks.length == that.ranks.length) {
-              for (int i = 0; i < ranks.length; i++)
-                if (ranks[i] != that.ranks[i])
-                  return false;
-              return true;
-            }
-          }
-        }
-      }
-
-      return false;
-    }
-
-    /**
-     * DerivationState objects are unique to each VirtualNode, so the unique identifying information
-     * only need contain the edge position and the ranks.
-     */
-    public int hashCode() {
-      int hash = edgePos;
-      if (ranks != null) {
-        for (int i = 0; i < ranks.length; i++)
-          hash = hash * 53 + i;
-      }
-
-      return hash;
-    }
-
-    /**
-     * Visits every state in the derivation in a depth-first order.
-     */
-    private DerivationVisitor visit(DerivationVisitor visitor) {
-      return visit(visitor, 0, 0);
-    }
-
-    private DerivationVisitor visit(DerivationVisitor visitor, int indent, int tailNodeIndex) {
-
-      visitor.before(this, indent, tailNodeIndex);
-
-      final Rule rule = edge.getRule();
-      final List<HGNode> tailNodes = edge.getTailNodes();
-
-      if (rule == null) {
-        getChildDerivationState(edge, 0).visit(visitor, indent + 1, 0);
-      } else {
-        if (tailNodes != null) {
-          for (int index = 0; index < tailNodes.size(); index++) {
-            getChildDerivationState(edge, index).visit(visitor, indent + 1, index);
-          }
-        }
-      }
-
-      visitor.after(this, indent, tailNodeIndex);
-
-      return visitor;
-    }
-
-    private String getWordAlignmentString() {
-      return visit(new WordAlignmentExtractor()).toString();
-    }
-    
-    private List<List<Integer>> getWordAlignment() {
-      WordAlignmentExtractor extractor = new WordAlignmentExtractor();
-      visit(extractor);
-      return extractor.getFinalWordAlignments();
-    }
-
-    private String getTree() {
-      return visit(new TreeExtractor()).toString();
-    }
-    
-    private String getHypothesis() {
-      return getHypothesis(defaultSide);
-    }
-
-    /**
-     * For stack decoding we keep using the old string-based
-     * HypothesisExtractor.
-     * For Hiero, we use a faster, int-based hypothesis extraction
-     * that is correct also for Side.SOURCE cases.
-     */
-    private String getHypothesis(final Side side) {
-      return visit(new OutputStringExtractor(side.equals(Side.SOURCE))).toString();
-    }
-
-    private FeatureVector getFeatures() {
-      final FeatureVectorExtractor extractor = new FeatureVectorExtractor(featureFunctions, sentence);
-      visit(extractor);
-      return extractor.getFeatures();
-    }
-
-    private String getDerivation() {
-      return visit(new DerivationExtractor()).toString();
-    }
-
-    /**
-     * Helper function for navigating the hierarchical list of DerivationState objects. This
-     * function looks up the VirtualNode corresponding to the HGNode pointed to by the edge's
-     * {tailNodeIndex}th tail node.
-     * 
-     * @param edge
-     * @param tailNodeIndex
-     * @return
-     */
-    public DerivationState getChildDerivationState(HyperEdge edge, int tailNodeIndex) {
-      HGNode child = edge.getTailNodes().get(tailNodeIndex);
-      VirtualNode virtualChild = getVirtualNode(child);
-      return virtualChild.nbests.get(ranks[tailNodeIndex] - 1);
-    }
-
-  } // end of Class DerivationState
-
   public static class DerivationStateComparator implements Comparator<DerivationState> {
     // natural order by cost
     public int compare(DerivationState one, DerivationState another) {
@@ -1001,6 +646,31 @@ public class KBestExtractor {
     @Override
     public void after(DerivationState state, int level, int tailNodeIndex) {}
   }
-  
 
+  @Override
+  public Iterator<DerivationState> iterator() {
+    return this;
+  }
+
+  @Override
+  public boolean hasNext() {
+    if (this.nextDerivation != null)
+      return true;
+
+    derivationCounter++;
+    
+    VirtualNode virtualNode = getVirtualNode(hyperGraph.goalNode);
+    this.nextDerivation = virtualNode.lazyKBestExtractOnNode(this, derivationCounter);
+    return this.nextDerivation != null;
+  }
+
+  @Override
+  public DerivationState next() {
+    if (this.hasNext()) {
+      DerivationState returnDerivation = this.nextDerivation;
+      this.nextDerivation = null;
+      return returnDerivation;
+    }
+    return null;
+  }
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2f82c38/src/joshua/decoder/hypergraph/OutputStringExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/OutputStringExtractor.java b/src/joshua/decoder/hypergraph/OutputStringExtractor.java
index f67a9df..8fc10ce 100644
--- a/src/joshua/decoder/hypergraph/OutputStringExtractor.java
+++ b/src/joshua/decoder/hypergraph/OutputStringExtractor.java
@@ -24,7 +24,6 @@ import static joshua.corpus.Vocabulary.getWords;
 import java.util.Stack;
 
 import joshua.decoder.ff.tm.Rule;
-import joshua.decoder.hypergraph.KBestExtractor.DerivationState;
 import joshua.decoder.hypergraph.KBestExtractor.DerivationVisitor;
 import joshua.util.FormatUtils;
 

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2f82c38/src/joshua/decoder/hypergraph/ViterbiExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/ViterbiExtractor.java b/src/joshua/decoder/hypergraph/ViterbiExtractor.java
index 31c8dc0..b70446a 100644
--- a/src/joshua/decoder/hypergraph/ViterbiExtractor.java
+++ b/src/joshua/decoder/hypergraph/ViterbiExtractor.java
@@ -109,9 +109,9 @@ public class ViterbiExtractor {
   /**
    * Returns the Viterbi Word Alignments as list of lists (target-side).
    */
-  public static List<List<Integer>> getViterbiWordAlignmentList(final HyperGraph hg) {
+  public static WordAlignmentState getViterbiWordAlignmentList(final HyperGraph hg) {
     if (hg == null)
-      return emptyList();
+      return null;
     
     final WordAlignmentExtractor wordAlignmentWalker = new WordAlignmentExtractor();
     viterbiWalk(hg.goalNode, wordAlignmentWalker);

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2f82c38/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java b/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
index 837c69f..cb2f059 100644
--- a/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
+++ b/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
@@ -24,7 +24,6 @@ import java.util.List;
 import java.util.Stack;
 
 import joshua.decoder.ff.tm.Rule;
-import joshua.decoder.hypergraph.KBestExtractor.DerivationState;
 import joshua.decoder.hypergraph.KBestExtractor.DerivationVisitor;
 
 /**
@@ -97,9 +96,9 @@ public class WordAlignmentExtractor implements WalkerFunction, DerivationVisitor
    * Final word alignment without sentence markers
    * or empty list if stack is empty.
    */
-  public List<List<Integer>> getFinalWordAlignments() {
+  public WordAlignmentState getFinalWordAlignments() {
     if (stack.isEmpty()) {
-      return emptyList();
+      return null;
     }
     
     if (stack.size() != 1) {
@@ -108,7 +107,7 @@ public class WordAlignmentExtractor implements WalkerFunction, DerivationVisitor
               "Stack of WordAlignmentExtractor should contain only a single (last) element, but was size %d", stack.size()));
     }
     
-    return stack.peek().toFinalList();
+    return stack.peek();
   }
   
   /**

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2f82c38/src/joshua/decoder/io/JSONMessage.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/io/JSONMessage.java b/src/joshua/decoder/io/JSONMessage.java
index 2733db4..bf75133 100644
--- a/src/joshua/decoder/io/JSONMessage.java
+++ b/src/joshua/decoder/io/JSONMessage.java
@@ -24,7 +24,12 @@ import java.util.List;
 import com.google.gson.Gson;
 import com.google.gson.GsonBuilder;
 
+import joshua.decoder.JoshuaConfiguration;
 import joshua.decoder.Translation;
+import joshua.decoder.TranslationFactory;
+import joshua.decoder.hypergraph.DerivationState;
+import joshua.decoder.hypergraph.KBestExtractor;
+import joshua.decoder.segment_file.Sentence;
 
 public class JSONMessage {
   public Data data = null;
@@ -86,19 +91,25 @@ public class JSONMessage {
     }
   }
 
-  public static JSONMessage buildMessage(Translation translation) {
+  public static JSONMessage buildMessage(Sentence sentence, KBestExtractor extractor, JoshuaConfiguration config) {
     JSONMessage message = new JSONMessage();
-    String[] results = translation.toString().split("\\n");
-    if (results.length > 0) {
-      JSONMessage.TranslationItem item = message.addTranslation(translation.getStructuredTranslation().getTranslationString());
+    
+    final String mosesFormat = "%i ||| %s ||| %f ||| %c"; 
+    
+    int k = 1;
+    for (DerivationState derivation: extractor) {
+      if (k > config.topN)
+        break;
+      
+      TranslationFactory factory = new TranslationFactory(sentence, derivation, config);
+      Translation translation = factory.formattedTranslation(mosesFormat).translation();
 
-      for (String result: results) {
-        String[] tokens = result.split(" \\|\\|\\| ");
-        String rawResult = tokens[1];
-        float score = Float.parseFloat(tokens[3]);
-        item.addHypothesis(rawResult, score);
-      }
+      JSONMessage.TranslationItem item = message.addTranslation(translation.toString());
+      item.addHypothesis(translation.toString(), translation.score());
+      
+      k++;
     }
+
     return message;
   }

[7/8] incubator-joshua git commit: bugfix: where to destroy KenLM allocations

Posted by mj...@apache.org.

bugfix: where to destroy KenLM allocations


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/19aadf0e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/19aadf0e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/19aadf0e

Branch: refs/heads/JOSHUA-273
Commit: 19aadf0e240acb09c5b4336068e6368083f26bef
Parents: ebdf643
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed May 25 11:47:23 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed May 25 11:47:23 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/Decoder.java       | 12 ++++++++++++
 src/joshua/decoder/DecoderThread.java | 12 ------------
 2 files changed, 12 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/19aadf0e/src/joshua/decoder/Decoder.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/Decoder.java b/src/joshua/decoder/Decoder.java
index 414d547..2cc8438 100644
--- a/src/joshua/decoder/Decoder.java
+++ b/src/joshua/decoder/Decoder.java
@@ -510,6 +510,18 @@ public class Decoder {
         out.write("\n".getBytes());
       }
       out.flush();
+    
+      /*
+       * KenLM hack. If using KenLMFF, we need to tell KenLM to delete the pool used to create chart
+       * objects for this sentence.
+       */
+      // TODO: make sure this works here
+      for (FeatureFunction feature : featureFunctions) {
+        if (feature instanceof StateMinimizingLanguageModel) {
+          ((StateMinimizingLanguageModel) feature).destroyPool(hg.sentence.id());
+          break;
+        }
+      }
     }
     
     if (config.n_best_file != null)

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/19aadf0e/src/joshua/decoder/DecoderThread.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/DecoderThread.java b/src/joshua/decoder/DecoderThread.java
index cf78420..7a04500 100644
--- a/src/joshua/decoder/DecoderThread.java
+++ b/src/joshua/decoder/DecoderThread.java
@@ -148,18 +148,6 @@ public class DecoderThread extends Thread {
     Decoder.LOG(1, String.format("Input %d: Translation took %.3f seconds", sentence.id(), seconds));
     Decoder.LOG(1, String.format("Input %d: Memory used is %.1f MB", sentence.id(), (Runtime
         .getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / 1000000.0));
-
-     /*
-     * KenLM hack. If using KenLMFF, we need to tell KenLM to delete the pool used to create chart
-     * objects for this sentence.
-     */
-    // TODO: make sure this works here
-    for (FeatureFunction feature : featureFunctions) {
-      if (feature instanceof StateMinimizingLanguageModel) {
-        ((StateMinimizingLanguageModel) feature).destroyPool(sentence.id());
-        break;
-      }
-    }
     
     /* Return the translation unless we're doing synchronous parsing. */
     if (!joshuaConfiguration.parse || hypergraph == null) {

[5/8] incubator-joshua git commit: minor cleanup

Posted by mj...@apache.org.

minor cleanup


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/3f305f35
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/3f305f35
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/3f305f35

Branch: refs/heads/JOSHUA-273
Commit: 3f305f359620e80d57b3b86d657a9ec2fe80ec33
Parents: f2f82c3
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed May 25 11:29:59 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed May 25 11:29:59 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/Decoder.java                        | 10 ++++------
 src/joshua/decoder/JoshuaConfiguration.java            | 13 -------------
 src/joshua/decoder/chart_parser/DotChart.java          |  3 ---
 src/joshua/decoder/hypergraph/KBestExtractor.java      |  5 -----
 src/joshua/decoder/hypergraph/ViterbiExtractor.java    |  2 --
 .../decoder/hypergraph/WordAlignmentExtractor.java     |  3 ---
 6 files changed, 4 insertions(+), 32 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f305f35/src/joshua/decoder/Decoder.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/Decoder.java b/src/joshua/decoder/Decoder.java
index fc4ba89..414d547 100644
--- a/src/joshua/decoder/Decoder.java
+++ b/src/joshua/decoder/Decoder.java
@@ -182,9 +182,6 @@ public class Decoder {
     /* Where to put translated sentences. */
     private final Translations response;
     
-    /* Sometimes we need to communicate with the client even when we didn't get a new sentence
-     * (e.g., metadata)
-     */
     private OutputStream out;
     
     RequestParallelizer(TranslationRequestStream request, Translations response) {
@@ -481,11 +478,12 @@ public class Decoder {
         String text;
         if (config.moses) {
           KBestExtractor extractor = new KBestExtractor(sentence, hg, featureFunctions, weights, false, config);
+          
           final String mosesFormat = "%i ||| %s ||| %f ||| %c"; 
           
           int k = 1;
           for (DerivationState derivation: extractor) {
-            if (k > config.topN)
+            if (k > config.topN || derivation == null)
               break;
             
             TranslationFactory factory = new TranslationFactory(sentence, derivation, config);
@@ -691,7 +689,7 @@ public class Decoder {
           System.err
               .println("You might be using an old version of the config file that is no longer supported");
           System.err
-              .println("Check joshua-decoder.org or email joshua_support@googlegroups.com for help");
+              .println("Check joshua-decoder.org or email user@joshua.incubator.apache.org for help");
           System.exit(17);
         }
 
@@ -699,7 +697,7 @@ public class Decoder {
       }
 
       Decoder.LOG(1, String.format("Read %d weights (%d of them dense)", weights.size(),
-      DENSE_FEATURE_NAMES.size()));
+          DENSE_FEATURE_NAMES.size()));
 
       // Do this before loading the grammars and the LM.
       this.featureFunctions = new ArrayList<FeatureFunction>();

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f305f35/src/joshua/decoder/JoshuaConfiguration.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/JoshuaConfiguration.java b/src/joshua/decoder/JoshuaConfiguration.java
index 3f20f46..8621382 100644
--- a/src/joshua/decoder/JoshuaConfiguration.java
+++ b/src/joshua/decoder/JoshuaConfiguration.java
@@ -214,14 +214,6 @@ public class JoshuaConfiguration {
    */
   public String fragmentMapFile = null;
 
-  /*
-   * Whether to use soft syntactic constraint decoding /fuzzy matching, which allows that any
-   * nonterminal may be substituted for any other nonterminal (except for OOV and GOAL)
-   */
-  public boolean fuzzy_matching = false;
-
-  public static final String SOFT_SYNTACTIC_CONSTRAINT_DECODING_PROPERTY_NAME = "fuzzy_matching";
-
   /***
    * Phrase-based decoding parameters.
    */
@@ -568,11 +560,6 @@ public class JoshuaConfiguration {
             // add the feature to the list of features for later processing
             maxlen = Integer.parseInt(fds[1]);
 
-          } else if (parameter
-              .equals(normalize_key(SOFT_SYNTACTIC_CONSTRAINT_DECODING_PROPERTY_NAME))) {
-            fuzzy_matching = Boolean.parseBoolean(fds[1]);
-            logger.finest(String.format(fuzzy_matching + ": %s", fuzzy_matching));
-
           } else if (parameter.equals(normalize_key("fragment-map"))) {
             fragmentMapFile = fds[1];
             Tree.readMapping(fragmentMapFile);

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f305f35/src/joshua/decoder/chart_parser/DotChart.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/chart_parser/DotChart.java b/src/joshua/decoder/chart_parser/DotChart.java
index 796256e..b1a16da 100644
--- a/src/joshua/decoder/chart_parser/DotChart.java
+++ b/src/joshua/decoder/chart_parser/DotChart.java
@@ -19,13 +19,10 @@
 package joshua.decoder.chart_parser;
 
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.List;
-import java.util.Map;
 import java.util.logging.Level;
 import java.util.logging.Logger;
 
-import joshua.corpus.Vocabulary;
 import joshua.decoder.ff.tm.Grammar;
 import joshua.decoder.ff.tm.Rule;
 import joshua.decoder.ff.tm.RuleCollection;

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f305f35/src/joshua/decoder/hypergraph/KBestExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/KBestExtractor.java b/src/joshua/decoder/hypergraph/KBestExtractor.java
index b8f167c..4d40639 100644
--- a/src/joshua/decoder/hypergraph/KBestExtractor.java
+++ b/src/joshua/decoder/hypergraph/KBestExtractor.java
@@ -18,9 +18,6 @@
  */
 package joshua.decoder.hypergraph;
 
-import java.io.BufferedWriter;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
 import java.util.Arrays;
 import java.util.Comparator;
 import java.util.ArrayList;
@@ -38,8 +35,6 @@ import joshua.decoder.ff.fragmentlm.Tree;
 import joshua.decoder.ff.state_maintenance.DPState;
 import joshua.decoder.ff.tm.Rule;
 import joshua.decoder.segment_file.Sentence;
-import joshua.decoder.segment_file.Token;
-import joshua.util.FormatUtils;
 
 /**
  * This class implements lazy k-best extraction on a hyper-graph.

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f305f35/src/joshua/decoder/hypergraph/ViterbiExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/ViterbiExtractor.java b/src/joshua/decoder/hypergraph/ViterbiExtractor.java
index b70446a..8a76240 100644
--- a/src/joshua/decoder/hypergraph/ViterbiExtractor.java
+++ b/src/joshua/decoder/hypergraph/ViterbiExtractor.java
@@ -18,8 +18,6 @@
  */
 package joshua.decoder.hypergraph;
 
-import static java.util.Collections.emptyList;
-
 import java.util.ArrayList;
 import java.util.List;
 

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3f305f35/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java b/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
index cb2f059..15af246 100644
--- a/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
+++ b/src/joshua/decoder/hypergraph/WordAlignmentExtractor.java
@@ -18,9 +18,6 @@
  */
 package joshua.decoder.hypergraph;
 
-import static java.util.Collections.emptyList;
-
-import java.util.List;
 import java.util.Stack;
 
 import joshua.decoder.ff.tm.Rule;

[6/8] incubator-joshua git commit: removed rescoring test case

Posted by mj...@apache.org.

removed rescoring test case


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/ebdf643b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/ebdf643b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/ebdf643b

Branch: refs/heads/JOSHUA-273
Commit: ebdf643be2fa3bc51b9d33562b9be1e00eb625fe
Parents: 3f305f3
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed May 25 11:44:44 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed May 25 11:44:44 2016 -0400

----------------------------------------------------------------------
 test/decoder/rescoring/glue-grammar  |   3 ---
 test/decoder/rescoring/grammar.gz    | Bin 177 -> 0 bytes
 test/decoder/rescoring/input.txt     |   2 --
 test/decoder/rescoring/joshua.config |  31 ------------------------------
 test/decoder/rescoring/output.gold   |  12 ------------
 test/decoder/rescoring/test.sh       |  30 -----------------------------
 6 files changed, 78 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ebdf643b/test/decoder/rescoring/glue-grammar
----------------------------------------------------------------------
diff --git a/test/decoder/rescoring/glue-grammar b/test/decoder/rescoring/glue-grammar
deleted file mode 100644
index 6a1162f..0000000
--- a/test/decoder/rescoring/glue-grammar
+++ /dev/null
@@ -1,3 +0,0 @@
-[GOAL] ||| <s> ||| <s> ||| 0
-[GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1
-[GOAL] ||| [GOAL,1] </s> ||| [GOAL,1] </s> ||| 0

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ebdf643b/test/decoder/rescoring/grammar.gz
----------------------------------------------------------------------
diff --git a/test/decoder/rescoring/grammar.gz b/test/decoder/rescoring/grammar.gz
deleted file mode 100644
index 6708c0d..0000000
Binary files a/test/decoder/rescoring/grammar.gz and /dev/null differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ebdf643b/test/decoder/rescoring/input.txt
----------------------------------------------------------------------
diff --git a/test/decoder/rescoring/input.txt b/test/decoder/rescoring/input.txt
deleted file mode 100644
index 5562a01..0000000
--- a/test/decoder/rescoring/input.txt
+++ /dev/null
@@ -1,2 +0,0 @@
-el nino tomo la cucaracha |||  ||| the boy ate the cockroach
-el nino tomo la cucaracha |||  ||| the big storm swarmed the coast ||| the big storm only swarmed the coast

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ebdf643b/test/decoder/rescoring/joshua.config
----------------------------------------------------------------------
diff --git a/test/decoder/rescoring/joshua.config b/test/decoder/rescoring/joshua.config
deleted file mode 100644
index 0e4a277..0000000
--- a/test/decoder/rescoring/joshua.config
+++ /dev/null
@@ -1,31 +0,0 @@
-rescore-forest = true
-rescore-forest-weight = 100
-
-lm = kenlm 5 false false 100 ../constrained/lm.gz
-
-tm = thrax pt 12 grammar.gz
-tm = thrax glue -1 glue-grammar
-
-mark-oovs = true
-
-default-non-terminal = X
-goalSymbol = GOAL
-
-#pruning config
-pop-limit = 100
-
-#output-format = %i %c %s
-
-#nbest config
-use_unique_nbest = true
-top_n = 2
-
-feature-function = WordPenalty
-feature-function = OOVPenalty
-
-lm_0 1.2373676802179452
-
-tm_pt_0 1
-tm_glue_0 1
-WordPenalty -1
-OOVPenalty 1.0

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ebdf643b/test/decoder/rescoring/output.gold
----------------------------------------------------------------------
diff --git a/test/decoder/rescoring/output.gold b/test/decoder/rescoring/output.gold
deleted file mode 100644
index 5d6600d..0000000
--- a/test/decoder/rescoring/output.gold
+++ /dev/null
@@ -1,12 +0,0 @@
-0 ||| the boy ate the cockroach ||| tm_pt_0=-6.000 tm_glue_0=5.000 lm_0=-17.198 WordPenalty=-3.040 OOVPenalty=0.000 ||| -19.240
-0 ||| the boy eated the cockroach ||| tm_pt_0=-11.000 tm_glue_0=5.000 lm_0=-17.198 WordPenalty=-3.040 OOVPenalty=0.000 ||| -24.240
-0 ||| the boy eated the cockroach ||| tm_pt_0=-11.000 tm_glue_0=5.000 lm_0=-17.198 WordPenalty=-3.040 OOVPenalty=0.000 ||| -24.240
-0 ||| the kid eated the cockroach ||| tm_pt_0=-15.000 tm_glue_0=5.000 lm_0=-20.053 WordPenalty=-3.040 OOVPenalty=0.000 ||| -31.773
-0 ||| the boy ate the cockroach ||| tm_pt_0=-6.000 tm_glue_0=5.000 lm_0=-17.198 WordPenalty=-3.040 OOVPenalty=0.000 ||| -19.240
-0 ||| the boy eated the cockroach ||| tm_pt_0=-11.000 tm_glue_0=5.000 lm_0=-17.198 WordPenalty=-3.040 OOVPenalty=0.000 ||| -24.240
-1 ||| the boy ate the cockroach ||| tm_pt_0=-6.000 tm_glue_0=5.000 lm_0=-17.198 WordPenalty=-3.040 OOVPenalty=0.000 ||| -19.240
-1 ||| the boy eated the cockroach ||| tm_pt_0=-11.000 tm_glue_0=5.000 lm_0=-17.198 WordPenalty=-3.040 OOVPenalty=0.000 ||| -24.240
-1 ||| the boy ate the cockroach ||| tm_pt_0=-6.000 tm_glue_0=5.000 lm_0=-17.198 WordPenalty=-3.040 OOVPenalty=0.000 ||| -19.240
-1 ||| the boy eated the cockroach ||| tm_pt_0=-11.000 tm_glue_0=5.000 lm_0=-17.198 WordPenalty=-3.040 OOVPenalty=0.000 ||| -24.240
-1 ||| the boy ate the cockroach ||| tm_pt_0=-6.000 tm_glue_0=5.000 lm_0=-17.198 WordPenalty=-3.040 OOVPenalty=0.000 ||| -19.240
-1 ||| the boy eated the cockroach ||| tm_pt_0=-11.000 tm_glue_0=5.000 lm_0=-17.198 WordPenalty=-3.040 OOVPenalty=0.000 ||| -24.240

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ebdf643b/test/decoder/rescoring/test.sh
----------------------------------------------------------------------
diff --git a/test/decoder/rescoring/test.sh b/test/decoder/rescoring/test.sh
deleted file mode 100755
index 58f2d2d..0000000
--- a/test/decoder/rescoring/test.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-set -u
-
-cat input.txt | $JOSHUA/bin/joshua-decoder -m 1g -threads 1 -c joshua.config > output 2> log
-
-# Compare
-diff -u output output.gold > diff
-
-if [ $? -eq 0 ]; then
-	rm -f diff log output 
-	exit 0
-else
-	exit 1
-fi

[8/8] incubator-joshua git commit: Added k-best extraction (logic is duplicated)

Posted by mj...@apache.org.

Added k-best extraction (logic is duplicated)


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/f4090b0f
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/f4090b0f
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/f4090b0f

Branch: refs/heads/JOSHUA-273
Commit: f4090b0fb419b186d6c4aa395b046417763de0e4
Parents: 19aadf0
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed May 25 11:49:16 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed May 25 11:49:16 2016 -0400

----------------------------------------------------------------------
 src/joshua/decoder/Decoder.java | 23 +++++++++++++++--------
 1 file changed, 15 insertions(+), 8 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f4090b0f/src/joshua/decoder/Decoder.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/Decoder.java b/src/joshua/decoder/Decoder.java
index 2cc8438..1a48110 100644
--- a/src/joshua/decoder/Decoder.java
+++ b/src/joshua/decoder/Decoder.java
@@ -45,6 +45,7 @@ import joshua.decoder.ff.FeatureFunction;
 import joshua.decoder.ff.PhraseModel;
 import joshua.decoder.ff.StatefulFF;
 import joshua.decoder.ff.lm.LanguageModelFF;
+import joshua.decoder.ff.lm.StateMinimizingLanguageModel;
 import joshua.decoder.ff.tm.Grammar;
 import joshua.decoder.ff.tm.Rule;
 import joshua.decoder.ff.tm.Trie;
@@ -498,16 +499,22 @@ public class Decoder {
         }
 
         KBestExtractor extractor = new KBestExtractor(sentence, hg, featureFunctions, weights, false, config);
-        DerivationState viterbi = extractor.getViterbiDerivation();
-        Translation best = new TranslationFactory(sentence, viterbi, config)
-            .formattedTranslation(config.outputFormat)
+        int k = 1;
+        for (DerivationState derivation: extractor) {
+          if (k > config.topN || derivation == null)
+            break;
+
+          Translation t = new TranslationFactory(sentence, derivation, config)
+              .formattedTranslation(config.outputFormat)
               .translation();
-        
-        Decoder.LOG(1, String.format("Translation %d: %.3f %s", sentence.id(), best.score(), best.toString()));
+          
+          if (k == 1)
+            Decoder.LOG(1, String.format("Translation %d: %.3f %s", sentence.id(), t.score(), t.toString()));
 
-        String bestString = best.getFormattedTranslation();
-        out.write(bestString.getBytes());
-        out.write("\n".getBytes());
+          String bestString = t.getFormattedTranslation();
+          out.write(bestString.getBytes());
+          out.write("\n".getBytes());
+        }
       }
       out.flush();

[2/8] incubator-joshua git commit: Removed regexp grammar tests

Posted by mj...@apache.org.

Removed regexp grammar tests


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/fe88c686
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/fe88c686
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/fe88c686

Branch: refs/heads/JOSHUA-273
Commit: fe88c686e082b190dc95e634956cc4679982b3bc
Parents: fe2c434
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed May 25 07:26:26 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed May 25 07:26:26 2016 -0400

----------------------------------------------------------------------
 .../regexp-grammar-both-rule-types/.gitignore   |  2 --
 .../regexp-grammar-both-rule-types/README       | 16 -----------
 .../regexp-grammar-both-rule-types/config       |  9 ------
 .../regexp-grammar-both-rule-types/glue-grammar |  3 --
 .../regexp-grammar-both-rule-types/input        |  5 ----
 .../regexp-grammar-both-rule-types/output.gold  | 12 --------
 .../regexp-grammar                              | 12 --------
 .../regexp-grammar-both-rule-types/test.sh      | 29 --------------------
 .../regexp-grammar-both-rule-types/weights      |  4 ---
 test/decoder/regexp-grammar/.gitignore          |  2 --
 test/decoder/regexp-grammar/README              | 10 -------
 test/decoder/regexp-grammar/config              | 11 --------
 test/decoder/regexp-grammar/glue-grammar        |  3 --
 test/decoder/regexp-grammar/input               |  4 ---
 test/decoder/regexp-grammar/output.gold         |  4 ---
 test/decoder/regexp-grammar/regexp-grammar      |  6 ----
 test/decoder/regexp-grammar/test.sh             | 29 --------------------
 test/decoder/regexp-grammar/weights             |  5 ----
 18 files changed, 166 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe88c686/test/decoder/regexp-grammar-both-rule-types/.gitignore
----------------------------------------------------------------------
diff --git a/test/decoder/regexp-grammar-both-rule-types/.gitignore b/test/decoder/regexp-grammar-both-rule-types/.gitignore
deleted file mode 100644
index d937c7f..0000000
--- a/test/decoder/regexp-grammar-both-rule-types/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-diff
-output

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe88c686/test/decoder/regexp-grammar-both-rule-types/README
----------------------------------------------------------------------
diff --git a/test/decoder/regexp-grammar-both-rule-types/README b/test/decoder/regexp-grammar-both-rule-types/README
deleted file mode 100644
index 226fa64..0000000
--- a/test/decoder/regexp-grammar-both-rule-types/README
+++ /dev/null
@@ -1,16 +0,0 @@
-This tests the case where something matched *both* a regex and a non-regex
-rule (or two regexes), but the (correct) regex rule wasn't winning. It should
-be the case, if the code is right, that if you change the order of the rules in
-your grammar, you still get the same output translations.
-
-This test tests the use of regular expressions in the grammar.  This is an
-experimental feature with an inefficient implementation in the decoder, but
-there are a number of things that could be done to make it more efficient if
-the technique proves useful.
-
-To enable it, you set the Joshua parameter
-
-  regexp-grammar = OWNER
-
-where OWNER is the owner of one or more grammars whose rules might be interpreted as regular
-expressions.

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe88c686/test/decoder/regexp-grammar-both-rule-types/config
----------------------------------------------------------------------
diff --git a/test/decoder/regexp-grammar-both-rule-types/config b/test/decoder/regexp-grammar-both-rule-types/config
deleted file mode 100644
index 0fb4c0c..0000000
--- a/test/decoder/regexp-grammar-both-rule-types/config
+++ /dev/null
@@ -1,9 +0,0 @@
-tm = regexp regexp 10 ./regexp-grammar
-tm = thrax glue -1 ./glue-grammar
-mark-oovs = true
-goal-symbol = GOAL
-top-n = 3
-
-weights-file = weights
-
-feature-function = OOVPenalty

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe88c686/test/decoder/regexp-grammar-both-rule-types/glue-grammar
----------------------------------------------------------------------
diff --git a/test/decoder/regexp-grammar-both-rule-types/glue-grammar b/test/decoder/regexp-grammar-both-rule-types/glue-grammar
deleted file mode 100644
index 6a1162f..0000000
--- a/test/decoder/regexp-grammar-both-rule-types/glue-grammar
+++ /dev/null
@@ -1,3 +0,0 @@
-[GOAL] ||| <s> ||| <s> ||| 0
-[GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1
-[GOAL] ||| [GOAL,1] </s> ||| [GOAL,1] </s> ||| 0

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe88c686/test/decoder/regexp-grammar-both-rule-types/input
----------------------------------------------------------------------
diff --git a/test/decoder/regexp-grammar-both-rule-types/input b/test/decoder/regexp-grammar-both-rule-types/input
deleted file mode 100644
index 5531876..0000000
--- a/test/decoder/regexp-grammar-both-rule-types/input
+++ /dev/null
@@ -1,5 +0,0 @@
-chica linda
-chicos lindos
-chicos lind?s
-1928371028
-192837102

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe88c686/test/decoder/regexp-grammar-both-rule-types/output.gold
----------------------------------------------------------------------
diff --git a/test/decoder/regexp-grammar-both-rule-types/output.gold b/test/decoder/regexp-grammar-both-rule-types/output.gold
deleted file mode 100644
index c8edb86..0000000
--- a/test/decoder/regexp-grammar-both-rule-types/output.gold
+++ /dev/null
@@ -1,12 +0,0 @@
-0 ||| girl feminine-singular-pretty ||| tm_regexp_0=-2.000 tm_regexp_1=0.000 tm_glue_0=2.000 OOVPenalty=0.000 ||| -4.000
-0 ||| girl feminine-pretty ||| tm_regexp_0=-2.000 tm_regexp_1=-1.000 tm_glue_0=2.000 OOVPenalty=0.000 ||| -5.000
-0 ||| girl generic-pretty ||| tm_regexp_0=-2.000 tm_regexp_1=-2.000 tm_glue_0=2.000 OOVPenalty=0.000 ||| -6.000
-1 ||| boys masculine-pretty ||| tm_regexp_0=-2.000 tm_regexp_1=-1.000 tm_glue_0=2.000 OOVPenalty=0.000 ||| -5.000
-1 ||| boys generic-pretty ||| tm_regexp_0=-2.000 tm_regexp_1=-2.000 tm_glue_0=2.000 OOVPenalty=0.000 ||| -6.000
-1 ||| boys lindos_OOV ||| tm_regexp_0=-1.000 tm_regexp_1=0.000 tm_glue_0=2.000 OOVPenalty=-100.000 ||| -103.000
-2 ||| boys generic-pretty ||| tm_regexp_0=-2.000 tm_regexp_1=-2.000 tm_glue_0=2.000 OOVPenalty=0.000 ||| -6.000
-2 ||| boys lind?s_OOV ||| tm_regexp_0=-1.000 tm_regexp_1=0.000 tm_glue_0=2.000 OOVPenalty=-100.000 ||| -103.000
-2 ||| chicos_OOV generic-pretty ||| tm_regexp_0=-1.000 tm_regexp_1=-2.000 tm_glue_0=2.000 OOVPenalty=-100.000 ||| -105.000
-3 ||| really big number ||| tm_regexp_0=-1.000 tm_regexp_1=-1.000 tm_glue_0=1.000 OOVPenalty=0.000 ||| -3.000
-3 ||| 1928371028_OOV ||| tm_regexp_0=0.000 tm_regexp_1=0.000 tm_glue_0=1.000 OOVPenalty=-100.000 ||| -101.000
-4 ||| 192837102_OOV ||| tm_regexp_0=0.000 tm_regexp_1=0.000 tm_glue_0=1.000 OOVPenalty=-100.000 ||| -101.000

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe88c686/test/decoder/regexp-grammar-both-rule-types/regexp-grammar
----------------------------------------------------------------------
diff --git a/test/decoder/regexp-grammar-both-rule-types/regexp-grammar b/test/decoder/regexp-grammar-both-rule-types/regexp-grammar
deleted file mode 100644
index c93dc80..0000000
--- a/test/decoder/regexp-grammar-both-rule-types/regexp-grammar
+++ /dev/null
@@ -1,12 +0,0 @@
-[X] ||| blah linda ||| feminine-singular-pretty blah ||| 1 0
-[X] ||| \d{10,} ||| really big number ||| 1 1
-[X] ||| lindo.* ||| masculine-pretty ||| 1 1
-[X] ||| linda.* ||| feminine-pretty ||| 1 1
-[X] ||| lind.* ||| generic-pretty ||| 1 2
-[X] ||| lindo ||| masculine-singular-pretty ||| 1 0
-[X] ||| linda ||| feminine-singular-pretty ||| 1 0
-[X] ||| chico ||| boy ||| 1 0
-[X] ||| chicos ||| boys ||| 1 0
-[X] ||| chica ||| girl ||| 1 0
-[X] ||| chicas ||| girls ||| 1 0
-[X] ||| grande ||| great ||| 1 0

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe88c686/test/decoder/regexp-grammar-both-rule-types/test.sh
----------------------------------------------------------------------
diff --git a/test/decoder/regexp-grammar-both-rule-types/test.sh b/test/decoder/regexp-grammar-both-rule-types/test.sh
deleted file mode 100755
index d4b6436..0000000
--- a/test/decoder/regexp-grammar-both-rule-types/test.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-set -u
-
-cat input | $JOSHUA/bin/joshua-decoder -m 1g -c config > output 2> log
-
-diff -u output output.gold > diff
-
-if [ $? -eq 0 ]; then
-    rm -f output log diff
-    exit 0
-else
-    exit 1
-fi

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe88c686/test/decoder/regexp-grammar-both-rule-types/weights
----------------------------------------------------------------------
diff --git a/test/decoder/regexp-grammar-both-rule-types/weights b/test/decoder/regexp-grammar-both-rule-types/weights
deleted file mode 100644
index a998939..0000000
--- a/test/decoder/regexp-grammar-both-rule-types/weights
+++ /dev/null
@@ -1,4 +0,0 @@
-tm_regexp_0 1
-tm_regexp_1 1
-tm_glue_0 -1
-OOVPenalty 1

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe88c686/test/decoder/regexp-grammar/.gitignore
----------------------------------------------------------------------
diff --git a/test/decoder/regexp-grammar/.gitignore b/test/decoder/regexp-grammar/.gitignore
deleted file mode 100644
index d937c7f..0000000
--- a/test/decoder/regexp-grammar/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-diff
-output

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe88c686/test/decoder/regexp-grammar/README
----------------------------------------------------------------------
diff --git a/test/decoder/regexp-grammar/README b/test/decoder/regexp-grammar/README
deleted file mode 100644
index df81a67..0000000
--- a/test/decoder/regexp-grammar/README
+++ /dev/null
@@ -1,10 +0,0 @@
-This test tests the use of regular expressions in the grammar.  This is an experimental feature with
-an inefficient implementation in the decoder, but there are a number of things that could be done to
-make it more efficient if the technique proves useful.
-
-To enable it, you set the Joshua parameter
-
-  regexp-grammar = OWNER
-
-where OWNER is the owner of one or more grammars whose rules might be interpreted as regular
-expressions.

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe88c686/test/decoder/regexp-grammar/config
----------------------------------------------------------------------
diff --git a/test/decoder/regexp-grammar/config b/test/decoder/regexp-grammar/config
deleted file mode 100644
index 526dba0..0000000
--- a/test/decoder/regexp-grammar/config
+++ /dev/null
@@ -1,11 +0,0 @@
-tm = regexp regexp 10 ./regexp-grammar
-tm = thrax glue -1 ./glue-grammar
-mark-oovs = true
-goal-symbol = GOAL
-regexp-grammar = regexp
-
-weights-file = weights
-
-feature-function = OOVPenalty
-
-

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe88c686/test/decoder/regexp-grammar/glue-grammar
----------------------------------------------------------------------
diff --git a/test/decoder/regexp-grammar/glue-grammar b/test/decoder/regexp-grammar/glue-grammar
deleted file mode 100644
index 6a1162f..0000000
--- a/test/decoder/regexp-grammar/glue-grammar
+++ /dev/null
@@ -1,3 +0,0 @@
-[GOAL] ||| <s> ||| <s> ||| 0
-[GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1
-[GOAL] ||| [GOAL,1] </s> ||| [GOAL,1] </s> ||| 0

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe88c686/test/decoder/regexp-grammar/input
----------------------------------------------------------------------
diff --git a/test/decoder/regexp-grammar/input b/test/decoder/regexp-grammar/input
deleted file mode 100644
index 8cdf0f8..0000000
--- a/test/decoder/regexp-grammar/input
+++ /dev/null
@@ -1,4 +0,0 @@
-chica linda
-chico lindo
-1928371028
-192837102

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe88c686/test/decoder/regexp-grammar/output.gold
----------------------------------------------------------------------
diff --git a/test/decoder/regexp-grammar/output.gold b/test/decoder/regexp-grammar/output.gold
deleted file mode 100644
index 49c5ea4..0000000
--- a/test/decoder/regexp-grammar/output.gold
+++ /dev/null
@@ -1,4 +0,0 @@
-0 ||| girl pretty ||| tm_regexp_0=-2.000 tm_regexp_1=-1.000 tm_glue_0=2.000 OOVPenalty=0.000 ||| -5.000
-1 ||| boy pretty ||| tm_regexp_0=-2.000 tm_regexp_1=-1.000 tm_glue_0=2.000 OOVPenalty=0.000 ||| -5.000
-2 ||| really big number ||| tm_regexp_0=-1.000 tm_regexp_1=0.000 tm_glue_0=1.000 OOVPenalty=0.000 ||| -2.000
-3 ||| 192837102_OOV ||| tm_regexp_0=0.000 tm_regexp_1=0.000 tm_glue_0=1.000 OOVPenalty=-100.000 ||| -101.000

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe88c686/test/decoder/regexp-grammar/regexp-grammar
----------------------------------------------------------------------
diff --git a/test/decoder/regexp-grammar/regexp-grammar b/test/decoder/regexp-grammar/regexp-grammar
deleted file mode 100644
index 6f6c57c..0000000
--- a/test/decoder/regexp-grammar/regexp-grammar
+++ /dev/null
@@ -1,6 +0,0 @@
-[X] ||| lind.* ||| pretty ||| 1 1
-[X] ||| lindo ||| [boy version of pretty] ||| 10 0 
-[X] ||| linda ||| [girl version of pretty] ||| 10 0 
-[X] ||| chico ||| boy ||| 1 0
-[X] ||| chica ||| girl ||| 1 0
-[X] ||| \d{10,} ||| really big number ||| 1 0

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe88c686/test/decoder/regexp-grammar/test.sh
----------------------------------------------------------------------
diff --git a/test/decoder/regexp-grammar/test.sh b/test/decoder/regexp-grammar/test.sh
deleted file mode 100755
index 3235bd4..0000000
--- a/test/decoder/regexp-grammar/test.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/bin/bash
-#
-# Licensed to the Apache Software Foundation (ASF) under one or more
-# contributor license agreements.  See the NOTICE file distributed with
-# this work for additional information regarding copyright ownership.
-# The ASF licenses this file to You under the Apache License, Version 2.0
-# (the "License"); you may not use this file except in compliance with
-# the License.  You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-#
-set -u
-
-cat input | $JOSHUA/bin/joshua-decoder -c config > output 2> log
-
-diff -u output output.gold > diff
-
-if [ $? -eq 0 ]; then
-  rm -rf output log diff
-	exit 0
-else
-	exit 1
-fi

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fe88c686/test/decoder/regexp-grammar/weights
----------------------------------------------------------------------
diff --git a/test/decoder/regexp-grammar/weights b/test/decoder/regexp-grammar/weights
deleted file mode 100644
index 4782753..0000000
--- a/test/decoder/regexp-grammar/weights
+++ /dev/null
@@ -1,5 +0,0 @@
-tm_regexp_0 1
-tm_regexp_1 1
-tm_glue_0 -1
-
-OOVPenalty 1

[3/8] incubator-joshua git commit: Large refactor of the Translation output interface

Posted by mj...@apache.org.

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2f82c38/src/joshua/oracle/OracleExtractionHG.java
----------------------------------------------------------------------
diff --git a/src/joshua/oracle/OracleExtractionHG.java b/src/joshua/oracle/OracleExtractionHG.java
deleted file mode 100644
index 8a688e3..0000000
--- a/src/joshua/oracle/OracleExtractionHG.java
+++ /dev/null
@@ -1,794 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.oracle;
-
-import static joshua.decoder.hypergraph.ViterbiExtractor.getViterbiString;
-import static joshua.util.FormatUtils.removeSentenceMarkers;
-
-import java.io.BufferedWriter;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.HashMap;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.JoshuaConfiguration;
-import joshua.decoder.Support;
-import joshua.decoder.Decoder;
-import joshua.decoder.hypergraph.HGNode;
-import joshua.decoder.hypergraph.HyperEdge;
-import joshua.decoder.hypergraph.HyperGraph;
-import joshua.decoder.hypergraph.KBestExtractor;
-import joshua.util.FileUtility;
-import joshua.util.FormatUtils;
-import joshua.util.io.LineReader;
-
-/**
- * approximated BLEU (1) do not consider clipping effect (2) in the dynamic programming, do not
- * maintain different states for different hyp length (3) brief penalty is calculated based on the
- * avg ref length (4) using sentence-level BLEU, instead of doc-level BLEU
- * 
- * @author Zhifei Li, <zh...@gmail.com> (Johns Hopkins University)
- */
-public class OracleExtractionHG extends SplitHg {
-  static String BACKOFF_LEFT_LM_STATE_SYM = "<lzfbo>";
-  public int BACKOFF_LEFT_LM_STATE_SYM_ID;// used for equivelant state
-
-  static String NULL_LEFT_LM_STATE_SYM = "<lzflnull>";
-  public int NULL_LEFT_LM_STATE_SYM_ID;// used for equivelant state
-
-  static String NULL_RIGHT_LM_STATE_SYM = "<lzfrnull>";
-  public int NULL_RIGHT_LM_STATE_SYM_ID;// used for equivelant state
-
-  // int[] ref_sentence;//reference string (not tree)
-  protected int src_sent_len = 0;
-  protected int ref_sent_len = 0;
-  protected int g_lm_order = 4; // only used for decide whether to get the LM state by this class or
-                                // not in compute_state
-  static protected boolean do_local_ngram_clip = false;
-  static protected boolean maitain_length_state = false;
-  static protected int g_bleu_order = 4;
-
-  static boolean using_left_equiv_state = true;
-  static boolean using_right_equiv_state = true;
-
-  // TODO Add generics to hash tables in this class
-  HashMap<String, Boolean> tbl_suffix = new HashMap<String, Boolean>();
-  HashMap<String, Boolean> tbl_prefix = new HashMap<String, Boolean>();
-  static PrefixGrammar grammar_prefix = new PrefixGrammar();// TODO
-  static PrefixGrammar grammar_suffix = new PrefixGrammar();// TODO
-
-  // key: item; value: best_deduction, best_bleu, best_len, # of n-gram match where n is in [1,4]
-  protected HashMap<String, Integer> tbl_ref_ngrams = new HashMap<String, Integer>();
-
-  static boolean always_maintain_seperate_lm_state = true; // if true: the virtual item maintain its
-                                                           // own lm state regardless whether
-                                                           // lm_order>=g_bleu_order
-
-  int lm_feat_id = 0; // the baseline LM feature id
-
-  /**
-   * Constructs a new object capable of extracting a tree from a hypergraph that most closely
-   * matches a provided oracle sentence.
-   * <p>
-   * It seems that the symbol table here should only need to represent monolingual terminals, plus
-   * nonterminals.
-   * 
-   * @param lm_feat_id_
-   */
-  public OracleExtractionHG(int lm_feat_id_) {
-    this.lm_feat_id = lm_feat_id_;
-    this.BACKOFF_LEFT_LM_STATE_SYM_ID = Vocabulary.id(BACKOFF_LEFT_LM_STATE_SYM);
-    this.NULL_LEFT_LM_STATE_SYM_ID = Vocabulary.id(NULL_RIGHT_LM_STATE_SYM);
-    this.NULL_RIGHT_LM_STATE_SYM_ID = Vocabulary.id(NULL_RIGHT_LM_STATE_SYM);
-  }
-
-  /*
-   * for 919 sent, time_on_reading: 148797 time_on_orc_extract: 580286
-   */
-  @SuppressWarnings({ "unused" })
-  public static void main(String[] args) throws IOException {
-    JoshuaConfiguration joshuaConfiguration = new JoshuaConfiguration();
-    /*
-     * String f_hypergraphs="C:\\Users\\zli\\Documents\\mt03.src.txt.ss.nbest.hg.items"; String
-     * f_rule_tbl="C:\\Users\\zli\\Documents\\mt03.src.txt.ss.nbest.hg.rules"; String
-     * f_ref_files="C:\\Users\\zli\\Documents\\mt03.ref.txt.1"; String f_orc_out
-     * ="C:\\Users\\zli\\Documents\\mt03.orc.txt";
-     */
-    if (6 != args.length) {
-      System.out
-          .println("Usage: java Decoder f_hypergraphs f_rule_tbl f_ref_files f_orc_out lm_order orc_extract_nbest");
-      System.out.println("num of args is " + args.length);
-      for (int i = 0; i < args.length; i++) {
-        System.out.println("arg is: " + args[i]);
-      }
-      System.exit(1);
-    }
-    // String f_hypergraphs = args[0].trim();
-    // String f_rule_tbl = args[1].trim();
-    String f_ref_files = args[2].trim();
-    String f_orc_out = args[3].trim();
-    int lm_order = Integer.parseInt(args[4].trim());
-    boolean orc_extract_nbest = Boolean.valueOf(args[5].trim()); // oracle extraction from nbest or
-                                                                 // hg
-
-    // ??????????????????????????????????????
-    int baseline_lm_feat_id = 0;
-    // ??????????????????????????????????????
-
-    KBestExtractor kbest_extractor = null;
-    int topN = 300;// TODO
-    joshuaConfiguration.use_unique_nbest = true;
-    joshuaConfiguration.include_align_index = false;
-    boolean do_ngram_clip_nbest = true; // TODO
-    if (orc_extract_nbest) {
-      System.out.println("oracle extraction from nbest list");
-
-      kbest_extractor = new KBestExtractor(null, null, Decoder.weights, false, joshuaConfiguration);
-    }
-
-    BufferedWriter orc_out = FileUtility.getWriteFileStream(f_orc_out);
-
-    long start_time0 = System.currentTimeMillis();
-    long time_on_reading = 0;
-    long time_on_orc_extract = 0;
-    // DiskHyperGraph dhg_read = new DiskHyperGraph(baseline_lm_feat_id, true, null);
-
-    // dhg_read.initRead(f_hypergraphs, f_rule_tbl, null);
-
-    OracleExtractionHG orc_extractor = new OracleExtractionHG(baseline_lm_feat_id);
-    long start_time = System.currentTimeMillis();
-    int sent_id = 0;
-    for (String ref_sent: new LineReader(f_ref_files)) {
-      System.out.println("############Process sentence " + sent_id);
-      start_time = System.currentTimeMillis();
-      sent_id++;
-      // if(sent_id>10)break;
-
-      // HyperGraph hg = dhg_read.readHyperGraph();
-      HyperGraph hg = null;
-      if (hg == null)
-        continue;
-
-      // System.out.println("read disk hyp: " + (System.currentTimeMillis()-start_time));
-      time_on_reading += System.currentTimeMillis() - start_time;
-      start_time = System.currentTimeMillis();
-
-      String orc_sent = null;
-      double orc_bleu = 0;
-      if (orc_extract_nbest) {
-        Object[] res = orc_extractor.oracle_extract_nbest(kbest_extractor, hg, topN,
-            do_ngram_clip_nbest, ref_sent);
-        orc_sent = (String) res[0];
-        orc_bleu = (Double) res[1];
-      } else {
-        HyperGraph hg_oracle = orc_extractor.oracle_extract_hg(hg, hg.sentLen(), lm_order, ref_sent);
-        orc_sent = removeSentenceMarkers(getViterbiString(hg_oracle));
-        orc_bleu = orc_extractor.get_best_goal_cost(hg, orc_extractor.g_tbl_split_virtual_items);
-
-        time_on_orc_extract += System.currentTimeMillis() - start_time;
-        System.out.println("num_virtual_items: " + orc_extractor.g_num_virtual_items
-            + " num_virtual_dts: " + orc_extractor.g_num_virtual_deductions);
-        // System.out.println("oracle extract: " + (System.currentTimeMillis()-start_time));
-      }
-
-      orc_out.write(orc_sent + "\n");
-      System.out.println("orc bleu is " + orc_bleu);
-    }
-    orc_out.close();
-
-    System.out.println("time_on_reading: " + time_on_reading);
-    System.out.println("time_on_orc_extract: " + time_on_orc_extract);
-    System.out.println("total running time: " + (System.currentTimeMillis() - start_time0));
-  }
-
-  // find the oracle hypothesis in the nbest list
-  public Object[] oracle_extract_nbest(KBestExtractor kbest_extractor, HyperGraph hg, int n,
-      boolean do_ngram_clip, String ref_sent) {
-    if (hg.goalNode == null)
-      return null;
-    kbest_extractor.resetState();
-    int next_n = 0;
-    double orc_bleu = -1;
-    String orc_sent = null;
-    while (true) {
-      String hyp_sent = kbest_extractor.getKthHyp(hg.goalNode, ++next_n);// ?????????
-      if (hyp_sent == null || next_n > n)
-        break;
-      double t_bleu = compute_sentence_bleu(ref_sent, hyp_sent, do_ngram_clip, 4);
-      if (t_bleu > orc_bleu) {
-        orc_bleu = t_bleu;
-        orc_sent = hyp_sent;
-      }
-    }
-    System.out.println("Oracle sent: " + orc_sent);
-    System.out.println("Oracle bleu: " + orc_bleu);
-    Object[] res = new Object[2];
-    res[0] = orc_sent;
-    res[1] = orc_bleu;
-    return res;
-  }
-
-  public HyperGraph oracle_extract_hg(HyperGraph hg, int src_sent_len_in, int lm_order,
-      String ref_sent_str) {
-    int[] ref_sent = Vocabulary.addAll(ref_sent_str);
-    g_lm_order = lm_order;
-    src_sent_len = src_sent_len_in;
-    ref_sent_len = ref_sent.length;
-
-    tbl_ref_ngrams.clear();
-    get_ngrams(tbl_ref_ngrams, g_bleu_order, ref_sent, false);
-    if (using_left_equiv_state || using_right_equiv_state) {
-      tbl_prefix.clear();
-      tbl_suffix.clear();
-      setup_prefix_suffix_tbl(ref_sent, g_bleu_order, tbl_prefix, tbl_suffix);
-      setup_prefix_suffix_grammar(ref_sent, g_bleu_order, grammar_prefix, grammar_suffix);// TODO
-    }
-    split_hg(hg);
-
-    // System.out.println("best bleu is " + get_best_goal_cost( hg, g_tbl_split_virtual_items));
-    return get_1best_tree_hg(hg, g_tbl_split_virtual_items);
-  }
-
-  /*
-   * This procedure does (1) identify all possible match (2) add a new deduction for each matches
-   */
-  protected void process_one_combination_axiom(HGNode parent_item,
-      HashMap<String, VirtualItem> virtual_item_sigs, HyperEdge cur_dt) {
-    if (null == cur_dt.getRule()) {
-      throw new RuntimeException("error null rule in axiom");
-    }
-    double avg_ref_len = (parent_item.j - parent_item.i >= src_sent_len) ? ref_sent_len
-        : (parent_item.j - parent_item.i) * ref_sent_len * 1.0 / src_sent_len;// avg len?
-    double bleu_score[] = new double[1];
-    DPStateOracle dps = compute_state(parent_item, cur_dt, null, tbl_ref_ngrams,
-        do_local_ngram_clip, g_lm_order, avg_ref_len, bleu_score, tbl_suffix, tbl_prefix);
-    VirtualDeduction t_dt = new VirtualDeduction(cur_dt, null, -bleu_score[0]);// cost: -best_bleu
-    g_num_virtual_deductions++;
-    add_deduction(parent_item, virtual_item_sigs, t_dt, dps, true);
-  }
-
-  /*
-   * This procedure does (1) create a new deduction (based on cur_dt and ant_virtual_item) (2) find
-   * whether an Item can contain this deduction (based on virtual_item_sigs which is a hashmap
-   * specific to a parent_item) (2.1) if yes, add the deduction, (2.2) otherwise (2.2.1) create a
-   * new item (2.2.2) and add the item into virtual_item_sigs
-   */
-  protected void process_one_combination_nonaxiom(HGNode parent_item,
-      HashMap<String, VirtualItem> virtual_item_sigs, HyperEdge cur_dt,
-      ArrayList<VirtualItem> l_ant_virtual_item) {
-    if (null == l_ant_virtual_item) {
-      throw new RuntimeException("wrong call in process_one_combination_nonaxiom");
-    }
-    double avg_ref_len = (parent_item.j - parent_item.i >= src_sent_len) ? ref_sent_len
-        : (parent_item.j - parent_item.i) * ref_sent_len * 1.0 / src_sent_len;// avg len?
-    double bleu_score[] = new double[1];
-    DPStateOracle dps = compute_state(parent_item, cur_dt, l_ant_virtual_item, tbl_ref_ngrams,
-        do_local_ngram_clip, g_lm_order, avg_ref_len, bleu_score, tbl_suffix, tbl_prefix);
-    VirtualDeduction t_dt = new VirtualDeduction(cur_dt, l_ant_virtual_item, -bleu_score[0]);// cost:
-                                                                                             // -best_bleu
-    g_num_virtual_deductions++;
-    add_deduction(parent_item, virtual_item_sigs, t_dt, dps, true);
-  }
-
-  // DPState maintain all the state information at an item that is required during dynamic
-  // programming
-  protected static class DPStateOracle extends DPState {
-    int best_len; // this may not be used in the signature
-    int[] ngram_matches;
-    int[] left_lm_state;
-    int[] right_lm_state;
-
-    public DPStateOracle(int blen, int[] matches, int[] left, int[] right) {
-      best_len = blen;
-      ngram_matches = matches;
-      left_lm_state = left;
-      right_lm_state = right;
-    }
-
-    protected String get_signature() {
-      StringBuffer res = new StringBuffer();
-      if (maitain_length_state) {
-        res.append(best_len);
-        res.append(' ');
-      }
-      if (null != left_lm_state) { // goal-item have null state
-        for (int i = 0; i < left_lm_state.length; i++) {
-          res.append(left_lm_state[i]);
-          res.append(' ');
-        }
-      }
-      res.append("lzf ");
-
-      if (null != right_lm_state) { // goal-item have null state
-        for (int i = 0; i < right_lm_state.length; i++) {
-          res.append(right_lm_state[i]);
-          res.append(' ');
-        }
-      }
-      // if(left_lm_state==null || right_lm_state==null)System.out.println("sig is: " +
-      // res.toString());
-      return res.toString();
-    }
-
-    protected void print() {
-      StringBuffer res = new StringBuffer();
-      res.append("DPstate: best_len: ");
-      res.append(best_len);
-      for (int i = 0; i < ngram_matches.length; i++) {
-        res.append("; ngram: ");
-        res.append(ngram_matches[i]);
-      }
-      System.out.println(res.toString());
-    }
-  }
-
-  // ########################## commmon funcions #####################
-  // based on tbl_oracle_states, tbl_ref_ngrams, and dt, get the state
-  // get the new state: STATE_BEST_DEDUCT STATE_BEST_BLEU STATE_BEST_LEN NGRAM_MATCH_COUNTS
-  protected DPStateOracle compute_state(HGNode parent_item, HyperEdge dt,
-      ArrayList<VirtualItem> l_ant_virtual_item, HashMap<String, Integer> tbl_ref_ngrams,
-      boolean do_local_ngram_clip, int lm_order, double ref_len, double[] bleu_score,
-      HashMap<String, Boolean> tbl_suffix, HashMap<String, Boolean> tbl_prefix) {
-    // ##### deductions under "goal item" does not have rule
-    if (null == dt.getRule()) {
-      if (l_ant_virtual_item.size() != 1) {
-        throw new RuntimeException("error deduction under goal item have more than one item");
-      }
-      bleu_score[0] = -l_ant_virtual_item.get(0).best_virtual_deduction.best_cost;
-      return new DPStateOracle(0, null, null, null); // no DPState at all
-    }
-
-    // ################## deductions *not* under "goal item"
-    HashMap<String, Integer> new_ngram_counts = new HashMap<String, Integer>();// new ngrams created
-                                                                               // due to the
-                                                                               // combination
-    HashMap<String, Integer> old_ngram_counts = new HashMap<String, Integer>();// the ngram that has
-                                                                               // already been
-                                                                               // computed
-    int total_hyp_len = 0;
-    int[] num_ngram_match = new int[g_bleu_order];
-    int[] en_words = dt.getRule().getEnglish();
-
-    // ####calulate new and old ngram counts, and len
-
-    ArrayList<Integer> words = new ArrayList<Integer>();
-
-    // used for compute left- and right- lm state
-    ArrayList<Integer> left_state_sequence = null;
-    // used for compute left- and right- lm state
-    ArrayList<Integer> right_state_sequence = null;
-
-    int correct_lm_order = lm_order;
-    if (always_maintain_seperate_lm_state || lm_order < g_bleu_order) {
-      left_state_sequence = new ArrayList<Integer>();
-      right_state_sequence = new ArrayList<Integer>();
-      correct_lm_order = g_bleu_order; // if lm_order is smaller than g_bleu_order, we will get the
-                                       // lm state by ourself
-    }
-
-    // #### get left_state_sequence, right_state_sequence, total_hyp_len, num_ngram_match
-    for (int c = 0; c < en_words.length; c++) {
-      int c_id = en_words[c];
-      if (FormatUtils.isNonterminal(c_id)) {
-        int index = -(c_id + 1);
-        DPStateOracle ant_state = (DPStateOracle) l_ant_virtual_item.get(index).dp_state;
-        total_hyp_len += ant_state.best_len;
-        for (int t = 0; t < g_bleu_order; t++) {
-          num_ngram_match[t] += ant_state.ngram_matches[t];
-        }
-        int[] l_context = ant_state.left_lm_state;
-        int[] r_context = ant_state.right_lm_state;
-        for (int t : l_context) { // always have l_context
-          words.add(t);
-          if (null != left_state_sequence && left_state_sequence.size() < g_bleu_order - 1) {
-            left_state_sequence.add(t);
-          }
-        }
-        get_ngrams(old_ngram_counts, g_bleu_order, l_context, true);
-        if (r_context.length >= correct_lm_order - 1) { // the right and left are NOT overlapping
-          get_ngrams(new_ngram_counts, g_bleu_order, words, true);
-          get_ngrams(old_ngram_counts, g_bleu_order, r_context, true);
-          words.clear();// start a new chunk
-          if (null != right_state_sequence) {
-            right_state_sequence.clear();
-          }
-          for (int t : r_context) {
-            words.add(t);
-          }
-        }
-        if (null != right_state_sequence) {
-          for (int t : r_context) {
-            right_state_sequence.add(t);
-          }
-        }
-      } else {
-        words.add(c_id);
-        total_hyp_len += 1;
-        if (null != left_state_sequence && left_state_sequence.size() < g_bleu_order - 1) {
-          left_state_sequence.add(c_id);
-        }
-        if (null != right_state_sequence) {
-          right_state_sequence.add(c_id);
-        }
-      }
-    }
-    get_ngrams(new_ngram_counts, g_bleu_order, words, true);
-
-    // ####now deduct ngram counts
-    for (String ngram : new_ngram_counts.keySet()) {
-      if (tbl_ref_ngrams.containsKey(ngram)) {
-        int final_count = (Integer) new_ngram_counts.get(ngram);
-        if (old_ngram_counts.containsKey(ngram)) {
-          final_count -= (Integer) old_ngram_counts.get(ngram);
-          // BUG: Whoa, is that an actual hard-coded ID in there? :)
-          if (final_count < 0) {
-            throw new RuntimeException("negative count for ngram: " + Vocabulary.word(11844)
-                + "; new: " + new_ngram_counts.get(ngram) + "; old: " + old_ngram_counts.get(ngram));
-          }
-        }
-        if (final_count > 0) { // TODO: not correct/global ngram clip
-          if (do_local_ngram_clip) {
-            // BUG: use joshua.util.Regex.spaces.split(...)
-            num_ngram_match[ngram.split("\\s+").length - 1] += Support.findMin(final_count,
-                (Integer) tbl_ref_ngrams.get(ngram));
-          } else {
-            // BUG: use joshua.util.Regex.spaces.split(...)
-            num_ngram_match[ngram.split("\\s+").length - 1] += final_count; // do not do any cliping
-          }
-        }
-      }
-    }
-
-    // ####now calculate the BLEU score and state
-    int[] left_lm_state = null;
-    int[] right_lm_state = null;
-    left_lm_state = get_left_equiv_state(left_state_sequence, tbl_suffix);
-    right_lm_state = get_right_equiv_state(right_state_sequence, tbl_prefix);
-
-    // debug
-    // System.out.println("lm_order is " + lm_order);
-    // compare_two_int_arrays(left_lm_state,
-    // (int[])parent_item.tbl_states.get(Symbol.LM_L_STATE_SYM_ID));
-    // compare_two_int_arrays(right_lm_state,
-    // (int[])parent_item.tbl_states.get(Symbol.LM_R_STATE_SYM_ID));
-    // end
-
-    bleu_score[0] = compute_bleu(total_hyp_len, ref_len, num_ngram_match, g_bleu_order);
-    // System.out.println("blue score is " + bleu_score[0]);
-    return new DPStateOracle(total_hyp_len, num_ngram_match, left_lm_state, right_lm_state);
-  }
-
-  private int[] get_left_equiv_state(ArrayList<Integer> left_state_sequence,
-      HashMap<String, Boolean> tbl_suffix) {
-    int l_size = (left_state_sequence.size() < g_bleu_order - 1) ? left_state_sequence.size()
-        : (g_bleu_order - 1);
-    int[] left_lm_state = new int[l_size];
-    if (!using_left_equiv_state || l_size < g_bleu_order - 1) { // regular
-      for (int i = 0; i < l_size; i++) {
-        left_lm_state[i] = left_state_sequence.get(i);
-      }
-    } else {
-      for (int i = l_size - 1; i >= 0; i--) { // right to left
-        if (is_a_suffix_in_tbl(left_state_sequence, 0, i, tbl_suffix)) {
-          // if(is_a_suffix_in_grammar(left_state_sequence, 0, i, grammar_suffix)){
-          for (int j = i; j >= 0; j--) {
-            left_lm_state[j] = left_state_sequence.get(j);
-          }
-          break;
-        } else {
-          left_lm_state[i] = this.NULL_LEFT_LM_STATE_SYM_ID;
-        }
-      }
-      // System.out.println("origi left:" + Symbol.get_string(left_state_sequence) + "; equiv left:"
-      // + Symbol.get_string(left_lm_state));
-    }
-    return left_lm_state;
-  }
-
-  private boolean is_a_suffix_in_tbl(ArrayList<Integer> left_state_sequence, int start_pos,
-      int end_pos, HashMap<String, Boolean> tbl_suffix) {
-    if ((Integer) left_state_sequence.get(end_pos) == this.NULL_LEFT_LM_STATE_SYM_ID) {
-      return false;
-    }
-    StringBuffer suffix = new StringBuffer();
-    for (int i = end_pos; i >= start_pos; i--) { // right-most first
-      suffix.append(left_state_sequence.get(i));
-      if (i > start_pos)
-        suffix.append(' ');
-    }
-    return (Boolean) tbl_suffix.containsKey(suffix.toString());
-  }
-
-  private int[] get_right_equiv_state(ArrayList<Integer> right_state_sequence,
-      HashMap<String, Boolean> tbl_prefix) {
-    int r_size = (right_state_sequence.size() < g_bleu_order - 1) ? right_state_sequence.size()
-        : (g_bleu_order - 1);
-    int[] right_lm_state = new int[r_size];
-    if (!using_right_equiv_state || r_size < g_bleu_order - 1) { // regular
-      for (int i = 0; i < r_size; i++) {
-        right_lm_state[i] = (Integer) right_state_sequence.get(right_state_sequence.size() - r_size
-            + i);
-      }
-    } else {
-      for (int i = 0; i < r_size; i++) { // left to right
-        if (is_a_prefix_in_tbl(right_state_sequence, right_state_sequence.size() - r_size + i,
-            right_state_sequence.size() - 1, tbl_prefix)) {
-          // if(is_a_prefix_in_grammar(right_state_sequence, right_state_sequence.size()-r_size+i,
-          // right_state_sequence.size()-1, grammar_prefix)){
-          for (int j = i; j < r_size; j++) {
-            right_lm_state[j] = (Integer) right_state_sequence.get(right_state_sequence.size()
-                - r_size + j);
-          }
-          break;
-        } else {
-          right_lm_state[i] = this.NULL_RIGHT_LM_STATE_SYM_ID;
-        }
-      }
-      // System.out.println("origi right:" + Symbol.get_string(right_state_sequence)+
-      // "; equiv right:" + Symbol.get_string(right_lm_state));
-    }
-    return right_lm_state;
-  }
-
-  private boolean is_a_prefix_in_tbl(ArrayList<Integer> right_state_sequence, int start_pos,
-      int end_pos, HashMap<String, Boolean> tbl_prefix) {
-    if (right_state_sequence.get(start_pos) == this.NULL_RIGHT_LM_STATE_SYM_ID) {
-      return false;
-    }
-    StringBuffer prefix = new StringBuffer();
-    for (int i = start_pos; i <= end_pos; i++) {
-      prefix.append(right_state_sequence.get(i));
-      if (i < end_pos)
-        prefix.append(' ');
-    }
-    return (Boolean) tbl_prefix.containsKey(prefix.toString());
-  }
-
-  public static void compare_two_int_arrays(int[] a, int[] b) {
-    if (a.length != b.length) {
-      throw new RuntimeException("two arrays do not have same size");
-    }
-    for (int i = 0; i < a.length; i++) {
-      if (a[i] != b[i]) {
-        throw new RuntimeException("elements in two arrays are not same");
-      }
-    }
-  }
-
-  // sentence-bleu: BLEU= bp * prec; where prec = exp (sum 1/4 * log(prec[order]))
-  public static double compute_bleu(int hyp_len, double ref_len, int[] num_ngram_match,
-      int bleu_order) {
-    if (hyp_len <= 0 || ref_len <= 0) {
-      throw new RuntimeException("ref or hyp is zero len");
-    }
-    double res = 0;
-    double wt = 1.0 / bleu_order;
-    double prec = 0;
-    double smooth_factor = 1.0;
-    for (int t = 0; t < bleu_order && t < hyp_len; t++) {
-      if (num_ngram_match[t] > 0) {
-        prec += wt * Math.log(num_ngram_match[t] * 1.0 / (hyp_len - t));
-      } else {
-        smooth_factor *= 0.5;// TODO
-        prec += wt * Math.log(smooth_factor / (hyp_len - t));
-      }
-    }
-    double bp = (hyp_len >= ref_len) ? 1.0 : Math.exp(1 - ref_len / hyp_len);
-    res = bp * Math.exp(prec);
-    // System.out.println("hyp_len: " + hyp_len + "; ref_len:" + ref_len + "prec: " + Math.exp(prec)
-    // + "; bp: " + bp + "; bleu: " + res);
-    return res;
-  }
-
-  // accumulate ngram counts into tbl
-  public void get_ngrams(HashMap<String, Integer> tbl, int order, int[] wrds,
-      boolean ignore_null_equiv_symbol) {
-    for (int i = 0; i < wrds.length; i++) {
-      for (int j = 0; j < order && j + i < wrds.length; j++) { // ngram: [i,i+j]
-        boolean contain_null = false;
-        StringBuffer ngram = new StringBuffer();
-        for (int k = i; k <= i + j; k++) {
-          if (wrds[k] == this.NULL_LEFT_LM_STATE_SYM_ID
-              || wrds[k] == this.NULL_RIGHT_LM_STATE_SYM_ID) {
-            contain_null = true;
-            if (ignore_null_equiv_symbol)
-              break;
-          }
-          ngram.append(wrds[k]);
-          if (k < i + j)
-            ngram.append(' ');
-        }
-        if (ignore_null_equiv_symbol && contain_null)
-          continue; // skip this ngram
-        String ngram_str = ngram.toString();
-        if (tbl.containsKey(ngram_str)) {
-          tbl.put(ngram_str, (Integer) tbl.get(ngram_str) + 1);
-        } else {
-          tbl.put(ngram_str, 1);
-        }
-      }
-    }
-  }
-
-  /** accumulate ngram counts into tbl. */
-  public void get_ngrams(HashMap<String, Integer> tbl, int order, ArrayList<Integer> wrds,
-      boolean ignore_null_equiv_symbol) {
-    for (int i = 0; i < wrds.size(); i++) {
-      // ngram: [i,i+j]
-      for (int j = 0; j < order && j + i < wrds.size(); j++) {
-        boolean contain_null = false;
-        StringBuffer ngram = new StringBuffer();
-        for (int k = i; k <= i + j; k++) {
-          int t_wrd = (Integer) wrds.get(k);
-          if (t_wrd == this.NULL_LEFT_LM_STATE_SYM_ID || t_wrd == this.NULL_RIGHT_LM_STATE_SYM_ID) {
-            contain_null = true;
-            if (ignore_null_equiv_symbol)
-              break;
-          }
-          ngram.append(t_wrd);
-          if (k < i + j)
-            ngram.append(' ');
-        }
-        // skip this ngram
-        if (ignore_null_equiv_symbol && contain_null)
-          continue;
-
-        String ngram_str = ngram.toString();
-        if (tbl.containsKey(ngram_str)) {
-          tbl.put(ngram_str, (Integer) tbl.get(ngram_str) + 1);
-        } else {
-          tbl.put(ngram_str, 1);
-        }
-      }
-    }
-  }
-
-  // do_ngram_clip: consider global n-gram clip
-  public double compute_sentence_bleu(String ref_sent, String hyp_sent, boolean do_ngram_clip,
-      int bleu_order) {
-    // BUG: use joshua.util.Regex.spaces.split(...)
-    int[] numeric_ref_sent = Vocabulary.addAll(ref_sent);
-    int[] numeric_hyp_sent = Vocabulary.addAll(hyp_sent);
-    return compute_sentence_bleu(numeric_ref_sent, numeric_hyp_sent, do_ngram_clip, bleu_order);
-  }
-
-  public double compute_sentence_bleu(int[] ref_sent, int[] hyp_sent, boolean do_ngram_clip,
-      int bleu_order) {
-    double res_bleu = 0;
-    int order = 4;
-    HashMap<String, Integer> ref_ngram_tbl = new HashMap<String, Integer>();
-    get_ngrams(ref_ngram_tbl, order, ref_sent, false);
-    HashMap<String, Integer> hyp_ngram_tbl = new HashMap<String, Integer>();
-    get_ngrams(hyp_ngram_tbl, order, hyp_sent, false);
-
-    int[] num_ngram_match = new int[order];
-    for (String ngram : hyp_ngram_tbl.keySet()) {
-      if (ref_ngram_tbl.containsKey(ngram)) {
-        if (do_ngram_clip) {
-          // BUG: use joshua.util.Regex.spaces.split(...)
-          num_ngram_match[ngram.split("\\s+").length - 1] += Support.findMin(
-              (Integer) ref_ngram_tbl.get(ngram), (Integer) hyp_ngram_tbl.get(ngram)); // ngram clip
-        } else {
-          // BUG: use joshua.util.Regex.spaces.split(...)
-          num_ngram_match[ngram.split("\\s+").length - 1] += (Integer) hyp_ngram_tbl.get(ngram);// without
-                                                                                                // ngram
-                                                                                                // count
-                                                                                                // clipping
-        }
-      }
-    }
-    res_bleu = compute_bleu(hyp_sent.length, ref_sent.length, num_ngram_match, bleu_order);
-    // System.out.println("hyp_len: " + hyp_sent.length + "; ref_len:" + ref_sent.length +
-    // "; bleu: " + res_bleu +" num_ngram_matches: " + num_ngram_match[0] + " " +num_ngram_match[1]+
-    // " " + num_ngram_match[2] + " " +num_ngram_match[3]);
-
-    return res_bleu;
-  }
-
-  // #### equivalent lm stuff ############
-  public static void setup_prefix_suffix_tbl(int[] wrds, int order,
-      HashMap<String, Boolean> prefix_tbl, HashMap<String, Boolean> suffix_tbl) {
-    for (int i = 0; i < wrds.length; i++) {
-      for (int j = 0; j < order && j + i < wrds.length; j++) { // ngram: [i,i+j]
-        StringBuffer ngram = new StringBuffer();
-        // ### prefix
-        for (int k = i; k < i + j; k++) { // all ngrams [i,i+j-1]
-          ngram.append(wrds[k]);
-          prefix_tbl.put(ngram.toString(), true);
-          ngram.append(' ');
-        }
-        // ### suffix: right-most wrd first
-        ngram = new StringBuffer();
-        for (int k = i + j; k > i; k--) { // all ngrams [i+1,i+j]: reverse order
-          ngram.append(wrds[k]);
-          suffix_tbl.put(ngram.toString(), true);// stored in reverse order
-          ngram.append(' ');
-        }
-      }
-    }
-  }
-
-  // #### equivalent lm stuff ############
-  public static void setup_prefix_suffix_grammar(int[] wrds, int order, PrefixGrammar prefix_gr,
-      PrefixGrammar suffix_gr) {
-    for (int i = 0; i < wrds.length; i++) {
-      for (int j = 0; j < order && j + i < wrds.length; j++) { // ngram: [i,i+j]
-        // ### prefix
-        prefix_gr.add_ngram(wrds, i, i + j - 1);// ngram: [i,i+j-1]
-
-        // ### suffix: right-most wrd first
-        int[] reverse_wrds = new int[j];
-        for (int k = i + j, t = 0; k > i; k--) { // all ngrams [i+1,i+j]: reverse order
-          reverse_wrds[t++] = wrds[k];
-        }
-        suffix_gr.add_ngram(reverse_wrds, 0, j - 1);
-      }
-    }
-  }
-
-  /*
-   * a backoff node is a hashtable, it may include: (1) probabilititis for next words (2) pointers
-   * to a next-layer backoff node (hashtable) (3) backoff weight for this node (4) suffix/prefix
-   * flag to indicate that there is ngrams start from this suffix
-   */
-  private static class PrefixGrammar {
-
-    private static class PrefixGrammarNode extends HashMap<Integer, PrefixGrammarNode> {
-      private static final long serialVersionUID = 1L;
-    };
-
-    PrefixGrammarNode root = new PrefixGrammarNode();
-
-    // add prefix information
-    public void add_ngram(int[] wrds, int start_pos, int end_pos) {
-      // ######### identify the position, and insert the trinodes if necessary
-      PrefixGrammarNode pos = root;
-      for (int k = start_pos; k <= end_pos; k++) {
-        int cur_sym_id = wrds[k];
-        PrefixGrammarNode next_layer = pos.get(cur_sym_id);
-
-        if (null != next_layer) {
-          pos = next_layer;
-        } else {
-          // next layer node
-          PrefixGrammarNode tmp = new PrefixGrammarNode();
-          pos.put(cur_sym_id, tmp);
-          pos = tmp;
-        }
-      }
-    }
-    
-    @SuppressWarnings("unused")
-    public boolean contain_ngram(ArrayList<Integer> wrds, int start_pos, int end_pos) {
-      if (end_pos < start_pos)
-        return false;
-      PrefixGrammarNode pos = root;
-      for (int k = start_pos; k <= end_pos; k++) {
-        int cur_sym_id = wrds.get(k);
-        PrefixGrammarNode next_layer = pos.get(cur_sym_id);
-        if (next_layer != null) {
-          pos = next_layer;
-        } else {
-          return false;
-        }
-      }
-      return true;
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f2f82c38/src/joshua/oracle/OracleExtractor.java
----------------------------------------------------------------------
diff --git a/src/joshua/oracle/OracleExtractor.java b/src/joshua/oracle/OracleExtractor.java
deleted file mode 100644
index d4a0019..0000000
--- a/src/joshua/oracle/OracleExtractor.java
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership.  The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *  http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.oracle;
-
-import joshua.decoder.hypergraph.HyperGraph;
-
-/**
- * Convenience wrapper class for oracle extraction code.
- * 
- * @author Lane Schwartz
- */
-public class OracleExtractor {
-
-  private final OracleExtractionHG extractor;
-
-  /**
-   * Constructs an object capable of extracting an oracle hypergraph.
-   */
-  public OracleExtractor() {
-
-    int baselineLanguageModelFeatureID = 0;
-    this.extractor = new OracleExtractionHG(baselineLanguageModelFeatureID);
-
-  }
-
-  /**
-   * Extract a hypergraph that represents the translation from the original shared forest hypergraph
-   * that is closest to the reference translation.
-   * 
-   * @param forest Original hypergraph representing a shared forest.
-   * @param lmOrder N-gram order of the language model.
-   * @param reference Reference sentence.
-   * @return Hypergraph closest to the reference.
-   */
-  public HyperGraph getOracle(HyperGraph forest, int lmOrder, String reference) {
-    if (reference != null)
-      return extractor.oracle_extract_hg(forest, forest.sentLen(), lmOrder, reference);
-
-    return null;
-  }
-
-}