You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/08/17 10:32:15 UTC
[34/56] [partial] incubator-joshua git commit: maven multi-module layout 1st commit: moving files into joshua-core

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Stack.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Stack.java b/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Stack.java
new file mode 100644
index 0000000..d0ae2da
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Stack.java
@@ -0,0 +1,229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.phrase;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.PriorityQueue;
+import java.util.Set;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.ComputeNodeResult;
+import org.apache.joshua.decoder.ff.FeatureFunction;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Organizes all hypotheses containing the same number of source words. 
+ *
+ */
+public class Stack extends ArrayList<Hypothesis> {
+
+  private static final Logger LOG = LoggerFactory.getLogger(Stack.class);
+
+  private static final long serialVersionUID = 7885252799032416068L;
+
+  private HashMap<Coverage, ArrayList<Hypothesis>> coverages;
+  
+  private Sentence sentence;
+  private List<FeatureFunction> featureFunctions;
+  private JoshuaConfiguration config;
+
+  /* The list of states we've already visited. */
+  private HashSet<Candidate> visitedStates;
+  
+  /* A list of candidates sorted for consideration for entry to the chart (for cube pruning) */
+  private PriorityQueue<Candidate> candidates;
+  
+  /* Short-circuits adding a cube-prune state more than once */
+  private HashMap<Hypothesis, Hypothesis> deduper;
+  
+  /**
+   * Create a new stack. Stacks are organized one for each number of source words that are covered.
+   * 
+   * @param featureFunctions {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+   * @param sentence input for a {@link org.apache.joshua.lattice.Lattice}
+   * @param config populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+   */
+  public Stack(List<FeatureFunction> featureFunctions, Sentence sentence, JoshuaConfiguration config) {
+    this.featureFunctions = featureFunctions;
+    this.sentence = sentence;
+    this.config = config;
+    
+    this.candidates = new PriorityQueue<Candidate>(1, new CandidateComparator());
+    this.coverages = new HashMap<Coverage, ArrayList<Hypothesis>>();
+    this.visitedStates = new HashSet<Candidate>();
+    this.deduper = new HashMap<Hypothesis,Hypothesis>();
+  }
+
+  /**
+   * A Stack is an ArrayList; here, we intercept the add so we can maintain a list of the items
+   * stored under each distinct coverage vector
+   * @param hyp a {@link org.apache.joshua.decoder.phrase.Hypothesis} to add to the {@link org.apache.joshua.decoder.phrase.Stack}
+   * @return true if the {@link org.apache.joshua.decoder.phrase.Hypothesis} is appended to the list
+   */
+  @Override
+  public boolean add(Hypothesis hyp) {
+    
+    if (! coverages.containsKey((hyp.getCoverage())))
+      coverages.put(hyp.getCoverage(), new ArrayList<Hypothesis>()); 
+    coverages.get(hyp.getCoverage()).add(hyp);
+    
+    return super.add(hyp);
+  }
+  
+  /**
+   * Intercept calls to remove() so that we can reduce the coverage vector
+   */
+  @Override
+  public boolean remove(Object obj) {
+    boolean found = super.remove(obj);
+    if (found) {
+      Hypothesis item = (Hypothesis) obj;
+      Coverage cov = item.getCoverage();
+      assert coverages.get(cov).remove(obj);
+      if (coverages.get(cov).size() == 0)
+        coverages.remove(cov);
+    }
+    return found;
+  }
+  
+  /** 
+   * Returns the set of coverages contained in this stack. This is used to iterate over them
+   * in the main decoding loop in Stacks.java.
+   * @return a {@link java.util.Set} of {@link org.apache.joshua.decoder.phrase.Coverage}'s
+   */
+  public Set<Coverage> getCoverages() {
+    return coverages.keySet();
+  }
+  
+  /**
+   * Get all items with the same coverage vector.
+   * 
+   * @param cov the {@link org.apache.joshua.decoder.phrase.Coverage} vector to get
+   * @return an {@link java.util.ArrayList} of {@link org.apache.joshua.decoder.phrase.Hypothesis}'
+   */
+  public ArrayList<Hypothesis> get(Coverage cov) {
+    ArrayList<Hypothesis> list = coverages.get(cov);
+    Collections.sort(list);
+    return list;
+  }
+  
+  /**
+   * Receives a partially-initialized translation candidate and places it on the
+   * priority queue after scoring it with all of the feature functions. In this
+   * respect it is like {@link org.apache.joshua.decoder.chart_parser.CubePruneState} (it could make use of that class with
+   * a little generalization of spans / coverage).
+   * 
+   * This function is also used to (fairly concisely) implement constrained decoding. Before
+   * adding a candidate, we ensure that the sequence of English words match the sentence. If not,
+   * the code extends the dot in the cube-pruning chart to the next phrase, since that one might
+   * be a match.
+   * @param cand a partially-initialized translation {@link org.apache.joshua.decoder.phrase.Candidate}
+   */
+  public void addCandidate(Candidate cand) {
+    if (visitedStates.contains(cand))
+      return;
+    
+    visitedStates.add(cand);
+
+    // Constrained decoding
+    if (sentence.target() != null) {
+      String oldWords = cand.getHypothesis().bestHyperedge.getRule().getEnglishWords().replace("[X,1] ",  "");
+      String newWords = cand.getRule().getEnglishWords().replace("[X,1] ",  "");
+          
+      // If the string is not found in the target sentence, explore the cube neighbors
+      if (sentence.fullTarget().indexOf(oldWords + " " + newWords) == -1) {
+        Candidate next = cand.extendPhrase();
+        if (next != null)
+          addCandidate(next); 
+        return;
+      }
+    }
+
+    // TODO: sourcepath
+    ComputeNodeResult result = new ComputeNodeResult(this.featureFunctions, cand.getRule(),
+        cand.getTailNodes(), -1, cand.getSpan().end, null, this.sentence);
+    cand.setResult(result);
+    
+    candidates.add(cand);
+  }
+  
+  /**
+   * Cube pruning. Repeatedly pop the top candidate, creating a new hyperedge from it, adding it to
+   * the k-best list, and then extending the list of candidates with extensions of the current
+   * candidate.
+   */
+  public void search() {
+    int to_pop = config.pop_limit;
+    
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("Stack::search(): pop: {} size: {}", to_pop, candidates.size());
+      for (Candidate c: candidates)
+        LOG.debug("{}", c);
+    }
+    while (to_pop > 0 && !candidates.isEmpty()) {
+      Candidate got = candidates.poll();
+      if (got != null) {
+        addHypothesis(got);
+        --to_pop;
+        
+        for (Candidate c : got.extend())
+          if (c != null) {
+            addCandidate(c);
+          }
+      }
+    }
+  }
+
+  /**
+   * Adds a popped candidate to the chart / main stack. This is a candidate we have decided to
+   * keep around.
+   * @param complete a completely-initialized translation {@link org.apache.joshua.decoder.phrase.Candidate}
+   * 
+   */
+  public void addHypothesis(Candidate complete) {
+    Hypothesis added = new Hypothesis(complete);
+
+    String taskName;
+    if (deduper.containsKey(added)) {
+      taskName = "recombining hypothesis";
+      Hypothesis existing = deduper.get(added);
+      existing.absorb(added);
+    } else {
+      taskName = "creating new hypothesis";
+      add(added);
+      deduper.put(added, added);
+    }
+
+    if (LOG.isDebugEnabled()) {
+      LOG.debug("{} from ( ... {} )", taskName, complete.getHypothesis().getRule().getEnglishWords());
+      LOG.debug("        base score {}", complete.getResult().getBaseCost());
+      LOG.debug("        covering {}-{}", complete.getSpan().start - 1, complete.getSpan().end - 2);
+      LOG.debug("        translated as: {}", complete.getRule().getEnglishWords());
+      LOG.debug("        score {} + future cost {} = {}",
+          complete.getResult().getTransitionCost(), complete.getFutureEstimate(),
+          complete.getResult().getTransitionCost() + complete.getFutureEstimate());
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Stacks.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Stacks.java b/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Stacks.java
new file mode 100644
index 0000000..8c092ec
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Stacks.java
@@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.phrase;
+
+/***
+ * Entry point for phrase-based decoding, analogous to {@link Chart} for the CKY algorithm. This
+ * class organizes all the stacks used for decoding, and is responsible for building them. Stack
+ * construction is stack-centric: that is, we loop over the number of source words in increasing sizes;
+ * at each step of this iteration, we break the search between smaller stack sizes and source-side
+ * phrase sizes.
+ * 
+ * The end result of decoding is a {@link Hypergraph} with the same format as hierarchical decoding.
+ * Phrases are treating as left-branching rules, and the span information (i,j) is overloaded so
+ * that i means nothing and j represents the index of the last-translated source word in each
+ * hypothesis. This means that most hypergraph code can work without modification. The algorithm 
+ * ensures that the coverage vector is consistent but the resulting hypergraph may not be projective,
+ * which is different from the CKY algorithm, which does produce projective derivations. 
+ * 
+ * TODO Lattice decoding is not yet supported (March 2015).
+ */
+
+import static org.apache.joshua.decoder.ff.tm.OwnerMap.UNKNOWN_OWNER;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.joshua.corpus.Span;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.ComputeNodeResult;
+import org.apache.joshua.decoder.ff.FeatureFunction;
+import org.apache.joshua.decoder.ff.tm.AbstractGrammar;
+import org.apache.joshua.decoder.ff.tm.Grammar;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.hypergraph.HyperEdge;
+import org.apache.joshua.decoder.hypergraph.HyperGraph;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Stacks {
+
+  private static final Logger LOG = LoggerFactory.getLogger(Stacks.class);
+
+  // The list of stacks, grouped according to number of source words covered
+  private List<Stack> stacks;
+
+  // The end state
+  private Hypothesis end;
+  
+  List<FeatureFunction> featureFunctions;
+
+  private Sentence sentence;
+
+  private JoshuaConfiguration config;
+
+  /* Contains all the phrase tables */
+  private PhraseChart chart;
+  
+  /**
+   * Entry point. Initialize everything. Create pass-through (OOV) phrase table and glue phrase
+   * table (with start-of-sentence and end-of-sentence rules).
+   * 
+   * @param sentence input to {@link org.apache.joshua.lattice.Lattice}
+   * @param featureFunctions {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+   * @param grammars an array of {@link org.apache.joshua.decoder.ff.tm.Grammar}'s
+   * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+   */
+  public Stacks(Sentence sentence, List<FeatureFunction> featureFunctions, Grammar[] grammars, 
+      JoshuaConfiguration config) {
+
+    this.sentence = sentence;
+    this.featureFunctions = featureFunctions;
+    this.config = config;
+    
+    int num_phrase_tables = 0;
+    for (int i = 0; i < grammars.length; i++)
+      if (grammars[i] instanceof PhraseTable)
+        ++num_phrase_tables;
+    
+    PhraseTable[] phraseTables = new PhraseTable[num_phrase_tables + 2];
+    for (int i = 0, j = 0; i < grammars.length; i++)
+      if (grammars[i] instanceof PhraseTable)
+        phraseTables[j++] = (PhraseTable) grammars[i];
+    
+    phraseTables[phraseTables.length - 2] = new PhraseTable(UNKNOWN_OWNER, config);
+    phraseTables[phraseTables.length - 2].addRule(Hypothesis.END_RULE);
+    
+    phraseTables[phraseTables.length - 1] = new PhraseTable("oov", config);
+    AbstractGrammar.addOOVRules(phraseTables[phraseTables.length - 1], sentence.getLattice(), featureFunctions, config.true_oovs_only);
+    
+    this.chart = new PhraseChart(phraseTables, featureFunctions, sentence, config.num_translation_options);
+  }
+  
+  
+  /**
+   * The main algorithm. Returns a hypergraph representing the search space.
+   * 
+   * @return a {@link org.apache.joshua.decoder.hypergraph.HyperGraph} representing the search space
+   */
+  public HyperGraph search() {
+    
+    long startTime = System.currentTimeMillis();
+    
+    Future future = new Future(chart);
+    stacks = new ArrayList<Stack>();
+    
+    // <s> counts as the first word. Pushing null lets us count from one.
+    stacks.add(null);
+
+    // Initialize root hypothesis with <s> context and future cost for everything.
+    ComputeNodeResult result = new ComputeNodeResult(this.featureFunctions, Hypothesis.BEGIN_RULE,
+        null, -1, 1, null, this.sentence);
+    Stack firstStack = new Stack(featureFunctions, sentence, config);
+    firstStack.add(new Hypothesis(result.getDPStates(), future.Full()));
+    stacks.add(firstStack);
+    
+    // Decode with increasing numbers of source words. 
+    for (int source_words = 2; source_words <= sentence.length(); ++source_words) {
+      Stack targetStack = new Stack(featureFunctions, sentence, config);
+      stacks.add(targetStack);
+
+      // Iterate over stacks to continue from.
+      for (int phrase_length = 1; phrase_length <= Math.min(source_words - 1, chart.MaxSourcePhraseLength());
+          phrase_length++) {
+        int from_stack = source_words - phrase_length;
+        Stack tailStack = stacks.get(from_stack);
+
+        LOG.debug("WORDS {} MAX {} (STACK {} phrase_length {})", source_words,
+            chart.MaxSourcePhraseLength(), from_stack, phrase_length);
+        
+        // Iterate over antecedents in this stack.
+        for (Coverage coverage: tailStack.getCoverages()) {
+          ArrayList<Hypothesis> hypotheses = tailStack.get(coverage); 
+          
+          // the index of the starting point of the first possible phrase
+          int begin = coverage.firstZero();
+          
+          // the absolute position of the ending spot of the last possible phrase
+          int last_end = Math.min(coverage.firstZero() + config.reordering_limit, chart.SentenceLength());
+          int last_begin = (last_end > phrase_length) ? (last_end - phrase_length) : 0;
+
+          for (begin = coverage.firstZero(); begin <= last_begin; begin++) {
+            if (!coverage.compatible(begin, begin + phrase_length) ||
+                ! permissible(coverage, begin, begin + phrase_length)) {
+              continue;
+            }
+
+            Span span = new Span(begin, begin + phrase_length);
+
+            // Don't append </s> until the end
+            if (begin == sentence.length() - 1 && source_words != sentence.length()) 
+              continue;            
+
+            TargetPhrases phrases = chart.getRange(begin, begin + phrase_length);
+            if (phrases == null)
+              continue;
+
+
+            LOG.debug("Applying {} target phrases over [{}, {}]",
+                phrases.size(), begin, begin + phrase_length);
+            
+            // TODO: could also compute some number of features here (e.g., non-LM ones)
+            // float score_delta = context.GetScorer().transition(ant, phrases, begin, begin + phrase_length);
+            
+            // Future costs: remove span to be filled.
+            float future_delta = future.Change(coverage, begin, begin + phrase_length);
+            
+            /* This associates with each span a set of hypotheses that can be extended by
+             * phrases from that span. The hypotheses are wrapped in HypoState objects, which
+             * augment the hypothesis score with a future cost.
+             */
+            Candidate cand = new Candidate(hypotheses, phrases, span, future_delta);
+            targetStack.addCandidate(cand);
+          }
+        }
+      }
+
+      /* At this point, every vertex contains a list of all existing hypotheses that the target
+       * phrases in that vertex could extend. Now we need to create the search object, which
+       * implements cube pruning. There are up to O(n^2) cubes, n the size of the current stack,
+       * one cube each over each span of the input. Each "cube" has two dimensions: one representing
+       * the target phrases over the span, and one representing all of these incoming hypotheses.
+       * We seed the chart with the best item in each cube, and then repeatedly pop and extend.
+       */
+      
+//      System.err.println(String.format("\nBuilding cube-pruning chart for %d words", source_words));
+
+      targetStack.search();
+    }
+    
+    LOG.info("Input {}: Search took {} seconds", sentence.id(),
+        (System.currentTimeMillis() - startTime) / 1000.0f);
+    
+    return createGoalNode();
+  }
+    
+  /**
+   * Enforces reordering constraints. Our version of Moses' ReorderingConstraint::Check() and
+   * SearchCubePruning::CheckDistortion(). 
+   * 
+   * @param coverage
+   * @param begin
+   * @param i
+   * @return
+   */
+  private boolean permissible(Coverage coverage, int begin, int end) {
+    int firstZero = coverage.firstZero();
+
+    if (config.reordering_limit < 0)
+      return true;
+    
+    /* We can always start with the first zero since it doesn't create a reordering gap
+     */
+    if (begin == firstZero)
+      return true;
+
+    /* If a gap is created by applying this phrase, make sure that you can reach the first
+     * zero later on without violating the distortion constraint.
+     */
+    if (end - firstZero > config.reordering_limit) {
+      return false;
+    }
+    
+    return true;
+  }
+
+
+  /**
+   * Searches through the goal stack, calling the final transition function on each node, and then returning
+   * the best item. Usually the final transition code doesn't add anything, because all features
+   * have already computed everything they need to. The standard exception is language models that
+   * have not yet computed their prefix probabilities (which is not the case with KenLM, the default).
+   * 
+   * @return
+   */
+  private HyperGraph createGoalNode() {
+    Stack lastStack = stacks.get(sentence.length());
+    
+    for (Hypothesis hyp: lastStack) {
+      float score = hyp.getScore();
+      List<HGNode> tailNodes = new ArrayList<HGNode>();
+      tailNodes.add(hyp);
+      
+      float finalTransitionScore = ComputeNodeResult.computeFinalCost(featureFunctions, tailNodes, 0, sentence.length(), null, sentence);
+
+      if (null == this.end)
+        this.end = new Hypothesis(null, score + finalTransitionScore, hyp, sentence.length(), null);
+
+      HyperEdge edge = new HyperEdge(null, score + finalTransitionScore, finalTransitionScore, tailNodes, null);
+      end.addHyperedgeInNode(edge);
+    }
+    
+    return new HyperGraph(end, -1, -1, this.sentence);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/TargetPhrases.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/TargetPhrases.java b/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/TargetPhrases.java
new file mode 100644
index 0000000..05a4b0a
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/TargetPhrases.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.phrase;
+
+import java.util.ArrayList;	
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.joshua.decoder.ff.FeatureFunction;
+import org.apache.joshua.decoder.ff.FeatureVector;
+import org.apache.joshua.decoder.ff.tm.Rule;
+
+/**
+ * Represents a sorted collection of target-side phrases. Typically, these are phrases
+ * generated from the same source word sequence. The list of options is reduced to the number
+ * of translation options.
+ * 
+ * @author Matt Post
+ */
+
+public class TargetPhrases extends ArrayList<Rule> {
+
+  private static final long serialVersionUID = 1L;
+
+  public TargetPhrases() {
+    super();
+  }
+  
+  /**
+   * Initialize with a collection of rules.
+   * 
+   * @param list a {@link java.util.List} of {@link org.apache.joshua.decoder.ff.tm.Rule}'s
+   */
+  public TargetPhrases(List<Rule> list) {
+    super();
+    
+    for (Rule rule: list) {
+      add(rule);
+    }
+  }
+  
+  /**
+   * Score the rules and sort them. Scoring is necessary because rules are only scored if they
+   * are used, in an effort to make reading in rules more efficient. This is starting to create
+   * some trouble and should probably be reworked.
+   * @param features a {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+   * @param weights a populated {@link org.apache.joshua.decoder.ff.FeatureVector}
+   * @param num_options the number of options
+   */
+  public void finish(List<FeatureFunction> features, FeatureVector weights, int num_options) {
+    for (Rule rule: this) { 
+      rule.estimateRuleCost(features);
+//      System.err.println("TargetPhrases:finish(): " + rule);
+    }
+    Collections.sort(this, Rule.EstimatedCostComparator);
+    
+    if (this.size() > num_options)
+      this.removeRange(num_options, this.size());
+    
+//    System.err.println("TargetPhrases::finish()");
+//    for (Rule rule: this) 
+//      System.err.println("  " + rule);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintRule.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintRule.java b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintRule.java
new file mode 100644
index 0000000..5146e2c
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintRule.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.segment_file;
+
+import javax.swing.text.Segment;
+
+/**
+ * This interface is for an individual (partial) item to seed the chart with. All rules should be
+ * flat (no hierarchical nonterminals).
+ * <p>
+ * The {@link Segment}, {@link ConstraintSpan}, and {@link ConstraintRule} interfaces are for
+ * defining an interchange format between a SegmentFileParser and the Chart class. These interfaces
+ * <b>should not</b> be used internally by the Chart. The objects returned by a
+ * SegmentFileParser will not be optimal for use during decoding. The Chart should convert each of
+ * these objects into its own internal representation during construction. That is the contract
+ * described by these interfaces.
+ * 
+ * @see org.apache.joshua.decoder.segment_file.ConstraintRule.Type
+ * 
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $
+ */
+public interface ConstraintRule {
+
+  /**
+   * <p>There are three types of ConstraintRule. The RULE type returns non-null values for all methods.
+   * The LHS type provides a (non-null) value for the lhs method, but returns null for everything
+   * else. And the RHS type provides a (non-null) value for nativeRhs and foreignRhs but returns
+   * null for the lhs and features.</p>
+   * <p>
+   * The interpretation of a RULE is that it adds a new rule to the grammar which only applies to
+   * the associated span. If the associated span is hard, then the set of rules for that span will
+   * override the regular grammar.</p>
+   * <p>
+   * The intepretation of a LHS is that it provides a hard constraint that the associated span be
+   * treated as the nonterminal for that span, thus filtering the regular grammar.</p>
+   * <p>
+   * The interpretation of a RHS is that it provides a hard constraint to filter the regular grammar
+   * such that only rules generating the desired translation can be used.</p>
+   */
+  public enum Type {
+    RULE, LHS, RHS
+  };
+
+  /** 
+   * Return the type of this ConstraintRule.
+   * @return the {@link org.apache.joshua.decoder.segment_file.ConstraintRule.Type}
+   */
+  Type type();
+
+
+  /**
+   * Return the left hand side of the constraint rule. If this is null, then this object is
+   * specifying a translation for the span, but that translation may be derived from any
+   * nonterminal. The nonterminal here must be one used by the regular grammar.
+   * @return the left hand side of the constraint rule
+   */
+  String lhs();
+
+
+  /**
+   * Return the native right hand side of the constraint rule. If this is null, then the regular
+   * grammar will be used to fill in the derivation from the lhs.
+   * @return the native right hand side of the constraint rule
+   */
+  String nativeRhs();
+
+
+  /**
+   * Return the foreign right hand side of the constraint rule. This must be consistent with the
+   * sentence for the associated span, and is provided as a convenience method.
+   * @return the foreign right hand side of the constraint rule
+   */
+  String foreignRhs();
+
+
+  /**
+   * Return the grammar feature values for the RULE. The length of this array must be the same as
+   * for the regular grammar. We cannot enforce this requirement, but the
+   * {@link org.apache.joshua.decoder.chart_parser.Chart} must throw an error if there is a mismatch.
+   * @return an array of floating feature values for the RULE 
+   */
+  float[] features();
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintSpan.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintSpan.java b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintSpan.java
new file mode 100644
index 0000000..9863fa6
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintSpan.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.segment_file;
+
+import java.util.List;
+
+import javax.swing.text.Segment;
+
+/**
+ * This interface represents a collection of constraints for a given span in the associated segment.
+ * Intuitively, each constraint corresponds to one or more items in the chart for parsing, except
+ * that we pre-seed the chart with these items before beginning the parsing algorithm. Some
+ * constraints can be "hard", in which case the regular grammar is not consulted for these spans. It
+ * is an error to have hard constraints for overlapping spans.
+ * <p>
+ * Indices for the span boundaries mark the transitions between words. Thus, the 0 index occurs
+ * before the first word, the 1 index occurs between the first and second words, 2 is between the
+ * second and third, etc. Consequently, it is an error for the end index to be equal to or less than
+ * the start index. It is also an error to have negative indices or to have indices larger than the
+ * count of words in the segment. Clients may assume that no <code>ConstraintSpan</code> objects are
+ * constructed which violate these laws.
+ * <p>
+ * The {@link Segment}, {@link ConstraintSpan}, and {@link ConstraintRule} interfaces are for
+ * defining an interchange format between a SegmentFileParser and the Chart class. These interfaces
+ * <b>should not</b> be used internally by the Chart. The objects returned by a
+ * SegmentFileParser will not be optimal for use during decoding. The Chart should convert each of
+ * these objects into its own internal representation during construction. That is the contract
+ * described by these interfaces.
+ * 
+ * @author wren ng thornton wren@users.sourceforge.net
+ */
+public interface ConstraintSpan {
+
+  /**
+   * Return the starting index of the span covered by this constraint.
+   * @return the starting index of the span covered by this constraint
+   */
+  int start();
+
+  /**
+   * Return the ending index of the span covered by this constraint. Clients may assume
+   * <code>this.end() &gt;= 1 + this.start()</code>.
+   * @return the ending index of the span covered by this constraint
+   */
+  int end();
+
+  /**
+   * Return whether this is a hard constraint which should override the grammar. This value only
+   * really matters for sets of <code>RULE</code> type constraints.
+   * @return true if a hard constraint exists which should override the grammar
+   */
+  boolean isHard();
+
+  /**
+   * Return a collection of the "rules" for this constraint span.
+   * <p>
+   * This return type is suboptimal for some SegmentFileParsers. It should be an
+   * {@link java.util.Iterator} instead in order to reduce the coupling between this class and
+   * Chart. See the note above about the fact that this interface should not be used internally by
+   * the Chart class because it will not be performant.
+   * @return a {@link java.util.List} of {@link org.apache.joshua.decoder.segment_file.ConstraintRule}'s
+   */
+  List<ConstraintRule> rules();
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ParseTreeInput.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ParseTreeInput.java b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ParseTreeInput.java
new file mode 100644
index 0000000..b9b1896
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ParseTreeInput.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.segment_file;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+
+public class ParseTreeInput extends Sentence {
+
+  public ParseTreeInput(String input, int id, JoshuaConfiguration joshuaConfiguration) {
+    super(input, id,joshuaConfiguration);
+  }
+
+  // looks_like_parse_tree = sentence.sentence().matches("^\\(+[A-Z]+ .*");
+
+  // private SyntaxTree syntax_tree;
+
+  // ParseTreeInput() {
+  // SyntaxTree syntax_tree = new ArraySyntaxTree(sentence.sentence(), Vocabulary);
+  // }
+
+  // public int[] int_sentence() {
+  // return syntax_tree.getTerminals();
+  // }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ParsedSentence.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ParsedSentence.java b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ParsedSentence.java
new file mode 100644
index 0000000..a97718e
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ParsedSentence.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.segment_file;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.corpus.syntax.ArraySyntaxTree;
+import org.apache.joshua.corpus.syntax.SyntaxTree;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+
+public class ParsedSentence extends Sentence {
+
+  private SyntaxTree syntaxTree = null;
+
+  public ParsedSentence(String input, int id,JoshuaConfiguration joshuaConfiguration) {
+    super(input, id, joshuaConfiguration);
+  }
+
+  public int[] getWordIDs() {
+    int[] terminals = syntaxTree().getTerminals();
+    int[] annotated = new int[terminals.length + 2];
+    System.arraycopy(terminals, 0, annotated, 1, terminals.length);
+    annotated[0] = Vocabulary.id(Vocabulary.START_SYM);
+    annotated[annotated.length - 1] = Vocabulary.id(Vocabulary.STOP_SYM);
+    return annotated;
+  }
+
+  public SyntaxTree syntaxTree() {
+    if (syntaxTree == null)
+      syntaxTree = new ArraySyntaxTree(this.source());
+    return syntaxTree;
+  }
+
+  public static boolean matches(String input) {
+    return input.matches("^\\(+[A-Z]+ .*");
+  }
+
+  public String fullSource() {
+    return Vocabulary.getWords(this.getWordIDs());
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java
new file mode 100644
index 0000000..e323ef6
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java
@@ -0,0 +1,450 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.segment_file;
+
+import static org.apache.joshua.util.FormatUtils.addSentenceMarkers;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.StringTokenizer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.ff.tm.Grammar;
+import org.apache.joshua.lattice.Arc;
+import org.apache.joshua.lattice.Lattice;
+import org.apache.joshua.lattice.Node;
+import org.apache.joshua.util.ChartSpan;
+import org.apache.joshua.util.Regex;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class represents lattice input. The lattice is contained on a single line and is represented
+ * in PLF (Python Lattice Format), e.g.,
+ * 
+ * <pre>
+ * ((('ein',0.1,1),('dieses',0.2,1),('haus',0.4,2),),(('haus',0.8,1),),)
+ * </pre>
+ * 
+ * @author Matt Post post@cs.jhu.edu
+ */
+
+public class Sentence {
+
+  private static final Logger LOG = LoggerFactory.getLogger(Sentence.class);
+
+  /* The sentence number. */
+  public int id = -1;
+
+  /*
+   * The source and target sides of the input sentence. Target sides are present when doing
+   * alignment or forced decoding.
+   */
+  protected String source = null;
+  protected String fullSource = null;
+  
+  protected String target = null;
+  protected String fullTarget = null;
+  protected String[] references = null;
+
+  /* Lattice representation of the source sentence. */
+  protected Lattice<Token> sourceLattice = null;
+
+  /* List of constraints */
+  private final List<ConstraintSpan> constraints;
+  
+  public JoshuaConfiguration config = null;
+
+  /**
+   * Constructor. Receives a string representing the input sentence. This string may be a
+   * string-encoded lattice or a plain text string for decoding.
+   * 
+   * @param inputString representing the input sentence
+   * @param id ID to associate with the input string
+   * @param joshuaConfiguration a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+   */
+  public Sentence(String inputString, int id, JoshuaConfiguration joshuaConfiguration) {
+  
+    inputString = Regex.spaces.replaceAll(inputString, " ").trim();
+    
+    config = joshuaConfiguration;
+    
+    this.constraints = new LinkedList<ConstraintSpan>();
+
+    // Check if the sentence has SGML markings denoting the
+    // sentence ID; if so, override the id passed in to the
+    // constructor
+    Matcher start = SEG_START.matcher(inputString);
+    if (start.find()) {
+      source = SEG_END.matcher(start.replaceFirst("")).replaceFirst("");
+      String idstr = start.group(1);
+      this.id = Integer.parseInt(idstr);
+
+    } else {
+      if (inputString.indexOf(" ||| ") != -1) {
+        /* Target-side given; used for parsing and forced decoding */
+        String[] pieces = inputString.split("\\s?\\|{3}\\s?");
+        source = pieces[0];
+        target = pieces[1];
+        if (target.equals(""))
+          target = null;
+        if (pieces.length > 2) {
+          references = new String[pieces.length - 2];
+          System.arraycopy(pieces, 2, references, 0, pieces.length - 2);
+        }
+        this.id = id;
+
+      } else {
+        /* Regular ol' input sentence */
+        source = inputString;
+        this.id = id;
+      }
+    }
+    
+    // Only trim strings
+    if (! (joshuaConfiguration.lattice_decoding && source.startsWith("(((")))
+      adjustForLength(joshuaConfiguration.maxlen);
+  }
+
+  /**
+   * Indicates whether the underlying lattice is a linear chain, i.e., a sentence.
+   * 
+   * @return true if this is a linear chain, false otherwise
+   */
+  public boolean isLinearChain() {
+    return ! this.getLattice().hasMoreThanOnePath();
+  }
+
+  // Matches the opening and closing <seg> tags, e.g.,
+  // <seg id="72">this is a test input sentence</seg>.
+  protected static final Pattern SEG_START = Pattern
+      .compile("^\\s*<seg\\s+id=\"?(\\d+)\"?[^>]*>\\s*");
+  protected static final Pattern SEG_END = Pattern.compile("\\s*</seg\\s*>\\s*$");
+
+  /**
+   * Returns the length of the sentence. For lattices, the length is the shortest path through the
+   * lattice. The length includes the &lt;s&gt; and &lt;/s&gt; sentence markers. 
+   * 
+   * @return number of input tokens + 2 (for start and end of sentence markers)
+   */
+  public int length() {
+    return this.getLattice().getShortestDistance();
+  }
+
+  /**
+   * Returns the annotations for a specific word (specified by an index) in the 
+   * sentence
+   * 
+   * @param index The location of the word in the sentence
+   * @param key The annotation identity
+   * @return The annotations associated with this word
+   */
+  public String getAnnotation(int index, String key) {
+    return getTokens().get(index).getAnnotation(key);
+  }
+
+  /**
+   * This function computes the intersection of \sigma^+ (where \sigma is the terminal vocabulary)
+   * with all character-level segmentations of each OOV in the input sentence.
+   * 
+   * The idea is to break apart noun compounds in languages like German (such as the word "golfloch"
+   * = "golf" (golf) + "loch" (hole)), allowing them to be translated.
+   * 
+   * @param grammars a list of grammars to consult to find in- and out-of-vocabulary items
+   */
+  public void segmentOOVs(Grammar[] grammars) {
+    Lattice<Token> oldLattice = this.getLattice();
+
+    /* Build a list of terminals across all grammars */
+    HashSet<Integer> vocabulary = new HashSet<Integer>();
+    for (Grammar grammar : grammars) {
+      Iterator<Integer> iterator = grammar.getTrieRoot().getTerminalExtensionIterator();
+      while (iterator.hasNext())
+        vocabulary.add(iterator.next());
+    }
+
+    List<Node<Token>> oldNodes = oldLattice.getNodes();
+
+    /* Find all the subwords that appear in the vocabulary, and create the lattice */
+    for (int nodeid = oldNodes.size() - 3; nodeid >= 1; nodeid -= 1) {
+      if (oldNodes.get(nodeid).getOutgoingArcs().size() == 1) {
+        Arc<Token> arc = oldNodes.get(nodeid).getOutgoingArcs().get(0);
+        String word = Vocabulary.word(arc.getLabel().getWord());
+        if (!vocabulary.contains(arc.getLabel())) {
+          // System.err.println(String.format("REPL: '%s'", word));
+          List<Arc<Token>> savedArcs = oldNodes.get(nodeid).getOutgoingArcs();
+
+          char[] chars = word.toCharArray();
+          ChartSpan<Boolean> wordChart = new ChartSpan<Boolean>(chars.length + 1, false);
+          ArrayList<Node<Token>> nodes = new ArrayList<Node<Token>>(chars.length + 1);
+          nodes.add(oldNodes.get(nodeid));
+          for (int i = 1; i < chars.length; i++)
+            nodes.add(new Node<Token>(i));
+          nodes.add(oldNodes.get(nodeid + 1));
+          for (int width = 1; width <= chars.length; width++) {
+            for (int i = 0; i <= chars.length - width; i++) {
+              int j = i + width;
+              if (width != chars.length) {
+                Token token = new Token(word.substring(i, j), config);
+                if (vocabulary.contains(id)) {
+                  nodes.get(i).addArc(nodes.get(j), 0.0f, token);
+                  wordChart.set(i, j, true);
+                  //                    System.err.println(String.format("  FOUND '%s' at (%d,%d)", word.substring(i, j),
+                  //                        i, j));
+                }
+              }
+
+              for (int k = i + 1; k < j; k++) {
+                if (wordChart.get(i, k) && wordChart.get(k, j)) {
+                  wordChart.set(i, j, true);
+                  //                    System.err.println(String.format("    PATH FROM %d-%d-%d", i, k, j));
+                }
+              }
+            }
+          }
+
+          /* If there's a path from beginning to end */
+          if (wordChart.get(0, chars.length)) {
+            // Remove nodes not part of a complete path
+            HashSet<Node<Token>> deletedNodes = new HashSet<Node<Token>>();
+            for (int k = 1; k < nodes.size() - 1; k++)
+              if (!(wordChart.get(0, k) && wordChart.get(k, chars.length)))
+                nodes.set(k, null);
+
+            int delIndex = 1;
+            while (delIndex < nodes.size())
+              if (nodes.get(delIndex) == null) {
+                deletedNodes.add(nodes.get(delIndex));
+                nodes.remove(delIndex);
+              } else
+                delIndex++;
+
+            for (Node<Token> node : nodes) {
+              int arcno = 0;
+              while (arcno != node.getOutgoingArcs().size()) {
+                Arc<Token> delArc = node.getOutgoingArcs().get(arcno);
+                if (deletedNodes.contains(delArc.getHead()))
+                  node.getOutgoingArcs().remove(arcno);
+                else {
+                  arcno++;
+                  //                    System.err.println("           ARC: " + Vocabulary.word(delArc.getLabel()));
+                }
+              }
+            }
+
+            // Insert into the main lattice
+            this.getLattice().insert(nodeid, nodeid + 1, nodes);
+          } else {
+            nodes.get(0).setOutgoingArcs(savedArcs);
+          }
+        }
+      }
+    }
+  }
+
+  /**
+   * If the input sentence is too long (not counting the &lt;s&gt; and &lt;/s&gt; tokens), it is truncated to
+   * the maximum length, specified with the "maxlen" parameter.
+   * 
+   * Note that this code assumes the underlying representation is a sentence, and not a lattice. Its
+   * behavior is undefined for lattices.
+   * 
+   * @param length int representing the length to truncate the sentence to
+   */
+  protected void adjustForLength(int length) {
+    int size = this.getLattice().size() - 2; // subtract off the start- and end-of-sentence tokens
+
+    if (size > length) {
+      LOG.warn("sentence {} too long {}, truncating to length {}", id(), size, length);
+
+      // Replace the input sentence (and target) -- use the raw string, not source()
+      String[] tokens = source.split("\\s+");
+      source = tokens[0];
+      for (int i = 1; i < length; i++)
+        source += " " + tokens[i];
+      sourceLattice = null;
+      if (target != null) {
+        target = "";
+      }
+    }
+  }
+
+  public boolean isEmpty() {
+    return source.matches("^\\s*$");
+  }
+
+  public int id() {
+    return id;
+  }
+
+  /**
+   * Returns the raw source-side input string.
+   * @return the raw source-side input string
+   */
+  public String rawSource() {
+    return source;
+  }
+  
+  /**
+   * Returns the source-side string with annotations --- if any --- stripped off.
+   * 
+   * @return  the source-side string with annotations --- if any --- stripped off
+   */
+  public String source() {
+    StringBuilder str = new StringBuilder();
+    int[] ids = getWordIDs();
+    for (int i = 1; i < ids.length - 1; i++) {
+      str.append(Vocabulary.word(ids[i])).append(" ");
+    }
+    return str.toString().trim();
+  }
+
+  /**
+   * Returns a sentence with the start and stop symbols added to the 
+   * beginning and the end of the sentence respectively
+   * 
+   * @return String The input sentence with start and stop symbols
+   */
+  public String fullSource() {
+    if (fullSource == null) {
+      fullSource = addSentenceMarkers(source());
+    }
+    return fullSource;  
+  }
+
+  /**
+   * If a target side was supplied with the sentence, this will be non-null. This is used when doing
+   * synchronous parsing or constrained decoding. The input format is:
+   * 
+   * Bill quiere ir a casa ||| Bill wants to go home
+   * 
+   * If the parameter parse=true is set, parsing will be triggered, otherwise constrained decoding.
+   * 
+   * @return target side of sentence translation
+   */
+  public String target() {
+    return target;
+  }
+
+  public String fullTarget() {
+    if (fullTarget == null) {
+      fullTarget = addSentenceMarkers(target());
+    }
+    return fullTarget; 
+  }
+
+  public String source(int i, int j) {
+    StringTokenizer st = new StringTokenizer(fullSource());
+    int index = 0;
+    StringBuilder substring = new StringBuilder();
+    while (st.hasMoreTokens()) {
+      String token = st.nextToken();
+      if (index >= j)
+        break;
+      if (index >= i)
+        substring.append(token).append(" ");
+      index++;
+    }
+    return substring.toString().trim();
+  }
+
+  public String[] references() {
+    return references;
+  }
+
+  /**
+   * Returns the sequence of tokens comprising the sentence. This assumes you've done the checking
+   * to makes sure the input string (the source side) isn't a PLF waiting to be parsed.
+   * 
+   * @return a {@link java.util.List} of {@link org.apache.joshua.decoder.segment_file.Token}'s comprising the sentence
+   */
+  public List<Token> getTokens() {
+    assert isLinearChain();
+    List<Token> tokens = new ArrayList<Token>();
+    for (Node<Token> node: getLattice().getNodes())
+      if (node != null && node.getOutgoingArcs().size() > 0) 
+        tokens.add(node.getOutgoingArcs().get(0).getLabel());
+    return tokens;
+  }
+  
+  /**
+   * Returns the sequence of word IDs comprising the input sentence. Assumes this is not a general
+   * lattice, but a linear chain.
+   * @return an int[] comprising all word ID's
+   */
+  public int[] getWordIDs() {
+    List<Token> tokens = getTokens();
+    int[] ids = new int[tokens.size()];
+    for (int i = 0; i < tokens.size(); i++)
+      ids[i] = tokens.get(i).getWord();
+    return ids;
+  }
+  
+  /**
+   * Returns the sequence of word ids comprising the sentence. Assumes this is a sentence and
+   * not a lattice.
+   *  
+   * @return the sequence of word ids comprising the sentence
+   */
+  public Lattice<String> stringLattice() {
+    assert isLinearChain();
+    return Lattice.createStringLatticeFromString(source(), config);
+  }
+
+  public List<ConstraintSpan> constraints() {
+    return constraints;
+  }
+
+  public Lattice<Token> getLattice() {
+    if (this.sourceLattice == null) {
+      if (config.lattice_decoding && rawSource().startsWith("(((")) {
+        if (config.search_algorithm.equals("stack")) {
+          throw new RuntimeException("* FATAL: lattice decoding currently not supported for stack-based search algorithm.");
+        }
+        this.sourceLattice = Lattice.createTokenLatticeFromPLF(rawSource(), config);
+      } else
+        this.sourceLattice = Lattice.createTokenLatticeFromString(String.format("%s %s %s", Vocabulary.START_SYM,
+            rawSource(), Vocabulary.STOP_SYM), config);
+    }
+    return this.sourceLattice;
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder(source());
+    if (target() != null) {
+      sb.append(" ||| " + target());
+    }
+    return sb.toString();
+  }
+
+  public boolean hasPath(int begin, int end) {
+    return getLattice().distance(begin, end) != -1;
+  }
+
+  public Node<Token> getNode(int i) {
+    return getLattice().getNode(i);
+  }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/Token.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/Token.java b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/Token.java
new file mode 100644
index 0000000..b84826d
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/Token.java
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.segment_file;
+
+import static org.apache.joshua.util.FormatUtils.escapeSpecialSymbols;
+
+import java.util.HashMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.util.FormatUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Stores the identity of a word and its annotations in a sentence.
+
+ * @author "Gaurav Kumar"
+ * @author Matt Post
+ */
+public class Token {
+
+  private static final Logger LOG = LoggerFactory.getLogger(Token.class);
+
+  // The token without the annotations
+  private String token; 
+  private int tokenID;
+
+  private HashMap<String,String> annotations = null;
+  private JoshuaConfiguration joshuaConfiguration;
+
+  /**
+   * <p>Constructor : Creates a Token object from a raw word
+   * Extracts and assigns an annotation when available.
+   * Any word can be marked with annotations, which are arbitrary semicolon-delimited
+   * key[=value] pairs (the value is optional) listed in brackets after a word, e.g.,</p>
+   * <pre>
+   *    Je[ref=Samuel;PRO] voudrais[FUT;COND]
+   * </pre>
+   * 
+   * <p>This will create a dictionary annotation on the word of the following form for "Je"</p>
+   * 
+   * <pre>
+   *   ref -&gt; Samuel
+   *   PRO -&gt; PRO
+   * </pre>
+   * 
+   * <p>and the following for "voudrais":</p>
+   * 
+   * <pre>
+   *   FUT  -&gt; FUT
+   *   COND -&gt; COND
+   * </pre>
+   * 
+   * @param rawWord A word with annotation information (possibly)
+   * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+   *  
+   */
+  public Token(String rawWord, JoshuaConfiguration config) {
+    
+    this.joshuaConfiguration = config;
+    
+    annotations = new HashMap<String,String>();
+    
+    // Matches a word with an annotation
+    // Check guidelines in constructor description
+    Pattern pattern = Pattern.compile("(\\S+)\\[(\\S+)\\]");
+    Matcher tag = pattern.matcher(rawWord);
+    if (tag.find()) {
+      // Annotation match found
+      token = tag.group(1);
+      String tagStr = tag.group(2);
+
+      for (String annotation: tagStr.split(";")) {
+        int where = annotation.indexOf("=");
+        if (where != -1) {
+          annotations.put(annotation.substring(0, where), annotation.substring(where + 1));
+        } else {
+          annotations.put(annotation, annotation);
+        }
+      }
+    } else {
+      // No match found, which implies that this token does not have any annotations 
+      token = rawWord;
+    }
+
+    // Mask strings that cause problems for the decoder. This has to be done *after* parsing for
+    // annotations.
+    token = escapeSpecialSymbols(token);
+
+    if (joshuaConfiguration != null && joshuaConfiguration.lowercase) {
+      if (FormatUtils.ISALLUPPERCASE(token))
+        annotations.put("lettercase", "all-upper");
+      else if (Character.isUpperCase(token.charAt(0)))
+        annotations.put("lettercase",  "upper");
+      else
+        annotations.put("lettercase",  "lower");
+      
+      LOG.info("TOKEN: {} -> {} ({})", token, token.toLowerCase(), annotations.get("lettercase"));
+      token = token.toLowerCase(); 
+    }
+    
+    tokenID = Vocabulary.id(token);
+  }
+  
+  /**
+   * Returns the word ID (vocab ID) for this token
+   * 
+   * @return int A word ID
+   */
+  public int getWord() {
+    return tokenID;
+  }
+
+  /**
+   * Returns the string associated with this token
+   * @return String A word
+   */
+  public String getWordIdentity() {
+    return token;
+  }
+  
+  public String toString() {
+    return token;
+  }
+
+  /**
+   * Returns the annotationID (vocab ID)
+   * associated with this token
+   * @param key A type ID
+   * @return the annotationID (vocab ID)
+   */
+  public String getAnnotation(String key) {
+    if (annotations.containsKey(key)) {
+      return annotations.get(key);
+    }
+    
+    return null;
+  }
+}
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/package-info.java b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/package-info.java
new file mode 100644
index 0000000..a615030
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/package-info.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/** 
+ * Provides common interfaces for parsing segment files 
+ * (aka test corpora to be translated). In order to support 
+ * constraint annotations, we provide a general API for 
+ * use by JoshuaDecoder and Chart.
+ */
+package org.apache.joshua.decoder.segment_file;

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/lattice/Arc.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/lattice/Arc.java b/joshua-core/src/main/java/org/apache/joshua/lattice/Arc.java
new file mode 100644
index 0000000..5d056ab
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/lattice/Arc.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.lattice;
+
+/**
+ * An arc in a directed graph.
+ * 
+ * @author Lane Schwartz
+ * @since 2008-07-08
+ * 
+ * @param Label Type of label associated with an arc.
+ */
+public class Arc<Label> {
+
+  /**
+   * Weight of this arc.
+   */
+  private float cost;
+
+  /**
+   * Node where this arc ends. 
+   */
+  private Node<Label> head;
+
+  /**
+   * Node where this arc begins.
+   */
+  private Node<Label> tail;
+
+  /**
+   * Label associated with this arc.
+   */
+  private Label label;
+  
+  /**
+   * Creates an arc with the specified head, tail, cost, and label.
+   * 
+   * @param head The node where this arc begins.
+   * @param tail The node where this arc ends.
+   * @param cost The cost of this arc.
+   * @param label The label associated with this arc.
+   */
+  public Arc(Node<Label> tail, Node<Label> head, float cost, Label label) {
+    this.tail = tail;
+    this.head = head;
+    this.cost = cost;
+    this.label = label;
+  }
+
+  /**
+   * Gets the cost of this arc.
+   * 
+   * @return The cost of this arc.
+   */
+  public float getCost() {
+    return cost;
+  }
+
+  /**
+   * Gets the tail of this arc (the node where this arc begins).
+   * 
+   * @return The tail of this arc.
+   */
+  public Node<Label> getTail() {
+    return tail;
+  }
+
+  /**
+   * Gets the head of this arc (the node where this arc ends).
+   * 
+   * @return The head of this arc.
+   */
+  public Node<Label> getHead() {
+    return head;
+  }
+
+  /**
+   * Gets the label associated with this arc.
+   * 
+   * @return The label associated with this arc.
+   */
+  public Label getLabel() {
+    return label;
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder s = new StringBuilder();
+
+    s.append(label.toString());
+    s.append("  :  ");
+    s.append(tail.toString());
+    s.append(" ==> ");
+    s.append(head.toString());
+    s.append("  :  ");
+    s.append(cost);
+
+    return s.toString();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/lattice/Lattice.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/lattice/Lattice.java b/joshua-core/src/main/java/org/apache/joshua/lattice/Lattice.java
new file mode 100644
index 0000000..2332159
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/lattice/Lattice.java
@@ -0,0 +1,587 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.lattice;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Stack;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.segment_file.Token;
+import org.apache.joshua.util.ChartSpan;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A lattice representation of a directed graph.
+ *
+ * @author Lane Schwartz
+ * @author Matt Post post@cs.jhu.edu
+ * @since 2008-07-08
+ *
+ */
+public class Lattice<Value> implements Iterable<Node<Value>> {
+
+  private static final Logger LOG = LoggerFactory.getLogger(Lattice.class);
+
+  /**
+   * True if there is more than one path through the lattice.
+   */
+  private boolean latticeHasAmbiguity;
+
+  /**
+   * Costs of the best path between each pair of nodes in the lattice.
+   */
+  private ChartSpan<Integer> distances = null;
+
+  /**
+   * List of all nodes in the lattice. Nodes are assumed to be in topological order.
+   */
+  private List<Node<Value>> nodes;
+
+
+  JoshuaConfiguration config = null;
+
+  /**
+   * Constructs a new lattice from an existing list of (connected) nodes.
+   * <p>
+   * The list of nodes must already be in topological order. If the list is not in topological
+   * order, the behavior of the lattice is not defined.
+   *
+   * @param nodes A list of nodes which must be in topological order.
+   * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+   */
+  public Lattice(List<Node<Value>> nodes, JoshuaConfiguration config) {
+    this.nodes = nodes;
+    //    this.distances = calculateAllPairsShortestPath();
+    this.latticeHasAmbiguity = true;
+  }
+
+  public Lattice(List<Node<Value>> nodes, boolean isAmbiguous, JoshuaConfiguration config) {
+    // Node<Value> sink = new Node<Value>(nodes.size());
+    // nodes.add(sink);
+    this.nodes = nodes;
+    //    this.distances = calculateAllPairsShortestPath();
+    this.latticeHasAmbiguity = isAmbiguous;
+  }
+
+  /**
+   * Instantiates a lattice from a linear chain of values, i.e., a sentence.
+   *
+   * @param linearChain a sequence of Value objects
+   * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+   */
+  public Lattice(Value[] linearChain, JoshuaConfiguration config) {
+    this.latticeHasAmbiguity = false;
+    this.nodes = new ArrayList<Node<Value>>();
+
+    Node<Value> previous = new Node<Value>(0);
+    nodes.add(previous);
+
+    int i = 1;
+
+    for (Value value : linearChain) {
+
+      Node<Value> current = new Node<Value>(i);
+      float cost = 0.0f;
+      // if (i > 4) cost = (float)i/1.53432f;
+      previous.addArc(current, cost, value);
+
+      nodes.add(current);
+
+      previous = current;
+      i++;
+    }
+
+    //    this.distances = calculateAllPairsShortestPath();
+  }
+
+  public final boolean hasMoreThanOnePath() {
+    return latticeHasAmbiguity;
+  }
+
+  /**
+   * Computes the shortest distance between two nodes, which is used (perhaps among other places) in
+   * computing which rules can apply over which spans of the input
+   *
+   * @param arc an {@link org.apache.joshua.lattice.Arc} of values
+   * @return the shortest distance between two nodes
+   */
+  public int distance(Arc<Value> arc) {
+    return this.getShortestPath(arc.getTail().getNumber(), arc.getHead().getNumber());
+  }
+
+  public int distance(int i, int j) {
+    return this.getShortestPath(i, j);
+  }
+
+  /**
+   * Convenience method to get a lattice from a linear sequence of {@link Token} objects.
+   *
+   * @param source input string from which to create a {@link org.apache.joshua.lattice.Lattice}
+   * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+   * @return Lattice representation of the linear chain.
+   */
+  public static Lattice<Token> createTokenLatticeFromString(String source, JoshuaConfiguration config) {
+    String[] tokens = source.split("\\s+");
+    Token[] integerSentence = new Token[tokens.length];
+    for (int i = 0; i < tokens.length; i++) {
+      integerSentence[i] = new Token(tokens[i], config);
+    }
+
+    return new Lattice<Token>(integerSentence, config);
+  }
+
+  public static Lattice<Token> createTokenLatticeFromPLF(String data, JoshuaConfiguration config) {
+    ArrayList<Node<Token>> nodes = new ArrayList<Node<Token>>();
+
+    // This matches a sequence of tuples, which describe arcs leaving this node
+    Pattern nodePattern = Pattern.compile("(.+?)\\(\\s*(\\(.+?\\),\\s*)\\s*\\)(.*)");
+
+    /*
+     * This matches a comma-delimited, parenthesized tuple of a (a) single-quoted word (b) a number,
+     * optionally in scientific notation (c) an offset (how many states to jump ahead)
+     */
+    Pattern arcPattern = Pattern
+        .compile("\\s*\\('(.+?)',\\s*(-?\\d+\\.?\\d*?(?:[eE]-?\\d+)?),\\s*(\\d+)\\),\\s*(.*)");
+
+    Matcher nodeMatcher = nodePattern.matcher(data);
+
+    boolean latticeIsAmbiguous = false;
+
+    int nodeID = 0;
+    Node<Token> startNode = new Node<Token>(nodeID);
+    nodes.add(startNode);
+
+    while (nodeMatcher.matches()) {
+
+      String nodeData = nodeMatcher.group(2);
+      String remainingData = nodeMatcher.group(3);
+
+      nodeID++;
+
+      Node<Token> currentNode = null;
+      if (nodeID < nodes.size() && nodes.get(nodeID) != null) {
+        currentNode = nodes.get(nodeID);
+      } else {
+        currentNode = new Node<Token>(nodeID);
+        while (nodeID > nodes.size())
+          nodes.add(new Node<Token>(nodes.size()));
+        nodes.add(currentNode);
+      }
+
+      Matcher arcMatcher = arcPattern.matcher(nodeData);
+      int numArcs = 0;
+      if (!arcMatcher.matches()) {
+        throw new RuntimeException("Parse error!");
+      }
+      while (arcMatcher.matches()) {
+        numArcs++;
+        String arcLabel = arcMatcher.group(1);
+        float arcWeight = Float.parseFloat(arcMatcher.group(2));
+        int destinationNodeID = nodeID + Integer.parseInt(arcMatcher.group(3));
+
+        Node<Token> destinationNode;
+        if (destinationNodeID < nodes.size() && nodes.get(destinationNodeID) != null) {
+          destinationNode = nodes.get(destinationNodeID);
+        } else {
+          destinationNode = new Node<Token>(destinationNodeID);
+          while (destinationNodeID > nodes.size())
+            nodes.add(new Node<Token>(nodes.size()));
+          nodes.add(destinationNode);
+        }
+
+        String remainingArcs = arcMatcher.group(4);
+
+        Token arcToken = new Token(arcLabel, config);
+        currentNode.addArc(destinationNode, arcWeight, arcToken);
+
+        arcMatcher = arcPattern.matcher(remainingArcs);
+      }
+      if (numArcs > 1)
+        latticeIsAmbiguous = true;
+
+      nodeMatcher = nodePattern.matcher(remainingData);
+    }
+
+    /* Add <s> to the start of the lattice. */
+    if (nodes.size() > 1 && nodes.get(1) != null) {
+      Node<Token> firstNode = nodes.get(1);
+      startNode.addArc(firstNode, 0.0f, new Token(Vocabulary.START_SYM, config));
+    }
+
+    /* Add </s> as a final state, connect it to the previous end-state */
+    nodeID = nodes.get(nodes.size()-1).getNumber() + 1;
+    Node<Token> endNode = new Node<Token>(nodeID);
+    nodes.get(nodes.size()-1).addArc(endNode, 0.0f, new Token(Vocabulary.STOP_SYM, config));
+    nodes.add(endNode);
+
+    return new Lattice<Token>(nodes, latticeIsAmbiguous, config);
+  }
+
+  /**
+   * Constructs a lattice from a given string representation.
+   *
+   * @param data String representation of a lattice.
+   * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+   * @return A lattice that corresponds to the given string.
+   */
+  public static Lattice<String> createStringLatticeFromString(String data, JoshuaConfiguration config) {
+
+    Map<Integer, Node<String>> nodes = new HashMap<Integer, Node<String>>();
+
+    Pattern nodePattern = Pattern.compile("(.+?)\\((\\(.+?\\),)\\)(.*)");
+    Pattern arcPattern = Pattern.compile("\\('(.+?)',(\\d+.\\d+),(\\d+)\\),(.*)");
+
+    Matcher nodeMatcher = nodePattern.matcher(data);
+
+    int nodeID = -1;
+
+    while (nodeMatcher.matches()) {
+
+      String nodeData = nodeMatcher.group(2);
+      String remainingData = nodeMatcher.group(3);
+
+      nodeID++;
+
+      Node<String> currentNode;
+      if (nodes.containsKey(nodeID)) {
+        currentNode = nodes.get(nodeID);
+      } else {
+        currentNode = new Node<String>(nodeID);
+        nodes.put(nodeID, currentNode);
+      }
+
+      LOG.debug("Node : {}", nodeID);
+
+      Matcher arcMatcher = arcPattern.matcher(nodeData);
+
+      while (arcMatcher.matches()) {
+        String arcLabel = arcMatcher.group(1);
+        float arcWeight = Float.valueOf(arcMatcher.group(2));
+        int destinationNodeID = nodeID + Integer.parseInt(arcMatcher.group(3));
+
+        Node<String> destinationNode;
+        if (nodes.containsKey(destinationNodeID)) {
+          destinationNode = nodes.get(destinationNodeID);
+        } else {
+          destinationNode = new Node<String>(destinationNodeID);
+          nodes.put(destinationNodeID, destinationNode);
+        }
+
+        String remainingArcs = arcMatcher.group(4);
+
+        LOG.debug("\t{} {} {}", arcLabel, arcWeight, destinationNodeID);
+
+        currentNode.addArc(destinationNode, arcWeight, arcLabel);
+
+        arcMatcher = arcPattern.matcher(remainingArcs);
+      }
+
+      nodeMatcher = nodePattern.matcher(remainingData);
+    }
+
+    List<Node<String>> nodeList = new ArrayList<Node<String>>(nodes.values());
+    Collections.sort(nodeList, new NodeIdentifierComparator());
+
+    LOG.debug("Nodelist={}", nodeList);
+
+    return new Lattice<String>(nodeList, config);
+  }
+
+  /**
+   * Gets the cost of the shortest path between two nodes.
+   *
+   * @param from ID of the starting node.
+   * @param to ID of the ending node.
+   * @return The cost of the shortest path between the two nodes.
+   */
+  public int getShortestPath(int from, int to) {
+    // System.err.println(String.format("DISTANCE(%d,%d) = %f", from, to, costs[from][to]));
+    if (distances == null)
+      this.distances = calculateAllPairsShortestPath();
+
+    return distances.get(from, to);
+  }
+
+  /**
+   * Gets the shortest distance through the lattice.
+   * @return int representing the shortest distance through the lattice
+   */
+  public int getShortestDistance() {
+    if (distances == null)
+      distances = calculateAllPairsShortestPath();
+    return distances.get(0, nodes.size()-1);
+  }
+
+  /**
+   * Gets the node with a specified integer identifier. If the identifier is negative, we count
+   * backwards from the end of the array, Perl-style (-1 is the last element, -2 the penultimate,
+   * etc).
+   *
+   * @param index Integer identifier for a node.
+   * @return The node with the specified integer identifier
+   */
+  public Node<Value> getNode(int index) {
+    if (index >= 0)
+      return nodes.get(index);
+    else
+      return nodes.get(size() + index);
+  }
+
+  public List<Node<Value>> getNodes() {
+    return nodes;
+  }
+
+  /**
+   * Returns an iterator over the nodes in this lattice.
+   *
+   * @return An iterator over the nodes in this lattice.
+   */
+  public Iterator<Node<Value>> iterator() {
+    return nodes.iterator();
+  }
+
+  /**
+   * Returns the number of nodes in this lattice.
+   *
+   * @return The number of nodes in this lattice.
+   */
+  public int size() {
+    return nodes.size();
+  }
+
+  /**
+   * Calculate the all-pairs shortest path for all pairs of nodes.
+   * <p>
+   * Note: This method assumes no backward arcs. If there are backward arcs, the returned shortest
+   * path costs for that node may not be accurate.
+   *
+   * @param nodes A list of nodes which must be in topological order.
+   * @return The all-pairs shortest path for all pairs of nodes.
+   */
+  private ChartSpan<Integer> calculateAllPairsShortestPath() {
+
+    ChartSpan<Integer> distance = new ChartSpan<Integer>(nodes.size() - 1, Integer.MAX_VALUE);
+    distance.setDiagonal(0);
+
+    /* Mark reachability between immediate neighbors */
+    for (Node<Value> tail : nodes) {
+      for (Arc<Value> arc : tail.getOutgoingArcs()) {
+        Node<Value> head = arc.getHead();
+        distance.set(tail.id(), head.id(), 1);
+      }
+    }
+
+    int size = nodes.size();
+
+    for (int width = 2; width <= size; width++) {
+      for (int i = 0; i < size - width; i++) {
+        int j = i + width;
+        for (int k = i + 1; k < j; k++) {
+          distance.set(i, j, Math.min(distance.get(i, j), distance.get(i, k) + distance.get(k, j)));
+        }
+      }
+    }
+
+    return distance;
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder s = new StringBuilder();
+
+    for (Node<Value> start : this) {
+      for (Arc<Value> arc : start.getOutgoingArcs()) {
+        s.append(arc.toString());
+        s.append('\n');
+      }
+    }
+
+    return s.toString();
+  }
+
+  public static void main(String[] args) {
+
+    List<Node<String>> nodes = new ArrayList<Node<String>>();
+    for (int i = 0; i < 4; i++) {
+      nodes.add(new Node<String>(i));
+    }
+
+    nodes.get(0).addArc(nodes.get(1), 1.0f, "x");
+    nodes.get(1).addArc(nodes.get(2), 1.0f, "y");
+    nodes.get(0).addArc(nodes.get(2), 1.5f, "a");
+    nodes.get(2).addArc(nodes.get(3), 3.0f, "b");
+    nodes.get(2).addArc(nodes.get(3), 5.0f, "c");
+
+    Lattice<String> graph = new Lattice<String>(nodes, null);
+
+    System.out.println("Shortest path from 0 to 3: " + graph.getShortestPath(0, 3));
+  }
+
+  /**
+   * Replaced the arc from node i to j with the supplied lattice. This is used to do OOV
+   * segmentation of words in a lattice.
+   *
+   * @param i start node of arc
+   * @param j end node of arc
+   * @param newNodes new nodes used within the replacement operation
+   */
+  public void insert(int i, int j, List<Node<Value>> newNodes) {
+
+    nodes.get(i).setOutgoingArcs(newNodes.get(0).getOutgoingArcs());
+
+    newNodes.remove(0);
+    nodes.remove(j);
+    Collections.reverse(newNodes);
+
+    for (Node<Value> node: newNodes)
+      nodes.add(j, node);
+
+    this.latticeHasAmbiguity = false;
+    for (int x = 0; x < nodes.size(); x++) {
+      nodes.get(x).setID(x);
+      this.latticeHasAmbiguity |= (nodes.get(x).getOutgoingArcs().size() > 1);
+    }
+
+    this.distances = null;
+  }
+
+  /**
+   * Topologically sorts the nodes and reassigns their numbers. Assumes that the first node is the
+   * source, but otherwise assumes nothing about the input.
+   *
+   * Probably correct, but untested.
+   */
+  @SuppressWarnings("unused")
+  private void topologicalSort() {
+    HashMap<Node<Value>, List<Arc<Value>>> outgraph = new HashMap<Node<Value>, List<Arc<Value>>>();
+    HashMap<Node<Value>, List<Arc<Value>>> ingraph = new HashMap<Node<Value>, List<Arc<Value>>>();
+    for (Node<Value> node: nodes) {
+      ArrayList<Arc<Value>> arcs = new ArrayList<Arc<Value>>();
+      for (Arc<Value> arc: node.getOutgoingArcs()) {
+        arcs.add(arc);
+
+        if (! ingraph.containsKey(arc.getHead()))
+          ingraph.put(arc.getHead(), new ArrayList<Arc<Value>>());
+        ingraph.get(arc.getHead()).add(arc);
+
+        outgraph.put(node, arcs);
+      }
+    }
+
+    ArrayList<Node<Value>> sortedNodes = new ArrayList<Node<Value>>();
+    Stack<Node<Value>> stack = new Stack<Node<Value>>();
+    stack.push(nodes.get(0));
+
+    while (! stack.empty()) {
+      Node<Value> node = stack.pop();
+      sortedNodes.add(node);
+      for (Arc<Value> arc: outgraph.get(node)) {
+        outgraph.get(node).remove(arc);
+        ingraph.get(arc.getHead()).remove(arc);
+
+        if (ingraph.get(arc.getHead()).size() == 0)
+          sortedNodes.add(arc.getHead());
+      }
+    }
+
+    int id = 0;
+    for (Node<Value> node : sortedNodes)
+      node.setID(id++);
+
+    this.nodes = sortedNodes;
+  }
+
+  /**
+   * Constructs a lattice from a given string representation. 
+   *
+   * @param data String representation of a lattice. 
+   * @return A lattice that corresponds to the given string. 
+   */
+  public static Lattice<String> createFromString(String data) {
+
+    Map<Integer,Node<String>> nodes = new HashMap<Integer,Node<String>>();
+
+    Pattern nodePattern = Pattern.compile("(.+?)\\((\\(.+?\\),)\\)(.*)");
+    Pattern arcPattern = Pattern.compile("\\('(.+?)',(\\d+.\\d+),(\\d+)\\),(.*)");
+
+    Matcher nodeMatcher = nodePattern.matcher(data);
+
+    int nodeID = -1;
+
+    while (nodeMatcher.matches()) {
+
+      String nodeData = nodeMatcher.group(2);
+      String remainingData = nodeMatcher.group(3);
+
+      nodeID++;
+
+      Node<String> currentNode;
+      if (nodes.containsKey(nodeID)) {
+        currentNode = nodes.get(nodeID);
+      } else {
+        currentNode = new Node<String>(nodeID);
+        nodes.put(nodeID, currentNode);
+      }
+
+      LOG.debug("Node : {}", nodeID);
+
+      Matcher arcMatcher = arcPattern.matcher(nodeData);
+
+      while (arcMatcher.matches()) {
+        String arcLabel = arcMatcher.group(1);
+        double arcWeight = Double.valueOf(arcMatcher.group(2));
+        int destinationNodeID = nodeID + Integer.valueOf(arcMatcher.group(3));
+
+        Node<String> destinationNode;
+        if (nodes.containsKey(destinationNodeID)) {
+          destinationNode = nodes.get(destinationNodeID);
+        } else {
+          destinationNode = new Node<String>(destinationNodeID);
+          nodes.put(destinationNodeID, destinationNode);
+        }
+
+        String remainingArcs = arcMatcher.group(4);
+
+        LOG.debug("\t {} {} {}", arcLabel,  arcWeight, destinationNodeID);
+
+        currentNode.addArc(destinationNode, (float) arcWeight, arcLabel);
+
+        arcMatcher = arcPattern.matcher(remainingArcs);
+      }
+
+      nodeMatcher = nodePattern.matcher(remainingData);
+    }
+
+    List<Node<String>> nodeList = new ArrayList<Node<String>>(nodes.values());
+    Collections.sort(nodeList, new NodeIdentifierComparator());
+
+    LOG.debug("Nodelist={}", nodeList);
+
+    return new Lattice<String>(nodeList, new JoshuaConfiguration());
+  }
+}