You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/08/17 10:32:15 UTC
[34/56] [partial] incubator-joshua git commit: maven multi-module
layout 1st commit: moving files into joshua-core
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Stack.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Stack.java b/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Stack.java
new file mode 100644
index 0000000..d0ae2da
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Stack.java
@@ -0,0 +1,229 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.phrase;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.PriorityQueue;
+import java.util.Set;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.ComputeNodeResult;
+import org.apache.joshua.decoder.ff.FeatureFunction;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Organizes all hypotheses containing the same number of source words.
+ *
+ */
+public class Stack extends ArrayList<Hypothesis> {
+
+ private static final Logger LOG = LoggerFactory.getLogger(Stack.class);
+
+ private static final long serialVersionUID = 7885252799032416068L;
+
+ private HashMap<Coverage, ArrayList<Hypothesis>> coverages;
+
+ private Sentence sentence;
+ private List<FeatureFunction> featureFunctions;
+ private JoshuaConfiguration config;
+
+ /* The list of states we've already visited. */
+ private HashSet<Candidate> visitedStates;
+
+ /* A list of candidates sorted for consideration for entry to the chart (for cube pruning) */
+ private PriorityQueue<Candidate> candidates;
+
+ /* Short-circuits adding a cube-prune state more than once */
+ private HashMap<Hypothesis, Hypothesis> deduper;
+
+ /**
+ * Create a new stack. Stacks are organized one for each number of source words that are covered.
+ *
+ * @param featureFunctions {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ * @param sentence input for a {@link org.apache.joshua.lattice.Lattice}
+ * @param config populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+ */
+ public Stack(List<FeatureFunction> featureFunctions, Sentence sentence, JoshuaConfiguration config) {
+ this.featureFunctions = featureFunctions;
+ this.sentence = sentence;
+ this.config = config;
+
+ this.candidates = new PriorityQueue<Candidate>(1, new CandidateComparator());
+ this.coverages = new HashMap<Coverage, ArrayList<Hypothesis>>();
+ this.visitedStates = new HashSet<Candidate>();
+ this.deduper = new HashMap<Hypothesis,Hypothesis>();
+ }
+
+ /**
+ * A Stack is an ArrayList; here, we intercept the add so we can maintain a list of the items
+ * stored under each distinct coverage vector
+ * @param hyp a {@link org.apache.joshua.decoder.phrase.Hypothesis} to add to the {@link org.apache.joshua.decoder.phrase.Stack}
+ * @return true if the {@link org.apache.joshua.decoder.phrase.Hypothesis} is appended to the list
+ */
+ @Override
+ public boolean add(Hypothesis hyp) {
+
+ if (! coverages.containsKey((hyp.getCoverage())))
+ coverages.put(hyp.getCoverage(), new ArrayList<Hypothesis>());
+ coverages.get(hyp.getCoverage()).add(hyp);
+
+ return super.add(hyp);
+ }
+
+ /**
+ * Intercept calls to remove() so that we can reduce the coverage vector
+ */
+ @Override
+ public boolean remove(Object obj) {
+ boolean found = super.remove(obj);
+ if (found) {
+ Hypothesis item = (Hypothesis) obj;
+ Coverage cov = item.getCoverage();
+ assert coverages.get(cov).remove(obj);
+ if (coverages.get(cov).size() == 0)
+ coverages.remove(cov);
+ }
+ return found;
+ }
+
+ /**
+ * Returns the set of coverages contained in this stack. This is used to iterate over them
+ * in the main decoding loop in Stacks.java.
+ * @return a {@link java.util.Set} of {@link org.apache.joshua.decoder.phrase.Coverage}'s
+ */
+ public Set<Coverage> getCoverages() {
+ return coverages.keySet();
+ }
+
+ /**
+ * Get all items with the same coverage vector.
+ *
+ * @param cov the {@link org.apache.joshua.decoder.phrase.Coverage} vector to get
+ * @return an {@link java.util.ArrayList} of {@link org.apache.joshua.decoder.phrase.Hypothesis}'
+ */
+ public ArrayList<Hypothesis> get(Coverage cov) {
+ ArrayList<Hypothesis> list = coverages.get(cov);
+ Collections.sort(list);
+ return list;
+ }
+
+ /**
+ * Receives a partially-initialized translation candidate and places it on the
+ * priority queue after scoring it with all of the feature functions. In this
+ * respect it is like {@link org.apache.joshua.decoder.chart_parser.CubePruneState} (it could make use of that class with
+ * a little generalization of spans / coverage).
+ *
+ * This function is also used to (fairly concisely) implement constrained decoding. Before
+ * adding a candidate, we ensure that the sequence of English words match the sentence. If not,
+ * the code extends the dot in the cube-pruning chart to the next phrase, since that one might
+ * be a match.
+ * @param cand a partially-initialized translation {@link org.apache.joshua.decoder.phrase.Candidate}
+ */
+ public void addCandidate(Candidate cand) {
+ if (visitedStates.contains(cand))
+ return;
+
+ visitedStates.add(cand);
+
+ // Constrained decoding
+ if (sentence.target() != null) {
+ String oldWords = cand.getHypothesis().bestHyperedge.getRule().getEnglishWords().replace("[X,1] ", "");
+ String newWords = cand.getRule().getEnglishWords().replace("[X,1] ", "");
+
+ // If the string is not found in the target sentence, explore the cube neighbors
+ if (sentence.fullTarget().indexOf(oldWords + " " + newWords) == -1) {
+ Candidate next = cand.extendPhrase();
+ if (next != null)
+ addCandidate(next);
+ return;
+ }
+ }
+
+ // TODO: sourcepath
+ ComputeNodeResult result = new ComputeNodeResult(this.featureFunctions, cand.getRule(),
+ cand.getTailNodes(), -1, cand.getSpan().end, null, this.sentence);
+ cand.setResult(result);
+
+ candidates.add(cand);
+ }
+
+ /**
+ * Cube pruning. Repeatedly pop the top candidate, creating a new hyperedge from it, adding it to
+ * the k-best list, and then extending the list of candidates with extensions of the current
+ * candidate.
+ */
+ public void search() {
+ int to_pop = config.pop_limit;
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("Stack::search(): pop: {} size: {}", to_pop, candidates.size());
+ for (Candidate c: candidates)
+ LOG.debug("{}", c);
+ }
+ while (to_pop > 0 && !candidates.isEmpty()) {
+ Candidate got = candidates.poll();
+ if (got != null) {
+ addHypothesis(got);
+ --to_pop;
+
+ for (Candidate c : got.extend())
+ if (c != null) {
+ addCandidate(c);
+ }
+ }
+ }
+ }
+
+ /**
+ * Adds a popped candidate to the chart / main stack. This is a candidate we have decided to
+ * keep around.
+ * @param complete a completely-initialized translation {@link org.apache.joshua.decoder.phrase.Candidate}
+ *
+ */
+ public void addHypothesis(Candidate complete) {
+ Hypothesis added = new Hypothesis(complete);
+
+ String taskName;
+ if (deduper.containsKey(added)) {
+ taskName = "recombining hypothesis";
+ Hypothesis existing = deduper.get(added);
+ existing.absorb(added);
+ } else {
+ taskName = "creating new hypothesis";
+ add(added);
+ deduper.put(added, added);
+ }
+
+ if (LOG.isDebugEnabled()) {
+ LOG.debug("{} from ( ... {} )", taskName, complete.getHypothesis().getRule().getEnglishWords());
+ LOG.debug(" base score {}", complete.getResult().getBaseCost());
+ LOG.debug(" covering {}-{}", complete.getSpan().start - 1, complete.getSpan().end - 2);
+ LOG.debug(" translated as: {}", complete.getRule().getEnglishWords());
+ LOG.debug(" score {} + future cost {} = {}",
+ complete.getResult().getTransitionCost(), complete.getFutureEstimate(),
+ complete.getResult().getTransitionCost() + complete.getFutureEstimate());
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Stacks.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Stacks.java b/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Stacks.java
new file mode 100644
index 0000000..8c092ec
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/Stacks.java
@@ -0,0 +1,271 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.phrase;
+
+/***
+ * Entry point for phrase-based decoding, analogous to {@link Chart} for the CKY algorithm. This
+ * class organizes all the stacks used for decoding, and is responsible for building them. Stack
+ * construction is stack-centric: that is, we loop over the number of source words in increasing sizes;
+ * at each step of this iteration, we break the search between smaller stack sizes and source-side
+ * phrase sizes.
+ *
+ * The end result of decoding is a {@link Hypergraph} with the same format as hierarchical decoding.
+ * Phrases are treating as left-branching rules, and the span information (i,j) is overloaded so
+ * that i means nothing and j represents the index of the last-translated source word in each
+ * hypothesis. This means that most hypergraph code can work without modification. The algorithm
+ * ensures that the coverage vector is consistent but the resulting hypergraph may not be projective,
+ * which is different from the CKY algorithm, which does produce projective derivations.
+ *
+ * TODO Lattice decoding is not yet supported (March 2015).
+ */
+
+import static org.apache.joshua.decoder.ff.tm.OwnerMap.UNKNOWN_OWNER;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.joshua.corpus.Span;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.ComputeNodeResult;
+import org.apache.joshua.decoder.ff.FeatureFunction;
+import org.apache.joshua.decoder.ff.tm.AbstractGrammar;
+import org.apache.joshua.decoder.ff.tm.Grammar;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.hypergraph.HyperEdge;
+import org.apache.joshua.decoder.hypergraph.HyperGraph;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Stacks {
+
+ private static final Logger LOG = LoggerFactory.getLogger(Stacks.class);
+
+ // The list of stacks, grouped according to number of source words covered
+ private List<Stack> stacks;
+
+ // The end state
+ private Hypothesis end;
+
+ List<FeatureFunction> featureFunctions;
+
+ private Sentence sentence;
+
+ private JoshuaConfiguration config;
+
+ /* Contains all the phrase tables */
+ private PhraseChart chart;
+
+ /**
+ * Entry point. Initialize everything. Create pass-through (OOV) phrase table and glue phrase
+ * table (with start-of-sentence and end-of-sentence rules).
+ *
+ * @param sentence input to {@link org.apache.joshua.lattice.Lattice}
+ * @param featureFunctions {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ * @param grammars an array of {@link org.apache.joshua.decoder.ff.tm.Grammar}'s
+ * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+ */
+ public Stacks(Sentence sentence, List<FeatureFunction> featureFunctions, Grammar[] grammars,
+ JoshuaConfiguration config) {
+
+ this.sentence = sentence;
+ this.featureFunctions = featureFunctions;
+ this.config = config;
+
+ int num_phrase_tables = 0;
+ for (int i = 0; i < grammars.length; i++)
+ if (grammars[i] instanceof PhraseTable)
+ ++num_phrase_tables;
+
+ PhraseTable[] phraseTables = new PhraseTable[num_phrase_tables + 2];
+ for (int i = 0, j = 0; i < grammars.length; i++)
+ if (grammars[i] instanceof PhraseTable)
+ phraseTables[j++] = (PhraseTable) grammars[i];
+
+ phraseTables[phraseTables.length - 2] = new PhraseTable(UNKNOWN_OWNER, config);
+ phraseTables[phraseTables.length - 2].addRule(Hypothesis.END_RULE);
+
+ phraseTables[phraseTables.length - 1] = new PhraseTable("oov", config);
+ AbstractGrammar.addOOVRules(phraseTables[phraseTables.length - 1], sentence.getLattice(), featureFunctions, config.true_oovs_only);
+
+ this.chart = new PhraseChart(phraseTables, featureFunctions, sentence, config.num_translation_options);
+ }
+
+
+ /**
+ * The main algorithm. Returns a hypergraph representing the search space.
+ *
+ * @return a {@link org.apache.joshua.decoder.hypergraph.HyperGraph} representing the search space
+ */
+ public HyperGraph search() {
+
+ long startTime = System.currentTimeMillis();
+
+ Future future = new Future(chart);
+ stacks = new ArrayList<Stack>();
+
+ // <s> counts as the first word. Pushing null lets us count from one.
+ stacks.add(null);
+
+ // Initialize root hypothesis with <s> context and future cost for everything.
+ ComputeNodeResult result = new ComputeNodeResult(this.featureFunctions, Hypothesis.BEGIN_RULE,
+ null, -1, 1, null, this.sentence);
+ Stack firstStack = new Stack(featureFunctions, sentence, config);
+ firstStack.add(new Hypothesis(result.getDPStates(), future.Full()));
+ stacks.add(firstStack);
+
+ // Decode with increasing numbers of source words.
+ for (int source_words = 2; source_words <= sentence.length(); ++source_words) {
+ Stack targetStack = new Stack(featureFunctions, sentence, config);
+ stacks.add(targetStack);
+
+ // Iterate over stacks to continue from.
+ for (int phrase_length = 1; phrase_length <= Math.min(source_words - 1, chart.MaxSourcePhraseLength());
+ phrase_length++) {
+ int from_stack = source_words - phrase_length;
+ Stack tailStack = stacks.get(from_stack);
+
+ LOG.debug("WORDS {} MAX {} (STACK {} phrase_length {})", source_words,
+ chart.MaxSourcePhraseLength(), from_stack, phrase_length);
+
+ // Iterate over antecedents in this stack.
+ for (Coverage coverage: tailStack.getCoverages()) {
+ ArrayList<Hypothesis> hypotheses = tailStack.get(coverage);
+
+ // the index of the starting point of the first possible phrase
+ int begin = coverage.firstZero();
+
+ // the absolute position of the ending spot of the last possible phrase
+ int last_end = Math.min(coverage.firstZero() + config.reordering_limit, chart.SentenceLength());
+ int last_begin = (last_end > phrase_length) ? (last_end - phrase_length) : 0;
+
+ for (begin = coverage.firstZero(); begin <= last_begin; begin++) {
+ if (!coverage.compatible(begin, begin + phrase_length) ||
+ ! permissible(coverage, begin, begin + phrase_length)) {
+ continue;
+ }
+
+ Span span = new Span(begin, begin + phrase_length);
+
+ // Don't append </s> until the end
+ if (begin == sentence.length() - 1 && source_words != sentence.length())
+ continue;
+
+ TargetPhrases phrases = chart.getRange(begin, begin + phrase_length);
+ if (phrases == null)
+ continue;
+
+
+ LOG.debug("Applying {} target phrases over [{}, {}]",
+ phrases.size(), begin, begin + phrase_length);
+
+ // TODO: could also compute some number of features here (e.g., non-LM ones)
+ // float score_delta = context.GetScorer().transition(ant, phrases, begin, begin + phrase_length);
+
+ // Future costs: remove span to be filled.
+ float future_delta = future.Change(coverage, begin, begin + phrase_length);
+
+ /* This associates with each span a set of hypotheses that can be extended by
+ * phrases from that span. The hypotheses are wrapped in HypoState objects, which
+ * augment the hypothesis score with a future cost.
+ */
+ Candidate cand = new Candidate(hypotheses, phrases, span, future_delta);
+ targetStack.addCandidate(cand);
+ }
+ }
+ }
+
+ /* At this point, every vertex contains a list of all existing hypotheses that the target
+ * phrases in that vertex could extend. Now we need to create the search object, which
+ * implements cube pruning. There are up to O(n^2) cubes, n the size of the current stack,
+ * one cube each over each span of the input. Each "cube" has two dimensions: one representing
+ * the target phrases over the span, and one representing all of these incoming hypotheses.
+ * We seed the chart with the best item in each cube, and then repeatedly pop and extend.
+ */
+
+// System.err.println(String.format("\nBuilding cube-pruning chart for %d words", source_words));
+
+ targetStack.search();
+ }
+
+ LOG.info("Input {}: Search took {} seconds", sentence.id(),
+ (System.currentTimeMillis() - startTime) / 1000.0f);
+
+ return createGoalNode();
+ }
+
+ /**
+ * Enforces reordering constraints. Our version of Moses' ReorderingConstraint::Check() and
+ * SearchCubePruning::CheckDistortion().
+ *
+ * @param coverage
+ * @param begin
+ * @param i
+ * @return
+ */
+ private boolean permissible(Coverage coverage, int begin, int end) {
+ int firstZero = coverage.firstZero();
+
+ if (config.reordering_limit < 0)
+ return true;
+
+ /* We can always start with the first zero since it doesn't create a reordering gap
+ */
+ if (begin == firstZero)
+ return true;
+
+ /* If a gap is created by applying this phrase, make sure that you can reach the first
+ * zero later on without violating the distortion constraint.
+ */
+ if (end - firstZero > config.reordering_limit) {
+ return false;
+ }
+
+ return true;
+ }
+
+
+ /**
+ * Searches through the goal stack, calling the final transition function on each node, and then returning
+ * the best item. Usually the final transition code doesn't add anything, because all features
+ * have already computed everything they need to. The standard exception is language models that
+ * have not yet computed their prefix probabilities (which is not the case with KenLM, the default).
+ *
+ * @return
+ */
+ private HyperGraph createGoalNode() {
+ Stack lastStack = stacks.get(sentence.length());
+
+ for (Hypothesis hyp: lastStack) {
+ float score = hyp.getScore();
+ List<HGNode> tailNodes = new ArrayList<HGNode>();
+ tailNodes.add(hyp);
+
+ float finalTransitionScore = ComputeNodeResult.computeFinalCost(featureFunctions, tailNodes, 0, sentence.length(), null, sentence);
+
+ if (null == this.end)
+ this.end = new Hypothesis(null, score + finalTransitionScore, hyp, sentence.length(), null);
+
+ HyperEdge edge = new HyperEdge(null, score + finalTransitionScore, finalTransitionScore, tailNodes, null);
+ end.addHyperedgeInNode(edge);
+ }
+
+ return new HyperGraph(end, -1, -1, this.sentence);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/TargetPhrases.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/TargetPhrases.java b/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/TargetPhrases.java
new file mode 100644
index 0000000..05a4b0a
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/phrase/TargetPhrases.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.phrase;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.joshua.decoder.ff.FeatureFunction;
+import org.apache.joshua.decoder.ff.FeatureVector;
+import org.apache.joshua.decoder.ff.tm.Rule;
+
+/**
+ * Represents a sorted collection of target-side phrases. Typically, these are phrases
+ * generated from the same source word sequence. The list of options is reduced to the number
+ * of translation options.
+ *
+ * @author Matt Post
+ */
+
+public class TargetPhrases extends ArrayList<Rule> {
+
+ private static final long serialVersionUID = 1L;
+
+ public TargetPhrases() {
+ super();
+ }
+
+ /**
+ * Initialize with a collection of rules.
+ *
+ * @param list a {@link java.util.List} of {@link org.apache.joshua.decoder.ff.tm.Rule}'s
+ */
+ public TargetPhrases(List<Rule> list) {
+ super();
+
+ for (Rule rule: list) {
+ add(rule);
+ }
+ }
+
+ /**
+ * Score the rules and sort them. Scoring is necessary because rules are only scored if they
+ * are used, in an effort to make reading in rules more efficient. This is starting to create
+ * some trouble and should probably be reworked.
+ * @param features a {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ * @param weights a populated {@link org.apache.joshua.decoder.ff.FeatureVector}
+ * @param num_options the number of options
+ */
+ public void finish(List<FeatureFunction> features, FeatureVector weights, int num_options) {
+ for (Rule rule: this) {
+ rule.estimateRuleCost(features);
+// System.err.println("TargetPhrases:finish(): " + rule);
+ }
+ Collections.sort(this, Rule.EstimatedCostComparator);
+
+ if (this.size() > num_options)
+ this.removeRange(num_options, this.size());
+
+// System.err.println("TargetPhrases::finish()");
+// for (Rule rule: this)
+// System.err.println(" " + rule);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintRule.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintRule.java b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintRule.java
new file mode 100644
index 0000000..5146e2c
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintRule.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.segment_file;
+
+import javax.swing.text.Segment;
+
+/**
+ * This interface is for an individual (partial) item to seed the chart with. All rules should be
+ * flat (no hierarchical nonterminals).
+ * <p>
+ * The {@link Segment}, {@link ConstraintSpan}, and {@link ConstraintRule} interfaces are for
+ * defining an interchange format between a SegmentFileParser and the Chart class. These interfaces
+ * <b>should not</b> be used internally by the Chart. The objects returned by a
+ * SegmentFileParser will not be optimal for use during decoding. The Chart should convert each of
+ * these objects into its own internal representation during construction. That is the contract
+ * described by these interfaces.
+ *
+ * @see org.apache.joshua.decoder.segment_file.ConstraintRule.Type
+ *
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $
+ */
+public interface ConstraintRule {
+
+ /**
+ * <p>There are three types of ConstraintRule. The RULE type returns non-null values for all methods.
+ * The LHS type provides a (non-null) value for the lhs method, but returns null for everything
+ * else. And the RHS type provides a (non-null) value for nativeRhs and foreignRhs but returns
+ * null for the lhs and features.</p>
+ * <p>
+ * The interpretation of a RULE is that it adds a new rule to the grammar which only applies to
+ * the associated span. If the associated span is hard, then the set of rules for that span will
+ * override the regular grammar.</p>
+ * <p>
+ * The intepretation of a LHS is that it provides a hard constraint that the associated span be
+ * treated as the nonterminal for that span, thus filtering the regular grammar.</p>
+ * <p>
+ * The interpretation of a RHS is that it provides a hard constraint to filter the regular grammar
+ * such that only rules generating the desired translation can be used.</p>
+ */
+ public enum Type {
+ RULE, LHS, RHS
+ };
+
+ /**
+ * Return the type of this ConstraintRule.
+ * @return the {@link org.apache.joshua.decoder.segment_file.ConstraintRule.Type}
+ */
+ Type type();
+
+
+ /**
+ * Return the left hand side of the constraint rule. If this is null, then this object is
+ * specifying a translation for the span, but that translation may be derived from any
+ * nonterminal. The nonterminal here must be one used by the regular grammar.
+ * @return the left hand side of the constraint rule
+ */
+ String lhs();
+
+
+ /**
+ * Return the native right hand side of the constraint rule. If this is null, then the regular
+ * grammar will be used to fill in the derivation from the lhs.
+ * @return the native right hand side of the constraint rule
+ */
+ String nativeRhs();
+
+
+ /**
+ * Return the foreign right hand side of the constraint rule. This must be consistent with the
+ * sentence for the associated span, and is provided as a convenience method.
+ * @return the foreign right hand side of the constraint rule
+ */
+ String foreignRhs();
+
+
+ /**
+ * Return the grammar feature values for the RULE. The length of this array must be the same as
+ * for the regular grammar. We cannot enforce this requirement, but the
+ * {@link org.apache.joshua.decoder.chart_parser.Chart} must throw an error if there is a mismatch.
+ * @return an array of floating feature values for the RULE
+ */
+ float[] features();
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintSpan.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintSpan.java b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintSpan.java
new file mode 100644
index 0000000..9863fa6
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ConstraintSpan.java
@@ -0,0 +1,80 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.segment_file;
+
+import java.util.List;
+
+import javax.swing.text.Segment;
+
+/**
+ * This interface represents a collection of constraints for a given span in the associated segment.
+ * Intuitively, each constraint corresponds to one or more items in the chart for parsing, except
+ * that we pre-seed the chart with these items before beginning the parsing algorithm. Some
+ * constraints can be "hard", in which case the regular grammar is not consulted for these spans. It
+ * is an error to have hard constraints for overlapping spans.
+ * <p>
+ * Indices for the span boundaries mark the transitions between words. Thus, the 0 index occurs
+ * before the first word, the 1 index occurs between the first and second words, 2 is between the
+ * second and third, etc. Consequently, it is an error for the end index to be equal to or less than
+ * the start index. It is also an error to have negative indices or to have indices larger than the
+ * count of words in the segment. Clients may assume that no <code>ConstraintSpan</code> objects are
+ * constructed which violate these laws.
+ * <p>
+ * The {@link Segment}, {@link ConstraintSpan}, and {@link ConstraintRule} interfaces are for
+ * defining an interchange format between a SegmentFileParser and the Chart class. These interfaces
+ * <b>should not</b> be used internally by the Chart. The objects returned by a
+ * SegmentFileParser will not be optimal for use during decoding. The Chart should convert each of
+ * these objects into its own internal representation during construction. That is the contract
+ * described by these interfaces.
+ *
+ * @author wren ng thornton wren@users.sourceforge.net
+ */
+public interface ConstraintSpan {
+
+ /**
+ * Return the starting index of the span covered by this constraint.
+ * @return the starting index of the span covered by this constraint
+ */
+ int start();
+
+ /**
+ * Return the ending index of the span covered by this constraint. Clients may assume
+ * <code>this.end() >= 1 + this.start()</code>.
+ * @return the ending index of the span covered by this constraint
+ */
+ int end();
+
+ /**
+ * Return whether this is a hard constraint which should override the grammar. This value only
+ * really matters for sets of <code>RULE</code> type constraints.
+ * @return true if a hard constraint exists which should override the grammar
+ */
+ boolean isHard();
+
+ /**
+ * Return a collection of the "rules" for this constraint span.
+ * <p>
+ * This return type is suboptimal for some SegmentFileParsers. It should be an
+ * {@link java.util.Iterator} instead in order to reduce the coupling between this class and
+ * Chart. See the note above about the fact that this interface should not be used internally by
+ * the Chart class because it will not be performant.
+ * @return a {@link java.util.List} of {@link org.apache.joshua.decoder.segment_file.ConstraintRule}'s
+ */
+ List<ConstraintRule> rules();
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ParseTreeInput.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ParseTreeInput.java b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ParseTreeInput.java
new file mode 100644
index 0000000..b9b1896
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ParseTreeInput.java
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.segment_file;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+
+public class ParseTreeInput extends Sentence {
+
+ public ParseTreeInput(String input, int id, JoshuaConfiguration joshuaConfiguration) {
+ super(input, id,joshuaConfiguration);
+ }
+
+ // looks_like_parse_tree = sentence.sentence().matches("^\\(+[A-Z]+ .*");
+
+ // private SyntaxTree syntax_tree;
+
+ // ParseTreeInput() {
+ // SyntaxTree syntax_tree = new ArraySyntaxTree(sentence.sentence(), Vocabulary);
+ // }
+
+ // public int[] int_sentence() {
+ // return syntax_tree.getTerminals();
+ // }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ParsedSentence.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ParsedSentence.java b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ParsedSentence.java
new file mode 100644
index 0000000..a97718e
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/ParsedSentence.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.segment_file;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.corpus.syntax.ArraySyntaxTree;
+import org.apache.joshua.corpus.syntax.SyntaxTree;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+
+public class ParsedSentence extends Sentence {
+
+ private SyntaxTree syntaxTree = null;
+
+ public ParsedSentence(String input, int id,JoshuaConfiguration joshuaConfiguration) {
+ super(input, id, joshuaConfiguration);
+ }
+
+ public int[] getWordIDs() {
+ int[] terminals = syntaxTree().getTerminals();
+ int[] annotated = new int[terminals.length + 2];
+ System.arraycopy(terminals, 0, annotated, 1, terminals.length);
+ annotated[0] = Vocabulary.id(Vocabulary.START_SYM);
+ annotated[annotated.length - 1] = Vocabulary.id(Vocabulary.STOP_SYM);
+ return annotated;
+ }
+
+ public SyntaxTree syntaxTree() {
+ if (syntaxTree == null)
+ syntaxTree = new ArraySyntaxTree(this.source());
+ return syntaxTree;
+ }
+
+ public static boolean matches(String input) {
+ return input.matches("^\\(+[A-Z]+ .*");
+ }
+
+ public String fullSource() {
+ return Vocabulary.getWords(this.getWordIDs());
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java
new file mode 100644
index 0000000..e323ef6
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/Sentence.java
@@ -0,0 +1,450 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.segment_file;
+
+import static org.apache.joshua.util.FormatUtils.addSentenceMarkers;
+
+import java.util.ArrayList;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.StringTokenizer;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.ff.tm.Grammar;
+import org.apache.joshua.lattice.Arc;
+import org.apache.joshua.lattice.Lattice;
+import org.apache.joshua.lattice.Node;
+import org.apache.joshua.util.ChartSpan;
+import org.apache.joshua.util.Regex;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class represents lattice input. The lattice is contained on a single line and is represented
+ * in PLF (Python Lattice Format), e.g.,
+ *
+ * <pre>
+ * ((('ein',0.1,1),('dieses',0.2,1),('haus',0.4,2),),(('haus',0.8,1),),)
+ * </pre>
+ *
+ * @author Matt Post post@cs.jhu.edu
+ */
+
+public class Sentence {
+
+ private static final Logger LOG = LoggerFactory.getLogger(Sentence.class);
+
+ /* The sentence number. */
+ public int id = -1;
+
+ /*
+ * The source and target sides of the input sentence. Target sides are present when doing
+ * alignment or forced decoding.
+ */
+ protected String source = null;
+ protected String fullSource = null;
+
+ protected String target = null;
+ protected String fullTarget = null;
+ protected String[] references = null;
+
+ /* Lattice representation of the source sentence. */
+ protected Lattice<Token> sourceLattice = null;
+
+ /* List of constraints */
+ private final List<ConstraintSpan> constraints;
+
+ public JoshuaConfiguration config = null;
+
+ /**
+ * Constructor. Receives a string representing the input sentence. This string may be a
+ * string-encoded lattice or a plain text string for decoding.
+ *
+ * @param inputString representing the input sentence
+ * @param id ID to associate with the input string
+ * @param joshuaConfiguration a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+ */
+ public Sentence(String inputString, int id, JoshuaConfiguration joshuaConfiguration) {
+
+ inputString = Regex.spaces.replaceAll(inputString, " ").trim();
+
+ config = joshuaConfiguration;
+
+ this.constraints = new LinkedList<ConstraintSpan>();
+
+ // Check if the sentence has SGML markings denoting the
+ // sentence ID; if so, override the id passed in to the
+ // constructor
+ Matcher start = SEG_START.matcher(inputString);
+ if (start.find()) {
+ source = SEG_END.matcher(start.replaceFirst("")).replaceFirst("");
+ String idstr = start.group(1);
+ this.id = Integer.parseInt(idstr);
+
+ } else {
+ if (inputString.indexOf(" ||| ") != -1) {
+ /* Target-side given; used for parsing and forced decoding */
+ String[] pieces = inputString.split("\\s?\\|{3}\\s?");
+ source = pieces[0];
+ target = pieces[1];
+ if (target.equals(""))
+ target = null;
+ if (pieces.length > 2) {
+ references = new String[pieces.length - 2];
+ System.arraycopy(pieces, 2, references, 0, pieces.length - 2);
+ }
+ this.id = id;
+
+ } else {
+ /* Regular ol' input sentence */
+ source = inputString;
+ this.id = id;
+ }
+ }
+
+ // Only trim strings
+ if (! (joshuaConfiguration.lattice_decoding && source.startsWith("(((")))
+ adjustForLength(joshuaConfiguration.maxlen);
+ }
+
+ /**
+ * Indicates whether the underlying lattice is a linear chain, i.e., a sentence.
+ *
+ * @return true if this is a linear chain, false otherwise
+ */
+ public boolean isLinearChain() {
+ return ! this.getLattice().hasMoreThanOnePath();
+ }
+
+ // Matches the opening and closing <seg> tags, e.g.,
+ // <seg id="72">this is a test input sentence</seg>.
+ protected static final Pattern SEG_START = Pattern
+ .compile("^\\s*<seg\\s+id=\"?(\\d+)\"?[^>]*>\\s*");
+ protected static final Pattern SEG_END = Pattern.compile("\\s*</seg\\s*>\\s*$");
+
+ /**
+ * Returns the length of the sentence. For lattices, the length is the shortest path through the
+ * lattice. The length includes the <s> and </s> sentence markers.
+ *
+ * @return number of input tokens + 2 (for start and end of sentence markers)
+ */
+ public int length() {
+ return this.getLattice().getShortestDistance();
+ }
+
+ /**
+ * Returns the annotations for a specific word (specified by an index) in the
+ * sentence
+ *
+ * @param index The location of the word in the sentence
+ * @param key The annotation identity
+ * @return The annotations associated with this word
+ */
+ public String getAnnotation(int index, String key) {
+ return getTokens().get(index).getAnnotation(key);
+ }
+
+ /**
+ * This function computes the intersection of \sigma^+ (where \sigma is the terminal vocabulary)
+ * with all character-level segmentations of each OOV in the input sentence.
+ *
+ * The idea is to break apart noun compounds in languages like German (such as the word "golfloch"
+ * = "golf" (golf) + "loch" (hole)), allowing them to be translated.
+ *
+ * @param grammars a list of grammars to consult to find in- and out-of-vocabulary items
+ */
+ public void segmentOOVs(Grammar[] grammars) {
+ Lattice<Token> oldLattice = this.getLattice();
+
+ /* Build a list of terminals across all grammars */
+ HashSet<Integer> vocabulary = new HashSet<Integer>();
+ for (Grammar grammar : grammars) {
+ Iterator<Integer> iterator = grammar.getTrieRoot().getTerminalExtensionIterator();
+ while (iterator.hasNext())
+ vocabulary.add(iterator.next());
+ }
+
+ List<Node<Token>> oldNodes = oldLattice.getNodes();
+
+ /* Find all the subwords that appear in the vocabulary, and create the lattice */
+ for (int nodeid = oldNodes.size() - 3; nodeid >= 1; nodeid -= 1) {
+ if (oldNodes.get(nodeid).getOutgoingArcs().size() == 1) {
+ Arc<Token> arc = oldNodes.get(nodeid).getOutgoingArcs().get(0);
+ String word = Vocabulary.word(arc.getLabel().getWord());
+ if (!vocabulary.contains(arc.getLabel())) {
+ // System.err.println(String.format("REPL: '%s'", word));
+ List<Arc<Token>> savedArcs = oldNodes.get(nodeid).getOutgoingArcs();
+
+ char[] chars = word.toCharArray();
+ ChartSpan<Boolean> wordChart = new ChartSpan<Boolean>(chars.length + 1, false);
+ ArrayList<Node<Token>> nodes = new ArrayList<Node<Token>>(chars.length + 1);
+ nodes.add(oldNodes.get(nodeid));
+ for (int i = 1; i < chars.length; i++)
+ nodes.add(new Node<Token>(i));
+ nodes.add(oldNodes.get(nodeid + 1));
+ for (int width = 1; width <= chars.length; width++) {
+ for (int i = 0; i <= chars.length - width; i++) {
+ int j = i + width;
+ if (width != chars.length) {
+ Token token = new Token(word.substring(i, j), config);
+ if (vocabulary.contains(id)) {
+ nodes.get(i).addArc(nodes.get(j), 0.0f, token);
+ wordChart.set(i, j, true);
+ // System.err.println(String.format(" FOUND '%s' at (%d,%d)", word.substring(i, j),
+ // i, j));
+ }
+ }
+
+ for (int k = i + 1; k < j; k++) {
+ if (wordChart.get(i, k) && wordChart.get(k, j)) {
+ wordChart.set(i, j, true);
+ // System.err.println(String.format(" PATH FROM %d-%d-%d", i, k, j));
+ }
+ }
+ }
+ }
+
+ /* If there's a path from beginning to end */
+ if (wordChart.get(0, chars.length)) {
+ // Remove nodes not part of a complete path
+ HashSet<Node<Token>> deletedNodes = new HashSet<Node<Token>>();
+ for (int k = 1; k < nodes.size() - 1; k++)
+ if (!(wordChart.get(0, k) && wordChart.get(k, chars.length)))
+ nodes.set(k, null);
+
+ int delIndex = 1;
+ while (delIndex < nodes.size())
+ if (nodes.get(delIndex) == null) {
+ deletedNodes.add(nodes.get(delIndex));
+ nodes.remove(delIndex);
+ } else
+ delIndex++;
+
+ for (Node<Token> node : nodes) {
+ int arcno = 0;
+ while (arcno != node.getOutgoingArcs().size()) {
+ Arc<Token> delArc = node.getOutgoingArcs().get(arcno);
+ if (deletedNodes.contains(delArc.getHead()))
+ node.getOutgoingArcs().remove(arcno);
+ else {
+ arcno++;
+ // System.err.println(" ARC: " + Vocabulary.word(delArc.getLabel()));
+ }
+ }
+ }
+
+ // Insert into the main lattice
+ this.getLattice().insert(nodeid, nodeid + 1, nodes);
+ } else {
+ nodes.get(0).setOutgoingArcs(savedArcs);
+ }
+ }
+ }
+ }
+ }
+
+ /**
+ * If the input sentence is too long (not counting the <s> and </s> tokens), it is truncated to
+ * the maximum length, specified with the "maxlen" parameter.
+ *
+ * Note that this code assumes the underlying representation is a sentence, and not a lattice. Its
+ * behavior is undefined for lattices.
+ *
+ * @param length int representing the length to truncate the sentence to
+ */
+ protected void adjustForLength(int length) {
+ int size = this.getLattice().size() - 2; // subtract off the start- and end-of-sentence tokens
+
+ if (size > length) {
+ LOG.warn("sentence {} too long {}, truncating to length {}", id(), size, length);
+
+ // Replace the input sentence (and target) -- use the raw string, not source()
+ String[] tokens = source.split("\\s+");
+ source = tokens[0];
+ for (int i = 1; i < length; i++)
+ source += " " + tokens[i];
+ sourceLattice = null;
+ if (target != null) {
+ target = "";
+ }
+ }
+ }
+
+ public boolean isEmpty() {
+ return source.matches("^\\s*$");
+ }
+
+ public int id() {
+ return id;
+ }
+
+ /**
+ * Returns the raw source-side input string.
+ * @return the raw source-side input string
+ */
+ public String rawSource() {
+ return source;
+ }
+
+ /**
+ * Returns the source-side string with annotations --- if any --- stripped off.
+ *
+ * @return the source-side string with annotations --- if any --- stripped off
+ */
+ public String source() {
+ StringBuilder str = new StringBuilder();
+ int[] ids = getWordIDs();
+ for (int i = 1; i < ids.length - 1; i++) {
+ str.append(Vocabulary.word(ids[i])).append(" ");
+ }
+ return str.toString().trim();
+ }
+
+ /**
+ * Returns a sentence with the start and stop symbols added to the
+ * beginning and the end of the sentence respectively
+ *
+ * @return String The input sentence with start and stop symbols
+ */
+ public String fullSource() {
+ if (fullSource == null) {
+ fullSource = addSentenceMarkers(source());
+ }
+ return fullSource;
+ }
+
+ /**
+ * If a target side was supplied with the sentence, this will be non-null. This is used when doing
+ * synchronous parsing or constrained decoding. The input format is:
+ *
+ * Bill quiere ir a casa ||| Bill wants to go home
+ *
+ * If the parameter parse=true is set, parsing will be triggered, otherwise constrained decoding.
+ *
+ * @return target side of sentence translation
+ */
+ public String target() {
+ return target;
+ }
+
+ public String fullTarget() {
+ if (fullTarget == null) {
+ fullTarget = addSentenceMarkers(target());
+ }
+ return fullTarget;
+ }
+
+ public String source(int i, int j) {
+ StringTokenizer st = new StringTokenizer(fullSource());
+ int index = 0;
+ StringBuilder substring = new StringBuilder();
+ while (st.hasMoreTokens()) {
+ String token = st.nextToken();
+ if (index >= j)
+ break;
+ if (index >= i)
+ substring.append(token).append(" ");
+ index++;
+ }
+ return substring.toString().trim();
+ }
+
+ public String[] references() {
+ return references;
+ }
+
+ /**
+ * Returns the sequence of tokens comprising the sentence. This assumes you've done the checking
+ * to makes sure the input string (the source side) isn't a PLF waiting to be parsed.
+ *
+ * @return a {@link java.util.List} of {@link org.apache.joshua.decoder.segment_file.Token}'s comprising the sentence
+ */
+ public List<Token> getTokens() {
+ assert isLinearChain();
+ List<Token> tokens = new ArrayList<Token>();
+ for (Node<Token> node: getLattice().getNodes())
+ if (node != null && node.getOutgoingArcs().size() > 0)
+ tokens.add(node.getOutgoingArcs().get(0).getLabel());
+ return tokens;
+ }
+
+ /**
+ * Returns the sequence of word IDs comprising the input sentence. Assumes this is not a general
+ * lattice, but a linear chain.
+ * @return an int[] comprising all word ID's
+ */
+ public int[] getWordIDs() {
+ List<Token> tokens = getTokens();
+ int[] ids = new int[tokens.size()];
+ for (int i = 0; i < tokens.size(); i++)
+ ids[i] = tokens.get(i).getWord();
+ return ids;
+ }
+
+ /**
+ * Returns the sequence of word ids comprising the sentence. Assumes this is a sentence and
+ * not a lattice.
+ *
+ * @return the sequence of word ids comprising the sentence
+ */
+ public Lattice<String> stringLattice() {
+ assert isLinearChain();
+ return Lattice.createStringLatticeFromString(source(), config);
+ }
+
+ public List<ConstraintSpan> constraints() {
+ return constraints;
+ }
+
+ public Lattice<Token> getLattice() {
+ if (this.sourceLattice == null) {
+ if (config.lattice_decoding && rawSource().startsWith("(((")) {
+ if (config.search_algorithm.equals("stack")) {
+ throw new RuntimeException("* FATAL: lattice decoding currently not supported for stack-based search algorithm.");
+ }
+ this.sourceLattice = Lattice.createTokenLatticeFromPLF(rawSource(), config);
+ } else
+ this.sourceLattice = Lattice.createTokenLatticeFromString(String.format("%s %s %s", Vocabulary.START_SYM,
+ rawSource(), Vocabulary.STOP_SYM), config);
+ }
+ return this.sourceLattice;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder(source());
+ if (target() != null) {
+ sb.append(" ||| " + target());
+ }
+ return sb.toString();
+ }
+
+ public boolean hasPath(int begin, int end) {
+ return getLattice().distance(begin, end) != -1;
+ }
+
+ public Node<Token> getNode(int i) {
+ return getLattice().getNode(i);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/Token.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/Token.java b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/Token.java
new file mode 100644
index 0000000..b84826d
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/Token.java
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.segment_file;
+
+import static org.apache.joshua.util.FormatUtils.escapeSpecialSymbols;
+
+import java.util.HashMap;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.util.FormatUtils;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Stores the identity of a word and its annotations in a sentence.
+
+ * @author "Gaurav Kumar"
+ * @author Matt Post
+ */
+public class Token {
+
+ private static final Logger LOG = LoggerFactory.getLogger(Token.class);
+
+ // The token without the annotations
+ private String token;
+ private int tokenID;
+
+ private HashMap<String,String> annotations = null;
+ private JoshuaConfiguration joshuaConfiguration;
+
+ /**
+ * <p>Constructor : Creates a Token object from a raw word
+ * Extracts and assigns an annotation when available.
+ * Any word can be marked with annotations, which are arbitrary semicolon-delimited
+ * key[=value] pairs (the value is optional) listed in brackets after a word, e.g.,</p>
+ * <pre>
+ * Je[ref=Samuel;PRO] voudrais[FUT;COND]
+ * </pre>
+ *
+ * <p>This will create a dictionary annotation on the word of the following form for "Je"</p>
+ *
+ * <pre>
+ * ref -> Samuel
+ * PRO -> PRO
+ * </pre>
+ *
+ * <p>and the following for "voudrais":</p>
+ *
+ * <pre>
+ * FUT -> FUT
+ * COND -> COND
+ * </pre>
+ *
+ * @param rawWord A word with annotation information (possibly)
+ * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+ *
+ */
+ public Token(String rawWord, JoshuaConfiguration config) {
+
+ this.joshuaConfiguration = config;
+
+ annotations = new HashMap<String,String>();
+
+ // Matches a word with an annotation
+ // Check guidelines in constructor description
+ Pattern pattern = Pattern.compile("(\\S+)\\[(\\S+)\\]");
+ Matcher tag = pattern.matcher(rawWord);
+ if (tag.find()) {
+ // Annotation match found
+ token = tag.group(1);
+ String tagStr = tag.group(2);
+
+ for (String annotation: tagStr.split(";")) {
+ int where = annotation.indexOf("=");
+ if (where != -1) {
+ annotations.put(annotation.substring(0, where), annotation.substring(where + 1));
+ } else {
+ annotations.put(annotation, annotation);
+ }
+ }
+ } else {
+ // No match found, which implies that this token does not have any annotations
+ token = rawWord;
+ }
+
+ // Mask strings that cause problems for the decoder. This has to be done *after* parsing for
+ // annotations.
+ token = escapeSpecialSymbols(token);
+
+ if (joshuaConfiguration != null && joshuaConfiguration.lowercase) {
+ if (FormatUtils.ISALLUPPERCASE(token))
+ annotations.put("lettercase", "all-upper");
+ else if (Character.isUpperCase(token.charAt(0)))
+ annotations.put("lettercase", "upper");
+ else
+ annotations.put("lettercase", "lower");
+
+ LOG.info("TOKEN: {} -> {} ({})", token, token.toLowerCase(), annotations.get("lettercase"));
+ token = token.toLowerCase();
+ }
+
+ tokenID = Vocabulary.id(token);
+ }
+
+ /**
+ * Returns the word ID (vocab ID) for this token
+ *
+ * @return int A word ID
+ */
+ public int getWord() {
+ return tokenID;
+ }
+
+ /**
+ * Returns the string associated with this token
+ * @return String A word
+ */
+ public String getWordIdentity() {
+ return token;
+ }
+
+ public String toString() {
+ return token;
+ }
+
+ /**
+ * Returns the annotationID (vocab ID)
+ * associated with this token
+ * @param key A type ID
+ * @return the annotationID (vocab ID)
+ */
+ public String getAnnotation(String key) {
+ if (annotations.containsKey(key)) {
+ return annotations.get(key);
+ }
+
+ return null;
+ }
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/package-info.java b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/package-info.java
new file mode 100644
index 0000000..a615030
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/segment_file/package-info.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/**
+ * Provides common interfaces for parsing segment files
+ * (aka test corpora to be translated). In order to support
+ * constraint annotations, we provide a general API for
+ * use by JoshuaDecoder and Chart.
+ */
+package org.apache.joshua.decoder.segment_file;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/lattice/Arc.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/lattice/Arc.java b/joshua-core/src/main/java/org/apache/joshua/lattice/Arc.java
new file mode 100644
index 0000000..5d056ab
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/lattice/Arc.java
@@ -0,0 +1,117 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.lattice;
+
+/**
+ * An arc in a directed graph.
+ *
+ * @author Lane Schwartz
+ * @since 2008-07-08
+ *
+ * @param Label Type of label associated with an arc.
+ */
+public class Arc<Label> {
+
+ /**
+ * Weight of this arc.
+ */
+ private float cost;
+
+ /**
+ * Node where this arc ends.
+ */
+ private Node<Label> head;
+
+ /**
+ * Node where this arc begins.
+ */
+ private Node<Label> tail;
+
+ /**
+ * Label associated with this arc.
+ */
+ private Label label;
+
+ /**
+ * Creates an arc with the specified head, tail, cost, and label.
+ *
+ * @param head The node where this arc begins.
+ * @param tail The node where this arc ends.
+ * @param cost The cost of this arc.
+ * @param label The label associated with this arc.
+ */
+ public Arc(Node<Label> tail, Node<Label> head, float cost, Label label) {
+ this.tail = tail;
+ this.head = head;
+ this.cost = cost;
+ this.label = label;
+ }
+
+ /**
+ * Gets the cost of this arc.
+ *
+ * @return The cost of this arc.
+ */
+ public float getCost() {
+ return cost;
+ }
+
+ /**
+ * Gets the tail of this arc (the node where this arc begins).
+ *
+ * @return The tail of this arc.
+ */
+ public Node<Label> getTail() {
+ return tail;
+ }
+
+ /**
+ * Gets the head of this arc (the node where this arc ends).
+ *
+ * @return The head of this arc.
+ */
+ public Node<Label> getHead() {
+ return head;
+ }
+
+ /**
+ * Gets the label associated with this arc.
+ *
+ * @return The label associated with this arc.
+ */
+ public Label getLabel() {
+ return label;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder s = new StringBuilder();
+
+ s.append(label.toString());
+ s.append(" : ");
+ s.append(tail.toString());
+ s.append(" ==> ");
+ s.append(head.toString());
+ s.append(" : ");
+ s.append(cost);
+
+ return s.toString();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/lattice/Lattice.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/lattice/Lattice.java b/joshua-core/src/main/java/org/apache/joshua/lattice/Lattice.java
new file mode 100644
index 0000000..2332159
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/lattice/Lattice.java
@@ -0,0 +1,587 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.lattice;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Stack;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.segment_file.Token;
+import org.apache.joshua.util.ChartSpan;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * A lattice representation of a directed graph.
+ *
+ * @author Lane Schwartz
+ * @author Matt Post post@cs.jhu.edu
+ * @since 2008-07-08
+ *
+ */
+public class Lattice<Value> implements Iterable<Node<Value>> {
+
+ private static final Logger LOG = LoggerFactory.getLogger(Lattice.class);
+
+ /**
+ * True if there is more than one path through the lattice.
+ */
+ private boolean latticeHasAmbiguity;
+
+ /**
+ * Costs of the best path between each pair of nodes in the lattice.
+ */
+ private ChartSpan<Integer> distances = null;
+
+ /**
+ * List of all nodes in the lattice. Nodes are assumed to be in topological order.
+ */
+ private List<Node<Value>> nodes;
+
+
+ JoshuaConfiguration config = null;
+
+ /**
+ * Constructs a new lattice from an existing list of (connected) nodes.
+ * <p>
+ * The list of nodes must already be in topological order. If the list is not in topological
+ * order, the behavior of the lattice is not defined.
+ *
+ * @param nodes A list of nodes which must be in topological order.
+ * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+ */
+ public Lattice(List<Node<Value>> nodes, JoshuaConfiguration config) {
+ this.nodes = nodes;
+ // this.distances = calculateAllPairsShortestPath();
+ this.latticeHasAmbiguity = true;
+ }
+
+ public Lattice(List<Node<Value>> nodes, boolean isAmbiguous, JoshuaConfiguration config) {
+ // Node<Value> sink = new Node<Value>(nodes.size());
+ // nodes.add(sink);
+ this.nodes = nodes;
+ // this.distances = calculateAllPairsShortestPath();
+ this.latticeHasAmbiguity = isAmbiguous;
+ }
+
+ /**
+ * Instantiates a lattice from a linear chain of values, i.e., a sentence.
+ *
+ * @param linearChain a sequence of Value objects
+ * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+ */
+ public Lattice(Value[] linearChain, JoshuaConfiguration config) {
+ this.latticeHasAmbiguity = false;
+ this.nodes = new ArrayList<Node<Value>>();
+
+ Node<Value> previous = new Node<Value>(0);
+ nodes.add(previous);
+
+ int i = 1;
+
+ for (Value value : linearChain) {
+
+ Node<Value> current = new Node<Value>(i);
+ float cost = 0.0f;
+ // if (i > 4) cost = (float)i/1.53432f;
+ previous.addArc(current, cost, value);
+
+ nodes.add(current);
+
+ previous = current;
+ i++;
+ }
+
+ // this.distances = calculateAllPairsShortestPath();
+ }
+
+ public final boolean hasMoreThanOnePath() {
+ return latticeHasAmbiguity;
+ }
+
+ /**
+ * Computes the shortest distance between two nodes, which is used (perhaps among other places) in
+ * computing which rules can apply over which spans of the input
+ *
+ * @param arc an {@link org.apache.joshua.lattice.Arc} of values
+ * @return the shortest distance between two nodes
+ */
+ public int distance(Arc<Value> arc) {
+ return this.getShortestPath(arc.getTail().getNumber(), arc.getHead().getNumber());
+ }
+
+ public int distance(int i, int j) {
+ return this.getShortestPath(i, j);
+ }
+
+ /**
+ * Convenience method to get a lattice from a linear sequence of {@link Token} objects.
+ *
+ * @param source input string from which to create a {@link org.apache.joshua.lattice.Lattice}
+ * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+ * @return Lattice representation of the linear chain.
+ */
+ public static Lattice<Token> createTokenLatticeFromString(String source, JoshuaConfiguration config) {
+ String[] tokens = source.split("\\s+");
+ Token[] integerSentence = new Token[tokens.length];
+ for (int i = 0; i < tokens.length; i++) {
+ integerSentence[i] = new Token(tokens[i], config);
+ }
+
+ return new Lattice<Token>(integerSentence, config);
+ }
+
+ public static Lattice<Token> createTokenLatticeFromPLF(String data, JoshuaConfiguration config) {
+ ArrayList<Node<Token>> nodes = new ArrayList<Node<Token>>();
+
+ // This matches a sequence of tuples, which describe arcs leaving this node
+ Pattern nodePattern = Pattern.compile("(.+?)\\(\\s*(\\(.+?\\),\\s*)\\s*\\)(.*)");
+
+ /*
+ * This matches a comma-delimited, parenthesized tuple of a (a) single-quoted word (b) a number,
+ * optionally in scientific notation (c) an offset (how many states to jump ahead)
+ */
+ Pattern arcPattern = Pattern
+ .compile("\\s*\\('(.+?)',\\s*(-?\\d+\\.?\\d*?(?:[eE]-?\\d+)?),\\s*(\\d+)\\),\\s*(.*)");
+
+ Matcher nodeMatcher = nodePattern.matcher(data);
+
+ boolean latticeIsAmbiguous = false;
+
+ int nodeID = 0;
+ Node<Token> startNode = new Node<Token>(nodeID);
+ nodes.add(startNode);
+
+ while (nodeMatcher.matches()) {
+
+ String nodeData = nodeMatcher.group(2);
+ String remainingData = nodeMatcher.group(3);
+
+ nodeID++;
+
+ Node<Token> currentNode = null;
+ if (nodeID < nodes.size() && nodes.get(nodeID) != null) {
+ currentNode = nodes.get(nodeID);
+ } else {
+ currentNode = new Node<Token>(nodeID);
+ while (nodeID > nodes.size())
+ nodes.add(new Node<Token>(nodes.size()));
+ nodes.add(currentNode);
+ }
+
+ Matcher arcMatcher = arcPattern.matcher(nodeData);
+ int numArcs = 0;
+ if (!arcMatcher.matches()) {
+ throw new RuntimeException("Parse error!");
+ }
+ while (arcMatcher.matches()) {
+ numArcs++;
+ String arcLabel = arcMatcher.group(1);
+ float arcWeight = Float.parseFloat(arcMatcher.group(2));
+ int destinationNodeID = nodeID + Integer.parseInt(arcMatcher.group(3));
+
+ Node<Token> destinationNode;
+ if (destinationNodeID < nodes.size() && nodes.get(destinationNodeID) != null) {
+ destinationNode = nodes.get(destinationNodeID);
+ } else {
+ destinationNode = new Node<Token>(destinationNodeID);
+ while (destinationNodeID > nodes.size())
+ nodes.add(new Node<Token>(nodes.size()));
+ nodes.add(destinationNode);
+ }
+
+ String remainingArcs = arcMatcher.group(4);
+
+ Token arcToken = new Token(arcLabel, config);
+ currentNode.addArc(destinationNode, arcWeight, arcToken);
+
+ arcMatcher = arcPattern.matcher(remainingArcs);
+ }
+ if (numArcs > 1)
+ latticeIsAmbiguous = true;
+
+ nodeMatcher = nodePattern.matcher(remainingData);
+ }
+
+ /* Add <s> to the start of the lattice. */
+ if (nodes.size() > 1 && nodes.get(1) != null) {
+ Node<Token> firstNode = nodes.get(1);
+ startNode.addArc(firstNode, 0.0f, new Token(Vocabulary.START_SYM, config));
+ }
+
+ /* Add </s> as a final state, connect it to the previous end-state */
+ nodeID = nodes.get(nodes.size()-1).getNumber() + 1;
+ Node<Token> endNode = new Node<Token>(nodeID);
+ nodes.get(nodes.size()-1).addArc(endNode, 0.0f, new Token(Vocabulary.STOP_SYM, config));
+ nodes.add(endNode);
+
+ return new Lattice<Token>(nodes, latticeIsAmbiguous, config);
+ }
+
+ /**
+ * Constructs a lattice from a given string representation.
+ *
+ * @param data String representation of a lattice.
+ * @param config a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+ * @return A lattice that corresponds to the given string.
+ */
+ public static Lattice<String> createStringLatticeFromString(String data, JoshuaConfiguration config) {
+
+ Map<Integer, Node<String>> nodes = new HashMap<Integer, Node<String>>();
+
+ Pattern nodePattern = Pattern.compile("(.+?)\\((\\(.+?\\),)\\)(.*)");
+ Pattern arcPattern = Pattern.compile("\\('(.+?)',(\\d+.\\d+),(\\d+)\\),(.*)");
+
+ Matcher nodeMatcher = nodePattern.matcher(data);
+
+ int nodeID = -1;
+
+ while (nodeMatcher.matches()) {
+
+ String nodeData = nodeMatcher.group(2);
+ String remainingData = nodeMatcher.group(3);
+
+ nodeID++;
+
+ Node<String> currentNode;
+ if (nodes.containsKey(nodeID)) {
+ currentNode = nodes.get(nodeID);
+ } else {
+ currentNode = new Node<String>(nodeID);
+ nodes.put(nodeID, currentNode);
+ }
+
+ LOG.debug("Node : {}", nodeID);
+
+ Matcher arcMatcher = arcPattern.matcher(nodeData);
+
+ while (arcMatcher.matches()) {
+ String arcLabel = arcMatcher.group(1);
+ float arcWeight = Float.valueOf(arcMatcher.group(2));
+ int destinationNodeID = nodeID + Integer.parseInt(arcMatcher.group(3));
+
+ Node<String> destinationNode;
+ if (nodes.containsKey(destinationNodeID)) {
+ destinationNode = nodes.get(destinationNodeID);
+ } else {
+ destinationNode = new Node<String>(destinationNodeID);
+ nodes.put(destinationNodeID, destinationNode);
+ }
+
+ String remainingArcs = arcMatcher.group(4);
+
+ LOG.debug("\t{} {} {}", arcLabel, arcWeight, destinationNodeID);
+
+ currentNode.addArc(destinationNode, arcWeight, arcLabel);
+
+ arcMatcher = arcPattern.matcher(remainingArcs);
+ }
+
+ nodeMatcher = nodePattern.matcher(remainingData);
+ }
+
+ List<Node<String>> nodeList = new ArrayList<Node<String>>(nodes.values());
+ Collections.sort(nodeList, new NodeIdentifierComparator());
+
+ LOG.debug("Nodelist={}", nodeList);
+
+ return new Lattice<String>(nodeList, config);
+ }
+
+ /**
+ * Gets the cost of the shortest path between two nodes.
+ *
+ * @param from ID of the starting node.
+ * @param to ID of the ending node.
+ * @return The cost of the shortest path between the two nodes.
+ */
+ public int getShortestPath(int from, int to) {
+ // System.err.println(String.format("DISTANCE(%d,%d) = %f", from, to, costs[from][to]));
+ if (distances == null)
+ this.distances = calculateAllPairsShortestPath();
+
+ return distances.get(from, to);
+ }
+
+ /**
+ * Gets the shortest distance through the lattice.
+ * @return int representing the shortest distance through the lattice
+ */
+ public int getShortestDistance() {
+ if (distances == null)
+ distances = calculateAllPairsShortestPath();
+ return distances.get(0, nodes.size()-1);
+ }
+
+ /**
+ * Gets the node with a specified integer identifier. If the identifier is negative, we count
+ * backwards from the end of the array, Perl-style (-1 is the last element, -2 the penultimate,
+ * etc).
+ *
+ * @param index Integer identifier for a node.
+ * @return The node with the specified integer identifier
+ */
+ public Node<Value> getNode(int index) {
+ if (index >= 0)
+ return nodes.get(index);
+ else
+ return nodes.get(size() + index);
+ }
+
+ public List<Node<Value>> getNodes() {
+ return nodes;
+ }
+
+ /**
+ * Returns an iterator over the nodes in this lattice.
+ *
+ * @return An iterator over the nodes in this lattice.
+ */
+ public Iterator<Node<Value>> iterator() {
+ return nodes.iterator();
+ }
+
+ /**
+ * Returns the number of nodes in this lattice.
+ *
+ * @return The number of nodes in this lattice.
+ */
+ public int size() {
+ return nodes.size();
+ }
+
+ /**
+ * Calculate the all-pairs shortest path for all pairs of nodes.
+ * <p>
+ * Note: This method assumes no backward arcs. If there are backward arcs, the returned shortest
+ * path costs for that node may not be accurate.
+ *
+ * @param nodes A list of nodes which must be in topological order.
+ * @return The all-pairs shortest path for all pairs of nodes.
+ */
+ private ChartSpan<Integer> calculateAllPairsShortestPath() {
+
+ ChartSpan<Integer> distance = new ChartSpan<Integer>(nodes.size() - 1, Integer.MAX_VALUE);
+ distance.setDiagonal(0);
+
+ /* Mark reachability between immediate neighbors */
+ for (Node<Value> tail : nodes) {
+ for (Arc<Value> arc : tail.getOutgoingArcs()) {
+ Node<Value> head = arc.getHead();
+ distance.set(tail.id(), head.id(), 1);
+ }
+ }
+
+ int size = nodes.size();
+
+ for (int width = 2; width <= size; width++) {
+ for (int i = 0; i < size - width; i++) {
+ int j = i + width;
+ for (int k = i + 1; k < j; k++) {
+ distance.set(i, j, Math.min(distance.get(i, j), distance.get(i, k) + distance.get(k, j)));
+ }
+ }
+ }
+
+ return distance;
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder s = new StringBuilder();
+
+ for (Node<Value> start : this) {
+ for (Arc<Value> arc : start.getOutgoingArcs()) {
+ s.append(arc.toString());
+ s.append('\n');
+ }
+ }
+
+ return s.toString();
+ }
+
+ public static void main(String[] args) {
+
+ List<Node<String>> nodes = new ArrayList<Node<String>>();
+ for (int i = 0; i < 4; i++) {
+ nodes.add(new Node<String>(i));
+ }
+
+ nodes.get(0).addArc(nodes.get(1), 1.0f, "x");
+ nodes.get(1).addArc(nodes.get(2), 1.0f, "y");
+ nodes.get(0).addArc(nodes.get(2), 1.5f, "a");
+ nodes.get(2).addArc(nodes.get(3), 3.0f, "b");
+ nodes.get(2).addArc(nodes.get(3), 5.0f, "c");
+
+ Lattice<String> graph = new Lattice<String>(nodes, null);
+
+ System.out.println("Shortest path from 0 to 3: " + graph.getShortestPath(0, 3));
+ }
+
+ /**
+ * Replaced the arc from node i to j with the supplied lattice. This is used to do OOV
+ * segmentation of words in a lattice.
+ *
+ * @param i start node of arc
+ * @param j end node of arc
+ * @param newNodes new nodes used within the replacement operation
+ */
+ public void insert(int i, int j, List<Node<Value>> newNodes) {
+
+ nodes.get(i).setOutgoingArcs(newNodes.get(0).getOutgoingArcs());
+
+ newNodes.remove(0);
+ nodes.remove(j);
+ Collections.reverse(newNodes);
+
+ for (Node<Value> node: newNodes)
+ nodes.add(j, node);
+
+ this.latticeHasAmbiguity = false;
+ for (int x = 0; x < nodes.size(); x++) {
+ nodes.get(x).setID(x);
+ this.latticeHasAmbiguity |= (nodes.get(x).getOutgoingArcs().size() > 1);
+ }
+
+ this.distances = null;
+ }
+
+ /**
+ * Topologically sorts the nodes and reassigns their numbers. Assumes that the first node is the
+ * source, but otherwise assumes nothing about the input.
+ *
+ * Probably correct, but untested.
+ */
+ @SuppressWarnings("unused")
+ private void topologicalSort() {
+ HashMap<Node<Value>, List<Arc<Value>>> outgraph = new HashMap<Node<Value>, List<Arc<Value>>>();
+ HashMap<Node<Value>, List<Arc<Value>>> ingraph = new HashMap<Node<Value>, List<Arc<Value>>>();
+ for (Node<Value> node: nodes) {
+ ArrayList<Arc<Value>> arcs = new ArrayList<Arc<Value>>();
+ for (Arc<Value> arc: node.getOutgoingArcs()) {
+ arcs.add(arc);
+
+ if (! ingraph.containsKey(arc.getHead()))
+ ingraph.put(arc.getHead(), new ArrayList<Arc<Value>>());
+ ingraph.get(arc.getHead()).add(arc);
+
+ outgraph.put(node, arcs);
+ }
+ }
+
+ ArrayList<Node<Value>> sortedNodes = new ArrayList<Node<Value>>();
+ Stack<Node<Value>> stack = new Stack<Node<Value>>();
+ stack.push(nodes.get(0));
+
+ while (! stack.empty()) {
+ Node<Value> node = stack.pop();
+ sortedNodes.add(node);
+ for (Arc<Value> arc: outgraph.get(node)) {
+ outgraph.get(node).remove(arc);
+ ingraph.get(arc.getHead()).remove(arc);
+
+ if (ingraph.get(arc.getHead()).size() == 0)
+ sortedNodes.add(arc.getHead());
+ }
+ }
+
+ int id = 0;
+ for (Node<Value> node : sortedNodes)
+ node.setID(id++);
+
+ this.nodes = sortedNodes;
+ }
+
+ /**
+ * Constructs a lattice from a given string representation.
+ *
+ * @param data String representation of a lattice.
+ * @return A lattice that corresponds to the given string.
+ */
+ public static Lattice<String> createFromString(String data) {
+
+ Map<Integer,Node<String>> nodes = new HashMap<Integer,Node<String>>();
+
+ Pattern nodePattern = Pattern.compile("(.+?)\\((\\(.+?\\),)\\)(.*)");
+ Pattern arcPattern = Pattern.compile("\\('(.+?)',(\\d+.\\d+),(\\d+)\\),(.*)");
+
+ Matcher nodeMatcher = nodePattern.matcher(data);
+
+ int nodeID = -1;
+
+ while (nodeMatcher.matches()) {
+
+ String nodeData = nodeMatcher.group(2);
+ String remainingData = nodeMatcher.group(3);
+
+ nodeID++;
+
+ Node<String> currentNode;
+ if (nodes.containsKey(nodeID)) {
+ currentNode = nodes.get(nodeID);
+ } else {
+ currentNode = new Node<String>(nodeID);
+ nodes.put(nodeID, currentNode);
+ }
+
+ LOG.debug("Node : {}", nodeID);
+
+ Matcher arcMatcher = arcPattern.matcher(nodeData);
+
+ while (arcMatcher.matches()) {
+ String arcLabel = arcMatcher.group(1);
+ double arcWeight = Double.valueOf(arcMatcher.group(2));
+ int destinationNodeID = nodeID + Integer.valueOf(arcMatcher.group(3));
+
+ Node<String> destinationNode;
+ if (nodes.containsKey(destinationNodeID)) {
+ destinationNode = nodes.get(destinationNodeID);
+ } else {
+ destinationNode = new Node<String>(destinationNodeID);
+ nodes.put(destinationNodeID, destinationNode);
+ }
+
+ String remainingArcs = arcMatcher.group(4);
+
+ LOG.debug("\t {} {} {}", arcLabel, arcWeight, destinationNodeID);
+
+ currentNode.addArc(destinationNode, (float) arcWeight, arcLabel);
+
+ arcMatcher = arcPattern.matcher(remainingArcs);
+ }
+
+ nodeMatcher = nodePattern.matcher(remainingData);
+ }
+
+ List<Node<String>> nodeList = new ArrayList<Node<String>>(nodes.values());
+ Collections.sort(nodeList, new NodeIdentifierComparator());
+
+ LOG.debug("Nodelist={}", nodeList);
+
+ return new Lattice<String>(nodeList, new JoshuaConfiguration());
+ }
+}