You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by le...@apache.org on 2016/05/16 06:26:54 UTC
[38/66] [partial] incubator-joshua git commit: JOSHUA-252 Make it
possible to use Maven to build Joshua
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/phrase/PhraseTable.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/phrase/PhraseTable.java b/src/joshua/decoder/phrase/PhraseTable.java
deleted file mode 100644
index bcf7135..0000000
--- a/src/joshua/decoder/phrase/PhraseTable.java
+++ /dev/null
@@ -1,201 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.phrase;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.List;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.JoshuaConfiguration;
-import joshua.decoder.ff.FeatureFunction;
-import joshua.decoder.ff.tm.Grammar;
-import joshua.decoder.ff.tm.Rule;
-import joshua.decoder.ff.tm.RuleCollection;
-import joshua.decoder.ff.tm.Trie;
-import joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
-import joshua.decoder.ff.tm.packed.PackedGrammar;
-
-/**
- * Represents a phrase table, and is implemented as a wrapper around either a {@link PackedGrammar}
- * or a {@link MemoryBasedBatchGrammar}.
- *
- * TODO: this should all be implemented as a two-level trie (source trie and target trie).
- */
-public class PhraseTable implements Grammar {
-
- private JoshuaConfiguration config;
- private Grammar backend;
-
- /**
- * Chain to the super with a number of defaults. For example, we only use a single nonterminal,
- * and there is no span limit.
- *
- * @param grammarFile
- * @param owner
- * @param config
- * @throws IOException
- */
- public PhraseTable(String grammarFile, String owner, String type, JoshuaConfiguration config, int maxSource)
- throws IOException {
- this.config = config;
- int spanLimit = 0;
-
- if (grammarFile != null && new File(grammarFile).isDirectory()) {
- this.backend = new PackedGrammar(grammarFile, spanLimit, owner, type, config);
- if (this.backend.getMaxSourcePhraseLength() == -1) {
- System.err.println("FATAL: Using a packed grammar for a phrase table backend requires that you");
- System.err.println(" packed the grammar with Joshua 6.0.2 or greater");
- System.exit(-1);
- }
-
- } else {
- this.backend = new MemoryBasedBatchGrammar(type, grammarFile, owner, "[X]", spanLimit, config);
- }
- }
-
- public PhraseTable(String owner, JoshuaConfiguration config) {
- this.config = config;
-
- this.backend = new MemoryBasedBatchGrammar(owner, config);
- }
-
- /**
- * Returns the longest source phrase read. For {@link MemoryBasedBatchGrammar}s, we subtract 1
- * since the grammar includes the nonterminal. For {@link PackedGrammar}s, the value was either
- * in the packed config file (Joshua 6.0.2+) or was passed in via the TM config line.
- *
- * @return
- */
- @Override
- public int getMaxSourcePhraseLength() {
- if (backend instanceof MemoryBasedBatchGrammar)
- return this.backend.getMaxSourcePhraseLength() - 1;
- else
- return this.backend.getMaxSourcePhraseLength();
- }
-
- /**
- * Collect the set of target-side phrases associated with a source phrase.
- *
- * @param sourceWords the sequence of source words
- * @return the rules
- */
- public RuleCollection getPhrases(int[] sourceWords) {
- if (sourceWords.length != 0) {
- Trie pointer = getTrieRoot();
- if (! (backend instanceof PackedGrammar))
- pointer = pointer.match(Vocabulary.id("[X]"));
- int i = 0;
- while (pointer != null && i < sourceWords.length)
- pointer = pointer.match(sourceWords[i++]);
-
- if (pointer != null && pointer.hasRules()) {
- return pointer.getRuleCollection();
- }
- }
-
- return null;
- }
-
- /**
- * Adds a rule to the grammar. Only supported when the backend is a MemoryBasedBatchGrammar.
- *
- * @param rule the rule to add
- */
- public void addRule(Rule rule) {
- ((MemoryBasedBatchGrammar)backend).addRule(rule);
- }
-
- @Override
- public void addOOVRules(int sourceWord, List<FeatureFunction> featureFunctions) {
- // TODO: _OOV shouldn't be outright added, since the word might not be OOV for the LM (but now almost
- // certainly is)
- int targetWord = config.mark_oovs
- ? Vocabulary.id(Vocabulary.word(sourceWord) + "_OOV")
- : sourceWord;
-
- int nt_i = Vocabulary.id("[X]");
- Rule oovRule = new Rule(nt_i, new int[] { nt_i, sourceWord },
- new int[] { -1, targetWord }, "", 1, null);
- addRule(oovRule);
- oovRule.estimateRuleCost(featureFunctions);
-
-// String ruleString = String.format("[X] ||| [X,1] %s ||| [X,1] %s",
-// Vocabulary.word(sourceWord), Vocabulary.word(targetWord));
-// BilingualRule oovRule = new HieroFormatReader().parseLine(ruleString);
-// oovRule.setOwner(Vocabulary.id("oov"));
-// addRule(oovRule);
-// oovRule.estimateRuleCost(featureFunctions);
- }
-
- @Override
- public Trie getTrieRoot() {
- return backend.getTrieRoot();
- }
-
- @Override
- public void sortGrammar(List<FeatureFunction> models) {
- backend.sortGrammar(models);
- }
-
- @Override
- public boolean isSorted() {
- return backend.isSorted();
- }
-
- /**
- * This should never be called.
- */
- @Override
- public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) {
- return true;
- }
-
- @Override
- public int getNumRules() {
- return backend.getNumRules();
- }
-
- @Override
- public Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords, float[] scores,
- int arity) {
- return backend.constructManualRule(lhs, sourceWords, targetWords, scores, arity);
- }
-
- @Override
- public void writeGrammarOnDisk(String file) {
- backend.writeGrammarOnDisk(file);
- }
-
- @Override
- public boolean isRegexpGrammar() {
- return backend.isRegexpGrammar();
- }
-
- @Override
- public int getOwner() {
- return backend.getOwner();
- }
-
- @Override
- public int getNumDenseFeatures() {
- return backend.getNumDenseFeatures();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/phrase/Stack.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/phrase/Stack.java b/src/joshua/decoder/phrase/Stack.java
deleted file mode 100644
index 88b529a..0000000
--- a/src/joshua/decoder/phrase/Stack.java
+++ /dev/null
@@ -1,234 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.phrase;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.List;
-import java.util.PriorityQueue;
-import java.util.Set;
-
-import joshua.decoder.Decoder;
-import joshua.decoder.JoshuaConfiguration;
-import joshua.decoder.chart_parser.ComputeNodeResult;
-import joshua.decoder.ff.FeatureFunction;
-import joshua.decoder.segment_file.Sentence;
-
-/**
- * Organizes all hypotheses containing the same number of source words.
- *
- */
-public class Stack extends ArrayList<Hypothesis> {
-
- private static final long serialVersionUID = 7885252799032416068L;
-
- private HashMap<Coverage, ArrayList<Hypothesis>> coverages;
-
- private Sentence sentence;
- private List<FeatureFunction> featureFunctions;
- private JoshuaConfiguration config;
-
- /* The list of states we've already visited. */
- private HashSet<Candidate> visitedStates;
-
- /* A list of candidates sorted for consideration for entry to the chart (for cube pruning) */
- private PriorityQueue<Candidate> candidates;
-
- /* Short-circuits adding a cube-prune state more than once */
- private HashMap<Hypothesis, Hypothesis> deduper;
-
- /**
- * Create a new stack. Stacks are organized one for each number of source words that are covered.
- *
- * @param featureFunctions
- * @param sentence
- * @param config
- */
- public Stack(List<FeatureFunction> featureFunctions, Sentence sentence, JoshuaConfiguration config) {
- this.featureFunctions = featureFunctions;
- this.sentence = sentence;
- this.config = config;
-
- this.candidates = new PriorityQueue<Candidate>(1, new CandidateComparator());
- this.coverages = new HashMap<Coverage, ArrayList<Hypothesis>>();
- this.visitedStates = new HashSet<Candidate>();
- this.deduper = new HashMap<Hypothesis,Hypothesis>();
- }
-
- /**
- * A Stack is an ArrayList; here, we intercept the add so we can maintain a list of the items
- * stored under each distinct coverage vector
- */
- @Override
- public boolean add(Hypothesis hyp) {
-
- if (! coverages.containsKey((hyp.getCoverage())))
- coverages.put(hyp.getCoverage(), new ArrayList<Hypothesis>());
- coverages.get(hyp.getCoverage()).add(hyp);
-
- return super.add(hyp);
- }
-
- /**
- * Intercept calls to remove() so that we can reduce the coverage vector
- */
- @Override
- public boolean remove(Object obj) {
- boolean found = super.remove(obj);
- if (found) {
- Hypothesis item = (Hypothesis) obj;
- Coverage cov = item.getCoverage();
- assert coverages.get(cov).remove(obj);
- if (coverages.get(cov).size() == 0)
- coverages.remove(cov);
- }
- return found;
- }
-
- /**
- * Returns the set of coverages contained in this stack. This is used to iterate over them
- * in the main decoding loop in Stacks.java.
- */
- public Set<Coverage> getCoverages() {
- return coverages.keySet();
- }
-
- /**
- * Get all items with the same coverage vector.
- *
- * @param cov
- * @return
- */
- public ArrayList<Hypothesis> get(Coverage cov) {
- ArrayList<Hypothesis> list = coverages.get(cov);
- Collections.sort(list);
- return list;
- }
-
- /**
- * Receives a partially-initialized translation candidate and places it on the
- * priority queue after scoring it with all of the feature functions. In this
- * respect it is like {@link CubePruneState} (it could make use of that class with
- * a little generalization of spans / coverage).
- *
- * This function is also used to (fairly concisely) implement constrained decoding. Before
- * adding a candidate, we ensure that the sequence of English words match the sentence. If not,
- * the code extends the dot in the cube-pruning chart to the next phrase, since that one might
- * be a match.
- *
- * @param cand
- */
- public void addCandidate(Candidate cand) {
- if (visitedStates.contains(cand))
- return;
-
- visitedStates.add(cand);
-
- // Constrained decoding
- if (sentence.target() != null) {
- String oldWords = cand.getHypothesis().bestHyperedge.getRule().getEnglishWords().replace("[X,1] ", "");
- String newWords = cand.getRule().getEnglishWords().replace("[X,1] ", "");
-
- // If the string is not found in the target sentence, explore the cube neighbors
- if (sentence.fullTarget().indexOf(oldWords + " " + newWords) == -1) {
- Candidate next = cand.extendPhrase();
- if (next != null)
- addCandidate(next);
- return;
- }
- }
-
- // TODO: sourcepath
- ComputeNodeResult result = new ComputeNodeResult(this.featureFunctions, cand.getRule(),
- cand.getTailNodes(), -1, cand.getSpan().end, null, this.sentence);
- cand.setResult(result);
-
- candidates.add(cand);
- }
-
- /**
- * Cube pruning. Repeatedly pop the top candidate, creating a new hyperedge from it, adding it to
- * the k-best list, and then extending the list of candidates with extensions of the current
- * candidate.
- *
- * @param context
- * @param output
- */
- public void search() {
- int to_pop = config.pop_limit;
-
- if (Decoder.VERBOSE >= 3) {
- System.err.println("Stack::search(): pop: " + to_pop + " size: " + candidates.size());
- for (Candidate c: candidates)
- System.err.println(" " + c);
- }
- while (to_pop > 0 && !candidates.isEmpty()) {
- Candidate got = candidates.poll();
- if (got != null) {
- addHypothesis(got);
- --to_pop;
-
- for (Candidate c : got.extend())
- if (c != null) {
- addCandidate(c);
- }
- }
- }
- }
-
- /**
- * Adds a popped candidate to the chart / main stack. This is a candidate we have decided to
- * keep around.
- *
- */
- public void addHypothesis(Candidate complete) {
- Hypothesis added = new Hypothesis(complete);
-
- if (deduper.containsKey(added)) {
- Hypothesis existing = deduper.get(added);
- existing.absorb(added);
-
- if (Decoder.VERBOSE >= 3) {
- System.err.println(String.format("recombining hypothesis from ( ... %s )", complete.getHypothesis().getRule().getEnglishWords()));
- System.err.println(String.format(" base score %.3f", complete.getResult().getBaseCost()));
- System.err.println(String.format(" covering %d-%d", complete.getSpan().start - 1, complete.getSpan().end - 2));
- System.err.println(String.format(" translated as: %s", complete.getRule().getEnglishWords()));
- System.err.println(String.format(" score %.3f + future cost %.3f = %.3f",
- complete.getResult().getTransitionCost(), complete.getFutureEstimate(),
- complete.getResult().getTransitionCost() + complete.getFutureEstimate()));
- }
-
- } else {
- add(added);
- deduper.put(added, added);
-
- if (Decoder.VERBOSE >= 3) {
- System.err.println(String.format("creating new hypothesis from ( ... %s )", complete.getHypothesis().getRule().getEnglishWords()));
- System.err.println(String.format(" base score %.3f", complete.getResult().getBaseCost()));
- System.err.println(String.format(" covering %d-%d", complete.getSpan().start - 1, complete.getSpan().end - 2));
- System.err.println(String.format(" translated as: %s", complete.getRule().getEnglishWords()));
- System.err.println(String.format(" score %.3f + future cost %.3f = %.3f",
- complete.getResult().getTransitionCost(), complete.getFutureEstimate(),
- complete.getResult().getTransitionCost() + complete.getFutureEstimate()));
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/phrase/Stacks.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/phrase/Stacks.java b/src/joshua/decoder/phrase/Stacks.java
deleted file mode 100644
index eda7d8b..0000000
--- a/src/joshua/decoder/phrase/Stacks.java
+++ /dev/null
@@ -1,266 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.phrase;
-
-/***
- * Entry point for phrase-based decoding, analogous to {@link Chart} for the CKY algorithm. This
- * class organizes all the stacks used for decoding, and is responsible for building them. Stack
- * construction is stack-centric: that is, we loop over the number of source words in increasing sizes;
- * at each step of this iteration, we break the search between smaller stack sizes and source-side
- * phrase sizes.
- *
- * The end result of decoding is a {@link Hypergraph} with the same format as hierarchical decoding.
- * Phrases are treating as left-branching rules, and the span information (i,j) is overloaded so
- * that i means nothing and j represents the index of the last-translated source word in each
- * hypothesis. This means that most hypergraph code can work without modification. The algorithm
- * ensures that the coverage vector is consistent but the resulting hypergraph may not be projective,
- * which is different from the CKY algorithm, which does produce projective derivations.
- *
- * Lattice decoding is not yet supported (March 2015).
- */
-
-import java.util.ArrayList;
-import java.util.List;
-
-import joshua.corpus.Span;
-import joshua.decoder.Decoder;
-import joshua.decoder.JoshuaConfiguration;
-import joshua.decoder.chart_parser.ComputeNodeResult;
-import joshua.decoder.ff.FeatureFunction;
-import joshua.decoder.ff.tm.AbstractGrammar;
-import joshua.decoder.ff.tm.Grammar;
-import joshua.decoder.hypergraph.HGNode;
-import joshua.decoder.hypergraph.HyperEdge;
-import joshua.decoder.hypergraph.HyperGraph;
-import joshua.decoder.segment_file.Sentence;
-
-public class Stacks {
-
- // The list of stacks, grouped according to number of source words covered
- private List<Stack> stacks;
-
- // The end state
- private Hypothesis end;
-
- List<FeatureFunction> featureFunctions;
-
- private Sentence sentence;
-
- private JoshuaConfiguration config;
-
- /* Contains all the phrase tables */
- private PhraseChart chart;
-
- /**
- * Entry point. Initialize everything. Create pass-through (OOV) phrase table and glue phrase
- * table (with start-of-sentence and end-of-sentence rules).
- *
- * @param sentence
- * @param featureFunctions
- * @param grammars
- * @param config
- */
- public Stacks(Sentence sentence, List<FeatureFunction> featureFunctions, Grammar[] grammars,
- JoshuaConfiguration config) {
-
- this.sentence = sentence;
- this.featureFunctions = featureFunctions;
- this.config = config;
-
- int num_phrase_tables = 0;
- for (int i = 0; i < grammars.length; i++)
- if (grammars[i] instanceof PhraseTable)
- ++num_phrase_tables;
-
- PhraseTable[] phraseTables = new PhraseTable[num_phrase_tables + 2];
- for (int i = 0, j = 0; i < grammars.length; i++)
- if (grammars[i] instanceof PhraseTable)
- phraseTables[j++] = (PhraseTable) grammars[i];
-
- phraseTables[phraseTables.length - 2] = new PhraseTable("null", config);
- phraseTables[phraseTables.length - 2].addRule(Hypothesis.END_RULE);
-
- phraseTables[phraseTables.length - 1] = new PhraseTable("oov", config);
- AbstractGrammar.addOOVRules(phraseTables[phraseTables.length - 1], sentence.getLattice(), featureFunctions, config.true_oovs_only);
-
- this.chart = new PhraseChart(phraseTables, featureFunctions, sentence, config.num_translation_options);
- }
-
-
- /**
- * The main algorithm. Returns a hypergraph representing the search space.
- *
- * @return
- */
- public HyperGraph search() {
-
- long startTime = System.currentTimeMillis();
-
- Future future = new Future(chart);
- stacks = new ArrayList<Stack>();
-
- // <s> counts as the first word. Pushing null lets us count from one.
- stacks.add(null);
-
- // Initialize root hypothesis with <s> context and future cost for everything.
- ComputeNodeResult result = new ComputeNodeResult(this.featureFunctions, Hypothesis.BEGIN_RULE,
- null, -1, 1, null, this.sentence);
- Stack firstStack = new Stack(featureFunctions, sentence, config);
- firstStack.add(new Hypothesis(result.getDPStates(), future.Full()));
- stacks.add(firstStack);
-
- // Decode with increasing numbers of source words.
- for (int source_words = 2; source_words <= sentence.length(); ++source_words) {
- Stack targetStack = new Stack(featureFunctions, sentence, config);
- stacks.add(targetStack);
-
- // Iterate over stacks to continue from.
- for (int phrase_length = 1; phrase_length <= Math.min(source_words - 1, chart.MaxSourcePhraseLength());
- phrase_length++) {
- int from_stack = source_words - phrase_length;
- Stack tailStack = stacks.get(from_stack);
-
- if (Decoder.VERBOSE >= 3)
- System.err.println(String.format("\n WORDS %d MAX %d (STACK %d phrase_length %d)", source_words,
- chart.MaxSourcePhraseLength(), from_stack, phrase_length));
-
- // Iterate over antecedents in this stack.
- for (Coverage coverage: tailStack.getCoverages()) {
- ArrayList<Hypothesis> hypotheses = tailStack.get(coverage);
-
- // the index of the starting point of the first possible phrase
- int begin = coverage.firstZero();
-
- // the absolute position of the ending spot of the last possible phrase
- int last_end = Math.min(coverage.firstZero() + config.reordering_limit, chart.SentenceLength());
- int last_begin = (last_end > phrase_length) ? (last_end - phrase_length) : 0;
-
- for (begin = coverage.firstZero(); begin <= last_begin; begin++) {
- if (!coverage.compatible(begin, begin + phrase_length) ||
- ! permissible(coverage, begin, begin + phrase_length)) {
- continue;
- }
-
- Span span = new Span(begin, begin + phrase_length);
-
- // Don't append </s> until the end
- if (begin == sentence.length() - 1 && source_words != sentence.length())
- continue;
-
- TargetPhrases phrases = chart.getRange(begin, begin + phrase_length);
- if (phrases == null)
- continue;
-
- if (Decoder.VERBOSE >= 3)
- System.err.println(String.format(" Applying %d target phrases over [%d,%d]", phrases.size(), begin, begin + phrase_length));
-
- // TODO: could also compute some number of features here (e.g., non-LM ones)
- // float score_delta = context.GetScorer().transition(ant, phrases, begin, begin + phrase_length);
-
- // Future costs: remove span to be filled.
- float future_delta = future.Change(coverage, begin, begin + phrase_length);
-
- /* This associates with each span a set of hypotheses that can be extended by
- * phrases from that span. The hypotheses are wrapped in HypoState objects, which
- * augment the hypothesis score with a future cost.
- */
- Candidate cand = new Candidate(hypotheses, phrases, span, future_delta);
- targetStack.addCandidate(cand);
- }
- }
- }
-
- /* At this point, every vertex contains a list of all existing hypotheses that the target
- * phrases in that vertex could extend. Now we need to create the search object, which
- * implements cube pruning. There are up to O(n^2) cubes, n the size of the current stack,
- * one cube each over each span of the input. Each "cube" has two dimensions: one representing
- * the target phrases over the span, and one representing all of these incoming hypotheses.
- * We seed the chart with the best item in each cube, and then repeatedly pop and extend.
- */
-
-// System.err.println(String.format("\nBuilding cube-pruning chart for %d words", source_words));
-
- targetStack.search();
- }
-
- Decoder.LOG(1, String.format("Input %d: Search took %.3f seconds", sentence.id(),
- (System.currentTimeMillis() - startTime) / 1000.0f));
-
- return createGoalNode();
- }
-
- /**
- * Enforces reordering constraints. Our version of Moses' ReorderingConstraint::Check() and
- * SearchCubePruning::CheckDistortion().
- *
- * @param coverage
- * @param begin
- * @param i
- * @return
- */
- private boolean permissible(Coverage coverage, int begin, int end) {
- int firstZero = coverage.firstZero();
-
- if (config.reordering_limit < 0)
- return true;
-
- /* We can always start with the first zero since it doesn't create a reordering gap
- */
- if (begin == firstZero)
- return true;
-
- /* If a gap is created by applying this phrase, make sure that you can reach the first
- * zero later on without violating the distortion constraint.
- */
- if (end - firstZero > config.reordering_limit) {
- return false;
- }
-
- return true;
- }
-
-
- /**
- * Searches through the goal stack, calling the final transition function on each node, and then returning
- * the best item. Usually the final transition code doesn't add anything, because all features
- * have already computed everything they need to. The standard exception is language models that
- * have not yet computed their prefix probabilities (which is not the case with KenLM, the default).
- *
- * @return
- */
- private HyperGraph createGoalNode() {
- Stack lastStack = stacks.get(sentence.length());
-
- for (Hypothesis hyp: lastStack) {
- float score = hyp.getScore();
- List<HGNode> tailNodes = new ArrayList<HGNode>();
- tailNodes.add(hyp);
-
- float finalTransitionScore = ComputeNodeResult.computeFinalCost(featureFunctions, tailNodes, 0, sentence.length(), null, sentence);
-
- if (null == this.end)
- this.end = new Hypothesis(null, score + finalTransitionScore, hyp, sentence.length(), null);
-
- HyperEdge edge = new HyperEdge(null, score + finalTransitionScore, finalTransitionScore, tailNodes, null);
- end.addHyperedgeInNode(edge);
- }
-
- return new HyperGraph(end, -1, -1, this.sentence);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/phrase/TargetPhrases.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/phrase/TargetPhrases.java b/src/joshua/decoder/phrase/TargetPhrases.java
deleted file mode 100644
index 83b69d0..0000000
--- a/src/joshua/decoder/phrase/TargetPhrases.java
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.phrase;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
-import joshua.decoder.ff.FeatureFunction;
-import joshua.decoder.ff.FeatureVector;
-import joshua.decoder.ff.tm.Rule;
-
-/**
- * Represents a sorted collection of target-side phrases. Typically, these are phrases
- * generated from the same source word sequence. The list of options is reduced to the number
- * of translation options.
- *
- * @author Matt Post
- */
-
-public class TargetPhrases extends ArrayList<Rule> {
-
- private static final long serialVersionUID = 1L;
-
- public TargetPhrases() {
- super();
- }
-
- /**
- * Initialize with a collection of rules.
- *
- * @param list
- */
- public TargetPhrases(List<Rule> list) {
- super();
-
- for (Rule rule: list) {
- add(rule);
- }
- }
-
- /**
- * Score the rules and sort them. Scoring is necessary because rules are only scored if they
- * are used, in an effort to make reading in rules more efficient. This is starting to create
- * some trouble and should probably be reworked.
- */
- public void finish(List<FeatureFunction> features, FeatureVector weights, int num_options) {
- for (Rule rule: this) {
- rule.estimateRuleCost(features);
-// System.err.println("TargetPhrases:finish(): " + rule);
- }
- Collections.sort(this, Rule.EstimatedCostComparator);
-
- if (this.size() > num_options)
- this.removeRange(num_options, this.size());
-
-// System.err.println("TargetPhrases::finish()");
-// for (Rule rule: this)
-// System.err.println(" " + rule);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/segment_file/ConstraintRule.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/ConstraintRule.java b/src/joshua/decoder/segment_file/ConstraintRule.java
deleted file mode 100644
index 9968640..0000000
--- a/src/joshua/decoder/segment_file/ConstraintRule.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.segment_file;
-
-import javax.swing.text.Segment;
-
-
-/**
- * This interface is for an individual (partial) item to seed the chart with. All rules should be
- * flat (no hierarchical nonterminals).
- * <p>
- * The {@link Segment}, {@link ConstraintSpan}, and {@link ConstraintRule} interfaces are for
- * defining an interchange format between a SegmentFileParser and the Chart class. These interfaces
- * <emph>should not</emph> be used internally by the Chart. The objects returned by a
- * SegmentFileParser will not be optimal for use during decoding. The Chart should convert each of
- * these objects into its own internal representation during construction. That is the contract
- * described by these interfaces.
- *
- * @see Type
- *
- * @author wren ng thornton <wr...@users.sourceforge.net>
- * @version $LastChangedDate: 2009-03-26 15:06:57 -0400 (Thu, 26 Mar 2009) $
- */
-public interface ConstraintRule {
-
- /**
- * There are three types of ConstraintRule. The RULE type returns non-null values for all methods.
- * The LHS type provides a (non-null) value for the lhs method, but returns null for everything
- * else. And the RHS type provides a (non-null) value for nativeRhs and foreignRhs but returns
- * null for the lhs and features.
- * <p>
- * The interpretation of a RULE is that it adds a new rule to the grammar which only applies to
- * the associated span. If the associated span is hard, then the set of rules for that span will
- * override the regular grammar.
- * <p>
- * The intepretation of a LHS is that it provides a hard constraint that the associated span be
- * treated as the nonterminal for that span, thus filtering the regular grammar.
- * <p>
- * The interpretation of a RHS is that it provides a hard constraint to filter the regular grammar
- * such that only rules generating the desired translation can be used.
- */
- public enum Type {
- RULE, LHS, RHS
- };
-
- /** Return the type of this ConstraintRule. */
- Type type();
-
-
- /**
- * Return the left hand side of the constraint rule. If this is null, then this object is
- * specifying a translation for the span, but that translation may be derived from any
- * nonterminal. The nonterminal here must be one used by the regular grammar.
- */
- String lhs();
-
-
- /**
- * Return the native right hand side of the constraint rule. If this is null, then the regular
- * grammar will be used to fill in the derivation from the lhs.
- */
- String nativeRhs();
-
-
- /**
- * Return the foreign right hand side of the constraint rule. This must be consistent with the
- * sentence for the associated span, and is provided as a convenience method.
- */
- String foreignRhs();
-
-
- /**
- * Return the grammar feature values for the RULE. The length of this array must be the same as
- * for the regular grammar. We cannot enforce this requirement, but the
- * {@link joshua.decoder.chart_parser.Chart} must throw an error if there is a mismatch.
- */
- float[] features();
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/segment_file/ConstraintSpan.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/ConstraintSpan.java b/src/joshua/decoder/segment_file/ConstraintSpan.java
deleted file mode 100644
index c8087bd..0000000
--- a/src/joshua/decoder/segment_file/ConstraintSpan.java
+++ /dev/null
@@ -1,76 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.segment_file;
-
-import java.util.List;
-
-import javax.swing.text.Segment;
-
-/**
- * This interface represents a collection of constraints for a given span in the associated segment.
- * Intuitively, each constraint corresponds to one or more items in the chart for parsing, except
- * that we pre-seed the chart with these items before beginning the parsing algorithm. Some
- * constraints can be "hard", in which case the regular grammar is not consulted for these spans. It
- * is an error to have hard constraints for overlapping spans.
- * <p>
- * Indices for the span boundaries mark the transitions between words. Thus, the 0 index occurs
- * before the first word, the 1 index occurs between the first and second words, 2 is between the
- * second and third, etc. Consequently, it is an error for the end index to be equal to or less than
- * the start index. It is also an error to have negative indices or to have indices larger than the
- * count of words in the segment. Clients may assume that no <code>ConstraintSpan</code> objects are
- * constructed which violate these laws.
- * <p>
- * The {@link Segment}, {@link ConstraintSpan}, and {@link ConstraintRule} interfaces are for
- * defining an interchange format between a SegmentFileParser and the Chart class. These interfaces
- * <emph>should not</emph> be used internally by the Chart. The objects returned by a
- * SegmentFileParser will not be optimal for use during decoding. The Chart should convert each of
- * these objects into its own internal representation during construction. That is the contract
- * described by these interfaces.
- *
- * @author wren ng thornton <wr...@users.sourceforge.net>
- */
-public interface ConstraintSpan {
-
- /**
- * Return the starting index of the span covered by this constraint.
- */
- int start();
-
- /**
- * Return the ending index of the span covered by this constraint. Clients may assume
- * <code>this.end() >= 1 + this.start()</code>.
- */
- int end();
-
- /**
- * Return whether this is a hard constraint which should override the grammar. This value only
- * really matters for sets of <code>RULE</code> type constraints.
- */
- boolean isHard();
-
- /**
- * Return a collection of the "rules" for this constraint span.
- * <p>
- * This return type is suboptimal for some SegmentFileParsers. It should be an
- * {@link java.util.Iterator} instead in order to reduce the coupling between this class and
- * Chart. See the note above about the fact that this interface should not be used internally by
- * the Chart class because it will not be performant.
- */
- List<ConstraintRule> rules();
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/segment_file/ParseTreeInput.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/ParseTreeInput.java b/src/joshua/decoder/segment_file/ParseTreeInput.java
deleted file mode 100644
index 5feb051..0000000
--- a/src/joshua/decoder/segment_file/ParseTreeInput.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.segment_file;
-
-import joshua.decoder.JoshuaConfiguration;
-
-public class ParseTreeInput extends Sentence {
-
- public ParseTreeInput(String input, int id, JoshuaConfiguration joshuaConfiguration) {
- super(input, id,joshuaConfiguration);
- }
-
- // looks_like_parse_tree = sentence.sentence().matches("^\\(+[A-Z]+ .*");
-
- // private SyntaxTree syntax_tree;
-
- // ParseTreeInput() {
- // SyntaxTree syntax_tree = new ArraySyntaxTree(sentence.sentence(), Vocabulary);
- // }
-
- // public int[] int_sentence() {
- // return syntax_tree.getTerminals();
- // }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/segment_file/ParsedSentence.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/ParsedSentence.java b/src/joshua/decoder/segment_file/ParsedSentence.java
deleted file mode 100644
index 9273b96..0000000
--- a/src/joshua/decoder/segment_file/ParsedSentence.java
+++ /dev/null
@@ -1,56 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.segment_file;
-
-import joshua.corpus.Vocabulary;
-import joshua.corpus.syntax.ArraySyntaxTree;
-import joshua.corpus.syntax.SyntaxTree;
-import joshua.decoder.JoshuaConfiguration;
-
-public class ParsedSentence extends Sentence {
-
- private SyntaxTree syntaxTree = null;
-
- public ParsedSentence(String input, int id,JoshuaConfiguration joshuaConfiguration) {
- super(input, id, joshuaConfiguration);
- }
-
- public int[] getWordIDs() {
- int[] terminals = syntaxTree().getTerminals();
- int[] annotated = new int[terminals.length + 2];
- System.arraycopy(terminals, 0, annotated, 1, terminals.length);
- annotated[0] = Vocabulary.id(Vocabulary.START_SYM);
- annotated[annotated.length - 1] = Vocabulary.id(Vocabulary.STOP_SYM);
- return annotated;
- }
-
- public SyntaxTree syntaxTree() {
- if (syntaxTree == null)
- syntaxTree = new ArraySyntaxTree(this.source());
- return syntaxTree;
- }
-
- public static boolean matches(String input) {
- return input.matches("^\\(+[A-Z]+ .*");
- }
-
- public String fullSource() {
- return Vocabulary.getWords(this.getWordIDs());
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/segment_file/Sentence.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/Sentence.java b/src/joshua/decoder/segment_file/Sentence.java
deleted file mode 100644
index 588850b..0000000
--- a/src/joshua/decoder/segment_file/Sentence.java
+++ /dev/null
@@ -1,440 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.segment_file;
-
-import static joshua.util.FormatUtils.addSentenceMarkers;
-import static joshua.util.FormatUtils.escapeSpecialSymbols;
-
-import java.util.ArrayList;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.StringTokenizer;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.Decoder;
-import joshua.decoder.JoshuaConfiguration;
-import joshua.decoder.ff.tm.Grammar;
-import joshua.lattice.Arc;
-import joshua.lattice.Lattice;
-import joshua.lattice.Node;
-import joshua.util.ChartSpan;
-import joshua.util.Regex;
-
-/**
- * This class represents lattice input. The lattice is contained on a single line and is represented
- * in PLF (Python Lattice Format), e.g.,
- *
- * ((('ein',0.1,1),('dieses',0.2,1),('haus',0.4,2),),(('haus',0.8,1),),)
- *
- * @author Matt Post <po...@cs.jhu.edu>
- */
-
-public class Sentence {
-
- /* The sentence number. */
- public int id = -1;
-
- /*
- * The source and target sides of the input sentence. Target sides are present when doing
- * alignment or forced decoding.
- */
- protected String source = null;
- protected String fullSource = null;
-
- protected String target = null;
- protected String fullTarget = null;
- protected String[] references = null;
-
- /* Lattice representation of the source sentence. */
- protected Lattice<Token> sourceLattice = null;
-
- /* List of constraints */
- private final List<ConstraintSpan> constraints;
-
- private JoshuaConfiguration config = null;
-
- /**
- * Constructor. Receives a string representing the input sentence. This string may be a
- * string-encoded lattice or a plain text string for decoding.
- *
- * @param inputString
- * @param id
- */
- public Sentence(String inputString, int id, JoshuaConfiguration joshuaConfiguration) {
-
- inputString = Regex.spaces.replaceAll(inputString, " ").trim();
-
- config = joshuaConfiguration;
-
- this.constraints = new LinkedList<ConstraintSpan>();
-
- // Check if the sentence has SGML markings denoting the
- // sentence ID; if so, override the id passed in to the
- // constructor
- Matcher start = SEG_START.matcher(inputString);
- if (start.find()) {
- source = SEG_END.matcher(start.replaceFirst("")).replaceFirst("");
- String idstr = start.group(1);
- this.id = Integer.parseInt(idstr);
- } else {
- if (inputString.indexOf(" ||| ") != -1) {
- String[] pieces = inputString.split("\\s?\\|{3}\\s?");
- source = pieces[0];
- target = pieces[1];
- if (target.equals(""))
- target = null;
- if (pieces.length > 2) {
- references = new String[pieces.length - 2];
- System.arraycopy(pieces, 2, references, 0, pieces.length - 2);
- }
- } else {
- source = inputString;
- }
- this.id = id;
- }
-
- // Only trim strings
- if (! (joshuaConfiguration.lattice_decoding && source.startsWith("(((")))
- adjustForLength(joshuaConfiguration.maxlen);
- }
-
- /**
- * Indicates whether the underlying lattice is a linear chain, i.e., a sentence.
- *
- * @return true if this is a linear chain, false otherwise
- */
- public boolean isLinearChain() {
- return ! this.getLattice().hasMoreThanOnePath();
- }
-
- // Matches the opening and closing <seg> tags, e.g.,
- // <seg id="72">this is a test input sentence</seg>.
- protected static final Pattern SEG_START = Pattern
- .compile("^\\s*<seg\\s+id=\"?(\\d+)\"?[^>]*>\\s*");
- protected static final Pattern SEG_END = Pattern.compile("\\s*</seg\\s*>\\s*$");
-
- /**
- * Returns the length of the sentence. For lattices, the length is the shortest path through the
- * lattice. The length includes the <s> and </s> sentence markers.
- *
- * @return number of input tokens + 2 (for start and end of sentence markers)
- */
- public int length() {
- return this.getLattice().getShortestDistance();
- }
-
- /**
- * Returns the annotations for a specific word (specified by an index) in the
- * sentence
- *
- * @param index The location of the word in the sentence
- * @param key The annotation identity
- * @return The annotations associated with this word
- */
- public String getAnnotation(int index, String key) {
- return getTokens().get(index).getAnnotation(key);
- }
-
- /**
- * This function computes the intersection of \sigma^+ (where \sigma is the terminal vocabulary)
- * with all character-level segmentations of each OOV in the input sentence.
- *
- * The idea is to break apart noun compounds in languages like German (such as the word "golfloch"
- * = "golf" (golf) + "loch" (hole)), allowing them to be translated.
- *
- * @param grammars a list of grammars to consult to find in- and out-of-vocabulary items
- */
- public void segmentOOVs(Grammar[] grammars) {
- Lattice<Token> oldLattice = this.getLattice();
-
- /* Build a list of terminals across all grammars */
- HashSet<Integer> vocabulary = new HashSet<Integer>();
- for (Grammar grammar : grammars) {
- Iterator<Integer> iterator = grammar.getTrieRoot().getTerminalExtensionIterator();
- while (iterator.hasNext())
- vocabulary.add(iterator.next());
- }
-
- List<Node<Token>> oldNodes = oldLattice.getNodes();
-
- /* Find all the subwords that appear in the vocabulary, and create the lattice */
- for (int nodeid = oldNodes.size() - 3; nodeid >= 1; nodeid -= 1) {
- if (oldNodes.get(nodeid).getOutgoingArcs().size() == 1) {
- Arc<Token> arc = oldNodes.get(nodeid).getOutgoingArcs().get(0);
- String word = Vocabulary.word(arc.getLabel().getWord());
- if (!vocabulary.contains(arc.getLabel())) {
- // System.err.println(String.format("REPL: '%s'", word));
- List<Arc<Token>> savedArcs = oldNodes.get(nodeid).getOutgoingArcs();
-
- char[] chars = word.toCharArray();
- ChartSpan<Boolean> wordChart = new ChartSpan<Boolean>(chars.length + 1, false);
- ArrayList<Node<Token>> nodes = new ArrayList<Node<Token>>(chars.length + 1);
- nodes.add(oldNodes.get(nodeid));
- for (int i = 1; i < chars.length; i++)
- nodes.add(new Node<Token>(i));
- nodes.add(oldNodes.get(nodeid + 1));
- for (int width = 1; width <= chars.length; width++) {
- for (int i = 0; i <= chars.length - width; i++) {
- int j = i + width;
- if (width != chars.length) {
- Token token = new Token(word.substring(i, j), config);
- if (vocabulary.contains(id)) {
- nodes.get(i).addArc(nodes.get(j), 0.0f, token);
- wordChart.set(i, j, true);
- // System.err.println(String.format(" FOUND '%s' at (%d,%d)", word.substring(i, j),
- // i, j));
- }
- }
-
- for (int k = i + 1; k < j; k++) {
- if (wordChart.get(i, k) && wordChart.get(k, j)) {
- wordChart.set(i, j, true);
- // System.err.println(String.format(" PATH FROM %d-%d-%d", i, k, j));
- }
- }
- }
- }
-
- /* If there's a path from beginning to end */
- if (wordChart.get(0, chars.length)) {
- // Remove nodes not part of a complete path
- HashSet<Node<Token>> deletedNodes = new HashSet<Node<Token>>();
- for (int k = 1; k < nodes.size() - 1; k++)
- if (!(wordChart.get(0, k) && wordChart.get(k, chars.length)))
- nodes.set(k, null);
-
- int delIndex = 1;
- while (delIndex < nodes.size())
- if (nodes.get(delIndex) == null) {
- deletedNodes.add(nodes.get(delIndex));
- nodes.remove(delIndex);
- } else
- delIndex++;
-
- for (Node<Token> node : nodes) {
- int arcno = 0;
- while (arcno != node.getOutgoingArcs().size()) {
- Arc<Token> delArc = node.getOutgoingArcs().get(arcno);
- if (deletedNodes.contains(delArc.getHead()))
- node.getOutgoingArcs().remove(arcno);
- else {
- arcno++;
- // System.err.println(" ARC: " + Vocabulary.word(delArc.getLabel()));
- }
- }
- }
-
- // Insert into the main lattice
- this.getLattice().insert(nodeid, nodeid + 1, nodes);
- } else {
- nodes.get(0).setOutgoingArcs(savedArcs);
- }
- }
- }
- }
- }
-
- /**
- * If the input sentence is too long (not counting the <s> and </s> tokens), it is truncated to
- * the maximum length, specified with the "maxlen" parameter.
- *
- * Note that this code assumes the underlying representation is a sentence, and not a lattice. Its
- * behavior is undefined for lattices.
- *
- * @param length
- */
- protected void adjustForLength(int length) {
- int size = this.getLattice().size() - 2; // subtract off the start- and end-of-sentence tokens
-
- if (size > length) {
- Decoder.LOG(1, String.format("* WARNING: sentence %d too long (%d), truncating to length %d",
- id(), size, length));
-
- // Replace the input sentence (and target) -- use the raw string, not source()
- String[] tokens = source.split("\\s+");
- source = tokens[0];
- for (int i = 1; i < length; i++)
- source += " " + tokens[i];
- sourceLattice = null;
- if (target != null) {
- target = "";
- }
- }
- }
-
- public boolean isEmpty() {
- return source.matches("^\\s*$");
- }
-
- public int id() {
- return id;
- }
-
- /**
- * Returns the raw source-side input string.
- */
- public String rawSource() {
- return source;
- }
-
- /**
- * Returns the source-side string with annotations --- if any --- stripped off.
- *
- * @return
- */
- public String source() {
- StringBuilder str = new StringBuilder();
- int[] ids = getWordIDs();
- for (int i = 1; i < ids.length - 1; i++) {
- str.append(Vocabulary.word(ids[i])).append(" ");
- }
- return str.toString().trim();
- }
-
- /**
- * Returns a sentence with the start and stop symbols added to the
- * beginning and the end of the sentence respectively
- *
- * @return String The input sentence with start and stop symbols
- */
- public String fullSource() {
- if (fullSource == null) {
- fullSource = addSentenceMarkers(source());
- }
- return fullSource;
- }
-
- /**
- * If a target side was supplied with the sentence, this will be non-null. This is used when doing
- * synchronous parsing or constrained decoding. The input format is:
- *
- * Bill quiere ir a casa ||| Bill wants to go home
- *
- * If the parameter parse=true is set, parsing will be triggered, otherwise constrained decoding.
- *
- * @return
- */
- public String target() {
- return target;
- }
-
- public String fullTarget() {
- if (fullTarget == null) {
- fullTarget = addSentenceMarkers(target());
- }
- return fullTarget;
- }
-
- public String source(int i, int j) {
- StringTokenizer st = new StringTokenizer(fullSource());
- int index = 0;
- StringBuilder substring = new StringBuilder();
- while (st.hasMoreTokens()) {
- String token = st.nextToken();
- if (index >= j)
- break;
- if (index >= i)
- substring.append(token).append(" ");
- index++;
- }
- return substring.toString().trim();
- }
-
- public String[] references() {
- return references;
- }
-
- /**
- * Returns the sequence of tokens comprising the sentence. This assumes you've done the checking
- * to makes sure the input string (the source side) isn't a PLF waiting to be parsed.
- *
- * @return
- */
- public List<Token> getTokens() {
- assert isLinearChain();
- List<Token> tokens = new ArrayList<Token>();
- for (Node<Token> node: getLattice().getNodes())
- if (node != null && node.getOutgoingArcs().size() > 0)
- tokens.add(node.getOutgoingArcs().get(0).getLabel());
- return tokens;
- }
-
- /**
- * Returns the sequence of word IDs comprising the input sentence. Assumes this is not a general
- * lattice, but a linear chain.
- */
- public int[] getWordIDs() {
- List<Token> tokens = getTokens();
- int[] ids = new int[tokens.size()];
- for (int i = 0; i < tokens.size(); i++)
- ids[i] = tokens.get(i).getWord();
- return ids;
- }
-
- /**
- * Returns the sequence of word ids comprising the sentence. Assumes this is a sentence and
- * not a lattice.
- *
- * @return
- */
- public Lattice<String> stringLattice() {
- assert isLinearChain();
- return Lattice.createStringLatticeFromString(source(), config);
- }
-
- public List<ConstraintSpan> constraints() {
- return constraints;
- }
-
- public Lattice<Token> getLattice() {
- if (this.sourceLattice == null) {
- if (config.lattice_decoding && rawSource().startsWith("(((")) {
- if (config.search_algorithm.equals("stack")) {
- System.err.println("* FATAL: lattice decoding currently not supported for stack-based search algorithm.");
- System.exit(12);
- }
- this.sourceLattice = Lattice.createTokenLatticeFromPLF(rawSource(), config);
- } else
- this.sourceLattice = Lattice.createTokenLatticeFromString(String.format("%s %s %s", Vocabulary.START_SYM,
- rawSource(), Vocabulary.STOP_SYM), config);
- }
- return this.sourceLattice;
- }
-
- @Override
- public String toString() {
- StringBuilder sb = new StringBuilder(source());
- if (target() != null) {
- sb.append(" ||| " + target());
- }
- return sb.toString();
- }
-
- public boolean hasPath(int begin, int end) {
- return getLattice().distance(begin, end) != -1;
- }
-
- public Node<Token> getNode(int i) {
- return getLattice().getNode(i);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/segment_file/Token.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/Token.java b/src/joshua/decoder/segment_file/Token.java
deleted file mode 100644
index bddfd68..0000000
--- a/src/joshua/decoder/segment_file/Token.java
+++ /dev/null
@@ -1,147 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.segment_file;
-
-import static joshua.util.FormatUtils.escapeSpecialSymbols;
-
-import java.util.HashMap;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.Decoder;
-import joshua.decoder.JoshuaConfiguration;
-import joshua.util.FormatUtils;
-
-/**
- * Stores the identity of a word and its annotations in a sentence.
-
- * @author "Gaurav Kumar"
- * @author Matt Post
- */
-public class Token {
- // The token without the annotations
- private String token;
- private int tokenID;
-
- private HashMap<String,String> annotations = null;
- private JoshuaConfiguration joshuaConfiguration;
-
- /**
- * Constructor : Creates a Token object from a raw word
- * Extracts and assigns an annotation when available.
- * Any word can be marked with annotations, which are arbitrary semicolon-delimited
- * key[=value] pairs (the value is optional) listed in brackets after a word, e.g.,
- *
- * Je[ref=Samuel;PRO] voudrais[FUT;COND] ...
- *
- * This will create a dictionary annotation on the word of the following form for "Je"
- *
- * ref -> Samuel
- * PRO -> PRO
- *
- * and the following for "voudrais":
- *
- * FUT -> FUT
- * COND -> COND
- *
- * @param rawWord A word with annotation information (possibly)
- *
- */
- public Token(String rawWord, JoshuaConfiguration config) {
-
- this.joshuaConfiguration = config;
-
- annotations = new HashMap<String,String>();
-
- // Matches a word with an annotation
- // Check guidelines in constructor description
- Pattern pattern = Pattern.compile("(\\S+)\\[(\\S+)\\]");
- Matcher tag = pattern.matcher(rawWord);
- if (tag.find()) {
- // Annotation match found
- token = tag.group(1);
- String tagStr = tag.group(2);
-
- for (String annotation: tagStr.split(";")) {
- int where = annotation.indexOf("=");
- if (where != -1) {
- annotations.put(annotation.substring(0, where), annotation.substring(where + 1));
- } else {
- annotations.put(annotation, annotation);
- }
- }
- } else {
- // No match found, which implies that this token does not have any annotations
- token = rawWord;
- }
-
- // Mask strings that cause problems for the decoder. This has to be done *after* parsing for
- // annotations.
- token = escapeSpecialSymbols(token);
-
- if (joshuaConfiguration != null && joshuaConfiguration.lowercase) {
- if (FormatUtils.ISALLUPPERCASE(token))
- annotations.put("lettercase", "all-upper");
- else if (Character.isUpperCase(token.charAt(0)))
- annotations.put("lettercase", "upper");
- else
- annotations.put("lettercase", "lower");
-
- Decoder.LOG(2, String.format("TOKEN: %s -> %s (%s)", token, token.toLowerCase(), annotations.get("lettercase")));
- token = token.toLowerCase();
- }
-
- tokenID = Vocabulary.id(token);
- }
-
- /**
- * Returns the word ID (vocab ID) for this token
- *
- * @return int A word ID
- */
- public int getWord() {
- return tokenID;
- }
-
- /**
- * Returns the string associated with this token
- * @return String A word
- */
- public String getWordIdentity() {
- return token;
- }
-
- public String toString() {
- return token;
- }
-
- /**
- * Returns the annotationID (vocab ID)
- * associated with this token
- * @return int A type ID
- */
- public String getAnnotation(String key) {
- if (annotations.containsKey(key)) {
- return annotations.get(key);
- }
-
- return null;
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/segment_file/package.html
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/segment_file/package.html b/src/joshua/decoder/segment_file/package.html
deleted file mode 100644
index 8f06ebc..0000000
--- a/src/joshua/decoder/segment_file/package.html
+++ /dev/null
@@ -1,17 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
-<html>
-<head></head>
-<body bgcolor="white">
-
-<!--
-##### THIS IS THE TEMPLATE FOR THE PACKAGE DOC COMMENTS. #####
-##### TYPE YOUR PACKAGE COMMENTS HERE. BEGIN WITH A #####
-##### ONE-SENTENCE SUMMARY STARTING WITH A VERB LIKE: #####
--->
-
-Provides common interfaces for parsing segment files (aka test corpora to be translated). In order to support constraint annotations, we provide a general API for use by JoshuaDecoder and Chart.
-
-<!-- Put @see and @since tags down here. -->
-
-</body>
-</html>
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/lattice/Arc.java
----------------------------------------------------------------------
diff --git a/src/joshua/lattice/Arc.java b/src/joshua/lattice/Arc.java
deleted file mode 100644
index 793a128..0000000
--- a/src/joshua/lattice/Arc.java
+++ /dev/null
@@ -1,118 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.lattice;
-
-
-/**
- * An arc in a directed graph.
- *
- * @author Lane Schwartz
- * @since 2008-07-08
- *
- * @param Label Type of label associated with an arc.
- */
-public class Arc<Label> {
-
- /**
- * Weight of this arc.
- */
- private float cost;
-
- /**
- * Node where this arc ends.
- */
- private Node<Label> head;
-
- /**
- * Node where this arc begins.
- */
- private Node<Label> tail;
-
- /**
- * Label associated with this arc.
- */
- private Label label;
-
- /**
- * Creates an arc with the specified head, tail, cost, and label.
- *
- * @param head The node where this arc begins.
- * @param tail The node where this arc ends.
- * @param cost The cost of this arc.
- * @param label The label associated with this arc.
- */
- public Arc(Node<Label> tail, Node<Label> head, float cost, Label label) {
- this.tail = tail;
- this.head = head;
- this.cost = cost;
- this.label = label;
- }
-
- /**
- * Gets the cost of this arc.
- *
- * @return The cost of this arc.
- */
- public float getCost() {
- return cost;
- }
-
- /**
- * Gets the tail of this arc (the node where this arc begins).
- *
- * @return The tail of this arc.
- */
- public Node<Label> getTail() {
- return tail;
- }
-
- /**
- * Gets the head of this arc (the node where this arc ends).
- *
- * @return The head of this arc.
- */
- public Node<Label> getHead() {
- return head;
- }
-
- /**
- * Gets the label associated with this arc.
- *
- * @return The label associated with this arc.
- */
- public Label getLabel() {
- return label;
- }
-
- @Override
- public String toString() {
- StringBuilder s = new StringBuilder();
-
- s.append(label.toString());
- s.append(" : ");
- s.append(tail.toString());
- s.append(" ==> ");
- s.append(head.toString());
- s.append(" : ");
- s.append(cost);
-
- return s.toString();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/lattice/Lattice.java
----------------------------------------------------------------------
diff --git a/src/joshua/lattice/Lattice.java b/src/joshua/lattice/Lattice.java
deleted file mode 100644
index b0ef40f..0000000
--- a/src/joshua/lattice/Lattice.java
+++ /dev/null
@@ -1,515 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.lattice;
-
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Stack;
-import java.util.logging.Logger;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.JoshuaConfiguration;
-import joshua.decoder.segment_file.Token;
-import joshua.util.ChartSpan;
-
-/**
- * A lattice representation of a directed graph.
- *
- * @author Lane Schwartz
- * @author Matt Post <po...@cs.jhu.edu>
- * @since 2008-07-08
- *
- * @param Label Type of label associated with an arc.
- */
-public class Lattice<Value> implements Iterable<Node<Value>> {
-
- /**
- * True if there is more than one path through the lattice.
- */
- private boolean latticeHasAmbiguity;
-
- /**
- * Costs of the best path between each pair of nodes in the lattice.
- */
- private ChartSpan<Integer> distances = null;
-
- /**
- * List of all nodes in the lattice. Nodes are assumed to be in topological order.
- */
- private List<Node<Value>> nodes;
-
- /** Logger for this class. */
- private static final Logger logger = Logger.getLogger(Lattice.class.getName());
-
- JoshuaConfiguration config = null;
-
- /**
- * Constructs a new lattice from an existing list of (connected) nodes.
- * <p>
- * The list of nodes must already be in topological order. If the list is not in topological
- * order, the behavior of the lattice is not defined.
- *
- * @param nodes A list of nodes which must be in topological order.
- */
- public Lattice(List<Node<Value>> nodes, JoshuaConfiguration config) {
- this.nodes = nodes;
-// this.distances = calculateAllPairsShortestPath();
- this.latticeHasAmbiguity = true;
- }
-
- public Lattice(List<Node<Value>> nodes, boolean isAmbiguous, JoshuaConfiguration config) {
- // Node<Value> sink = new Node<Value>(nodes.size());
- // nodes.add(sink);
- this.nodes = nodes;
-// this.distances = calculateAllPairsShortestPath();
- this.latticeHasAmbiguity = isAmbiguous;
- }
-
- /**
- * Instantiates a lattice from a linear chain of values, i.e., a sentence.
- *
- * @param linearChain a sequence of Value objects
- */
- public Lattice(Value[] linearChain, JoshuaConfiguration config) {
- this.latticeHasAmbiguity = false;
- this.nodes = new ArrayList<Node<Value>>();
-
- Node<Value> previous = new Node<Value>(0);
- nodes.add(previous);
-
- int i = 1;
-
- for (Value value : linearChain) {
-
- Node<Value> current = new Node<Value>(i);
- float cost = 0.0f;
- // if (i > 4) cost = (float)i/1.53432f;
- previous.addArc(current, cost, value);
-
- nodes.add(current);
-
- previous = current;
- i++;
- }
-
-// this.distances = calculateAllPairsShortestPath();
- }
-
- public final boolean hasMoreThanOnePath() {
- return latticeHasAmbiguity;
- }
-
- /**
- * Computes the shortest distance between two nodes, which is used (perhaps among other places) in
- * computing which rules can apply over which spans of the input
- *
- * @param tail
- * @param head
- * @return the distance, a positive number, or -1 if there is no path between the nodes
- */
- public int distance(Arc<Value> arc) {
- return this.getShortestPath(arc.getTail().getNumber(), arc.getHead().getNumber());
- }
-
- public int distance(int i, int j) {
- return this.getShortestPath(i, j);
- }
-
- /**
- * Convenience method to get a lattice from a linear sequence of {@link Token} objects.
- *
- * @param linearChain
- * @return Lattice representation of the linear chain.
- */
- public static Lattice<Token> createTokenLatticeFromString(String source, JoshuaConfiguration config) {
- String[] tokens = source.split("\\s+");
- Token[] integerSentence = new Token[tokens.length];
- for (int i = 0; i < tokens.length; i++) {
- integerSentence[i] = new Token(tokens[i], config);
- }
-
- return new Lattice<Token>(integerSentence, config);
- }
-
- public static Lattice<Token> createTokenLatticeFromPLF(String data, JoshuaConfiguration config) {
- ArrayList<Node<Token>> nodes = new ArrayList<Node<Token>>();
-
- // This matches a sequence of tuples, which describe arcs leaving this node
- Pattern nodePattern = Pattern.compile("(.+?)\\(\\s*(\\(.+?\\),\\s*)\\s*\\)(.*)");
-
- /*
- * This matches a comma-delimited, parenthesized tuple of a (a) single-quoted word (b) a number,
- * optionally in scientific notation (c) an offset (how many states to jump ahead)
- */
- Pattern arcPattern = Pattern
- .compile("\\s*\\('(.+?)',\\s*(-?\\d+\\.?\\d*?(?:[eE]-?\\d+)?),\\s*(\\d+)\\),\\s*(.*)");
-
- Matcher nodeMatcher = nodePattern.matcher(data);
-
- boolean latticeIsAmbiguous = false;
-
- int nodeID = 0;
- Node<Token> startNode = new Node<Token>(nodeID);
- nodes.add(startNode);
-
- while (nodeMatcher.matches()) {
-
- String nodeData = nodeMatcher.group(2);
- String remainingData = nodeMatcher.group(3);
-
- nodeID++;
-
- Node<Token> currentNode = null;
- if (nodeID < nodes.size() && nodes.get(nodeID) != null) {
- currentNode = nodes.get(nodeID);
- } else {
- currentNode = new Node<Token>(nodeID);
- while (nodeID > nodes.size())
- nodes.add(new Node<Token>(nodes.size()));
- nodes.add(currentNode);
- }
-
- Matcher arcMatcher = arcPattern.matcher(nodeData);
- int numArcs = 0;
- if (!arcMatcher.matches()) {
- throw new RuntimeException("Parse error!");
- }
- while (arcMatcher.matches()) {
- numArcs++;
- String arcLabel = arcMatcher.group(1);
- float arcWeight = Float.parseFloat(arcMatcher.group(2));
- int destinationNodeID = nodeID + Integer.parseInt(arcMatcher.group(3));
-
- Node<Token> destinationNode;
- if (destinationNodeID < nodes.size() && nodes.get(destinationNodeID) != null) {
- destinationNode = nodes.get(destinationNodeID);
- } else {
- destinationNode = new Node<Token>(destinationNodeID);
- while (destinationNodeID > nodes.size())
- nodes.add(new Node<Token>(nodes.size()));
- nodes.add(destinationNode);
- }
-
- String remainingArcs = arcMatcher.group(4);
-
- Token arcToken = new Token(arcLabel, config);
- currentNode.addArc(destinationNode, arcWeight, arcToken);
-
- arcMatcher = arcPattern.matcher(remainingArcs);
- }
- if (numArcs > 1)
- latticeIsAmbiguous = true;
-
- nodeMatcher = nodePattern.matcher(remainingData);
- }
-
- /* Add <s> to the start of the lattice. */
- if (nodes.size() > 1 && nodes.get(1) != null) {
- Node<Token> firstNode = nodes.get(1);
- startNode.addArc(firstNode, 0.0f, new Token(Vocabulary.START_SYM, config));
- }
-
- /* Add </s> as a final state, connect it to the previous end-state */
- nodeID = nodes.get(nodes.size()-1).getNumber() + 1;
- Node<Token> endNode = new Node<Token>(nodeID);
- nodes.get(nodes.size()-1).addArc(endNode, 0.0f, new Token(Vocabulary.STOP_SYM, config));
- nodes.add(endNode);
-
- return new Lattice<Token>(nodes, latticeIsAmbiguous, config);
- }
-
- /**
- * Constructs a lattice from a given string representation.
- *
- * @param data String representation of a lattice.
- * @return A lattice that corresponds to the given string.
- */
- public static Lattice<String> createStringLatticeFromString(String data, JoshuaConfiguration config) {
-
- Map<Integer, Node<String>> nodes = new HashMap<Integer, Node<String>>();
-
- Pattern nodePattern = Pattern.compile("(.+?)\\((\\(.+?\\),)\\)(.*)");
- Pattern arcPattern = Pattern.compile("\\('(.+?)',(\\d+.\\d+),(\\d+)\\),(.*)");
-
- Matcher nodeMatcher = nodePattern.matcher(data);
-
- int nodeID = -1;
-
- while (nodeMatcher.matches()) {
-
- String nodeData = nodeMatcher.group(2);
- String remainingData = nodeMatcher.group(3);
-
- nodeID++;
-
- Node<String> currentNode;
- if (nodes.containsKey(nodeID)) {
- currentNode = nodes.get(nodeID);
- } else {
- currentNode = new Node<String>(nodeID);
- nodes.put(nodeID, currentNode);
- }
-
- logger.fine("Node " + nodeID + ":");
-
- Matcher arcMatcher = arcPattern.matcher(nodeData);
-
- while (arcMatcher.matches()) {
- String arcLabel = arcMatcher.group(1);
- float arcWeight = Float.valueOf(arcMatcher.group(2));
- int destinationNodeID = nodeID + Integer.parseInt(arcMatcher.group(3));
-
- Node<String> destinationNode;
- if (nodes.containsKey(destinationNodeID)) {
- destinationNode = nodes.get(destinationNodeID);
- } else {
- destinationNode = new Node<String>(destinationNodeID);
- nodes.put(destinationNodeID, destinationNode);
- }
-
- String remainingArcs = arcMatcher.group(4);
-
- logger.fine("\t" + arcLabel + " " + arcWeight + " " + destinationNodeID);
-
- currentNode.addArc(destinationNode, arcWeight, arcLabel);
-
- arcMatcher = arcPattern.matcher(remainingArcs);
- }
-
- nodeMatcher = nodePattern.matcher(remainingData);
- }
-
- List<Node<String>> nodeList = new ArrayList<Node<String>>(nodes.values());
- Collections.sort(nodeList, new NodeIdentifierComparator());
-
- logger.fine(nodeList.toString());
-
- return new Lattice<String>(nodeList, config);
- }
-
- /**
- * Gets the cost of the shortest path between two nodes.
- *
- * @param from ID of the starting node.
- * @param to ID of the ending node.
- * @return The cost of the shortest path between the two nodes.
- */
- public int getShortestPath(int from, int to) {
- // System.err.println(String.format("DISTANCE(%d,%d) = %f", from, to, costs[from][to]));
- if (distances == null)
- this.distances = calculateAllPairsShortestPath();
-
- return distances.get(from, to);
- }
-
- /**
- * Gets the shortest distance through the lattice.
- *
- */
- public int getShortestDistance() {
- if (distances == null)
- distances = calculateAllPairsShortestPath();
- return distances.get(0, nodes.size()-1);
- }
-
- /**
- * Gets the node with a specified integer identifier. If the identifier is negative, we count
- * backwards from the end of the array, Perl-style (-1 is the last element, -2 the penultimate,
- * etc).
- *
- * @param index Integer identifier for a node.
- * @return The node with the specified integer identifier
- */
- public Node<Value> getNode(int index) {
- if (index >= 0)
- return nodes.get(index);
- else
- return nodes.get(size() + index);
- }
-
- public List<Node<Value>> getNodes() {
- return nodes;
- }
-
- /**
- * Returns an iterator over the nodes in this lattice.
- *
- * @return An iterator over the nodes in this lattice.
- */
- public Iterator<Node<Value>> iterator() {
- return nodes.iterator();
- }
-
- /**
- * Returns the number of nodes in this lattice.
- *
- * @return The number of nodes in this lattice.
- */
- public int size() {
- return nodes.size();
- }
-
- /**
- * Calculate the all-pairs shortest path for all pairs of nodes.
- * <p>
- * Note: This method assumes no backward arcs. If there are backward arcs, the returned shortest
- * path costs for that node may not be accurate.
- *
- * @param nodes A list of nodes which must be in topological order.
- * @return The all-pairs shortest path for all pairs of nodes.
- */
- private ChartSpan<Integer> calculateAllPairsShortestPath() {
-
- ChartSpan<Integer> distance = new ChartSpan<Integer>(nodes.size() - 1, Integer.MAX_VALUE);
- distance.setDiagonal(0);
-
- /* Mark reachability between immediate neighbors */
- for (Node<Value> tail : nodes) {
- for (Arc<Value> arc : tail.getOutgoingArcs()) {
- Node<Value> head = arc.getHead();
- distance.set(tail.id(), head.id(), 1);
- }
- }
-
- int size = nodes.size();
-
- for (int width = 2; width <= size; width++) {
- for (int i = 0; i < size - width; i++) {
- int j = i + width;
- for (int k = i + 1; k < j; k++) {
- distance.set(i, j, Math.min(distance.get(i, j), distance.get(i, k) + distance.get(k, j)));
- }
- }
- }
-
- return distance;
- }
-
- @Override
- public String toString() {
- StringBuilder s = new StringBuilder();
-
- for (Node<Value> start : this) {
- for (Arc<Value> arc : start.getOutgoingArcs()) {
- s.append(arc.toString());
- s.append('\n');
- }
- }
-
- return s.toString();
- }
-
- public static void main(String[] args) {
-
- List<Node<String>> nodes = new ArrayList<Node<String>>();
- for (int i = 0; i < 4; i++) {
- nodes.add(new Node<String>(i));
- }
-
- nodes.get(0).addArc(nodes.get(1), 1.0f, "x");
- nodes.get(1).addArc(nodes.get(2), 1.0f, "y");
- nodes.get(0).addArc(nodes.get(2), 1.5f, "a");
- nodes.get(2).addArc(nodes.get(3), 3.0f, "b");
- nodes.get(2).addArc(nodes.get(3), 5.0f, "c");
-
- Lattice<String> graph = new Lattice<String>(nodes, null);
-
- System.out.println("Shortest path from 0 to 3: " + graph.getShortestPath(0, 3));
- }
-
- /**
- * Replaced the arc from node i to j with the supplied lattice. This is used to do OOV
- * segmentation of words in a lattice.
- *
- * @param i
- * @param j
- * @param lattice
- */
- public void insert(int i, int j, List<Node<Value>> newNodes) {
-
- nodes.get(i).setOutgoingArcs(newNodes.get(0).getOutgoingArcs());
-
- newNodes.remove(0);
- nodes.remove(j);
- Collections.reverse(newNodes);
-
- for (Node<Value> node: newNodes)
- nodes.add(j, node);
-
- this.latticeHasAmbiguity = false;
- for (int x = 0; x < nodes.size(); x++) {
- nodes.get(x).setID(x);
- this.latticeHasAmbiguity |= (nodes.get(x).getOutgoingArcs().size() > 1);
- }
-
- this.distances = null;
- }
-
- /**
- * Topologically sorts the nodes and reassigns their numbers. Assumes that the first node is the
- * source, but otherwise assumes nothing about the input.
- *
- * Probably correct, but untested.
- */
- @SuppressWarnings("unused")
- private void topologicalSort() {
- HashMap<Node<Value>, List<Arc<Value>>> outgraph = new HashMap<Node<Value>, List<Arc<Value>>>();
- HashMap<Node<Value>, List<Arc<Value>>> ingraph = new HashMap<Node<Value>, List<Arc<Value>>>();
- for (Node<Value> node: nodes) {
- ArrayList<Arc<Value>> arcs = new ArrayList<Arc<Value>>();
- for (Arc<Value> arc: node.getOutgoingArcs()) {
- arcs.add(arc);
-
- if (! ingraph.containsKey(arc.getHead()))
- ingraph.put(arc.getHead(), new ArrayList<Arc<Value>>());
- ingraph.get(arc.getHead()).add(arc);
-
- outgraph.put(node, arcs);
- }
- }
-
- ArrayList<Node<Value>> sortedNodes = new ArrayList<Node<Value>>();
- Stack<Node<Value>> stack = new Stack<Node<Value>>();
- stack.push(nodes.get(0));
-
- while (! stack.empty()) {
- Node<Value> node = stack.pop();
- sortedNodes.add(node);
- for (Arc<Value> arc: outgraph.get(node)) {
- outgraph.get(node).remove(arc);
- ingraph.get(arc.getHead()).remove(arc);
-
- if (ingraph.get(arc.getHead()).size() == 0)
- sortedNodes.add(arc.getHead());
- }
- }
-
- int id = 0;
- for (Node<Value> node : sortedNodes)
- node.setID(id++);
-
- this.nodes = sortedNodes;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/lattice/Node.java
----------------------------------------------------------------------
diff --git a/src/joshua/lattice/Node.java b/src/joshua/lattice/Node.java
deleted file mode 100644
index 31dcea9..0000000
--- a/src/joshua/lattice/Node.java
+++ /dev/null
@@ -1,158 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.lattice;
-
-import java.util.ArrayList;
-import java.util.Iterator;
-import java.util.List;
-
-/**
- * A node in a directed graph.
- *
- * @author Lane Schwartz
- * @since 2008-07-08
- *
- * @param <Label> Type of label associated with an arc.
- */
-public class Node<Label> {
-
- // ===============================================================
- // Member variables
- // ===============================================================
-
- /**
- * Numeric integer identifier of this node. Package-private scope so that Lattice can quickly
- * access this variable.
- */
- private Integer id;
-
- /**
- * Arcs which begin at this node. Package-private scope so that Lattice can quickly access this
- * variable.
- */
- private List<Arc<Label>> outgoingArcs;
-
-
- // ===============================================================
- // Constructor(s)
- // ===============================================================
-
- /**
- * Constructs a new node with the specified numeric identifier.
- */
- public Node(int id) {
- this.id = id;
- this.outgoingArcs = new ArrayList<Arc<Label>>();
- }
-
-
- // ===========================================================
- // Accessor methods (set/get)
- // ===========================================================
-
- /**
- * Gets the numeric integer identifier of this node.
- *
- * @return Numeric integer identifier of this node.
- */
- public int getNumber() {
- return id;
-
- }
-
- public int id() {
- return id;
- }
-
- public void setID(int i) {
- this.id = i;
- }
-
- /**
- * Gets the arcs that begin at this node.
- *
- * @return The arcs that begin at this node.
- */
- public List<Arc<Label>> getOutgoingArcs() {
- return outgoingArcs;
- }
-
- public void setOutgoingArcs(List<Arc<Label>> arcs) {
- outgoingArcs = arcs;
- }
-
- /**
- * Gets an iterable object capable of iterating over all nodes directly reachable from this node.
- * This will be all nodes which are the target of an outgoing arc from this node.
- *
- * @return An iterable object capable of iterating over all nodes directly reachable from this
- * node.
- */
- public Iterable<Node<Label>> reachableNodes() {
- final Iterator<Arc<Label>> arcIterator = outgoingArcs.iterator();
-
- return new Iterable<Node<Label>>() {
- public Iterator<Node<Label>> iterator() {
- return new Iterator<Node<Label>>() {
-
- public boolean hasNext() {
- return arcIterator.hasNext();
- }
-
- public Node<Label> next() {
- return arcIterator.next().getHead();
- }
-
- public void remove() {
- throw new UnsupportedOperationException();
- }
- };
- }
- };
- }
-
-
- /**
- * Adds a new outgoing arc to this node that points to the specified destination. The new arc will
- * have the specified weight and specified label.
- *
- * @param destination Destination node of the new outgoing arc.
- * @param weight Weight of the new outgoing arc.
- * @param label Label of the new outgoing arc.
- */
- public void addArc(Node<Label> destination, float weight, Label label) {
- outgoingArcs.add(new Arc<Label>(this, destination, weight, label));
- }
-
-
- /**
- * Gets the number of outgoing arcs that begin at this node.
- *
- * @return The number of outgoing arcs that begin at this node.
- */
- public int size() {
- return outgoingArcs.size();
- }
-
- @Override
- public String toString() {
- return "Node-" + id;
- }
-
-}