You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by le...@apache.org on 2016/05/16 06:27:00 UTC
[44/66] [partial] incubator-joshua git commit: JOSHUA-252 Make it
possible to use Maven to build Joshua
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/fragmentlm/FragmentLMFF.java b/src/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
deleted file mode 100644
index 0375dc0..0000000
--- a/src/joshua/decoder/ff/fragmentlm/FragmentLMFF.java
+++ /dev/null
@@ -1,356 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.fragmentlm;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Stack;
-
-import joshua.decoder.JoshuaConfiguration;
-import joshua.decoder.chart_parser.SourcePath;
-import joshua.decoder.ff.FeatureVector;
-import joshua.decoder.ff.StatefulFF;
-import joshua.decoder.ff.state_maintenance.DPState;
-import joshua.decoder.ff.tm.Rule;
-import joshua.decoder.ff.tm.format.HieroFormatReader;
-import joshua.decoder.hypergraph.HGNode;
-import joshua.decoder.hypergraph.HyperEdge;
-import joshua.decoder.segment_file.Sentence;
-
-/**
- * Feature function that reads in a list of language model fragments and matches them against the
- * hypergraph. This allows for language model fragment "glue" features, which fire when LM fragments
- * (supplied as input) are assembled. These LM fragments are presumably useful in ensuring
- * grammaticality and can be independent of the translation model fragments.
- *
- * Usage: in the Joshua Configuration file, put
- *
- * feature-function = FragmentLM -lm LM_FRAGMENTS_FILE -map RULE_FRAGMENTS_MAP_FILE
- *
- * LM_FRAGMENTS_FILE is a pointer to a file containing a list of fragments that it should look for.
- * The format of the file is one fragment per line in PTB format, e.g.:
- *
- * (S NP (VP (VBD said) SBAR) (. .))
- *
- * RULE_FRAGMENTS_MAP_FILE points to a file that maps fragments to the flattened SCFG rule format
- * that Joshua uses. This mapping is necessary because Joshua's rules have been flattened, meaning
- * that their internal structure has been removed, yet this structure is needed for matching LM
- * fragments. The format of the file is
- *
- * FRAGMENT ||| RULE-TARGET-SIDE
- *
- * for example,
- *
- * (S (NP (DT the) (NN man)) VP .) ||| the man [VP,1] [.,2] (SBAR (IN that) (S (NP (PRP he)) (VP
- * (VBD was) (VB done)))) ||| that he was done (VP (VBD said) SBAR) ||| said SBAR
- *
- * @author Matt Post <po...@cs.jhu.edu>
- */
-public class FragmentLMFF extends StatefulFF {
-
- /*
- * When building a fragment from a rule rooted in the hypergraph, this parameter determines how
- * deep we'll go. Smaller values mean less hypergraph traversal but may also limit the LM
- * fragments that can be fired.
- */
- private int BUILD_DEPTH = 1;
-
- /*
- * The maximum depth of a fragment, defined as the longest path from the fragment root to any of
- * its leaves.
- */
- private int MAX_DEPTH = 0;
-
- /*
- * This is the minimum depth for lexicalized LM fragments. This allows you to easily exclude small
- * depth-one fragments that may be overfit to the training data. A depth of 1 (the default) does
- * not exclude any fragments.
- */
- private int MIN_LEX_DEPTH = 1;
-
- /*
- * Set to true to activate meta-features.
- */
- private boolean OPTS_DEPTH = false;
-
- /*
- * This contains a list of the language model fragments, indexed by LHS.
- */
- private HashMap<String, ArrayList<Tree>> lmFragments = null;
-
- private int numFragments = 0;
-
- /* The location of the file containing the language model fragments */
- private String fragmentLMFile = "";
-
- /**
- * @param weights
- * @param name
- * @param stateComputer
- */
- public FragmentLMFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
- super(weights, "FragmentLMFF", args, config);
-
- lmFragments = new HashMap<String, ArrayList<Tree>>();
-
- fragmentLMFile = parsedArgs.get("lm");
- BUILD_DEPTH = Integer.parseInt(parsedArgs.get("build-depth"));
- MAX_DEPTH = Integer.parseInt(parsedArgs.get("max-depth"));
- MIN_LEX_DEPTH = Integer.parseInt(parsedArgs.get("min-lex-depth"));
-
- /* Read in the language model fragments */
- try {
- Collection<Tree> trees = PennTreebankReader.readTrees(fragmentLMFile);
- for (Tree fragment : trees) {
- addLMFragment(fragment);
-
- // System.err.println(String.format("Read fragment: %s",
- // lmFragments.get(lmFragments.size()-1)));
- }
- } catch (IOException e) {
- System.err.println(String.format("* WARNING: couldn't read fragment LM file '%s'",
- fragmentLMFile));
- System.exit(1);
- }
- System.err.println(String.format("FragmentLMFF: Read %d LM fragments from '%s'", numFragments,
- fragmentLMFile));
- }
-
- /**
- * Add the provided fragment to the language model, subject to some filtering.
- *
- * @param fragment
- */
- public void addLMFragment(Tree fragment) {
- if (lmFragments == null)
- return;
-
- int fragmentDepth = fragment.getDepth();
-
- if (MAX_DEPTH != 0 && fragmentDepth > MAX_DEPTH) {
- System.err.println(String.format(" Skipping fragment %s (depth %d > %d)", fragment,
- fragmentDepth, MAX_DEPTH));
- return;
- }
-
- if (MIN_LEX_DEPTH > 1 && fragment.isLexicalized() && fragmentDepth < MIN_LEX_DEPTH) {
- System.err.println(String.format(" Skipping fragment %s (lex depth %d < %d)", fragment,
- fragmentDepth, MIN_LEX_DEPTH));
- return;
- }
-
- if (lmFragments.get(fragment.getRule()) == null)
- lmFragments.put(fragment.getRule(), new ArrayList<Tree>());
- lmFragments.get(fragment.getRule()).add(fragment);
- numFragments++;
- }
-
- /**
- * This function computes the features that fire when the current rule is applied. The features
- * that fire are any LM fragments that match the fragment associated with the current rule. LM
- * fragments may recurse over the tail nodes, following 1-best backpointers until the fragment
- * either matches or fails.
- */
- @Override
- public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
- Sentence sentence, Accumulator acc) {
-
- /*
- * Get the fragment associated with the target side of this rule.
- *
- * This could be done more efficiently. For example, just build the tree fragment once and then
- * pattern match against it. This would circumvent having to build the tree possibly once every
- * time you try to apply a rule.
- */
- Tree baseTree = Tree.buildTree(rule, tailNodes, BUILD_DEPTH);
-
- Stack<Tree> nodeStack = new Stack<Tree>();
- nodeStack.add(baseTree);
- while (!nodeStack.empty()) {
- Tree tree = nodeStack.pop();
- if (tree == null)
- continue;
-
- if (lmFragments.get(tree.getRule()) != null) {
- for (Tree fragment : lmFragments.get(tree.getRule())) {
-// System.err.println(String.format("Does\n %s match\n %s??\n -> %s", fragment, tree,
-// match(fragment, tree)));
-
- if (fragment.getLabel() == tree.getLabel() && match(fragment, tree)) {
-// System.err.println(String.format(" FIRING: matched %s against %s", fragment, tree));
- acc.add(fragment.escapedString(), 1);
- if (OPTS_DEPTH)
- if (fragment.isLexicalized())
- acc.add(String.format("FragmentFF_lexdepth%d", fragment.getDepth()), 1);
- else
- acc.add(String.format("FragmentFF_depth%d", fragment.getDepth()), 1);
- }
- }
- }
-
- // We also need to try matching rules against internal nodes of the fragment corresponding to
- // this
- // rule
- if (tree.getChildren() != null)
- for (Tree childNode : tree.getChildren()) {
- if (!childNode.isBoundary())
- nodeStack.add(childNode);
- }
- }
-
- return new FragmentState(baseTree);
- }
-
- /**
- * Matches the fragment against the (possibly partially-built) tree. Assumption
- *
- * @param fragment the language model fragment
- * @param tree the tree to match against (expanded from the hypergraph)
- * @return
- */
- private boolean match(Tree fragment, Tree tree) {
- // System.err.println(String.format("MATCH(%s,%s)", fragment, tree));
-
- /* Make sure the root labels match. */
- if (fragment.getLabel() != tree.getLabel()) {
- return false;
- }
-
- /* Same number of kids? */
- List<Tree> fkids = fragment.getChildren();
- if (fkids.size() > 0) {
- List<Tree> tkids = tree.getChildren();
- if (fkids.size() != tkids.size()) {
- return false;
- }
-
- /* Do the kids match on all labels? */
- for (int i = 0; i < fkids.size(); i++)
- if (fkids.get(i).getLabel() != tkids.get(i).getLabel())
- return false;
-
- /* Recursive match. */
- for (int i = 0; i < fkids.size(); i++) {
- if (!match(fkids.get(i), tkids.get(i)))
- return false;
- }
- }
-
- return true;
- }
-
- @Override
- public DPState computeFinal(HGNode tailNodes, int i, int j, SourcePath sourcePath, Sentence sentence,
- Accumulator acc) {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public float estimateFutureCost(Rule rule, DPState state, Sentence sentence) {
- // TODO Auto-generated method stub
- return 0;
- }
-
- @Override
- public float estimateCost(Rule rule, Sentence sentence) {
- // TODO Auto-generated method stub
- return 0;
- }
-
- public static void main(String[] args) {
- /* Add an LM fragment, then create a dummy multi-level hypergraph to match the fragment against. */
- // FragmentLMFF fragmentLMFF = new FragmentLMFF(new FeatureVector(), (StateComputer) null, "");
- FragmentLMFF fragmentLMFF = new FragmentLMFF(new FeatureVector(),
- new String[] {"-lm", "test/fragments.txt", "-map", "test/mapping.txt"}, null);
-
- Tree fragment = Tree.fromString("(S NP (VP (VBD \"said\") SBAR) (. \".\"))");
-
- Rule ruleS = new HieroFormatReader()
- .parseLine("[S] ||| the man [VP,1] [.,2] ||| the man [VP,1] [.,2] ||| 0");
- Rule ruleVP = new HieroFormatReader()
- .parseLine("[VP] ||| said [SBAR,1] ||| said [SBAR,1] ||| 0");
- Rule ruleSBAR = new HieroFormatReader()
- .parseLine("[SBAR] ||| that he was done ||| that he was done ||| 0");
- Rule rulePERIOD = new HieroFormatReader().parseLine("[.] ||| . ||| . ||| 0");
-
- ruleS.setOwner(0);
- ruleVP.setOwner(0);
- ruleSBAR.setOwner(0);
- rulePERIOD.setOwner(0);
-
- HyperEdge edgeSBAR = new HyperEdge(ruleSBAR, 0.0f, 0.0f, null, (SourcePath) null);
-
- HGNode nodeSBAR = new HGNode(3, 7, ruleSBAR.getLHS(), null, edgeSBAR, 0.0f);
- ArrayList<HGNode> tailNodesVP = new ArrayList<HGNode>();
- Collections.addAll(tailNodesVP, nodeSBAR);
- HyperEdge edgeVP = new HyperEdge(ruleVP, 0.0f, 0.0f, tailNodesVP, (SourcePath) null);
- HGNode nodeVP = new HGNode(2, 7, ruleVP.getLHS(), null, edgeVP, 0.0f);
-
- HyperEdge edgePERIOD = new HyperEdge(rulePERIOD, 0.0f, 0.0f, null, (SourcePath) null);
- HGNode nodePERIOD = new HGNode(7, 8, rulePERIOD.getLHS(), null, edgePERIOD, 0.0f);
-
- ArrayList<HGNode> tailNodes = new ArrayList<HGNode>();
- Collections.addAll(tailNodes, nodeVP, nodePERIOD);
-
- Tree tree = Tree.buildTree(ruleS, tailNodes, 1);
- boolean matched = fragmentLMFF.match(fragment, tree);
- System.err.println(String.format("Does\n %s match\n %s??\n -> %s", fragment, tree, matched));
- }
-
- /**
- * Maintains a state pointer used by KenLM to implement left-state minimization.
- *
- * @author Matt Post <po...@cs.jhu.edu>
- * @author Juri Ganitkevitch <ju...@cs.jhu.edu>
- */
- public class FragmentState extends DPState {
-
- private Tree tree = null;
-
- public FragmentState(Tree tree) {
- this.tree = tree;
- }
-
- /**
- * Every tree is unique.
- *
- * Some savings could be had here if we grouped together items with the same string.
- */
- @Override
- public int hashCode() {
- return tree.hashCode();
- }
-
- @Override
- public boolean equals(Object other) {
- return (other instanceof FragmentState && this == other);
- }
-
- @Override
- public String toString() {
- return String.format("[FragmentState %s]", tree);
- }
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/fragmentlm/PennTreebankReader.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/fragmentlm/PennTreebankReader.java b/src/joshua/decoder/ff/fragmentlm/PennTreebankReader.java
deleted file mode 100644
index 6ab52e1..0000000
--- a/src/joshua/decoder/ff/fragmentlm/PennTreebankReader.java
+++ /dev/null
@@ -1,135 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.fragmentlm;
-
-import java.util.*;
-import java.io.*;
-import java.nio.charset.Charset;
-import java.nio.charset.UnsupportedCharsetException;
-
-/**
- * @author Dan Klein
- */
-public class PennTreebankReader {
-
- static class TreeCollection extends AbstractCollection<Tree> {
-
- List<File> files;
- Charset charset;
-
- static class TreeIteratorIterator implements Iterator<Iterator<Tree>> {
- Iterator<File> fileIterator;
- Iterator<Tree> nextTreeIterator;
- Charset charset;
-
- public boolean hasNext() {
- return nextTreeIterator != null;
- }
-
- public Iterator<Tree> next() {
- Iterator<Tree> currentTreeIterator = nextTreeIterator;
- advance();
- return currentTreeIterator;
- }
-
- public void remove() {
- throw new UnsupportedOperationException();
- }
-
- private void advance() {
- nextTreeIterator = null;
- while (nextTreeIterator == null && fileIterator.hasNext()) {
- File file = fileIterator.next();
- // System.out.println(file);
- try {
- nextTreeIterator = new Trees.PennTreeReader(new BufferedReader(new InputStreamReader(
- new FileInputStream(file), this.charset)));
- } catch (FileNotFoundException e) {
- } catch (UnsupportedCharsetException e) {
- throw new Error("Unsupported charset in file " + file.getPath());
- }
- }
- }
-
- TreeIteratorIterator(List<File> files, Charset charset) {
- this.fileIterator = files.iterator();
- this.charset = charset;
- advance();
- }
- }
-
- public Iterator<Tree> iterator() {
- return new ConcatenationIterator<Tree>(new TreeIteratorIterator(files, this.charset));
- }
-
- public int size() {
- int size = 0;
- Iterator<Tree> i = iterator();
- while (i.hasNext()) {
- size++;
- i.next();
- }
- return size;
- }
-
- @SuppressWarnings("unused")
- private List<File> getFilesUnder(String path, FileFilter fileFilter) {
- File root = new File(path);
- List<File> files = new ArrayList<File>();
- addFilesUnder(root, files, fileFilter);
- return files;
- }
-
- private void addFilesUnder(File root, List<File> files, FileFilter fileFilter) {
- if (!fileFilter.accept(root))
- return;
- if (root.isFile()) {
- files.add(root);
- return;
- }
- if (root.isDirectory()) {
- File[] children = root.listFiles();
- for (int i = 0; i < children.length; i++) {
- File child = children[i];
- addFilesUnder(child, files, fileFilter);
- }
- }
- }
-
- public TreeCollection(String file) throws FileNotFoundException, IOException {
- this.files = new ArrayList<File>();
- this.files.add(new File(file));
- this.charset = Charset.defaultCharset();
- }
- }
-
- public static Collection<Tree> readTrees(String path) throws FileNotFoundException, IOException {
- return new TreeCollection(path);
- }
-
- public static void main(String[] args) {
-/* Collection<Tree> trees = readTrees(args[0], Charset.defaultCharset());
- for (Tree tree : trees) {
- tree = (new Trees.StandardTreeNormalizer()).transformTree(tree);
- System.out.println(Trees.PennTreeRenderer.render(tree));
- }
- */
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/fragmentlm/Tree.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/fragmentlm/Tree.java b/src/joshua/decoder/ff/fragmentlm/Tree.java
deleted file mode 100644
index b52ccce..0000000
--- a/src/joshua/decoder/ff/fragmentlm/Tree.java
+++ /dev/null
@@ -1,776 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.fragmentlm;
-
-import java.io.IOException;
-import java.io.Serializable;
-import java.io.StringReader;
-import java.util.*;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.ff.fragmentlm.Trees.PennTreeReader;
-import joshua.decoder.ff.tm.Rule;
-import joshua.decoder.hypergraph.HGNode;
-import joshua.decoder.hypergraph.HyperEdge;
-import joshua.decoder.hypergraph.KBestExtractor.DerivationState;
-import joshua.util.io.LineReader;
-
-/**
- * Represent phrase-structure trees, with each node consisting of a label and a list of children.
- * Borrowed from the Berkeley Parser, and extended to allow the representation of tree fragments in
- * addition to complete trees (the BP requires terminals to be immediately governed by a
- * preterminal). To distinguish terminals from nonterminals in fragments, the former must be
- * enclosed in double-quotes when read in.
- *
- * @author Dan Klein
- * @author Matt Post <po...@cs.jhu.edu>
- */
-public class Tree implements Serializable {
-
- private static final long serialVersionUID = 1L;
-
- protected int label;
-
- /* Marks a frontier node as a terminal (as opposed to a nonterminal). */
- boolean isTerminal = false;
-
- /*
- * Marks the root and frontier nodes of a fragment. Useful for denoting fragment derivations in
- * larger trees.
- */
- boolean isBoundary = false;
-
- /* A list of the node's children. */
- List<Tree> children;
-
- /* The maximum distance from the root to any of the frontier nodes. */
- int depth = -1;
-
- /* The number of lexicalized items among the tree's frontier. */
- private int numLexicalItems = -1;
-
- /*
- * This maps the flat right-hand sides of Joshua rules to the tree fragments they were derived
- * from. It is used to lookup the fragment that language model fragments should be match against.
- * For example, if the target (English) side of your rule is
- *
- * [NP,1] said [SBAR,2]
- *
- * we will retrieve the unflattened fragment
- *
- * (S NP (VP (VBD said) SBAR))
- *
- * which presumably was the fronter fragment used to derive the translation rule. With this in
- * hand, we can iterate through our store of language model fragments to match them against this,
- * following tail nodes if necessary.
- */
- public static HashMap<String, String> rulesToFragmentStrings = new HashMap<String, String>();
-
- public Tree(String label, List<Tree> children) {
- setLabel(label);
- this.children = children;
- }
-
- public Tree(String label) {
- setLabel(label);
- this.children = Collections.emptyList();
- }
-
- public Tree(int label2, ArrayList<Tree> newChildren) {
- this.label = label2;
- this.children = newChildren;
- }
-
- public void setChildren(List<Tree> c) {
- this.children = c;
- }
-
- public List<Tree> getChildren() {
- return children;
- }
-
- public int getLabel() {
- return label;
- }
-
- /**
- * Computes the depth-one rule rooted at this node. If the node has no children, null is returned.
- *
- * @return
- */
- public String getRule() {
- if (isLeaf()) {
- return null;
- }
- StringBuilder ruleString = new StringBuilder("(" + Vocabulary.word(getLabel()));
- for (Tree child : getChildren()) {
- ruleString.append(" ").append(Vocabulary.word(child.getLabel()));
- }
- return ruleString.toString();
- }
-
- /*
- * Boundary nodes are used externally to mark merge points between different fragments. This is
- * separate from the internal ( (substitution point) denotation.
- */
- public boolean isBoundary() {
- return isBoundary;
- }
-
- public void setBoundary(boolean b) {
- this.isBoundary = b;
- }
-
- public boolean isTerminal() {
- return isTerminal;
- }
-
- public boolean isLeaf() {
- return getChildren().isEmpty();
- }
-
- public boolean isPreTerminal() {
- return getChildren().size() == 1 && getChildren().get(0).isLeaf();
- }
-
- public List<Tree> getNonterminalYield() {
- List<Tree> yield = new ArrayList<Tree>();
- appendNonterminalYield(this, yield);
- return yield;
- }
-
- public List<Tree> getYield() {
- List<Tree> yield = new ArrayList<Tree>();
- appendYield(this, yield);
- return yield;
- }
-
- public List<Tree> getTerminals() {
- List<Tree> yield = new ArrayList<Tree>();
- appendTerminals(this, yield);
- return yield;
- }
-
- private static void appendTerminals(Tree tree, List<Tree> yield) {
- if (tree.isLeaf()) {
- yield.add(tree);
- return;
- }
- for (Tree child : tree.getChildren()) {
- appendTerminals(child, yield);
- }
- }
-
- /**
- * Clone the structure of the tree.
- *
- * @return a cloned tree
- */
- public Tree shallowClone() {
- ArrayList<Tree> newChildren = new ArrayList<Tree>(children.size());
- for (Tree child : children) {
- newChildren.add(child.shallowClone());
- }
-
- Tree newTree = new Tree(label, newChildren);
- newTree.setIsTerminal(isTerminal());
- newTree.setBoundary(isBoundary());
- return newTree;
- }
-
- private void setIsTerminal(boolean terminal) {
- isTerminal = terminal;
- }
-
- private static void appendNonterminalYield(Tree tree, List<Tree> yield) {
- if (tree.isLeaf() && !tree.isTerminal()) {
- yield.add(tree);
- return;
- }
- for (Tree child : tree.getChildren()) {
- appendNonterminalYield(child, yield);
- }
- }
-
- private static void appendYield(Tree tree, List<Tree> yield) {
- if (tree.isLeaf()) {
- yield.add(tree);
- return;
- }
- for (Tree child : tree.getChildren()) {
- appendYield(child, yield);
- }
- }
-
- public List<Tree> getPreTerminalYield() {
- List<Tree> yield = new ArrayList<Tree>();
- appendPreTerminalYield(this, yield);
- return yield;
- }
-
- private static void appendPreTerminalYield(Tree tree, List<Tree> yield) {
- if (tree.isPreTerminal()) {
- yield.add(tree);
- return;
- }
- for (Tree child : tree.getChildren()) {
- appendPreTerminalYield(child, yield);
- }
- }
-
- /**
- * A tree is lexicalized if it has terminal nodes among the leaves of its frontier. For normal
- * trees this is always true since they bottom out in terminals, but for fragments, this may or
- * may not be true.
- */
- public boolean isLexicalized() {
- if (this.numLexicalItems < 0) {
- if (isTerminal())
- this.numLexicalItems = 1;
- else {
- this.numLexicalItems = 0;
- for (Tree child : children)
- if (child.isLexicalized())
- this.numLexicalItems += 1;
- }
- }
-
- return (this.numLexicalItems > 0);
- }
-
- /**
- * The depth of a tree is the maximum distance from the root to any of the frontier nodes.
- *
- * @return the tree depth
- */
- public int getDepth() {
- if (this.depth >= 0)
- return this.depth;
-
- if (isLeaf()) {
- this.depth = 0;
- } else {
- int maxDepth = 0;
- for (Tree child : children) {
- int depth = child.getDepth();
- if (depth > maxDepth)
- maxDepth = depth;
- }
- this.depth = maxDepth + 1;
- }
- return this.depth;
- }
-
- public List<Tree> getAtDepth(int depth) {
- List<Tree> yield = new ArrayList<Tree>();
- appendAtDepth(depth, this, yield);
- return yield;
- }
-
- private static void appendAtDepth(int depth, Tree tree, List<Tree> yield) {
- if (depth < 0)
- return;
- if (depth == 0) {
- yield.add(tree);
- return;
- }
- for (Tree child : tree.getChildren()) {
- appendAtDepth(depth - 1, child, yield);
- }
- }
-
- public void setLabel(String label) {
- if (label.length() >= 3 && label.startsWith("\"") && label.endsWith("\"")) {
- this.isTerminal = true;
- label = label.substring(1, label.length() - 1);
- }
-
- this.label = Vocabulary.id(label);
- }
-
- public String toString() {
- StringBuilder sb = new StringBuilder();
- toStringBuilder(sb);
- return sb.toString();
- }
-
- /**
- * Removes the quotes around terminals. Note that the resulting tree could not be read back
- * in by this class, since unquoted leaves are interpreted as nonterminals.
- *
- * @return
- */
- public String unquotedString() {
- return toString().replaceAll("\"", "");
- }
-
- public String escapedString() {
- return toString().replaceAll(" ", "_");
- }
-
- public void toStringBuilder(StringBuilder sb) {
- if (!isLeaf())
- sb.append('(');
-
- if (isTerminal())
- sb.append(String.format("\"%s\"", Vocabulary.word(getLabel())));
- else
- sb.append(Vocabulary.word(getLabel()));
-
- if (!isLeaf()) {
- for (Tree child : getChildren()) {
- sb.append(' ');
- child.toStringBuilder(sb);
- }
- sb.append(')');
- }
- }
-
- /**
- * Get the set of all subtrees inside the tree by returning a tree rooted at each node. These are
- * <i>not</i> copies, but all share structure. The tree is regarded as a subtree of itself.
- *
- * @return the <code>Set</code> of all subtrees in the tree.
- */
- public Set<Tree> subTrees() {
- return (Set<Tree>) subTrees(new HashSet<Tree>());
- }
-
- /**
- * Get the list of all subtrees inside the tree by returning a tree rooted at each node. These are
- * <i>not</i> copies, but all share structure. The tree is regarded as a subtree of itself.
- *
- * @return the <code>List</code> of all subtrees in the tree.
- */
- public List<Tree> subTreeList() {
- return (List<Tree>) subTrees(new ArrayList<Tree>());
- }
-
- /**
- * Add the set of all subtrees inside a tree (including the tree itself) to the given
- * <code>Collection</code>.
- *
- * @param n A collection of nodes to which the subtrees will be added
- * @return The collection parameter with the subtrees added
- */
- public Collection<Tree> subTrees(Collection<Tree> n) {
- n.add(this);
- List<Tree> kids = getChildren();
- for (Tree kid : kids) {
- kid.subTrees(n);
- }
- return n;
- }
-
- /**
- * Returns an iterator over the nodes of the tree. This method implements the
- * <code>iterator()</code> method required by the <code>Collections</code> interface. It does a
- * preorder (children after node) traversal of the tree. (A possible extension to the class at
- * some point would be to allow different traversal orderings via variant iterators.)
- *
- * @return An interator over the nodes of the tree
- */
- public TreeIterator iterator() {
- return new TreeIterator();
- }
-
- private class TreeIterator implements Iterator<Tree> {
-
- private List<Tree> treeStack;
-
- private TreeIterator() {
- treeStack = new ArrayList<Tree>();
- treeStack.add(Tree.this);
- }
-
- public boolean hasNext() {
- return (!treeStack.isEmpty());
- }
-
- public Tree next() {
- int lastIndex = treeStack.size() - 1;
- Tree tr = treeStack.remove(lastIndex);
- List<Tree> kids = tr.getChildren();
- // so that we can efficiently use one List, we reverse them
- for (int i = kids.size() - 1; i >= 0; i--) {
- treeStack.add(kids.get(i));
- }
- return tr;
- }
-
- /**
- * Not supported
- */
- public void remove() {
- throw new UnsupportedOperationException();
- }
-
- }
-
- public boolean hasUnaryChain() {
- return hasUnaryChainHelper(this, false);
- }
-
- private boolean hasUnaryChainHelper(Tree tree, boolean unaryAbove) {
- boolean result = false;
- if (tree.getChildren().size() == 1) {
- if (unaryAbove)
- return true;
- else if (tree.getChildren().get(0).isPreTerminal())
- return false;
- else
- return hasUnaryChainHelper(tree.getChildren().get(0), true);
- } else {
- for (Tree child : tree.getChildren()) {
- if (!child.isPreTerminal())
- result = result || hasUnaryChainHelper(child, false);
- }
- }
- return result;
- }
-
- /**
- * Inserts the SOS (and EOS) symbols into a parse tree, attaching them as a left (right) sibling
- * to the leftmost (rightmost) pre-terminal in the tree. This facilitates using trees as language
- * models. The arguments have to be passed in to preserve Java generics, even though this is only
- * ever used with String versions.
- *
- * @param sos presumably "<s>"
- * @param eos presumably "</s>"
- */
- public void insertSentenceMarkers(String sos, String eos) {
- insertSentenceMarker(sos, 0);
- insertSentenceMarker(eos, -1);
- }
-
- public void insertSentenceMarkers() {
- insertSentenceMarker("<s>", 0);
- insertSentenceMarker("</s>", -1);
- }
-
- /**
- *
- * @param symbol
- * @param pos
- */
- private void insertSentenceMarker(String symbol, int pos) {
-
- if (isLeaf() || isPreTerminal())
- return;
-
- List<Tree> children = getChildren();
- int index = (pos == -1) ? children.size() - 1 : pos;
- if (children.get(index).isPreTerminal()) {
- if (pos == -1)
- children.add(new Tree(symbol));
- else
- children.add(pos, new Tree(symbol));
- } else {
- children.get(index).insertSentenceMarker(symbol, pos);
- }
- }
-
- /**
- * This is a convenience function for producing a fragment from its string representation.
- */
- public static Tree fromString(String ptbStr) {
- PennTreeReader reader = new PennTreeReader(new StringReader(ptbStr));
- Tree fragment = reader.next();
- return fragment;
- }
-
- public static Tree getFragmentFromYield(String yield) {
- String fragmentString = rulesToFragmentStrings.get(yield);
- if (fragmentString != null)
- return fromString(fragmentString);
-
- return null;
- }
-
- public static void readMapping(String fragmentMappingFile) {
- /* Read in the rule / fragments mapping */
- try {
- LineReader reader = new LineReader(fragmentMappingFile);
- for (String line : reader) {
- String[] fields = line.split("\\s+\\|{3}\\s+");
- if (fields.length != 2 || !fields[0].startsWith("(")) {
- System.err.println(String.format("* WARNING: malformed line %d: %s", reader.lineno(),
- line));
- continue;
- }
-
- rulesToFragmentStrings.put(fields[1].trim(), fields[0].trim()); // buildFragment(fields[0]));
- }
- } catch (IOException e) {
- System.err.println(String.format("* WARNING: couldn't read fragment mapping file '%s'",
- fragmentMappingFile));
- System.exit(1);
- }
- System.err.println(String.format("FragmentLMFF: Read %d mappings from '%s'",
- rulesToFragmentStrings.size(), fragmentMappingFile));
- }
-
- /**
- * Builds a tree from the kth-best derivation state. This is done by initializing the tree with
- * the internal fragment corresponding to the rule; this will be the top of the tree. We then
- * recursively visit the derivation state objects, following the route through the hypergraph
- * defined by them.
- *
- * This function is like the other buildTree() function, but that one simply follows the best
- * incoming hyperedge for each node.
- *
- * @param rule
- * @param tailNodes
- * @param derivation - should not be null
- * @param maxDepth
- * @return
- */
- public static Tree buildTree(Rule rule, DerivationState[] derivationStates, int maxDepth) {
- Tree tree = getFragmentFromYield(rule.getEnglishWords());
-
- if (tree == null) {
- return null;
- }
-
- tree = tree.shallowClone();
-
- System.err.println(String.format("buildTree(%s)", tree));
- for (int i = 0; i < derivationStates.length; i++) {
- System.err.println(String.format(" -> %d: %s", i, derivationStates[i]));
- }
-
- List<Tree> frontier = tree.getNonterminalYield();
-
- /* The English side of a rule is a sequence of integers. Nonnegative integers are word
- * indices in the Vocabulary, while negative indices are used to nonterminals. These negative
- * indices are a *permutation* of the source side nonterminals, which contain the actual
- * nonterminal Vocabulary indices for the nonterminal names. Here, we convert this permutation
- * to a nonnegative 0-based permutation and store it in tailIndices. This is used to index
- * the incoming DerivationState items, which are ordered by the source side.
- */
- ArrayList<Integer> tailIndices = new ArrayList<Integer>();
- int[] englishInts = rule.getEnglish();
- for (int i = 0; i < englishInts.length; i++)
- if (englishInts[i] < 0)
- tailIndices.add(-(englishInts[i] + 1));
-
- /*
- * We now have the tree's yield. The substitution points on the yield should match the
- * nonterminals of the heads of the derivation states. Since we don't know which of the tree's
- * frontier items are terminals and which are nonterminals, we walk through the tail nodes,
- * and then match the label of each against the frontier node labels until we have a match.
- */
- // System.err.println(String.format("WORDS: %s\nTREE: %s", rule.getEnglishWords(), tree));
- for (int i = 0; i < derivationStates.length; i++) {
-
- Tree frontierTree = frontier.get(tailIndices.get(i));
- frontierTree.setBoundary(true);
-
- HyperEdge nextEdge = derivationStates[i].edge;
- if (nextEdge != null) {
- DerivationState[] nextStates = null;
- if (nextEdge.getTailNodes() != null && nextEdge.getTailNodes().size() > 0) {
- nextStates = new DerivationState[nextEdge.getTailNodes().size()];
- for (int j = 0; j < nextStates.length; j++)
- nextStates[j] = derivationStates[i].getChildDerivationState(nextEdge, j);
- }
- Tree childTree = buildTree(nextEdge.getRule(), nextStates, maxDepth - 1);
-
- /* This can be null if there is no entry for the rule in the map */
- if (childTree != null)
- frontierTree.children = childTree.children;
- } else {
- frontierTree.children = tree.children;
- }
- }
-
- return tree;
- }
-
- /**
- * Builds a tree from the kth-best derivation state. This is done by initializing the tree with
- * the internal fragment corresponding to the rule; this will be the top of the tree. We then
- * recursively visit the derivation state objects, following the route through the hypergraph
- * defined by them.
- *
- * This function is like the other buildTree() function, but that one simply follows the best
- * incoming hyperedge for each node.
- *
- * @param rule
- * @param tailNodes
- * @param derivation
- * @param maxDepth
- * @return
- */
- public static Tree buildTree(DerivationState derivationState, int maxDepth) {
- Rule rule = derivationState.edge.getRule();
-
- Tree tree = getFragmentFromYield(rule.getEnglishWords());
-
- if (tree == null) {
- return null;
- }
-
- tree = tree.shallowClone();
-
- System.err.println(String.format("buildTree(%s)", tree));
-
- if (rule.getArity() > 0 && maxDepth > 0) {
- List<Tree> frontier = tree.getNonterminalYield();
-
- /* The English side of a rule is a sequence of integers. Nonnegative integers are word
- * indices in the Vocabulary, while negative indices are used to nonterminals. These negative
- * indices are a *permutation* of the source side nonterminals, which contain the actual
- * nonterminal Vocabulary indices for the nonterminal names. Here, we convert this permutation
- * to a nonnegative 0-based permutation and store it in tailIndices. This is used to index
- * the incoming DerivationState items, which are ordered by the source side.
- */
- ArrayList<Integer> tailIndices = new ArrayList<Integer>();
- int[] englishInts = rule.getEnglish();
- for (int i = 0; i < englishInts.length; i++)
- if (englishInts[i] < 0)
- tailIndices.add(-(englishInts[i] + 1));
-
- /*
- * We now have the tree's yield. The substitution points on the yield should match the
- * nonterminals of the heads of the derivation states. Since we don't know which of the tree's
- * frontier items are terminals and which are nonterminals, we walk through the tail nodes,
- * and then match the label of each against the frontier node labels until we have a match.
- */
- // System.err.println(String.format("WORDS: %s\nTREE: %s", rule.getEnglishWords(), tree));
- for (int i = 0; i < rule.getArity(); i++) {
-
- Tree frontierTree = frontier.get(tailIndices.get(i));
- frontierTree.setBoundary(true);
-
- DerivationState childState = derivationState.getChildDerivationState(derivationState.edge, i);
- Tree childTree = buildTree(childState, maxDepth - 1);
-
- /* This can be null if there is no entry for the rule in the map */
- if (childTree != null)
- frontierTree.children = childTree.children;
- }
- }
-
- return tree;
- }
-
- /**
- * Takes a rule and its tail pointers and recursively constructs a tree (up to maxDepth).
- *
- * This could be implemented by using the other buildTree() function and using the 1-best
- * DerivationState.
- *
- * @param rule
- * @param tailNodes
- * @return
- */
- public static Tree buildTree(Rule rule, List<HGNode> tailNodes, int maxDepth) {
- Tree tree = getFragmentFromYield(rule.getEnglishWords());
-
- if (tree == null) {
- tree = new Tree(String.format("(%s %s)", Vocabulary.word(rule.getLHS()), rule.getEnglishWords()));
- // System.err.println("COULDN'T FIND " + rule.getEnglishWords());
- // System.err.println("RULE " + rule);
- // for (Entry<String, Tree> pair: rulesToFragments.entrySet())
- // System.err.println(" FOUND " + pair.getKey());
-
-// return null;
- } else {
- tree = tree.shallowClone();
- }
-
- if (tree != null && tailNodes != null && tailNodes.size() > 0 && maxDepth > 0) {
- List<Tree> frontier = tree.getNonterminalYield();
-
- ArrayList<Integer> tailIndices = new ArrayList<Integer>();
- int[] englishInts = rule.getEnglish();
- for (int i = 0; i < englishInts.length; i++)
- if (englishInts[i] < 0)
- tailIndices.add(-1 * englishInts[i] - 1);
-
- /*
- * We now have the tree's yield. The substitution points on the yield should match the
- * nonterminals of the tail nodes. Since we don't know which of the tree's frontier items are
- * terminals and which are nonterminals, we walk through the tail nodes, and then match the
- * label of each against the frontier node labels until we have a match.
- */
- // System.err.println(String.format("WORDS: %s\nTREE: %s", rule.getEnglishWords(), tree));
- for (int i = 0; i < tailNodes.size(); i++) {
-
- // String lhs = tailNodes.get(i).getLHS().replaceAll("[\\[\\]]", "");
- // System.err.println(String.format(" %d: %s", i, lhs));
- try {
- Tree frontierTree = frontier.get(tailIndices.get(i).intValue());
- frontierTree.setBoundary(true);
-
- HyperEdge edge = tailNodes.get(i).bestHyperedge;
- if (edge != null) {
- Tree childTree = buildTree(edge.getRule(), edge.getTailNodes(), maxDepth - 1);
- /* This can be null if there is no entry for the rule in the map */
- if (childTree != null)
- frontierTree.children = childTree.children;
- } else {
- frontierTree.children = tree.children;
- }
- } catch (IndexOutOfBoundsException e) {
- System.err.println(String.format("ERROR at index %d", i));
- System.err.println(String.format("RULE: %s TREE: %s", rule.getEnglishWords(), tree));
- System.err.println(" FRONTIER:");
- for (Tree kid : frontier)
- System.err.println(" " + kid);
- e.printStackTrace();
- System.exit(1);
- }
- }
- }
-
- return tree;
- }
-
- public static void main(String[] args) {
- LineReader reader = new LineReader(System.in);
-
- for (String line : reader) {
- try {
- Tree tree = Tree.fromString(line);
- tree.insertSentenceMarkers();
- System.out.println(tree);
- } catch (Exception e) {
- System.out.println("");
- }
- }
-
- /*
- * Tree fragment = Tree
- * .fromString("(TOP (S (NP (DT the) (NN boy)) (VP (VBD ate) (NP (DT the) (NN food)))))");
- * fragment.insertSentenceMarkers("<s>", "</s>");
- *
- * System.out.println(fragment);
- *
- * ArrayList<Tree> trees = new ArrayList<Tree>(); trees.add(Tree.fromString("(NN \"mat\")"));
- * trees.add(Tree.fromString("(S (NP DT NN) VP)"));
- * trees.add(Tree.fromString("(S (NP (DT \"the\") NN) VP)"));
- * trees.add(Tree.fromString("(S (NP (DT the) NN) VP)"));
- *
- * for (Tree tree : trees) { System.out.println(String.format("TREE %s DEPTH %d LEX? %s", tree,
- * tree.getDepth(), tree.isLexicalized())); }
- */
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/fragmentlm/Trees.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/fragmentlm/Trees.java b/src/joshua/decoder/ff/fragmentlm/Trees.java
deleted file mode 100644
index 94a0f44..0000000
--- a/src/joshua/decoder/ff/fragmentlm/Trees.java
+++ /dev/null
@@ -1,265 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.fragmentlm;
-
-import java.io.IOException;
-import java.io.PushbackReader;
-import java.io.Reader;
-import java.io.StringReader;
-import java.util.*;
-
-import joshua.corpus.Vocabulary;
-
-/**
- * Tools for displaying, reading, and modifying trees. Borrowed from the Berkeley Parser.
- *
- * @author Dan Klein
- */
-public class Trees {
-
- public static class PennTreeReader implements Iterator<Tree> {
- public static String ROOT_LABEL = "ROOT";
-
- PushbackReader in;
- Tree nextTree;
-
- public boolean hasNext() {
- return (nextTree != null);
- }
-
- public Tree next() {
- if (!hasNext())
- throw new NoSuchElementException();
- Tree tree = nextTree;
- nextTree = readRootTree();
- // System.out.println(nextTree);
- return tree;
- }
-
- private Tree readRootTree() {
- try {
- readWhiteSpace();
- if (!isLeftParen(peek()))
- return null;
- return readTree(true);
- } catch (IOException e) {
- throw new RuntimeException("Error reading tree.");
- }
- }
-
- private Tree readTree(boolean isRoot) throws IOException {
- if (!isLeftParen(peek())) {
- return readLeaf();
- } else {
- readLeftParen();
- String label = readLabel();
- if (label.length() == 0 && isRoot)
- label = ROOT_LABEL;
- List<Tree> children = readChildren();
- readRightParen();
- return new Tree(label, children);
- }
- }
-
- private String readLabel() throws IOException {
- readWhiteSpace();
- return readText();
- }
-
- private String readText() throws IOException {
- StringBuilder sb = new StringBuilder();
- int ch = in.read();
- while (!isWhiteSpace(ch) && !isLeftParen(ch) && !isRightParen(ch)) {
- sb.append((char) ch);
- ch = in.read();
- }
- in.unread(ch);
- // System.out.println("Read text: ["+sb+"]");
- return sb.toString().intern();
- }
-
- private List<Tree> readChildren() throws IOException {
- readWhiteSpace();
- // if (!isLeftParen(peek()))
- // return Collections.singletonList(readLeaf());
- return readChildList();
- }
-
- private int peek() throws IOException {
- int ch = in.read();
- in.unread(ch);
- return ch;
- }
-
- private Tree readLeaf() throws IOException {
- String label = readText();
- return new Tree(label);
- }
-
- private List<Tree> readChildList() throws IOException {
- List<Tree> children = new ArrayList<Tree>();
- readWhiteSpace();
- while (!isRightParen(peek())) {
- children.add(readTree(false));
- readWhiteSpace();
- }
- return children;
- }
-
- private void readLeftParen() throws IOException {
- // System.out.println("Read left.");
- readWhiteSpace();
- int ch = in.read();
- if (!isLeftParen(ch))
- throw new RuntimeException("Format error reading tree. (leftParen)");
- }
-
- private void readRightParen() throws IOException {
- // System.out.println("Read right.");
- readWhiteSpace();
- int ch = in.read();
-
- if (!isRightParen(ch)) {
- System.out.println((char) ch);
- throw new RuntimeException("Format error reading tree. (rightParen)");
- }
- }
-
- private void readWhiteSpace() throws IOException {
- int ch = in.read();
- while (isWhiteSpace(ch)) {
- ch = in.read();
- }
- in.unread(ch);
- }
-
- private boolean isWhiteSpace(int ch) {
- return (ch == ' ' || ch == '\t' || ch == '\f' || ch == '\r' || ch == '\n');
- }
-
- private boolean isLeftParen(int ch) {
- return ch == '(';
- }
-
- private boolean isRightParen(int ch) {
- return ch == ')';
- }
-
- public void remove() {
- throw new UnsupportedOperationException();
- }
-
- public PennTreeReader(Reader in) {
- this.in = new PushbackReader(in);
- nextTree = readRootTree();
- // System.out.println(nextTree);
- }
- }
-
- /**
- * Renderer for pretty-printing trees according to the Penn Treebank indenting guidelines
- * (mutliline). Adapted from code originally written by Dan Klein and modified by Chris Manning.
- */
- public static class PennTreeRenderer {
-
- /**
- * Print the tree as done in Penn Treebank merged files. The formatting should be exactly the
- * same, but we don't print the trailing whitespace found in Penn Treebank trees. The basic
- * deviation from a bracketed indented tree is to in general collapse the printing of adjacent
- * preterminals onto one line of tags and words. Additional complexities are that conjunctions
- * (tag CC) are not collapsed in this way, and that the unlabeled outer brackets are collapsed
- * onto the same line as the next bracket down.
- */
- public static String render(Tree tree) {
- StringBuilder sb = new StringBuilder();
- renderTree(tree, 0, false, false, false, true, sb);
- sb.append('\n');
- return sb.toString();
- }
-
- /**
- * Display a node, implementing Penn Treebank style layout
- */
- private static void renderTree(Tree tree, int indent, boolean parentLabelNull,
- boolean firstSibling, boolean leftSiblingPreTerminal, boolean topLevel, StringBuilder sb) {
- // the condition for staying on the same line in Penn Treebank
- boolean suppressIndent = (parentLabelNull || (firstSibling && tree.isPreTerminal()) || (leftSiblingPreTerminal
- && tree.isPreTerminal()));
- if (suppressIndent) {
- sb.append(' ');
- } else {
- if (!topLevel) {
- sb.append('\n');
- }
- for (int i = 0; i < indent; i++) {
- sb.append(" ");
- }
- }
- if (tree.isLeaf() || tree.isPreTerminal()) {
- renderFlat(tree, sb);
- return;
- }
- sb.append('(');
- sb.append(tree.getLabel());
- renderChildren(tree.getChildren(), indent + 1, false, sb);
- sb.append(')');
- }
-
- private static void renderFlat(Tree tree, StringBuilder sb) {
- if (tree.isLeaf()) {
- sb.append(Vocabulary.word(tree.getLabel()));
- return;
- }
- sb.append('(');
- sb.append(Vocabulary.word(tree.getLabel()));
- sb.append(' ');
- sb.append(Vocabulary.word(tree.getChildren().get(0).getLabel()));
- sb.append(')');
- }
-
- private static void renderChildren(List<Tree> children, int indent,
- boolean parentLabelNull, StringBuilder sb) {
- boolean firstSibling = true;
- boolean leftSibIsPreTerm = true; // counts as true at beginning
- for (Tree child : children) {
- renderTree(child, indent, parentLabelNull, firstSibling, leftSibIsPreTerm, false, sb);
- leftSibIsPreTerm = child.isPreTerminal();
- firstSibling = false;
- }
- }
- }
-
- public static void main(String[] args) {
- String ptbTreeString = "((S (NP (DT the) (JJ quick) (JJ brown) (NN fox)) (VP (VBD jumped) (PP (IN over) (NP (DT the) (JJ lazy) (NN dog)))) (. .)))";
-
- if (args.length > 0) {
- String tree = "";
- for (String str : args) {
- tree += " " + str;
- }
- ptbTreeString = tree.substring(1);
- }
-
- PennTreeReader reader = new PennTreeReader(new StringReader(ptbTreeString));
-
- Tree tree = reader.next();
- System.out.println(PennTreeRenderer.render(tree));
- System.out.println(tree);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/lm/DefaultNGramLanguageModel.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/DefaultNGramLanguageModel.java b/src/joshua/decoder/ff/lm/DefaultNGramLanguageModel.java
deleted file mode 100644
index 20f29f1..0000000
--- a/src/joshua/decoder/ff/lm/DefaultNGramLanguageModel.java
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.lm;
-
-import java.util.Arrays;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-
-import joshua.corpus.Vocabulary;
-
-/**
- * This class provides a default implementation for the Equivalent LM State optimization (namely,
- * don't back off anywhere). It also provides some default implementations for more general
- * functions on the interface to fall back to more specific ones (e.g. from ArrayList<Integer> to
- * int[]) and a default implementation for sentenceLogProbability which enumerates the n-grams and
- * calls calls ngramLogProbability for each of them.
- *
- * @author Zhifei Li, <zh...@gmail.com>
- * @author wren ng thornton <wr...@users.sourceforge.net>
- */
-public abstract class DefaultNGramLanguageModel implements NGramLanguageModel {
-
- /** Logger for this class. */
- private static final Logger logger = Logger.getLogger(DefaultNGramLanguageModel.class.getName());
-
- protected final int ngramOrder;
-
- protected float ceiling_cost = -100;
-
- // ===============================================================
- // Constructors
- // ===============================================================
- public DefaultNGramLanguageModel(int order, float ceiling_cost) {
- this.ngramOrder = order;
- this.ceiling_cost = ceiling_cost;
- }
-
- public DefaultNGramLanguageModel(int order) {
- this.ngramOrder = order;
- }
-
-
- // ===============================================================
- // Attributes
- // ===============================================================
- @Override
- public final int getOrder() {
- return this.ngramOrder;
- }
-
-
- // ===============================================================
- // NGramLanguageModel Methods
- // ===============================================================
-
- @Override
- public boolean registerWord(String token, int id) {
- // No private LM ID mapping, do nothing
- return false;
- }
-
- @Override
- public float sentenceLogProbability(int[] sentence, int order, int startIndex) {
- if (sentence == null) return 0.0f;
- int sentenceLength = sentence.length;
- if (sentenceLength <= 0) return 0.0f;
-
- float probability = 0.0f;
- // partial ngrams at the beginning
- for (int j = startIndex; j < order && j <= sentenceLength; j++) {
- // TODO: startIndex dependents on the order, e.g., this.ngramOrder-1 (in srilm, for 3-gram lm,
- // start_index=2. othercase, need to check)
- int[] ngram = Arrays.copyOfRange(sentence, 0, j);
- double logProb = ngramLogProbability(ngram, order);
- if (logger.isLoggable(Level.FINE)) {
- String words = Vocabulary.getWords(ngram);
- logger.fine("\tlogp ( " + words + " ) = " + logProb);
- }
- probability += logProb;
- }
-
- // regular-order ngrams
- for (int i = 0; i <= sentenceLength - order; i++) {
- int[] ngram = Arrays.copyOfRange(sentence, i, i + order);
- double logProb = ngramLogProbability(ngram, order);
- if (logger.isLoggable(Level.FINE)) {
- String words = Vocabulary.getWords(ngram);
- logger.fine("\tlogp ( " + words + " ) = " + logProb);
- }
- probability += logProb;
- }
-
- return probability;
- }
-
- @Override
- public float ngramLogProbability(int[] ngram) {
- return this.ngramLogProbability(ngram, this.ngramOrder);
- }
-
- protected abstract float ngramLogProbability_helper(int[] ngram, int order);
-
- @Override
- public float ngramLogProbability(int[] ngram, int order) {
- if (ngram.length > order) {
- throw new RuntimeException("ngram length is greather than the max order");
- }
- // if (ngram.length==1 && "we".equals(Vocabulary.getWord(ngram[0]))) {
- // System.err.println("Something weird is about to happen");
- // }
-
- int historySize = ngram.length - 1;
- if (historySize >= order || historySize < 0) {
- // BUG: use logger or exception. Don't zero default
- throw new RuntimeException("Error: history size is " + historySize);
- // return 0;
- }
- float probability = ngramLogProbability_helper(ngram, order);
- if (probability < ceiling_cost) {
- probability = ceiling_cost;
- }
- return probability;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/lm/KenLM.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/KenLM.java b/src/joshua/decoder/ff/lm/KenLM.java
deleted file mode 100644
index 329b631..0000000
--- a/src/joshua/decoder/ff/lm/KenLM.java
+++ /dev/null
@@ -1,224 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.lm;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.ff.lm.NGramLanguageModel;
-import joshua.decoder.ff.state_maintenance.KenLMState;
-
-/**
- * JNI wrapper for KenLM. This version of KenLM supports two use cases, implemented by the separate
- * feature functions KenLMFF and LanguageModelFF. KenLMFF uses the RuleScore() interface in
- * lm/left.hh, returning a state pointer representing the KenLM state, while LangaugeModelFF handles
- * state by itself and just passes in the ngrams for scoring.
- *
- * @author Kenneth Heafield
- * @author Matt Post <po...@cs.jhu.edu>
- */
-
-public class KenLM implements NGramLanguageModel, Comparable<KenLM> {
-
- static {
- try {
- System.loadLibrary("ken");
- } catch (UnsatisfiedLinkError e) {
- System.err.println("* FATAL: Can't find libken.so (libken.dylib on OS X) in $JOSHUA/lib");
- System.err.println("* This probably means that the KenLM library didn't compile.");
- System.err.println("* Make sure that BOOST_ROOT is set to the root of your boost");
- System.err.println("* installation (it's not /opt/local/, the default), change to");
- System.err.println("* $JOSHUA, and type 'ant kenlm'. If problems persist, see the");
- System.err.println("* website (joshua-decoder.org).");
- System.exit(1);
- }
- }
-
- private final long pointer;
-
- // this is read from the config file, used to set maximum order
- private final int ngramOrder;
- // inferred from model file (may be larger than ngramOrder)
- private final int N;
- // whether left-state minimization was requested
- private boolean minimizing;
-
- private final static native long construct(String file_name);
-
- private final static native void destroy(long ptr);
-
- private final static native int order(long ptr);
-
- private final static native boolean registerWord(long ptr, String word, int id);
-
- private final static native float prob(long ptr, int words[]);
-
- private final static native float probForString(long ptr, String[] words);
-
- private final static native boolean isKnownWord(long ptr, String word);
-
- private final static native StateProbPair probRule(long ptr, long pool, long words[]);
-
- private final static native float estimateRule(long ptr, long words[]);
-
- private final static native float probString(long ptr, int words[], int start);
-
- public final static native long createPool();
- public final static native void destroyPool(long pointer);
-
- public KenLM(int order, String file_name) {
- ngramOrder = order;
-
- pointer = construct(file_name);
- N = order(pointer);
- }
-
- /**
- * Constructor if order is not known.
- * Order will be inferred from the model.
- */
- public KenLM(String file_name) {
- pointer = construct(file_name);
- N = order(pointer);
- ngramOrder = N;
- }
-
- public void destroy() {
- destroy(pointer);
- }
-
- public int getOrder() {
- return ngramOrder;
- }
-
- public boolean registerWord(String word, int id) {
- return registerWord(pointer, word, id);
- }
-
- public float prob(int[] words) {
- return prob(pointer, words);
- }
-
- /**
- * Query for n-gram probability using strings.
- */
- public float prob(String[] words) {
- return probForString(pointer, words);
- }
-
- // Apparently Zhifei starts some array indices at 1. Change to 0-indexing.
- public float probString(int words[], int start) {
- return probString(pointer, words, start - 1);
- }
-
- /**
- * This function is the bridge to the interface in kenlm/lm/left.hh, which has KenLM score the
- * whole rule. It takes a list of words and states retrieved from tail nodes (nonterminals in the
- * rule). Nonterminals have a negative value so KenLM can distinguish them. The sentence number is
- * needed so KenLM knows which memory pool to use. When finished, it returns the updated KenLM
- * state and the LM probability incurred along this rule.
- *
- * @param words
- * @param sentId
- * @return
- */
- public StateProbPair probRule(long[] words, long poolPointer) {
-
- StateProbPair pair = null;
- try {
- pair = probRule(pointer, poolPointer, words);
- } catch (NoSuchMethodError e) {
- e.printStackTrace();
- System.exit(1);
- }
-
- return pair;
- }
-
- /**
- * Public facing function that estimates the cost of a rule, which value is used for sorting
- * rules during cube pruning.
- *
- * @param words
- * @return the estimated cost of the rule (the (partial) n-gram probabilities of all words in the rule)
- */
- public float estimateRule(long[] words) {
- float estimate = 0.0f;
- try {
- estimate = estimateRule(pointer, words);
- } catch (NoSuchMethodError e) {
- e.printStackTrace();
- System.exit(1);
- }
-
- return estimate;
- }
-
- /**
- * The start symbol for a KenLM is the Vocabulary.START_SYM.
- */
- public String getStartSymbol() {
- return Vocabulary.START_SYM;
- }
-
- public boolean isKnownWord(String word) {
- return isKnownWord(pointer, word);
- }
-
-
- /**
- * Inner class used to hold the results returned from KenLM with left-state minimization. Note
- * that inner classes have to be static to be accessible from the JNI!
- */
- public static class StateProbPair {
- public KenLMState state = null;
- public float prob = 0.0f;
-
- public StateProbPair(long state, float prob) {
- this.state = new KenLMState(state);
- this.prob = prob;
- }
- }
-
- @Override
- public int compareTo(KenLM other) {
- if (this == other)
- return 0;
- else
- return -1;
- }
-
- /**
- * These functions are used if KenLM is invoked under LanguageModelFF instead of KenLMFF.
- */
- @Override
- public float sentenceLogProbability(int[] sentence, int order, int startIndex) {
- return probString(sentence, startIndex);
- }
-
- @Override
- public float ngramLogProbability(int[] ngram, int order) {
- if (order != N && order != ngram.length)
- throw new RuntimeException("Lower order not supported.");
- return prob(ngram);
- }
-
- @Override
- public float ngramLogProbability(int[] ngram) {
- return prob(ngram);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/lm/LanguageModelFF.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/LanguageModelFF.java b/src/joshua/decoder/ff/lm/LanguageModelFF.java
deleted file mode 100644
index a002de7..0000000
--- a/src/joshua/decoder/ff/lm/LanguageModelFF.java
+++ /dev/null
@@ -1,520 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.lm;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.HashMap;
-import java.util.LinkedList;
-import java.util.List;
-
-import com.google.common.primitives.Ints;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.JoshuaConfiguration;
-import joshua.decoder.Support;
-import joshua.decoder.chart_parser.SourcePath;
-import joshua.decoder.ff.FeatureVector;
-import joshua.decoder.ff.StatefulFF;
-import joshua.decoder.ff.lm.berkeley_lm.LMGrammarBerkeley;
-import joshua.decoder.ff.lm.KenLM;
-import joshua.decoder.ff.state_maintenance.DPState;
-import joshua.decoder.ff.state_maintenance.NgramDPState;
-import joshua.decoder.ff.tm.Rule;
-import joshua.decoder.hypergraph.HGNode;
-import joshua.decoder.segment_file.Sentence;
-
-/**
- * This class performs the following:
- * <ol>
- * <li>Gets the additional LM score due to combinations of small items into larger ones by using
- * rules
- * <li>Gets the LM state
- * <li>Gets the left-side LM state estimation score
- * </ol>
- *
- * @author Matt Post <po...@cs.jhu.edu>
- * @author Juri Ganitkevitch <ju...@cs.jhu.edu>
- * @author Zhifei Li, <zh...@gmail.com>
- */
-public class LanguageModelFF extends StatefulFF {
-
- public static int LM_INDEX = 0;
- private int startSymbolId;
-
- /**
- * N-gram language model. We assume the language model is in ARPA format for equivalent state:
- *
- * <ol>
- * <li>We assume it is a backoff lm, and high-order ngram implies low-order ngram; absense of
- * low-order ngram implies high-order ngram</li>
- * <li>For a ngram, existence of backoffweight => existence a probability Two ways of dealing with
- * low counts:
- * <ul>
- * <li>SRILM: don't multiply zeros in for unknown words</li>
- * <li>Pharaoh: cap at a minimum score exp(-10), including unknown words</li>
- * </ul>
- * </li>
- */
- protected NGramLanguageModel languageModel;
-
- /**
- * We always use this order of ngram, though the LMGrammar may provide higher order probability.
- */
- protected final int ngramOrder;
-
- /*
- * We cache the weight of the feature since there is only one.
- */
- protected float weight;
- protected String type;
- protected String path;
-
- /* Whether this is a class-based LM */
- private boolean isClassLM;
- private ClassMap classMap;
-
- protected class ClassMap {
-
- private final int OOV_id = Vocabulary.getUnknownId();
- private HashMap<Integer, Integer> classMap;
-
- public ClassMap(String file_name) throws IOException {
- this.classMap = new HashMap<Integer, Integer>();
- read(file_name);
- }
-
- public int getClassID(int wordID) {
- return this.classMap.getOrDefault(wordID, OOV_id);
- }
-
- /**
- * Reads a class map from file.
- *
- * @param file_name
- * @throws IOException
- */
- private void read(String file_name) throws IOException {
-
- int lineno = 0;
- for (String line: new joshua.util.io.LineReader(file_name, false)) {
- lineno++;
- String[] lineComp = line.trim().split("\\s+");
- try {
- this.classMap.put(Vocabulary.id(lineComp[0]), Vocabulary.id(lineComp[1]));
- } catch (java.lang.ArrayIndexOutOfBoundsException e) {
- System.err.println(String.format("* WARNING: bad vocab line #%d '%s'", lineno, line));
- }
- }
- }
-
- }
-
- public LanguageModelFF(FeatureVector weights, String[] args, JoshuaConfiguration config) {
- super(weights, String.format("lm_%d", LanguageModelFF.LM_INDEX++), args, config);
-
- this.type = parsedArgs.get("lm_type");
- this.ngramOrder = Integer.parseInt(parsedArgs.get("lm_order"));
- this.path = parsedArgs.get("lm_file");
-
- if (parsedArgs.containsKey("class_map"))
- try {
- this.isClassLM = true;
- this.classMap = new ClassMap(parsedArgs.get("class_map"));
- } catch (IOException e) {
- // TODO Auto-generated catch block
- e.printStackTrace();
- }
-
- // The dense feature initialization hasn't happened yet, so we have to retrieve this as sparse
- this.weight = weights.getSparse(name);
-
- initializeLM();
- }
-
- @Override
- public ArrayList<String> reportDenseFeatures(int index) {
- denseFeatureIndex = index;
-
- ArrayList<String> names = new ArrayList<String>();
- names.add(name);
- return names;
- }
-
- /**
- * Initializes the underlying language model.
- *
- * @param config
- * @param type
- * @param path
- */
- protected void initializeLM() {
- if (type.equals("kenlm")) {
- this.languageModel = new KenLM(ngramOrder, path);
-
- } else if (type.equals("berkeleylm")) {
- this.languageModel = new LMGrammarBerkeley(ngramOrder, path);
-
- } else {
- System.err.println(String.format("* FATAL: Invalid backend lm_type '%s' for LanguageModel", type));
- System.err.println(String.format("* Permissible values for 'lm_type' are 'kenlm' and 'berkeleylm'"));
- System.exit(-1);
- }
-
- Vocabulary.registerLanguageModel(this.languageModel);
- Vocabulary.id(config.default_non_terminal);
-
- startSymbolId = Vocabulary.id(Vocabulary.START_SYM);
- }
-
- public NGramLanguageModel getLM() {
- return this.languageModel;
- }
-
- public String logString() {
- if (languageModel != null)
- return String.format("%s, order %d (weight %.3f)", name, languageModel.getOrder(), weight);
- else
- return "WHOA";
- }
-
- /**
- * Computes the features incurred along this edge. Note that these features are unweighted costs
- * of the feature; they are the feature cost, not the model cost, or the inner product of them.
- */
- @Override
- public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
- Sentence sentence, Accumulator acc) {
-
- NgramDPState newState = null;
- if (rule != null) {
- if (config.source_annotations) {
- // Get source side annotations and project them to the target side
- newState = computeTransition(getTags(rule, i, j, sentence), tailNodes, acc);
- }
- else {
- if (this.isClassLM) {
- // Use a class language model
- // Return target side classes
- newState = computeTransition(getClasses(rule), tailNodes, acc);
- }
- else {
- // Default LM
- newState = computeTransition(rule.getEnglish(), tailNodes, acc);
- }
- }
-
- }
-
- return newState;
- }
-
- /**
- * Input sentences can be tagged with information specific to the language model. This looks for
- * such annotations by following a word's alignments back to the source words, checking for
- * annotations, and replacing the surface word if such annotations are found.
- *
- */
- protected int[] getTags(Rule rule, int begin, int end, Sentence sentence) {
- /* Very important to make a copy here, so the original rule is not modified */
- int[] tokens = Arrays.copyOf(rule.getEnglish(), rule.getEnglish().length);
- byte[] alignments = rule.getAlignment();
-
-// System.err.println(String.format("getTags() %s", rule.getRuleString()));
-
- /* For each target-side token, project it to each of its source-language alignments. If any of those
- * are annotated, take the first annotation and quit.
- */
- if (alignments != null) {
- for (int i = 0; i < tokens.length; i++) {
- if (tokens[i] > 0) { // skip nonterminals
- for (int j = 0; j < alignments.length; j += 2) {
- if (alignments[j] == i) {
- String annotation = sentence.getAnnotation((int)alignments[i] + begin, "class");
- if (annotation != null) {
-// System.err.println(String.format(" word %d source %d abs %d annotation %d/%s",
-// i, alignments[i], alignments[i] + begin, annotation, Vocabulary.word(annotation)));
- tokens[i] = Vocabulary.id(annotation);
- break;
- }
- }
- }
- }
- }
- }
-
- return tokens;
- }
-
- /**
- * Sets the class map if this is a class LM
- * @param classMap
- * @throws IOException
- */
- public void setClassMap(String fileName) throws IOException {
- this.classMap = new ClassMap(fileName);
- }
-
-
- /**
- * Replace each word in a rule with the target side classes.
- */
- protected int[] getClasses(Rule rule) {
- if (this.classMap == null) {
- System.err.println("The class map is not set. Cannot use the class LM ");
- System.exit(2);
- }
- /* Very important to make a copy here, so the original rule is not modified */
- int[] tokens = Arrays.copyOf(rule.getEnglish(), rule.getEnglish().length);
- for (int i = 0; i < tokens.length; i++) {
- if (tokens[i] > 0 ) {
- tokens[i] = this.classMap.getClassID(tokens[i]);
- }
- }
- return tokens;
- }
-
- @Override
- public DPState computeFinal(HGNode tailNode, int i, int j, SourcePath sourcePath, Sentence sentence,
- Accumulator acc) {
- return computeFinalTransition((NgramDPState) tailNode.getDPState(stateIndex), acc);
- }
-
- /**
- * This function computes all the complete n-grams found in the rule, as well as the incomplete
- * n-grams on the left-hand side.
- */
- @Override
- public float estimateCost(Rule rule, Sentence sentence) {
-
- float estimate = 0.0f;
- boolean considerIncompleteNgrams = true;
-
- int[] enWords = rule.getEnglish();
-
- List<Integer> words = new ArrayList<Integer>();
- boolean skipStart = (enWords[0] == startSymbolId);
-
- /*
- * Move through the words, accumulating language model costs each time we have an n-gram (n >=
- * 2), and resetting the series of words when we hit a nonterminal.
- */
- for (int c = 0; c < enWords.length; c++) {
- int currentWord = enWords[c];
- if (Vocabulary.nt(currentWord)) {
- estimate += scoreChunkLogP(words, considerIncompleteNgrams, skipStart);
- words.clear();
- skipStart = false;
- } else {
- words.add(currentWord);
- }
- }
- estimate += scoreChunkLogP(words, considerIncompleteNgrams, skipStart);
-
- return weight * estimate;
- }
-
- /**
- * Estimates the future cost of a rule. For the language model feature, this is the sum of the
- * costs of the leftmost k-grams, k = [1..n-1].
- */
- @Override
- public float estimateFutureCost(Rule rule, DPState currentState, Sentence sentence) {
- NgramDPState state = (NgramDPState) currentState;
-
- float estimate = 0.0f;
- int[] leftContext = state.getLeftLMStateWords();
-
- if (null != leftContext) {
- boolean skipStart = true;
- if (leftContext[0] != startSymbolId) {
- skipStart = false;
- }
- estimate += scoreChunkLogP(leftContext, true, skipStart);
- }
- return weight * estimate;
- }
-
- /**
- * Compute the cost of a rule application. The cost of applying a rule is computed by determining
- * the n-gram costs for all n-grams created by this rule application, and summing them. N-grams
- * are created when (a) terminal words in the rule string are followed by a nonterminal (b)
- * terminal words in the rule string are preceded by a nonterminal (c) we encounter adjacent
- * nonterminals. In all of these situations, the corresponding boundary words of the node in the
- * hypergraph represented by the nonterminal must be retrieved.
- *
- * IMPORTANT: only complete n-grams are scored. This means that hypotheses with fewer words
- * than the complete n-gram state remain *unscored*. This fact adds a lot of complication to the
- * code, including the use of the computeFinal* family of functions, which correct this fact for
- * sentences that are too short on the final transition.
- */
- private NgramDPState computeTransition(int[] enWords, List<HGNode> tailNodes, Accumulator acc) {
-
- int[] current = new int[this.ngramOrder];
- int[] shadow = new int[this.ngramOrder];
- int ccount = 0;
- float transitionLogP = 0.0f;
- int[] left_context = null;
-
- for (int c = 0; c < enWords.length; c++) {
- int curID = enWords[c];
-
- if (Vocabulary.nt(curID)) {
- int index = -(curID + 1);
-
- NgramDPState state = (NgramDPState) tailNodes.get(index).getDPState(stateIndex);
- int[] left = state.getLeftLMStateWords();
- int[] right = state.getRightLMStateWords();
-
- // Left context.
- for (int i = 0; i < left.length; i++) {
- current[ccount++] = left[i];
-
- if (left_context == null && ccount == this.ngramOrder - 1)
- left_context = Arrays.copyOf(current, ccount);
-
- if (ccount == this.ngramOrder) {
- // Compute the current word probability, and remove it.
- float prob = this.languageModel.ngramLogProbability(current, this.ngramOrder);
-// System.err.println(String.format("-> prob(%s) = %f", Vocabulary.getWords(current), prob));
- transitionLogP += prob;
- System.arraycopy(current, 1, shadow, 0, this.ngramOrder - 1);
- int[] tmp = current;
- current = shadow;
- shadow = tmp;
- --ccount;
- }
- }
- System.arraycopy(right, 0, current, ccount - right.length, right.length);
- } else { // terminal words
- current[ccount++] = curID;
-
- if (left_context == null && ccount == this.ngramOrder - 1)
- left_context = Arrays.copyOf(current, ccount);
-
- if (ccount == this.ngramOrder) {
- // Compute the current word probability, and remove it.s
- float prob = this.languageModel.ngramLogProbability(current, this.ngramOrder);
-// System.err.println(String.format("-> prob(%s) = %f", Vocabulary.getWords(current), prob));
- transitionLogP += prob;
- System.arraycopy(current, 1, shadow, 0, this.ngramOrder - 1);
- int[] tmp = current;
- current = shadow;
- shadow = tmp;
- --ccount;
- }
- }
- }
-// acc.add(name, transitionLogP);
- acc.add(denseFeatureIndex, transitionLogP);
-
- if (left_context != null) {
- return new NgramDPState(left_context, Arrays.copyOfRange(current, ccount - this.ngramOrder
- + 1, ccount));
- } else {
- int[] context = Arrays.copyOf(current, ccount);
- return new NgramDPState(context, context);
- }
- }
-
- /**
- * This function differs from regular transitions because we incorporate the cost of incomplete
- * left-hand ngrams, as well as including the start- and end-of-sentence markers (if they were
- * requested when the object was created).
- *
- * @param state the dynamic programming state
- * @return the final transition probability (including incomplete n-grams)
- */
- private NgramDPState computeFinalTransition(NgramDPState state, Accumulator acc) {
-
-// System.err.println(String.format("LanguageModel::computeFinalTransition()"));
-
- float res = 0.0f;
- LinkedList<Integer> currentNgram = new LinkedList<Integer>();
- int[] leftContext = state.getLeftLMStateWords();
- int[] rightContext = state.getRightLMStateWords();
-
- for (int i = 0; i < leftContext.length; i++) {
- int t = leftContext[i];
- currentNgram.add(t);
-
- if (currentNgram.size() >= 2) { // start from bigram
- float prob = this.languageModel.ngramLogProbability(Support.toArray(currentNgram),
- currentNgram.size());
- res += prob;
- }
- if (currentNgram.size() == this.ngramOrder)
- currentNgram.removeFirst();
- }
-
- // Tell the accumulator
-// acc.add(name, res);
- acc.add(denseFeatureIndex, res);
-
- // State is the same
- return new NgramDPState(leftContext, rightContext);
- }
-
-
- /**
- * Compatibility method for {@link #scoreChunkLogP(int[], boolean, boolean)}
- */
- private float scoreChunkLogP(List<Integer> words, boolean considerIncompleteNgrams,
- boolean skipStart) {
- return scoreChunkLogP(Ints.toArray(words), considerIncompleteNgrams, skipStart);
- }
-
- /**
- * This function is basically a wrapper for NGramLanguageModel::sentenceLogProbability(). It
- * computes the probability of a phrase ("chunk"), using lower-order n-grams for the first n-1
- * words.
- *
- * @param words
- * @param considerIncompleteNgrams
- * @param skipStart
- * @return the phrase log probability
- */
- private float scoreChunkLogP(int[] words, boolean considerIncompleteNgrams,
- boolean skipStart) {
-
- float score = 0.0f;
- if (words.length > 0) {
- int startIndex;
- if (!considerIncompleteNgrams) {
- startIndex = this.ngramOrder;
- } else if (skipStart) {
- startIndex = 2;
- } else {
- startIndex = 1;
- }
- score = this.languageModel.sentenceLogProbability(words, this.ngramOrder, startIndex);
- }
-
- return score;
- }
-
- /**
- * Public method to set LM_INDEX back to 0.
- * Required if multiple instances of the JoshuaDecoder live in the same JVM.
- */
- public static void resetLmIndex() {
- LM_INDEX = 0;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/lm/NGramLanguageModel.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/lm/NGramLanguageModel.java b/src/joshua/decoder/ff/lm/NGramLanguageModel.java
deleted file mode 100644
index 15da650..0000000
--- a/src/joshua/decoder/ff/lm/NGramLanguageModel.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.lm;
-
-/**
- * An interface for new language models to implement. An object of this type is passed to
- * LanguageModelFF, which will handle all the dynamic programming and state maintenance.
- *
- * @author wren ng thornton <wr...@users.sourceforge.net>
- * @author Zhifei Li, <zh...@gmail.com>
- * @author Matt Post <po...@cs.jhu.edu>
- * @author Juri Ganitkevitch <ju...@cs.jhu.edu>
- */
-public interface NGramLanguageModel {
-
- // ===============================================================
- // Attributes
- // ===============================================================
- int getOrder();
-
- // ===============================================================
- // Methods
- // ===============================================================
-
- /**
- * Language models may have their own private vocabulary mapping strings to integers; for example,
- * if they make use of a compile format (as KenLM and BerkeleyLM do). This mapping is likely
- * different from the global mapping containing in joshua.corpus.Vocabulary, which is used to
- * convert the input string and grammars. This function is used to tell the language model what
- * the global mapping is, so that the language model can convert it into its own private mapping.
- *
- * @param word
- * @param id
- * @return Whether any collisions were detected.
- */
- boolean registerWord(String token, int id);
-
- /**
- * @param sentence the sentence to be scored
- * @param order the order of N-grams for the LM
- * @param startIndex the index of first event-word we want to get its probability; if we want to
- * get the prob for the whole sentence, then startIndex should be 1
- * @return the LogP of the whole sentence
- */
- float sentenceLogProbability(int[] sentence, int order, int startIndex);
-
- /**
- * Compute the probability of a single word given its context.
- *
- * @param ngram
- * @param order
- * @return
- */
- float ngramLogProbability(int[] ngram, int order);
-
- float ngramLogProbability(int[] ngram);
-}