You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by le...@apache.org on 2016/05/16 06:26:58 UTC
[42/66] [partial] incubator-joshua git commit: JOSHUA-252 Make it
possible to use Maven to build Joshua
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/tm/CreateGlueGrammar.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/CreateGlueGrammar.java b/src/joshua/decoder/ff/tm/CreateGlueGrammar.java
deleted file mode 100644
index 51e9fc3..0000000
--- a/src/joshua/decoder/ff/tm/CreateGlueGrammar.java
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.tm;
-
-import static joshua.decoder.ff.tm.packed.PackedGrammar.VOCABULARY_FILENAME;
-import static joshua.util.FormatUtils.cleanNonTerminal;
-import static joshua.util.FormatUtils.isNonterminal;
-
-import java.io.File;
-import java.io.IOException;
-import java.util.HashSet;
-import java.util.Set;
-import java.util.logging.Logger;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.JoshuaConfiguration;
-import joshua.util.io.LineReader;
-
-import org.kohsuke.args4j.CmdLineException;
-import org.kohsuke.args4j.CmdLineParser;
-import org.kohsuke.args4j.Option;
-
-
-public class CreateGlueGrammar {
-
-
- private final Set<String> nonTerminalSymbols = new HashSet<>();
- private static final Logger log = Logger.getLogger(CreateGlueGrammar.class.getName());
-
- @Option(name = "--grammar", aliases = {"-g"}, required = true, usage = "provide grammar to determine list of NonTerminal symbols.")
- private String grammarPath;
-
- @Option(name = "--goal", aliases = {"-goal"}, required = false, usage = "specify custom GOAL symbol. Default: 'GOAL'")
- private String goalSymbol = cleanNonTerminal(new JoshuaConfiguration().goal_symbol);
-
- /* Rule templates */
- // [GOAL] ||| <s> ||| <s> ||| 0
- private static final String R_START = "[%1$s] ||| <s> ||| <s> ||| 0";
- // [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1
- private static final String R_TWO = "[%1$s] ||| [%1$s,1] [%2$s,2] ||| [%1$s,1] [%2$s,2] ||| -1";
- // [GOAL] ||| [GOAL,1] </s> ||| [GOAL,1] </s> ||| 0
- private static final String R_END = "[%1$s] ||| [%1$s,1] </s> ||| [%1$s,1] </s> ||| 0";
- // [GOAL] ||| <s> [X,1] </s> ||| <s> [X,1] </s> ||| 0
- private static final String R_TOP = "[%1$s] ||| <s> [%2$s,1] </s> ||| <s> [%2$s,1] </s> ||| 0";
-
- private void run() throws IOException {
-
- File grammar_file = new File(grammarPath);
- if (!grammar_file.exists()) {
- throw new IOException("Grammar file doesn't exist: " + grammarPath);
- }
-
- // in case of a packedGrammar, we read the serialized vocabulary,
- // collecting all cleaned nonTerminal symbols.
- if (grammar_file.isDirectory()) {
- Vocabulary.read(new File(grammarPath + File.separator + VOCABULARY_FILENAME));
- for (int i = 0; i < Vocabulary.size(); ++i) {
- final String token = Vocabulary.word(i);
- if (isNonterminal(token)) {
- nonTerminalSymbols.add(cleanNonTerminal(token));
- }
- }
- // otherwise we collect cleaned left-hand sides from the rules in the text grammar.
- } else {
- final LineReader reader = new LineReader(grammarPath);
- while (reader.hasNext()) {
- final String line = reader.next();
- int lhsStart = line.indexOf("[") + 1;
- int lhsEnd = line.indexOf("]");
- if (lhsStart < 1 || lhsEnd < 0) {
- log.info(String.format("malformed rule: %s\n", line));
- continue;
- }
- final String lhs = line.substring(lhsStart, lhsEnd);
- nonTerminalSymbols.add(lhs);
- }
- }
-
- log.info(
- String.format("%d nonTerminal symbols read: %s",
- nonTerminalSymbols.size(),
- nonTerminalSymbols.toString()));
-
- // write glue rules to stdout
-
- System.out.println(String.format(R_START, goalSymbol));
-
- for (String nt : nonTerminalSymbols)
- System.out.println(String.format(R_TWO, goalSymbol, nt));
-
- System.out.println(String.format(R_END, goalSymbol));
-
- for (String nt : nonTerminalSymbols)
- System.out.println(String.format(R_TOP, goalSymbol, nt));
-
- }
-
- public static void main(String[] args) throws IOException {
- final CreateGlueGrammar glueCreator = new CreateGlueGrammar();
- final CmdLineParser parser = new CmdLineParser(glueCreator);
-
- try {
- parser.parseArgument(args);
- glueCreator.run();
- } catch (CmdLineException e) {
- log.info(e.toString());
- parser.printUsage(System.err);
- System.exit(1);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/tm/Grammar.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/Grammar.java b/src/joshua/decoder/ff/tm/Grammar.java
deleted file mode 100644
index a834442..0000000
--- a/src/joshua/decoder/ff/tm/Grammar.java
+++ /dev/null
@@ -1,140 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.tm;
-
-import java.util.List;
-
-import joshua.decoder.ff.FeatureFunction;
-
-/**
- * Grammar is a class for wrapping a trie of TrieGrammar in order to store holistic metadata.
- *
- * @author wren ng thornton <wr...@users.sourceforge.net>
- * @author Zhifei Li, <zh...@gmail.com>
- */
-public interface Grammar {
-
- /**
- * Gets the root of the <code>Trie</code> backing this grammar.
- * <p>
- * <em>Note</em>: This method should run as a small constant-time function.
- *
- * @return the root of the <code>Trie</code> backing this grammar
- */
- Trie getTrieRoot();
-
- /**
- * After calling this method, the rules in this grammar are guaranteed to be sorted based on the
- * latest feature function values.
- * <p>
- * Cube-pruning requires that the grammar be sorted based on the latest feature functions.
- *
- * @param weights The model weights.
- */
- void sortGrammar(List<FeatureFunction> models);
-
- /**
- * Determines whether the rules in this grammar have been sorted based on the latest feature
- * function values.
- * <p>
- * This method is needed for the cube-pruning algorithm.
- *
- * @return <code>true</code> if the rules in this grammar have been sorted based on the latest
- * feature function values, <code>false</code> otherwise
- */
- boolean isSorted();
-
- /**
- * Returns whether this grammar has any valid rules for covering a particular span of a sentence.
- * Hiero's "glue" grammar will only say True if the span is longer than our span limit, and is
- * anchored at startIndex==0. Hiero's "regular" grammar will only say True if the span is less
- * than the span limit. Other grammars, e.g. for rule-based systems, may have different behaviors.
- *
- * @param startIndex Indicates the starting index of a phrase in a source input phrase, or a
- * starting node identifier in a source input lattice
- * @param endIndex Indicates the ending index of a phrase in a source input phrase, or an ending
- * node identifier in a source input lattice
- * @param pathLength Length of the input path in a source input lattice. If a source input phrase
- * is used instead of a lattice, this value will likely be ignored by the underlying
- * implementation, but would normally be defined as <code>endIndex-startIndex</code>
- */
- boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength);
-
- /**
- * Gets the number of rules stored in the grammar.
- *
- * @return the number of rules stored in the grammar
- */
- int getNumRules();
-
- /**
- * Returns the number of dense features.
- *
- * @return the number of dense features
- */
- int getNumDenseFeatures();
-
- /**
- * This is used to construct a manual rule supported from outside the grammar, but the owner
- * should be the same as the grammar. Rule ID will the same as OOVRuleId, and no lattice cost
- */
- @Deprecated
- Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords, float[] scores, int arity);
-
- /**
- * Dump the grammar to disk.
- *
- * @param file
- */
- @Deprecated
- void writeGrammarOnDisk(String file);
-
- /**
- * This returns true if the grammar contains rules that are regular expressions, possibly matching
- * many different inputs.
- *
- * @return true if the grammar's rules may contain regular expressions.
- */
- boolean isRegexpGrammar();
-
- /**
- * Return the grammar's owner.
- */
- int getOwner();
-
- /**
- * Return the maximum source phrase length (terminals + nonterminals).
- */
- int getMaxSourcePhraseLength();
-
- /**
- * Add an OOV rule for the requested word for the grammar.
- *
- * @param word
- * @param featureFunctions
- */
- void addOOVRules(int word, List<FeatureFunction> featureFunctions);
-
- /**
- * Add a rule to the grammar.
- *
- * @param Rule the rule
- */
- void addRule(Rule rule);
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/tm/GrammarReader.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/GrammarReader.java b/src/joshua/decoder/ff/tm/GrammarReader.java
deleted file mode 100644
index f94a472..0000000
--- a/src/joshua/decoder/ff/tm/GrammarReader.java
+++ /dev/null
@@ -1,207 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.tm;
-
-import java.io.IOException;
-import java.util.Iterator;
-import java.util.logging.Level;
-import java.util.logging.Logger;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.Decoder;
-import joshua.util.io.LineReader;
-
-/**
- * This is a base class for simple, ASCII line-based grammars that are stored on disk.
- *
- * @author Juri Ganitkevitch
- *
- */
-public abstract class GrammarReader<R extends Rule> implements Iterable<R>, Iterator<R> {
-
- protected static String fieldDelimiter;
- protected static String nonTerminalRegEx;
- protected static String nonTerminalCleanRegEx;
-
- protected static String description;
-
- protected String fileName;
- protected LineReader reader;
- protected String lookAhead;
- protected int numRulesRead;
-
- private static final Logger logger = Logger.getLogger(GrammarReader.class.getName());
-
- // dummy constructor for
- public GrammarReader() {
- this.fileName = null;
- }
-
- public GrammarReader(String fileName) {
- this.fileName = fileName;
- }
-
- public void initialize() {
- try {
- this.reader = new LineReader(fileName);
- } catch (IOException e) {
- throw new RuntimeException("Error opening translation model file: " + fileName + "\n"
- + (null != e.getMessage() ? e.getMessage() : "No details available. Sorry."), e);
- }
-
- Decoder.LOG(1, String.format("Reading grammar from file %s...", fileName));
- numRulesRead = 0;
- advanceReader();
- }
-
- // the reader is the iterator itself
- public Iterator<R> iterator() {
- return this;
- }
-
- /** Unsupported Iterator method. */
- public void remove() throws UnsupportedOperationException {
- throw new UnsupportedOperationException();
- }
-
- public void close() {
- if (null != this.reader) {
- try {
- this.reader.close();
- } catch (IOException e) {
- // FIXME: is this the right logging level?
- if (logger.isLoggable(Level.WARNING))
- logger.info("Error closing grammar file stream: " + this.fileName);
- }
- this.reader = null;
- }
- }
-
- /**
- * For correct behavior <code>close</code> must be called on every GrammarReader, however this
- * code attempts to avoid resource leaks.
- *
- * @see joshua.util.io.LineReader
- */
- @Override
- protected void finalize() throws Throwable {
- if (this.reader != null) {
- logger.severe("Grammar file stream was not closed, this indicates a coding error: "
- + this.fileName);
- }
-
- this.close();
- super.finalize();
- }
-
- @Override
- public boolean hasNext() {
- return lookAhead != null;
- }
-
- private void advanceReader() {
- try {
- lookAhead = reader.readLine();
- numRulesRead++;
- } catch (IOException e) {
- logger.severe("Error reading grammar from file: " + fileName);
- }
- if (lookAhead == null && reader != null) {
- this.close();
- }
- }
-
- /**
- * Read the next line, and print reader progress.
- */
- @Override
- public R next() {
- String line = lookAhead;
-
- int oldProgress = reader.progress();
- advanceReader();
-
- if (Decoder.VERBOSE >= 1) {
- int newProgress = (reader != null) ? reader.progress() : 100;
-
- if (newProgress > oldProgress) {
- for (int i = oldProgress + 1; i <= newProgress; i++)
- if (i == 97) {
- System.err.print("1");
- } else if (i == 98) {
- System.err.print("0");
- } else if (i == 99) {
- System.err.print("0");
- } else if (i == 100) {
- System.err.println("%");
- } else if (i % 10 == 0) {
- System.err.print(String.format("%d", i));
- System.err.flush();
- } else if ((i - 1) % 10 == 0)
- ; // skip at 11 since 10, 20, etc take two digits
- else {
- System.err.print(".");
- System.err.flush();
- }
- }
- }
- return parseLine(line);
- }
-
- protected abstract R parseLine(String line);
-
- // TODO: keep these around or not?
- public abstract String toWords(R rule);
-
- public abstract String toWordsWithoutFeatureScores(R rule);
-
- /**
- * Removes square brackets (and index, if present) from nonterminal id
- * @param tokenID
- * @return cleaned ID
- */
- public static int cleanNonTerminal(int tokenID) {
- // cleans NT of any markup, e.g., [X,1] may becomes [X], depending
- return Vocabulary.id(cleanNonTerminal(Vocabulary.word(tokenID)));
- }
-
- /**
- * Removes square brackets (and index, if present) from nonterminal id
- * @param token
- * @return cleaned token
- */
- public static String cleanNonTerminal(String token) {
- // cleans NT of any markup, e.g., [X,1] may becomes [X], depending on nonTerminalCleanRegEx
- return token.replaceAll(nonTerminalCleanRegEx, "");
- }
-
- public static boolean isNonTerminal(final String word) {
- // checks if word matches NT regex
- return word.matches(nonTerminalRegEx);
- }
-
- public String getNonTerminalRegEx() {
- return nonTerminalRegEx;
- }
-
- public String getNonTerminalCleanRegEx() {
- return nonTerminalCleanRegEx;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/tm/PhraseRule.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/PhraseRule.java b/src/joshua/decoder/ff/tm/PhraseRule.java
deleted file mode 100644
index 8f5d249..0000000
--- a/src/joshua/decoder/ff/tm/PhraseRule.java
+++ /dev/null
@@ -1,94 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.tm;
-
-import com.google.common.base.Supplier;
-import com.google.common.base.Suppliers;
-
-/***
- * A class for reading in rules from a Moses phrase table. Most of the conversion work is done
- * in {@link joshua.decoder.ff.tm.format.PhraseFormatReader}. This includes prepending every
- * rule with a nonterminal, so that the phrase-based decoder can assume the same hypergraph
- * format as the hierarchical decoder (by pretending to be a strictly left-branching grammar and
- * dispensing with the notion of coverage spans). However, prepending the nonterminals means all
- * the alignments are off by 1. We do not want to fix those when reading in due to the expense,
- * so instead we use this rule which adjust the alignments on the fly.
- *
- * Also, we only convert the Moses dense features on the fly, via this class.
- *
- * TODO: this class should also be responsible for prepending the nonterminals.
- *
- * @author Matt Post
- *
- */
-public class PhraseRule extends Rule {
-
-
- private final String mosesFeatureString;
- private final Supplier<byte[]> alignmentSupplier;
- private final Supplier<String> sparseFeaturesStringSupplier;
-
- public PhraseRule(int lhs, int[] french, int[] english, String sparse_features, int arity,
- String alignment) {
- super(lhs, french, english, null, arity, alignment);
- this.mosesFeatureString = sparse_features;
- this.alignmentSupplier = initializeAlignmentSupplier();
- this.sparseFeaturesStringSupplier = initializeSparseFeaturesStringSupplier();
- }
-
- /**
- * Moses features are probabilities; we need to convert them here by taking the negative log prob.
- * We do this only when the rule is used to amortize.
- */
- private Supplier<String> initializeSparseFeaturesStringSupplier() {
- return Suppliers.memoize(() ->{
- StringBuffer values = new StringBuffer();
- for (String value: mosesFeatureString.split(" ")) {
- float f = Float.parseFloat(value);
- values.append(String.format("%f ", f <= 0.0 ? -100 : -Math.log(f)));
- }
- return values.toString().trim();
- });
- }
-
- /**
- * This is the exact same as the parent implementation, but we need to add 1 to each alignment
- * point to account for the nonterminal [X] that was prepended to each rule.
- */
- private Supplier<byte[]> initializeAlignmentSupplier(){
- return Suppliers.memoize(() ->{
- String[] tokens = getAlignmentString().split("[-\\s]+");
- byte[] alignmentArray = new byte[tokens.length + 2];
- alignmentArray[0] = alignmentArray[1] = 0;
- for (int i = 0; i < tokens.length; i++)
- alignmentArray[i + 2] = (byte) (Short.parseShort(tokens[i]) + 1);
- return alignmentArray;
- });
- }
-
- @Override
- public String getFeatureString() {
- return this.sparseFeaturesStringSupplier.get();
- }
-
- @Override
- public byte[] getAlignment() {
- return this.alignmentSupplier.get();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/tm/Rule.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/Rule.java b/src/joshua/decoder/ff/tm/Rule.java
deleted file mode 100644
index 9f1fb8f..0000000
--- a/src/joshua/decoder/ff/tm/Rule.java
+++ /dev/null
@@ -1,606 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.tm;
-
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Map;
-import java.util.regex.Pattern;
-
-import com.google.common.base.Supplier;
-import com.google.common.base.Suppliers;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.Decoder;
-import joshua.decoder.ff.FeatureFunction;
-import joshua.decoder.ff.FeatureVector;
-import joshua.decoder.segment_file.Sentence;
-
-/**
- * This class define the interface for Rule.
- *
- * All feature scores are interpreted as negative log probabilities, and are therefore negated.
- * Note that not all features need to be negative log probs, but you should be aware that they
- * will be negated, so if you want a positive count, it should come in as negative.
- *
- * @author Zhifei Li, <zh...@gmail.com>
- */
-
-
-/**
- * Normally, the feature score in the rule should be *cost* (i.e., -LogP), so that the feature
- * weight should be positive
- *
- * @author Zhifei Li, <zh...@gmail.com>
- * @author Matt Post <po...@cs.jhu.edu>
- */
-public class Rule implements Comparator<Rule>, Comparable<Rule> {
-
- private int lhs; // tag of this rule
- private int[] pFrench; // pointer to the RuleCollection, as all the rules under it share the same
- // Source side
- protected int arity;
-
- // And a string containing the sparse ones
- //protected final String sparseFeatureString;
- protected final Supplier<String> sparseFeatureStringSupplier;
- private final Supplier<FeatureVector> featuresSupplier;
-
- /*
- * a feature function will be fired for this rule only if the owner of the rule matches the owner
- * of the feature function
- */
- private int owner = -1;
-
- /**
- * This is the cost computed only from the features present with the grammar rule. This cost is
- * needed to sort the rules in the grammar for cube pruning, but isn't the full cost of applying
- * the rule (which will include contextual features that can't be computed until the rule is
- * applied).
- */
- private float estimatedCost = Float.NEGATIVE_INFINITY;
-
- private float precomputableCost = Float.NEGATIVE_INFINITY;
-
- private int[] english;
-
- // The alignment string, e.g., 0-0 0-1 1-1 2-1
- private String alignmentString;
- private final Supplier<byte[]> alignmentSupplier;
-
- /**
- * Constructs a new rule using the provided parameters. Rule id for this rule is
- * undefined. Note that some of the sparse features may be unlabeled, but they cannot be mapped to
- * their default names ("tm_OWNER_INDEX") until later, when we know the owner of the rule. This is
- * not known until the rule is actually added to a grammar in Grammar::addRule().
- *
- * Constructor used by other constructors below;
- *
- * @param lhs Left-hand side of the rule.
- * @param sourceRhs Source language right-hand side of the rule.
- * @param targetRhs Target language right-hand side of the rule.
- * @param sparseFeatures Feature value scores for the rule.
- * @param arity Number of nonterminals in the source language right-hand side.
- * @param owner
- */
- public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, int arity, int owner) {
- this.lhs = lhs;
- this.pFrench = sourceRhs;
- this.arity = arity;
- this.owner = owner;
- this.english = targetRhs;
- this.sparseFeatureStringSupplier = Suppliers.memoize(() -> { return sparseFeatures; });
- this.featuresSupplier = initializeFeatureSupplierFromString();
- this.alignmentSupplier = initializeAlignmentSupplier();
- }
-
- /**
- * Constructor used by PackedGrammar's sortRules().
- */
- public Rule(int lhs, int[] sourceRhs, int[] targetRhs, FeatureVector features, int arity, int owner) {
- this.lhs = lhs;
- this.pFrench = sourceRhs;
- this.arity = arity;
- this.owner = owner;
- this.english = targetRhs;
- this.featuresSupplier = Suppliers.memoize(() -> { return features; });
- this.sparseFeatureStringSupplier = initializeSparseFeaturesStringSupplier();
- this.alignmentSupplier = initializeAlignmentSupplier();
- }
-
- /**
- * Constructor used for SamtFormatReader and GrammarBuilderWalkerFunction's getRuleWithSpans()
- * Owner set to -1
- */
- public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, int arity) {
- this(lhs, sourceRhs, targetRhs, sparseFeatures, arity, -1);
- }
-
- /**
- * Constructor used for addOOVRules(), HieroFormatReader and PhraseRule.
- */
- public Rule(int lhs, int[] sourceRhs, int[] targetRhs, String sparseFeatures, int arity, String alignment) {
- this(lhs, sourceRhs, targetRhs, sparseFeatures, arity);
- this.alignmentString = alignment;
- }
-
- /**
- * Constructor (implicitly) used by PackedRule
- */
- public Rule() {
- this.lhs = -1;
- this.sparseFeatureStringSupplier = initializeSparseFeaturesStringSupplier();
- this.featuresSupplier = initializeFeatureSupplierFromString();
- this.alignmentSupplier = initializeAlignmentSupplier();
- }
-
- // ==========================================================================
- // Lazy loading Suppliers for alignments, feature vector, and feature strings
- // ==========================================================================
-
- private Supplier<byte[]> initializeAlignmentSupplier(){
- return Suppliers.memoize(() ->{
- byte[] alignment = null;
- String alignmentString = getAlignmentString();
- if (alignmentString != null) {
- String[] tokens = alignmentString.split("[-\\s]+");
- alignment = new byte[tokens.length];
- for (int i = 0; i < tokens.length; i++)
- alignment[i] = (byte) Short.parseShort(tokens[i]);
- }
- return alignment;
- });
- }
-
- /**
- * If Rule was constructed with sparseFeatures String, we lazily populate the
- * FeatureSupplier.
- */
- private Supplier<FeatureVector> initializeFeatureSupplierFromString(){
- return Suppliers.memoize(() ->{
- if (owner != -1) {
- return new FeatureVector(getFeatureString(), "tm_" + Vocabulary.word(owner) + "_");
- } else {
- return new FeatureVector();
- }
- });
- }
-
- /**
- * If Rule was constructed with a FeatureVector, we lazily populate the sparseFeaturesStringSupplier.
- */
- private Supplier<String> initializeSparseFeaturesStringSupplier() {
- return Suppliers.memoize(() -> {
- return getFeatureVector().toString();
- });
- }
-
- // ===============================================================
- // Attributes
- // ===============================================================
-
- public void setEnglish(int[] eng) {
- this.english = eng;
- }
-
- public int[] getEnglish() {
- return this.english;
- }
-
- /**
- * Two Rules are equal of they have the same LHS, the same source RHS and the same target
- * RHS.
- *
- * @param o the object to check for equality
- * @return true if o is the same Rule as this rule, false otherwise
- */
- public boolean equals(Object o) {
- if (!(o instanceof Rule)) {
- return false;
- }
- Rule other = (Rule) o;
- if (getLHS() != other.getLHS()) {
- return false;
- }
- if (!Arrays.equals(getFrench(), other.getFrench())) {
- return false;
- }
- if (!Arrays.equals(english, other.getEnglish())) {
- return false;
- }
- return true;
- }
-
- public int hashCode() {
- // I just made this up. If two rules are equal they'll have the
- // same hashcode. Maybe someone else can do a better job though?
- int frHash = Arrays.hashCode(getFrench());
- int enHash = Arrays.hashCode(english);
- return frHash ^ enHash ^ getLHS();
- }
-
- // ===============================================================
- // Attributes
- // ===============================================================
-
- public void setArity(int arity) {
- this.arity = arity;
- }
-
- public int getArity() {
- return this.arity;
- }
-
- public void setOwner(int owner) {
- this.owner = owner;
- }
-
- public int getOwner() {
- return this.owner;
- }
-
- public void setLHS(int lhs) {
- this.lhs = lhs;
- }
-
- public int getLHS() {
- return this.lhs;
- }
-
- public void setFrench(int[] french) {
- this.pFrench = french;
- }
-
- public int[] getFrench() {
- return this.pFrench;
- }
-
- /**
- * This function does the work of turning the string version of the sparse features (passed in
- * when the rule was created) into an actual set of features. This is a bit complicated because we
- * support intermingled labeled and unlabeled features, where the unlabeled features are mapped to
- * a default name template of the form "tm_OWNER_INDEX".
- *
- * This function returns the dense (phrasal) features discovered when the rule was loaded. Dense
- * features are the list of unlabeled features that preceded labeled ones. They can also be
- * specified as labeled features of the form "tm_OWNER_INDEX", but the former format is preferred.
- */
- public FeatureVector getFeatureVector() {
- return featuresSupplier.get();
- }
-
- /**
- * This function returns the estimated cost of a rule, which should have been computed when the
- * grammar was first sorted via a call to Rule::estimateRuleCost(). This function is a getter
- * only; it will not compute the value if it has not already been set. It is necessary in addition
- * to estimateRuleCost(models) because sometimes the value needs to be retrieved from contexts
- * that do not have access to the feature functions.
- *
- * This function is called by the rule comparator when sorting the grammar. As such it may be
- * called many times and any implementation of it should be a cached implementation.
- *
- * @return the estimated cost of the rule (a lower bound on the true cost)
- */
- public float getEstimatedCost() {
- return estimatedCost;
- }
-
- /**
- * Precomputable costs is the inner product of the weights found on each grammar rule and the
- * weight vector. This is slightly different from the estimated rule cost, which can include other
- * features (such as a language model estimate). This getter and setter should also be cached, and
- * is basically provided to allow the PhraseModel feature to cache its (expensive) computation for
- * each rule.
- *
- * @return the precomputable cost of each rule
- */
- public float getPrecomputableCost() {
- return precomputableCost;
- }
-
- public float getDenseFeature(int k) {
- return getFeatureVector().getDense(k);
- }
-
- public void setPrecomputableCost(float[] phrase_weights, FeatureVector weights) {
- float cost = 0.0f;
- FeatureVector features = getFeatureVector();
- for (int i = 0; i < features.getDenseFeatures().size() && i < phrase_weights.length; i++) {
- cost += phrase_weights[i] * features.getDense(i);
- }
-
- for (String key: features.getSparseFeatures().keySet()) {
- cost += weights.getSparse(key) * features.getSparse(key);
- }
-
- this.precomputableCost = cost;
- }
-
- /**
- * This function estimates the cost of a rule, which is used for sorting the rules for cube
- * pruning. The estimated cost is basically the set of precomputable features (features listed
- * along with the rule in the grammar file) along with any other estimates that other features
- * would like to contribute (e.g., a language model estimate). This cost will be a lower bound on
- * the rule's actual cost.
- *
- * The value of this function is used only for sorting the rules. When the rule is later applied
- * in context to particular hypernodes, the rule's actual cost is computed.
- *
- * @param models the list of models available to the decoder
- * @return estimated cost of the rule
- */
- public float estimateRuleCost(List<FeatureFunction> models) {
- if (null == models)
- return 0.0f;
-
- if (this.estimatedCost <= Float.NEGATIVE_INFINITY) {
- this.estimatedCost = 0.0f; // weights.innerProduct(computeFeatures());
-
- if (Decoder.VERBOSE >= 4)
- System.err.println(String.format("estimateCost(%s ;; %s)", getFrenchWords(), getEnglishWords()));
- for (FeatureFunction ff : models) {
- float val = ff.estimateCost(this, null);
- if (Decoder.VERBOSE >= 4)
- System.err.println(String.format(" FEATURE %s -> %.3f", ff.getName(), val));
- this.estimatedCost += val;
- }
- }
-
- return estimatedCost;
- }
-
- // ===============================================================
- // Methods
- // ===============================================================
-
- public String toString() {
- StringBuffer sb = new StringBuffer();
- sb.append(Vocabulary.word(this.getLHS()));
- sb.append(" ||| ");
- sb.append(getFrenchWords());
- sb.append(" ||| ");
- sb.append(getEnglishWords());
- sb.append(" |||");
- sb.append(" " + getFeatureVector());
- sb.append(String.format(" ||| est=%.3f", getEstimatedCost()));
- sb.append(String.format(" pre=%.3f", getPrecomputableCost()));
- return sb.toString();
- }
-
- /**
- * Returns a version of the rule suitable for reading in from a text file.
- *
- * @return
- */
- public String textFormat() {
- StringBuffer sb = new StringBuffer();
- sb.append(Vocabulary.word(this.getLHS()));
- sb.append(" |||");
-
- int nt = 1;
- for (int i = 0; i < getFrench().length; i++) {
- if (getFrench()[i] < 0)
- sb.append(" " + Vocabulary.word(getFrench()[i]).replaceFirst("\\]", String.format(",%d]", nt++)));
- else
- sb.append(" " + Vocabulary.word(getFrench()[i]));
- }
- sb.append(" |||");
- nt = 1;
- for (int i = 0; i < getEnglish().length; i++) {
- if (getEnglish()[i] < 0)
- sb.append(" " + Vocabulary.word(getEnglish()[i]).replaceFirst("\\]", String.format(",%d]", nt++)));
- else
- sb.append(" " + Vocabulary.word(getEnglish()[i]));
- }
- sb.append(" |||");
- sb.append(" " + getFeatureString());
- if (getAlignmentString() != null)
- sb.append(" ||| " + getAlignmentString());
- return sb.toString();
- }
-
- public String getFeatureString() {
- return sparseFeatureStringSupplier.get();
- }
-
- /**
- * Returns an alignment as a sequence of integers. The integers at positions i and i+1 are paired,
- * with position i indexing the source and i+1 the target.
- */
- public byte[] getAlignment() {
- return this.alignmentSupplier.get();
- }
-
- public String getAlignmentString() {
- return this.alignmentString;
- }
-
- /**
- * The nonterminals on the English side are pointers to the source side nonterminals (-1 and -2),
- * rather than being directly encoded. These number indicate the correspondence between the
- * nonterminals on each side, introducing a level of indirection however when we want to resolve
- * them. So to get the ID, we need to look up the corresponding source side ID.
- *
- * @return The string of English words
- */
- public String getEnglishWords() {
- int[] foreignNTs = getForeignNonTerminals();
-
- StringBuilder sb = new StringBuilder();
- for (Integer index : getEnglish()) {
- if (index >= 0)
- sb.append(Vocabulary.word(index) + " ");
- else
- sb.append(Vocabulary.word(foreignNTs[-index - 1]).replace("]",
- String.format(",%d] ", Math.abs(index))));
- }
-
- return sb.toString().trim();
- }
-
- public boolean isTerminal() {
- for (int i = 0; i < getEnglish().length; i++)
- if (getEnglish()[i] < 0)
- return false;
-
- return true;
- }
-
- /**
- * Return the French (source) nonterminals as list of Strings
- *
- * @return
- */
- public int[] getForeignNonTerminals() {
- int[] nts = new int[getArity()];
- int index = 0;
- for (int id : getFrench())
- if (id < 0)
- nts[index++] = -id;
- return nts;
- }
-
- /**
- * Returns an array of size getArity() containing the source indeces of non terminals.
- */
- public int[] getNonTerminalSourcePositions() {
- int[] nonTerminalPositions = new int[getArity()];
- int ntPos = 0;
- for (int sourceIdx = 0; sourceIdx < getFrench().length; sourceIdx++) {
- if (getFrench()[sourceIdx] < 0)
- nonTerminalPositions[ntPos++] = sourceIdx;
- }
- return nonTerminalPositions;
- }
-
- /**
- * Parses the Alignment byte[] into a Map from target to (possibly a list of) source positions.
- * Used by the WordAlignmentExtractor.
- */
- public Map<Integer, List<Integer>> getAlignmentMap() {
- byte[] alignmentArray = getAlignment();
- Map<Integer, List<Integer>> alignmentMap = new HashMap<Integer, List<Integer>>();
- if (alignmentArray != null) {
- for (int alignmentIdx = 0; alignmentIdx < alignmentArray.length; alignmentIdx += 2 ) {
- int s = alignmentArray[alignmentIdx];
- int t = alignmentArray[alignmentIdx + 1];
- List<Integer> values = alignmentMap.get(t);
- if (values == null)
- alignmentMap.put(t, values = new ArrayList<Integer>());
- values.add(s);
- }
- }
- return alignmentMap;
- }
-
- /**
- * Return the English (target) nonterminals as list of Strings
- *
- * @return
- */
- public int[] getEnglishNonTerminals() {
- int[] nts = new int[getArity()];
- int[] foreignNTs = getForeignNonTerminals();
- int index = 0;
-
- for (int i : getEnglish()) {
- if (i < 0)
- nts[index++] = foreignNTs[Math.abs(getEnglish()[i]) - 1];
- }
-
- return nts;
- }
-
- private int[] getNormalizedEnglishNonterminalIndices() {
- int[] result = new int[getArity()];
-
- int ntIndex = 0;
- for (Integer index : getEnglish()) {
- if (index < 0)
- result[ntIndex++] = -index - 1;
- }
-
- return result;
- }
-
- public boolean isInverting() {
- int[] normalizedEnglishNonTerminalIndices = getNormalizedEnglishNonterminalIndices();
- if (normalizedEnglishNonTerminalIndices.length == 2) {
- if (normalizedEnglishNonTerminalIndices[0] == 1) {
- return true;
- }
- }
- return false;
- }
-
- public String getFrenchWords() {
- return Vocabulary.getWords(getFrench());
- }
-
- public static final String NT_REGEX = "\\[[^\\]]+?\\]";
-
- private Pattern getPattern() {
- String source = getFrenchWords();
- String pattern = Pattern.quote(source);
- pattern = pattern.replaceAll(NT_REGEX, "\\\\E.+\\\\Q");
- pattern = pattern.replaceAll("\\\\Q\\\\E", "");
- pattern = "(?:^|\\s)" + pattern + "(?:$|\\s)";
- return Pattern.compile(pattern);
- }
-
- /**
- * Matches the string representation of the rule's source side against a sentence
- *
- * @param sentence
- * @return
- */
- public boolean matches(Sentence sentence) {
- boolean match = getPattern().matcher(sentence.fullSource()).find();
- // System.err.println(String.format("match(%s,%s) = %s", Pattern.quote(getFrenchWords()),
- // sentence.annotatedSource(), match));
- return match;
- }
-
- /**
- * This comparator is used for sorting the rules during cube pruning. An estimate of the cost
- * of each rule is computed and used to sort.
- */
- public static Comparator<Rule> EstimatedCostComparator = new Comparator<Rule>() {
- public int compare(Rule rule1, Rule rule2) {
- float cost1 = rule1.getEstimatedCost();
- float cost2 = rule2.getEstimatedCost();
- return Float.compare(cost2, cost1);
- }
- };
-
- public int compare(Rule rule1, Rule rule2) {
- return EstimatedCostComparator.compare(rule1, rule2);
- }
-
- public int compareTo(Rule other) {
- return EstimatedCostComparator.compare(this, other);
- }
-
- public String getRuleString() {
- return String.format("%s -> %s ||| %s", Vocabulary.word(getLHS()), getFrenchWords(), getEnglishWords());
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/tm/RuleCollection.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/RuleCollection.java b/src/joshua/decoder/ff/tm/RuleCollection.java
deleted file mode 100644
index 6812fd5..0000000
--- a/src/joshua/decoder/ff/tm/RuleCollection.java
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.tm;
-
-import java.util.List;
-
-import joshua.decoder.ff.FeatureFunction;
-
-/**
- * A RuleCollection represents a set of rules that share the same source side (and hence the same
- * arity). These rules are likely stored together in a Trie data structure, although the interface
- * allows any implementation to be used.
- *
- * @author Zhifei Li
- * @author Lane Schwartz
- * @author Matt Post <po...@cs.jhu.edu>
- */
-public interface RuleCollection {
-
- /**
- * Returns true if the rules are sorted. This is used to allow rules to be sorted in an amortized
- * fashion; rather than sorting all trie nodes when the grammar is originally loaded, we sort them
- * only as the decoder actually needs them.
- */
- boolean isSorted();
-
- /**
- * This returns a list of the rules, sorting them if necessary.
- *
- * Implementations of this function should be synchronized.
- */
- List<Rule> getSortedRules(List<FeatureFunction> models);
-
- /**
- * Get the list of rules. There are no guarantees about whether they're sorted or not.
- */
- List<Rule> getRules();
-
- /**
- * Gets the source side for all rules in this RuleCollection. This source side is the same for all
- * the rules in the RuleCollection.
- *
- * @return the (common) source side for all rules in this RuleCollection
- */
- int[] getSourceSide();
-
- /**
- * Gets the number of nonterminals in the source side of the rules in this RuleCollection. The
- * source side is the same for all the rules in the RuleCollection, so the arity will also be the
- * same for all of these rules.
- *
- * @return the (common) number of nonterminals in the source side of the rules in this
- * RuleCollection
- */
- int getArity();
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/SentenceFilteredGrammar.java b/src/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
deleted file mode 100644
index d540727..0000000
--- a/src/joshua/decoder/ff/tm/SentenceFilteredGrammar.java
+++ /dev/null
@@ -1,373 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.tm;
-
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map.Entry;
-
-import joshua.decoder.ff.tm.hash_based.ExtensionIterator;
-import joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
-import joshua.decoder.segment_file.Sentence;
-
-/**
- * This class implements dynamic sentence-level filtering. This is accomplished with a parallel
- * trie, a subset of the original trie, that only contains trie paths that are reachable from
- * traversals of the current sentence.
- *
- * @author Matt Post <po...@cs.jhu.edu>
- */
-public class SentenceFilteredGrammar extends MemoryBasedBatchGrammar {
- private AbstractGrammar baseGrammar;
- private SentenceFilteredTrie filteredTrie;
- private int[] tokens;
- private Sentence sentence;
-
- /**
- * Construct a new sentence-filtered grammar. The main work is done in the enclosed trie (obtained
- * from the base grammar, which contains the complete grammar).
- *
- * @param baseGrammar
- * @param sentence
- */
- SentenceFilteredGrammar(AbstractGrammar baseGrammar, Sentence sentence) {
- super(baseGrammar.joshuaConfiguration);
- this.baseGrammar = baseGrammar;
- this.sentence = sentence;
- this.tokens = sentence.getWordIDs();
-
- int origCount = getNumRules(baseGrammar.getTrieRoot());
- long startTime = System.currentTimeMillis();
-
- /* Filter the rules; returns non-null object */
- this.filteredTrie = filter(baseGrammar.getTrieRoot());
- int filteredCount = getNumRules();
-
- float seconds = (System.currentTimeMillis() - startTime) / 1000.0f;
-
- System.err.println(String.format(
- "Sentence-level filtering of sentence %d (%d -> %d rules) in %.3f seconds", sentence.id(),
- origCount, filteredCount, seconds));
- }
-
- @Override
- public Trie getTrieRoot() {
- return filteredTrie;
- }
-
- /**
- * This function is poorly named: it doesn't mean whether a rule exists in the grammar for the
- * current span, but whether the grammar is permitted to apply rules to the current span (a
- * grammar-level parameter). As such we can just chain to the underlying grammar.
- */
- @Override
- public boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength) {
- return baseGrammar.hasRuleForSpan(startIndex, endIndex, pathLength);
- }
-
- @Override
- public int getNumRules() {
- return getNumRules(getTrieRoot());
- }
-
- /**
- * A convenience function that counts the number of rules in a grammar's trie.
- *
- * @param node
- * @return
- */
- public int getNumRules(Trie node) {
- int numRules = 0;
- if (node != null) {
- if (node.getRuleCollection() != null)
- numRules += node.getRuleCollection().getRules().size();
-
- if (node.getExtensions() != null)
- for (Trie child : node.getExtensions())
- numRules += getNumRules(child);
- }
-
- return numRules;
- }
-
- @Override
- public Rule constructManualRule(int lhs, int[] sourceWords, int[] targetWords, float[] scores,
- int aritity) {
- // TODO Auto-generated method stub
- return null;
- }
-
- @Override
- public boolean isRegexpGrammar() {
- return false;
- }
-
- /**
- * What is the algorithm?
- *
- * Take the first word of the sentence, and start at the root of the trie. There are two things to
- * consider: (a) word matches and (b) nonterminal matches.
- *
- * For a word match, simply follow that arc along the trie. We create a parallel arc in our
- * filtered grammar to represent it. Each arc in the filtered trie knows about its
- * corresponding/underlying node in the unfiltered grammar trie.
- *
- * A nonterminal is always permitted to match. The question then is how much of the input sentence
- * we imagine it consumed. The answer is that it could have been any amount. So the recursive call
- * has to be a set of calls, one each to the next trie node with different lengths of the sentence
- * remaining.
- *
- * A problem occurs when we have multiple sequential nonterminals. For scope-3 grammars, there can
- * be four sequential nonterminals (in the case when they are grounded by terminals on both ends
- * of the nonterminal chain). We'd like to avoid looking at all possible ways to split up the
- * subsequence, because with respect to filtering rules, they are all the same.
- *
- * We accomplish this with the following restriction: for purposes of grammar filtering, only the
- * first in a sequence of nonterminal traversals can consume more than one word. Each of the
- * subsequent ones would have to consume just one word. We then just have to record in the
- * recursive call whether the last traversal was a nonterminal or not.
- *
- * @return the root of the filtered trie
- */
- private SentenceFilteredTrie filter(Trie unfilteredTrieRoot) {
- SentenceFilteredTrie filteredTrieRoot = new SentenceFilteredTrie(unfilteredTrieRoot);
-
- // System.err.println(String.format("FILTERING TO SENTENCE\n %s\n",
- // Vocabulary.getWords(tokens)));
-
- /*
- * The root of the trie is where rule applications start, so we simply try all possible
- * positions in the sentence.
- */
- for (int i = 0; i < tokens.length; i++) {
- filter(i, filteredTrieRoot, false);
- }
-
- return filteredTrieRoot;
- }
-
- /**
- * Matches rules against the sentence. Intelligently handles chains of sequential nonterminals.
- * Marks arcs that are traversable for this sentence.
- *
- * @param i the position in the sentence to start matching
- * @param trie the trie node to match against
- * @param lastWasNT true if the match that brought us here was against a nonterminal
- */
- private void filter(int i, SentenceFilteredTrie trieNode, boolean lastWasNT) {
- if (i >= tokens.length)
- return;
-
- /* Make sure the underlying unfiltered node has children. */
- Trie unfilteredTrieNode = trieNode.unfilteredTrieNode;
- if (unfilteredTrieNode.getChildren() == null) {
- // trieNode.path.retreat();
- return;
- }
-
- /* Match a word */
- Trie trie = unfilteredTrieNode.match(tokens[i]);
- if (trie != null) {
- /*
- * The current filtered node might already have an arc for this label. If so, retrieve it
- * (since we still need to follow it); if not, create it.
- */
- SentenceFilteredTrie nextFilteredTrie = trieNode.match(tokens[i]);
- if (nextFilteredTrie == null) {
- nextFilteredTrie = new SentenceFilteredTrie(trie);
- trieNode.children.put(tokens[i], nextFilteredTrie);
- }
-
- /*
- * Now continue, trying to match the child node against the next position in the sentence. The
- * third argument records that this match was not against a nonterminal.
- */
- filter(i + 1, nextFilteredTrie, false);
- }
-
- /*
- * Now we attempt to match nonterminals. Any nonterminal is permitted to match any region of the
- * sentence, up to the maximum span for that grammar. So we enumerate all children of the
- * current (unfiltered) trie grammar node, looking for nonterminals (items whose label value is
- * less than 0), then recurse.
- *
- * There is one subtlely. Adjacent nonterminals in a grammar rule can match a span (i, j) in (j
- * - i - 1) ways, but for purposes of determining whether a rule fits, this is all wasted
- * effort. To handle this, we allow the first nonterminal in a sequence to record 1, 2, 3, ...
- * terminals (up to the grammar's span limit, or the rest of the sentence, whichever is
- * shorter). Subsequent adjacent nonterminals are permitted to consume only a single terminal.
- */
- HashMap<Integer, ? extends Trie> children = unfilteredTrieNode.getChildren();
- if (children != null) {
- for (int label : children.keySet()) {
- if (label < 0) {
- SentenceFilteredTrie nextFilteredTrie = trieNode.match(label);
- if (nextFilteredTrie == null) {
- nextFilteredTrie = new SentenceFilteredTrie(unfilteredTrieNode.match(label));
- trieNode.children.put(label, nextFilteredTrie);
- }
-
- /*
- * Recurse. If the last match was a nonterminal, we can only consume one more token.
- *
- * TODO: This goes too far by looking at the whole sentence; each grammar has a maximum
- * span limit which should be consulted. What we should be doing is passing the point
- * where we started matching the current sentence, so we can apply this span limit, which
- * is easily accessible (baseGrammar.spanLimit).
- */
- int maxJ = lastWasNT ? (i + 1) : tokens.length;
- for (int j = i + 1; j <= maxJ; j++) {
- filter(j, nextFilteredTrie, true);
- }
- }
- }
- }
- }
-
- /**
- * Alternate filter that uses regular expressions, walking the grammar trie and matching the
- * source side of each rule collection against the input sentence. Failed matches are discarded,
- * and trie nodes extending from that position need not be explored.
- *
- * @return the root of the filtered trie if any rules were retained, otherwise null
- */
- @SuppressWarnings("unused")
- private SentenceFilteredTrie filter_regexp(Trie unfilteredTrie) {
- SentenceFilteredTrie trie = null;
-
- /* Case 1: keep the trie node if it has a rule collection that matches the sentence */
- if (unfilteredTrie.hasRules())
- if (matchesSentence(unfilteredTrie))
- trie = new SentenceFilteredTrie(unfilteredTrie);
- else
- return null;
-
- /* Case 2: keep the trie node if it has children who have valid rule collections */
- if (unfilteredTrie.hasExtensions())
- for (Entry<Integer, ? extends Trie> arc : unfilteredTrie.getChildren().entrySet()) {
- Trie unfilteredChildTrie = arc.getValue();
- SentenceFilteredTrie nextTrie = filter_regexp(unfilteredChildTrie);
- if (nextTrie != null) {
- if (trie == null)
- trie = new SentenceFilteredTrie(unfilteredTrie);
- trie.children.put(arc.getKey(), nextTrie);
- }
- }
-
- return trie;
- }
-
- private boolean matchesSentence(Trie childTrie) {
- Rule rule = childTrie.getRuleCollection().getRules().get(0);
- return rule.matches(sentence);
- }
-
- /**
- * Implements a filtered trie, by sitting on top of a base trie and annotating nodes that match
- * the given input sentence.
- *
- * @author Matt Post <po...@cs.jhu.edu>
- *
- */
- public class SentenceFilteredTrie implements Trie {
-
- /* The underlying unfiltered trie node. */
- private Trie unfilteredTrieNode;
-
- /* The child nodes in the filtered trie. */
- private HashMap<Integer, SentenceFilteredTrie> children = null;
-
- /**
- * Constructor.
- *
- * @param trieRoot
- * @param source
- */
- public SentenceFilteredTrie(Trie unfilteredTrieNode) {
- this.unfilteredTrieNode = unfilteredTrieNode;
- this.children = new HashMap<Integer, SentenceFilteredTrie>();
- }
-
- @Override
- public SentenceFilteredTrie match(int wordID) {
- if (children != null)
- return children.get(wordID);
- return null;
- }
-
- @Override
- public boolean hasExtensions() {
- return children != null;
- }
-
- @Override
- public Collection<SentenceFilteredTrie> getExtensions() {
- if (children != null)
- return children.values();
-
- return null;
- }
-
- @Override
- public HashMap<Integer, SentenceFilteredTrie> getChildren() {
- return children;
- }
-
- @Override
- public boolean hasRules() {
- // Chain to the underlying unfiltered node.
- return unfilteredTrieNode.hasRules();
- }
-
- @Override
- public RuleCollection getRuleCollection() {
- // Chain to the underlying unfiltered node, since the rule collection just varies by target
- // side.
- return unfilteredTrieNode.getRuleCollection();
- }
-
- /**
- * Counts the number of rules.
- *
- * @return the number of rules rooted at this node.
- */
- public int getNumRules() {
- int numRules = 0;
- if (getTrieRoot() != null)
- if (getTrieRoot().getRuleCollection() != null)
- numRules += getTrieRoot().getRuleCollection().getRules().size();
-
- for (SentenceFilteredTrie node : getExtensions())
- numRules += node.getNumRules();
-
- return numRules;
- }
-
- @Override
- public Iterator<Integer> getTerminalExtensionIterator() {
- return new ExtensionIterator(children, true);
- }
-
- @Override
- public Iterator<Integer> getNonterminalExtensionIterator() {
- return new ExtensionIterator(children, false);
- }
- }
-}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/tm/Trie.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/Trie.java b/src/joshua/decoder/ff/tm/Trie.java
deleted file mode 100644
index df481d6..0000000
--- a/src/joshua/decoder/ff/tm/Trie.java
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.tm;
-
-import java.util.Collection;
-import java.util.HashMap;
-import java.util.Iterator;
-
-/**
- * An interface for trie-like data structures.
- *
- * @author wren ng thornton <wr...@users.sourceforge.net>
- * @author Zhifei Li, <zh...@gmail.com>
- */
-public interface Trie {
-
- /**
- * Traverse one ply further down the trie. If there is no match, the result is null.
- *
- * @param wordID
- * @return Child node of this trie
- */
- Trie match(int wordID);
-
-
- /**
- * Returns whether matchOne(Symbol) could succeed for any symbol.
- *
- * @return <code>true</code> if {@link #match(int)} could succeed for some symbol,
- * <code>false</code> otherwise
- */
- boolean hasExtensions();
-
-
- /**
- * If the trie node has extensions, then return a list of extended trie nodes, otherwise return
- * null.
- *
- * @return A list of extended <code>Trie</code> nodes if this node has extensions,
- * <code>null<code>
- * otherwise
- */
- Collection<? extends Trie> getExtensions();
-
-
- /**
- * If the trie node has extensions, get a list of their labels.
- *
- * @return
- */
- HashMap<Integer,? extends Trie> getChildren();
-
- /**
- * Returns an iterator over the trie node's extensions with terminal labels.
- *
- * @return
- */
- Iterator<Integer> getTerminalExtensionIterator();
-
- /**
- * Returns an iterator over the trie node's extensions with nonterminal labels.
- *
- * @return
- */
- Iterator<Integer> getNonterminalExtensionIterator();
-
-
- /**
- * Gets whether the current node/state is a "final state" that has matching rules.
- *
- * @return <code>true</code> if the current node/state is a "final state" that has matching rules,
- * <code>false</code> otherwise
- */
- boolean hasRules();
-
-
- /**
- * Retrieve the rules at the current node/state. The implementation of this method must adhere to
- * the following laws:
- *
- * <ol>
- * <li>The return value is always non-null. The collection may be empty however.</li>
- * <li>The collection must be empty if hasRules() is false, and must be non-empty if hasRules() is
- * true.</li>
- * <li>The collection must be sorted (at least as used by TMGrammar)</li>
- * </ol>
- */
- RuleCollection getRuleCollection();
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java b/src/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java
deleted file mode 100644
index 71fe6b2..0000000
--- a/src/joshua/decoder/ff/tm/UnsortedRuleCollectionException.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.tm;
-
-/**
- * Unchecked runtime exception thrown to indicate that a collection of rules has not been properly
- * sorted according to the feature functions in effect.
- *
- * @author Lane Schwartz
- */
-public class UnsortedRuleCollectionException extends RuntimeException {
-
- private static final long serialVersionUID = -4819014771607378835L;
-
- /**
- * Constructs an <code>UnsortedRuleCollectionException</code> with the specified detail message.
- *
- * @param message the detail message
- */
- public UnsortedRuleCollectionException(String message) {
- super(message);
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/tm/format/HieroFormatReader.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/format/HieroFormatReader.java b/src/joshua/decoder/ff/tm/format/HieroFormatReader.java
deleted file mode 100644
index a47813d..0000000
--- a/src/joshua/decoder/ff/tm/format/HieroFormatReader.java
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.tm.format;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.ff.tm.GrammarReader;
-import joshua.decoder.ff.tm.Rule;
-
-/**
- * This class implements reading files in the format defined by David Chiang for Hiero.
- *
- * @author Unknown
- * @author Matt Post <po...@cs.jhu.edu>
- */
-
-public class HieroFormatReader extends GrammarReader<Rule> {
-
- static {
- fieldDelimiter = "\\s\\|{3}\\s";
- nonTerminalRegEx = "^\\[[^\\s]+\\,[0-9]*\\]$";
- nonTerminalCleanRegEx = ",[0-9\\s]+";
- // nonTerminalRegEx = "^\\[[A-Z]+\\,[0-9]*\\]$";
- // nonTerminalCleanRegEx = "[\\[\\]\\,0-9\\s]+";
- description = "Original Hiero format";
- }
-
- public HieroFormatReader() {
- super();
- }
-
- public HieroFormatReader(String grammarFile) {
- super(grammarFile);
- }
-
- @Override
- public Rule parseLine(String line) {
- String[] fields = line.split(fieldDelimiter);
- if (fields.length < 3) {
- throw new RuntimeException(String.format("Rule '%s' does not have four fields", line));
- }
-
- int lhs = Vocabulary.id(cleanNonTerminal(fields[0]));
-
- int arity = 0;
- // foreign side
- String[] foreignWords = fields[1].split("\\s+");
- int[] french = new int[foreignWords.length];
- for (int i = 0; i < foreignWords.length; i++) {
- french[i] = Vocabulary.id(foreignWords[i]);
- if (Vocabulary.nt(french[i])) {
- arity++;
- french[i] = cleanNonTerminal(french[i]);
- }
- }
-
- // English side
- String[] englishWords = fields[2].split("\\s+");
- int[] english = new int[englishWords.length];
- for (int i = 0; i < englishWords.length; i++) {
- english[i] = Vocabulary.id(englishWords[i]);
- if (Vocabulary.nt(english[i])) {
- english[i] = -Vocabulary.getTargetNonterminalIndex(english[i]);
- }
- }
-
- String sparse_features = (fields.length > 3 ? fields[3] : "");
- String alignment = (fields.length > 4) ? fields[4] : null;
-
- return new Rule(lhs, french, english, sparse_features, arity, alignment);
- }
-
- @Override
- public String toWords(Rule rule) {
- StringBuffer sb = new StringBuffer("");
- sb.append(Vocabulary.word(rule.getLHS()));
- sb.append(" ||| ");
- sb.append(Vocabulary.getWords(rule.getFrench()));
- sb.append(" ||| ");
- sb.append(Vocabulary.getWords(rule.getEnglish()));
- sb.append(" |||");
- sb.append(" " + rule.getFeatureVector());
-
- return sb.toString();
- }
-
- @Override
- public String toWordsWithoutFeatureScores(Rule rule) {
- StringBuffer sb = new StringBuffer();
- sb.append(rule.getLHS());
- sb.append(" ||| ");
- sb.append(Vocabulary.getWords(rule.getFrench()));
- sb.append(" ||| ");
- sb.append(Vocabulary.getWords(rule.getEnglish()));
- sb.append(" |||");
-
- return sb.toString();
- }
-
-
- public static String getFieldDelimiter() {
- return fieldDelimiter;
- }
-
- public static boolean isNonTerminal(final String word) {
- return GrammarReader.isNonTerminal(word);
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/tm/format/PhraseFormatReader.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/format/PhraseFormatReader.java b/src/joshua/decoder/ff/tm/format/PhraseFormatReader.java
deleted file mode 100644
index be4d522..0000000
--- a/src/joshua/decoder/ff/tm/format/PhraseFormatReader.java
+++ /dev/null
@@ -1,128 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.tm.format;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.ff.tm.PhraseRule;
-import joshua.util.io.LineReader;
-
-/***
- * This class reads in the Moses phrase table format, with support for the source and target side,
- * list of features, and word alignments. It works by simply casting the phrase-based rules to
- * left-branching hierarchical rules and passing them on to its parent class, {@HieroFormatReader}.
- *
- * There is also a tool to convert the grammars directly, so that they can be suitably packed. Usage:
- *
- * <pre>
- * cat PHRASE_TABLE | java -cp $JOSHUA/class joshua.decoder.ff.tm.format.PhraseFormatReader > grammar
- * </pre>
- *
- * @author Matt Post <po...@cs.jhu.edu>
- *
- */
-
-public class PhraseFormatReader extends HieroFormatReader {
-
- private int lhs;
-
- /* Whether we are reading a Moses phrase table or Thrax phrase table */
- private boolean moses_format = false;
-
- public PhraseFormatReader(String grammarFile, boolean is_moses) {
- super(grammarFile);
- this.lhs = Vocabulary.id("[X]");
- this.moses_format = is_moses;
- }
-
- public PhraseFormatReader() {
- super();
- this.lhs = Vocabulary.id("[X]");
- }
-
- /**
- * When dealing with Moses format, this munges a Moses-style phrase table into a grammar.
- *
- * mots francaises ||| French words ||| 1 2 3 ||| 0-1 1-0
- *
- * becomes
- *
- * [X] ||| [X,1] mots francaises ||| [X,1] French words ||| 1 2 3 ||| 0-1 1-0
- *
- * For thrax-extracted phrasal grammars, it transforms
- *
- * [X] ||| mots francaises ||| French words ||| 1 2 3 ||| 0-1 1-0
- *
- * into
- *
- * [X] ||| [X,1] mots francaises ||| [X,1] French words ||| 1 2 3 ||| 0-1 1-0
- */
- @Override
- public PhraseRule parseLine(String line) {
- String[] fields = line.split(fieldDelimiter);
-
- int arity = 1;
-
- /* For Thrax phrase-based grammars, skip over the beginning nonterminal */
- int fieldIndex = 0;
- if (! moses_format)
- fieldIndex++;
-
- // foreign side
- String[] foreignWords = fields[fieldIndex].split("\\s+");
- int[] french = new int[foreignWords.length + 1];
- french[0] = lhs;
- for (int i = 0; i < foreignWords.length; i++) {
- french[i+1] = Vocabulary.id(foreignWords[i]);
- }
-
- // English side
- fieldIndex++;
- String[] englishWords = fields[fieldIndex].split("\\s+");
- int[] english = new int[englishWords.length + 1];
- english[0] = -1;
- for (int i = 0; i < englishWords.length; i++) {
- english[i+1] = Vocabulary.id(englishWords[i]);
- }
-
- // transform feature values
- fieldIndex++;
- String sparse_features = fields[fieldIndex];
-
-// System.out.println(String.format("parseLine: %s\n ->%s", line, sparse_features));
-
- // alignments
- fieldIndex++;
- String alignment = (fields.length > fieldIndex) ? fields[fieldIndex] : null;
-
- return new PhraseRule(lhs, french, english, sparse_features, arity, alignment);
- }
-
- /**
- * Converts a Moses phrase table to a Joshua grammar.
- *
- * @param args
- */
- public static void main(String[] args) {
- PhraseFormatReader reader = new PhraseFormatReader();
- for (String line: new LineReader(System.in)) {
- PhraseRule rule = reader.parseLine(line);
- System.out.println(rule.textFormat());
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/tm/format/SamtFormatReader.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/format/SamtFormatReader.java b/src/joshua/decoder/ff/tm/format/SamtFormatReader.java
deleted file mode 100644
index 6539d38..0000000
--- a/src/joshua/decoder/ff/tm/format/SamtFormatReader.java
+++ /dev/null
@@ -1,136 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.tm.format;
-
-import java.util.logging.Logger;
-
-import joshua.corpus.Vocabulary;
-import joshua.decoder.ff.tm.Rule;
-import joshua.decoder.ff.tm.GrammarReader;
-
-public class SamtFormatReader extends GrammarReader<Rule> {
-
- private static final Logger logger = Logger.getLogger(SamtFormatReader.class.getName());
-
- private static final String samtNonTerminalMarkup;
-
- static {
- fieldDelimiter = "#";
- nonTerminalRegEx = "^@[^\\s]+";
- nonTerminalCleanRegEx = ",[0-9\\s]+";
-
- samtNonTerminalMarkup = "@";
-
- description = "Original SAMT format";
- }
-
- public SamtFormatReader(String grammarFile) {
- super(grammarFile);
- }
-
- // Format example:
- // @VZ-HD @APPR-DA+ART-DA minutes#@2 protokoll @1#@PP-MO+VZ-HD#0 1 1 -0 0.5 -0
-
- @Override
- protected Rule parseLine(String line) {
- String[] fields = line.split(fieldDelimiter);
- if (fields.length != 4) {
- logger.severe("Rule line does not have four fields: " + line);
- logger.severe("Skipped.");
- return null;
- }
-
- int lhs = Vocabulary.id(adaptNonTerminalMarkup(fields[2]));
-
- int arity = 0;
-
- // foreign side
- String[] foreignWords = fields[0].split("\\s+");
- int[] french = new int[foreignWords.length];
- for (int i = 0; i < foreignWords.length; i++) {
- if (isNonTerminal(foreignWords[i])) {
- arity++;
- french[i] = Vocabulary.id(adaptNonTerminalMarkup(foreignWords[i], arity));
- } else {
- french[i] = Vocabulary.id(foreignWords[i]);
- }
- }
-
- // english side
- String[] englishWords = fields[1].split("\\s+");
- int[] english = new int[englishWords.length];
- for (int i = 0; i < englishWords.length; i++) {
- if (isNonTerminal(englishWords[i])) {
- english[i] = -Integer.parseInt(cleanSamtNonTerminal(englishWords[i]));
- } else {
- english[i] = Vocabulary.id(englishWords[i]);
- }
- }
-
- // feature scores
- String sparseFeatures = fields[3];
-
- return new Rule(lhs, french, english, sparseFeatures, arity);
- }
-
- protected String cleanSamtNonTerminal(String word) {
- // changes SAMT markup to Hiero-style
- return word.replaceAll(samtNonTerminalMarkup, "");
- }
-
- protected String adaptNonTerminalMarkup(String word) {
- // changes SAMT markup to Hiero-style
- return "["
- + word.replaceAll(",", "_COMMA_").replaceAll("\\$", "_DOLLAR_")
- .replaceAll(samtNonTerminalMarkup, "") + "]";
- }
-
- protected String adaptNonTerminalMarkup(String word, int ntIndex) {
- // changes SAMT markup to Hiero-style
- return "["
- + word.replaceAll(",", "_COMMA_").replaceAll("\\$", "_DOLLAR_")
- .replaceAll(samtNonTerminalMarkup, "") + "," + ntIndex + "]";
- }
-
- @Override
- public String toWords(Rule rule) {
- StringBuffer sb = new StringBuffer();
- sb.append(Vocabulary.word(rule.getLHS()));
- sb.append(" ||| ");
- sb.append(Vocabulary.getWords(rule.getFrench()));
- sb.append(" ||| ");
- sb.append(Vocabulary.getWords(rule.getEnglish()));
- sb.append(" ||| " + rule.getFeatureString());
-
- return sb.toString();
- }
-
- @Override
- public String toWordsWithoutFeatureScores(Rule rule) {
- StringBuffer sb = new StringBuffer();
- sb.append(Vocabulary.word(rule.getLHS()));
- sb.append(" ||| ");
- sb.append(Vocabulary.getWords(rule.getFrench()));
- sb.append(" ||| ");
- sb.append(Vocabulary.getWords(rule.getEnglish()));
- sb.append(" |||");
-
- return sb.toString();
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/decoder/ff/tm/hash_based/ExtensionIterator.java
----------------------------------------------------------------------
diff --git a/src/joshua/decoder/ff/tm/hash_based/ExtensionIterator.java b/src/joshua/decoder/ff/tm/hash_based/ExtensionIterator.java
deleted file mode 100644
index d6b5b97..0000000
--- a/src/joshua/decoder/ff/tm/hash_based/ExtensionIterator.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.decoder.ff.tm.hash_based;
-
-import java.util.HashMap;
-import java.util.Iterator;
-
-public class ExtensionIterator implements Iterator<Integer> {
-
- private Iterator<Integer> iterator;
- private boolean terminal;
- private boolean done;
- private int next;
-
- public ExtensionIterator(HashMap<Integer, ?> map, boolean terminal) {
- this.terminal = terminal;
- done = false;
- if (map == null) {
- done = true;
- } else {
- this.iterator = map.keySet().iterator();
- forward();
- }
- }
-
- private void forward() {
- if (done)
- return;
- while (iterator.hasNext()) {
- int candidate = iterator.next();
- if ((terminal && candidate > 0) || (!terminal && candidate < 0)) {
- next = candidate;
- return;
- }
- }
- done = true;
- }
-
- @Override
- public boolean hasNext() {
- return !done;
- }
-
- @Override
- public Integer next() {
- if (done)
- throw new RuntimeException();
- int consumed = next;
- forward();
- return consumed;
- }
-
- @Override
- public void remove() {
- throw new UnsupportedOperationException();
- }
-}