You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/08/17 10:32:20 UTC
[39/56] [partial] incubator-joshua git commit: maven multi-module
layout 1st commit: moving files into joshua-core
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java
new file mode 100644
index 0000000..4c56aac
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/BloomFilterLanguageModel.java
@@ -0,0 +1,567 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.lm.bloomfilter_lm;
+
+import java.io.Externalizable;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.ObjectInput;
+import java.io.ObjectInputStream;
+import java.io.ObjectOutput;
+import java.io.ObjectOutputStream;
+import java.util.HashMap;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.lm.DefaultNGramLanguageModel;
+import org.apache.joshua.util.Regex;
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * An n-gram language model with linearly-interpolated Witten-Bell smoothing, using a Bloom filter
+ * as its main data structure. A Bloom filter is a lossy data structure that can be used to test for
+ * set membership.
+ */
+public class BloomFilterLanguageModel extends DefaultNGramLanguageModel implements Externalizable {
+ /**
+ * An initial value used for hashing n-grams so that they can be stored in a bloom filter.
+ */
+ public static final int HASH_SEED = 17;
+
+ /**
+ * Another value used in the process of hashing n-grams.
+ */
+ public static final int HASH_OFFSET = 37;
+
+ /**
+ * The maximum score that a language model feature function can return to the Joshua decoder.
+ */
+ public static final double MAX_SCORE = 100.0;
+
+ /**
+ * The logger for this class.
+ */
+ private static final Logger LOG = LoggerFactory.getLogger(BloomFilterLanguageModel.class);
+
+ /**
+ * The Bloom filter data structure itself.
+ */
+ private BloomFilter bf;
+
+ /**
+ * The base of the logarithm used to quantize n-gram counts. N-gram counts are quantized
+ * logarithmically to reduce the number of times we need to query the Bloom filter.
+ */
+ private double quantizationBase;
+
+ /**
+ * Natural log of the number of tokens seen in the training corpus.
+ */
+ private double numTokens;
+
+ /**
+ * An array of pairs of long, used as hash functions for storing or retreiving the count of an
+ * n-gram in the Bloom filter.
+ */
+ private long[][] countFuncs;
+ /**
+ * An array of pairs of long, used as hash functions for storing or retreiving the number of
+ * distinct types observed after an n-gram.
+ */
+ private long[][] typesFuncs;
+
+ /**
+ * The smoothed probability of an unseen n-gram. This is also the probability of any n-gram under
+ * the zeroth-order model.
+ */
+ transient private double p0;
+
+ /**
+ * The interpolation constant between Witten-Bell models of order zero and one. Stored in a field
+ * because it can be calculated ahead of time; it doesn't depend on the particular n-gram.
+ */
+ transient private double lambda0;
+
+ /**
+ * The maximum possible quantized count of any n-gram stored in the Bloom filter. Used as an upper
+ * bound on the count that could be returned when querying the Bloom filter.
+ */
+ transient private int maxQ; // max quantized count
+
+ /**
+ * Constructor called from the Joshua decoder. This constructor assumes that the LM has already
+ * been built, and takes the name of the file where the LM is stored.
+ *
+ * @param order the order of the language model
+ * @param filename path to the file where the language model is stored
+ * @throws IOException if the bloom filter language model cannot be rebuilt from the input file
+ */
+ public BloomFilterLanguageModel(int order, String filename) throws IOException {
+ super(order);
+ try {
+ readExternal(new ObjectInputStream(new GZIPInputStream(new FileInputStream(filename))));
+ } catch (ClassNotFoundException e) {
+ IOException ioe = new IOException("Could not rebuild bloom filter LM from file " + filename);
+ ioe.initCause(e);
+ throw ioe;
+ }
+
+ int vocabSize = Vocabulary.size();
+ p0 = -Math.log(vocabSize + 1);
+ double oneMinusLambda0 = numTokens - logAdd(Math.log(vocabSize), numTokens);
+ p0 += oneMinusLambda0;
+ lambda0 = Math.log(vocabSize) - logAdd(Math.log(vocabSize), numTokens);
+ maxQ = quantize((long) Math.exp(numTokens));
+ }
+
+ /**
+ * Constructor to be used by the main function. This constructor is used to build a new language
+ * model from scratch. An LM should be built with the main function before using it in the Joshua
+ * decoder.
+ *
+ * @param filename path to the file of training corpus statistics
+ * @param order the order of the language model
+ * @param size the size of the Bloom filter, in bits
+ * @param base a double. The base of the logarithm for quantization.
+ */
+ private BloomFilterLanguageModel(String filename, int order, int size, double base) {
+ super(order);
+ quantizationBase = base;
+ populateBloomFilter(size, filename);
+ }
+
+ /**
+ * calculates the linearly-interpolated Witten-Bell probability for a given ngram. this is
+ * calculated as: p(w|h) = pML(w|h)L(h) - (1 - L(h))p(w|h') where: w is a word and h is a history
+ * h' is the history h with the first word removed pML is the maximum-likelihood estimate of the
+ * probability L(.) is lambda, the interpolation factor, which depends only on the history h: L(h)
+ * = s(h) / s(h) + c(h) where s(.) is the observed number of distinct types after h, and c is the
+ * observed number of counts of h in the training corpus.
+ * <p>
+ * in fact this model calculates the probability starting from the lowest order and working its
+ * way up, to take advantage of the one- sided error rate inherent in using a bloom filter data
+ * structure.
+ *
+ * @param ngram the ngram whose probability is to be calculated
+ * @param ngramOrder the order of the ngram.
+ *
+ * @return the linearly-interpolated Witten-Bell smoothed probability of an ngram
+ */
+ private float wittenBell(int[] ngram, int ngramOrder) {
+ int end = ngram.length;
+ double p = p0; // current calculated probability
+ // note that p0 and lambda0 are independent of the given
+ // ngram so they are calculated ahead of time.
+ int MAX_QCOUNT = getCount(ngram, ngram.length - 1, ngram.length, maxQ);
+ if (MAX_QCOUNT == 0) // OOV!
+ return (float) p;
+ double pML = Math.log(unQuantize(MAX_QCOUNT)) - numTokens;
+
+ // p += lambda0 * pML;
+ p = logAdd(p, (lambda0 + pML));
+ if (ngram.length == 1) { // if it's a unigram, we're done
+ return (float) p;
+ }
+ // otherwise we calculate the linear interpolation
+ // with higher order models.
+ for (int i = end - 2; i >= end - ngramOrder && i >= 0; i--) {
+ int historyCnt = getCount(ngram, i, end, MAX_QCOUNT);
+ // if the count for the history is zero, all higher
+ // terms in the interpolation must be zero, so we
+ // are done here.
+ if (historyCnt == 0) {
+ return (float) p;
+ }
+ int historyTypesAfter = getTypesAfter(ngram, i, end, historyCnt);
+ // unQuantize the counts we got from the BF
+ double HC = unQuantize(historyCnt);
+ double HTA = 1 + unQuantize(historyTypesAfter);
+ // interpolation constant
+ double lambda = Math.log(HTA) - Math.log(HTA + HC);
+ double oneMinusLambda = Math.log(HC) - Math.log(HTA + HC);
+ // p *= 1 - lambda
+ p += oneMinusLambda;
+ int wordCount = getCount(ngram, i + 1, end, historyTypesAfter);
+ double WC = unQuantize(wordCount);
+ // p += lambda * p_ML(w|h)
+ if (WC == 0) return (float) p;
+ p = logAdd(p, lambda + Math.log(WC) - Math.log(HC));
+ MAX_QCOUNT = wordCount;
+ }
+ return (float) p;
+ }
+
+ /**
+ * Retrieve the count of a ngram from the Bloom filter. That is, how many times did we see this
+ * ngram in the training corpus? This corresponds roughly to algorithm 2 in Talbot and Osborne's
+ * "Tera-Scale LMs on the Cheap."
+ *
+ * @param ngram array containing the ngram as a sub-array
+ * @param start the index of the first word of the ngram
+ * @param end the index after the last word of the ngram
+ * @param qcount the maximum possible count to be returned
+ *
+ * @return the number of times the ngram was seen in the training corpus, quantized
+ */
+ private int getCount(int[] ngram, int start, int end, int qcount) {
+ for (int i = 1; i <= qcount; i++) {
+ int hash = hashNgram(ngram, start, end, i);
+ if (!bf.query(hash, countFuncs)) {
+ return i - 1;
+ }
+ }
+ return qcount;
+ }
+
+ /**
+ * Retrieve the number of distinct types that follow an ngram in the training corpus.
+ *
+ * This is another version of algorithm 2. As noted in the paper, we have different algorithms for
+ * getting ngram counts versus suffix counts because c(x) = 1 is a proxy item for s(x) = 1
+ *
+ * @param ngram an array the contains the ngram as a sub-array
+ * @param start the index of the first word of the ngram
+ * @param end the index after the last word of the ngram
+ * @param qcount the maximum possible return value
+ *
+ * @return the number of distinct types observed to follow an ngram in the training corpus,
+ * quantized
+ */
+ private int getTypesAfter(int[] ngram, int start, int end, int qcount) {
+ // first we check c(x) >= 1
+ int hash = hashNgram(ngram, start, end, 1);
+ if (!bf.query(hash, countFuncs)) {
+ return 0;
+ }
+ // if c(x) >= 1, we check for the stored suffix count
+ for (int i = 1; i < qcount; i++) {
+ hash = hashNgram(ngram, start, end, i);
+ if (!bf.query(hash, typesFuncs)) {
+ return i - 1;
+ }
+ }
+ return qcount;
+ }
+
+ /**
+ * Logarithmically quantizes raw counts. The quantization scheme is described in Talbot and
+ * Osborne's paper "Tera-Scale LMs on the Cheap."
+ *
+ * @param x long giving the raw count to be quantized
+ *
+ * @return the quantized count
+ */
+ private int quantize(long x) {
+ return 1 + (int) Math.floor(Math.log(x) / Math.log(quantizationBase));
+ }
+
+ /**
+ * Unquantizes a quantized count.
+ *
+ * @param x the quantized count
+ *
+ * @return the expected raw value of the quantized count
+ */
+ private double unQuantize(int x) {
+ if (x == 0) {
+ return 0;
+ } else {
+ return ((quantizationBase + 1) * Math.pow(quantizationBase, x - 1) - 1) / 2;
+ }
+ }
+
+ /**
+ * Converts an n-gram and a count into a value that can be stored into a Bloom filter. This is
+ * adapted directly from <code>AbstractPhrase.hashCode()</code> elsewhere in the Joshua code base.
+ *
+ * @param ngram an array containing the ngram as a sub-array
+ * @param start the index of the first word of the ngram
+ * @param end the index after the last word of the ngram
+ * @param val the count of the ngram
+ *
+ * @return a value suitable to be stored in a Bloom filter
+ */
+ private int hashNgram(int[] ngram, int start, int end, int val) {
+ int result = HASH_OFFSET * HASH_SEED + val;
+ for (int i = start; i < end; i++)
+ result = HASH_OFFSET * result + ngram[i];
+ return result;
+ }
+
+ /**
+ * Adds two numbers that are in the log domain, avoiding underflow.
+ *
+ * @param x one summand
+ * @param y the other summand
+ *
+ * @return the log of the sum of the exponent of the two numbers.
+ */
+ private static double logAdd(double x, double y) {
+ if (y <= x) {
+ return x + Math.log1p(Math.exp(y - x));
+ } else {
+ return y + Math.log1p(Math.exp(x - y));
+ }
+ }
+
+ /**
+ * Builds a language model and stores it in a file.
+ *
+ * @param argv command-line arguments
+ */
+ public static void main(String[] argv) {
+ if (argv.length < 5) {
+ String msg = "usage: BloomFilterLanguageModel <statistics file> <order> <size>"
+ + " <quantization base> <output file>";
+ System.err.println(msg);
+ LOG.error(msg);
+ return;
+ }
+ int order = Integer.parseInt(argv[1]);
+ int size = (int) (Integer.parseInt(argv[2]) * Math.pow(2, 23));
+ double base = Double.parseDouble(argv[3]);
+
+ try {
+ BloomFilterLanguageModel lm = new BloomFilterLanguageModel(argv[0], order, size, base);
+
+ ObjectOutputStream out =
+ new ObjectOutputStream(new GZIPOutputStream(new FileOutputStream(argv[4])));
+
+ lm.writeExternal(out);
+ out.close(); //TODO: try-with-resources
+ } catch (IOException e) {
+ LOG.error(e.getMessage(), e);
+ }
+ }
+
+ /**
+ * Adds ngram counts and counts of distinct types after ngrams, read from a file, to the Bloom
+ * filter.
+ * <p>
+ * The file format should look like this: ngram1 count types-after ngram2 count types-after ...
+ *
+ * @param bloomFilterSize the size of the Bloom filter, in bits
+ * @param filename path to the statistics file
+ */
+ private void populateBloomFilter(int bloomFilterSize, String filename) {
+ HashMap<String, Long> typesAfter = new HashMap<String, Long>();
+ try {
+ FileInputStream file_in = new FileInputStream(filename);
+ FileInputStream file_in_copy = new FileInputStream(filename);
+ InputStream in;
+ InputStream estimateStream;
+ if (filename.endsWith(".gz")) {
+ in = new GZIPInputStream(file_in);
+ estimateStream = new GZIPInputStream(file_in_copy);
+ } else {
+ in = file_in;
+ estimateStream = file_in_copy;
+ }
+ int numObjects = estimateNumberOfObjects(estimateStream);
+ LOG.debug("Estimated number of objects: {}", numObjects);
+ bf = new BloomFilter(bloomFilterSize, numObjects);
+ countFuncs = bf.initializeHashFunctions();
+ populateFromInputStream(in, typesAfter);
+ in.close();
+ } catch (IOException e) {
+ LOG.error(e.getMessage(), e);
+ return;
+ }
+ typesFuncs = bf.initializeHashFunctions();
+ for (String history : typesAfter.keySet()) {
+ String[] toks = Regex.spaces.split(history);
+ int[] hist = new int[toks.length];
+ for (int i = 0; i < toks.length; i++)
+ hist[i] = Vocabulary.id(toks[i]);
+ add(hist, typesAfter.get(history), typesFuncs);
+ }
+ return;
+ }
+
+ /**
+ * Estimate the number of objects that will be stored in the Bloom filter. The optimum number of
+ * hash functions depends on the number of items that will be stored, so we want a guess before we
+ * begin to read the statistics file and store it.
+ *
+ * @param source an InputStream pointing to the training corpus stats
+ *
+ * @return an estimate of the number of objects to be stored in the Bloom filter
+ */
+ private int estimateNumberOfObjects(InputStream source) {
+ int numLines = 0;
+ long maxCount = 0;
+ for (String line: new LineReader(source)) {
+ if (line.trim().equals("")) continue;
+ String[] toks = Regex.spaces.split(line);
+ if (toks.length > ngramOrder + 1) continue;
+ try {
+ long cnt = Long.parseLong(toks[toks.length - 1]);
+ if (cnt > maxCount) maxCount = cnt;
+ } catch (NumberFormatException e) {
+ LOG.error(e.getMessage(), e);
+ break;
+ }
+ numLines++;
+ }
+ double estimate = Math.log(maxCount) / Math.log(quantizationBase);
+ return (int) Math.round(numLines * estimate);
+ }
+
+ /**
+ * Reads the statistics from a source and stores them in the Bloom filter. The ngram counts are
+ * stored immediately in the Bloom filter, but the counts of distinct types following each ngram
+ * are accumulated from the file as we go.
+ *
+ * @param source an InputStream pointing to the statistics
+ * @param types a HashMap that will stores the accumulated counts of distinct types observed to
+ * follow each ngram
+ */
+ private void populateFromInputStream(InputStream source, HashMap<String, Long> types) {
+ numTokens = Double.NEGATIVE_INFINITY; // = log(0)
+ for (String line: new LineReader(source)) {
+ String[] toks = Regex.spaces.split(line);
+ if ((toks.length < 2) || (toks.length > ngramOrder + 1)) continue;
+ int[] ngram = new int[toks.length - 1];
+ StringBuilder history = new StringBuilder();
+ for (int i = 0; i < toks.length - 1; i++) {
+ ngram[i] = Vocabulary.id(toks[i]);
+ if (i < toks.length - 2) history.append(toks[i]).append(" ");
+ }
+
+ long cnt = Long.parseLong(toks[toks.length - 1]);
+ add(ngram, cnt, countFuncs);
+ if (toks.length == 2) { // unigram
+ numTokens = logAdd(numTokens, Math.log(cnt));
+ // no need to count types after ""
+ // that's what vocabulary.size() is for.
+ continue;
+ }
+ if (types.get(history) == null)
+ types.put(history.toString(), 1L);
+ else {
+ long x = (Long) types.get(history);
+ types.put(history.toString(), x + 1);
+ }
+ }
+ return;
+ }
+
+ /**
+ * Adds an ngram, along with an associated value, to the Bloom filter. This corresponds to Talbot
+ * and Osborne's "Tera-scale LMs on the cheap", algorithm 1.
+ *
+ * @param ngram an array representing the ngram
+ * @param value the value to be associated with the ngram
+ * @param funcs an array of long to be used as hash functions
+ */
+ private void add(int[] ngram, long value, long[][] funcs) {
+ if (ngram == null) return;
+ int qValue = quantize(value);
+ for (int i = 1; i <= qValue; i++) {
+ int hash = hashNgram(ngram, 0, ngram.length, i);
+ bf.add(hash, funcs);
+ }
+ }
+
+ /**
+ * Read a Bloom filter LM from an external file.
+ *
+ * @param in an ObjectInput stream to read from
+ */
+ public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
+ int vocabSize = in.readInt();
+ for (int i = 0; i < vocabSize; i++) {
+ String line = in.readUTF();
+ Vocabulary.id(line);
+ }
+ numTokens = in.readDouble();
+ countFuncs = new long[in.readInt()][2];
+ for (int i = 0; i < countFuncs.length; i++) {
+ countFuncs[i][0] = in.readLong();
+ countFuncs[i][1] = in.readLong();
+ }
+ typesFuncs = new long[in.readInt()][2];
+ for (int i = 0; i < typesFuncs.length; i++) {
+ typesFuncs[i][0] = in.readLong();
+ typesFuncs[i][1] = in.readLong();
+ }
+ quantizationBase = in.readDouble();
+ bf = new BloomFilter();
+ bf.readExternal(in);
+ }
+
+ /**
+ * Write a Bloom filter LM to some external location.
+ *
+ * @param out an ObjectOutput stream to write to
+ *
+ * @throws IOException if an input or output exception occurred
+ */
+ public void writeExternal(ObjectOutput out) throws IOException {
+ out.writeInt(Vocabulary.size());
+ for (int i = 0; i < Vocabulary.size(); i++) {
+ // out.writeBytes(vocabulary.getWord(i));
+ // out.writeChar('\n'); // newline
+ out.writeUTF(Vocabulary.word(i));
+ }
+ out.writeDouble(numTokens);
+ out.writeInt(countFuncs.length);
+ for (int i = 0; i < countFuncs.length; i++) {
+ out.writeLong(countFuncs[i][0]);
+ out.writeLong(countFuncs[i][1]);
+ }
+ out.writeInt(typesFuncs.length);
+ for (int i = 0; i < typesFuncs.length; i++) {
+ out.writeLong(typesFuncs[i][0]);
+ out.writeLong(typesFuncs[i][1]);
+ }
+ out.writeDouble(quantizationBase);
+ bf.writeExternal(out);
+ }
+
+ /**
+ * Returns the language model score for an n-gram. This is called from the rest of the Joshua
+ * decoder.
+ *
+ * @param ngram the ngram to score
+ * @param order the order of the model
+ *
+ * @return the language model score of the ngram
+ */
+ @Override
+ protected float ngramLogProbability_helper(int[] ngram, int order) {
+ int[] lm_ngram = new int[ngram.length];
+ for (int i = 0; i < ngram.length; i++) {
+ lm_ngram[i] = Vocabulary.id(Vocabulary.word(ngram[i]));
+ }
+ return wittenBell(lm_ngram, order);
+ }
+
+ @Override
+ public boolean isOov(int id) {
+ int[] ngram = new int[] {id};
+ int MAX_QCOUNT = getCount(ngram, ngram.length - 1, ngram.length, maxQ);
+ return (MAX_QCOUNT == 0);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/package-info.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/package-info.java
new file mode 100644
index 0000000..19fa695
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/bloomfilter_lm/package-info.java
@@ -0,0 +1,25 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/**
+ * Provides an implementation of a bloom filter language model, and
+ * an associated implementation of the language model feature function typically used in
+ * hierarchical phrase-based decoding for statistical machine translation.
+ */
+package org.apache.joshua.decoder.ff.lm.bloomfilter_lm;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/TrieLM.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/TrieLM.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/TrieLM.java
new file mode 100644
index 0000000..bb4732f
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/TrieLM.java
@@ -0,0 +1,335 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.lm.buildin_lm;
+
+import java.io.File;
+import java.io.FileNotFoundException;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.HashMap;
+import java.util.LinkedList;
+import java.util.Map;
+import java.util.Scanner;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.lm.AbstractLM;
+import org.apache.joshua.decoder.ff.lm.ArpaFile;
+import org.apache.joshua.decoder.ff.lm.ArpaNgram;
+import org.apache.joshua.util.Bits;
+import org.apache.joshua.util.Regex;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Relatively memory-compact language model
+ * stored as a reversed-word-order trie.
+ * <p>
+ * The trie itself represents language model context.
+ * <p>
+ * Conceptually, each node in the trie stores a map
+ * from conditioning word to log probability.
+ * <p>
+ * Additionally, each node in the trie stores
+ * the backoff weight for that context.
+ *
+ * @author Lane Schwartz
+ * @see <a href="http://www.speech.sri.com/projects/srilm/manpages/ngram-discount.7.html">SRILM ngram-discount documentation</a>
+ */
+public class TrieLM extends AbstractLM { //DefaultNGramLanguageModel {
+
+ private static final Logger LOG = LoggerFactory.getLogger(TrieLM.class);
+
+ /**
+ * Node ID for the root node.
+ */
+ private static final int ROOT_NODE_ID = 0;
+
+
+ /**
+ * Maps from (node id, word id for child) --> node id of child.
+ */
+ private final Map<Long,Integer> children;
+
+ /**
+ * Maps from (node id, word id for lookup word) -->
+ * log prob of lookup word given context
+ *
+ * (the context is defined by where you are in the tree).
+ */
+ private final Map<Long,Float> logProbs;
+
+ /**
+ * Maps from (node id) -->
+ * backoff weight for that context
+ *
+ * (the context is defined by where you are in the tree).
+ */
+ private final Map<Integer,Float> backoffs;
+
+ public TrieLM(Vocabulary vocab, String file) throws FileNotFoundException {
+ this(new ArpaFile(file,vocab));
+ }
+
+ /**
+ * Constructs a language model object from the specified ARPA file.
+ *
+ * @param arpaFile input ARPA file
+ * @throws FileNotFoundException if the input file cannot be located
+ */
+ public TrieLM(ArpaFile arpaFile) throws FileNotFoundException {
+ super(arpaFile.getVocab().size(), arpaFile.getOrder());
+
+ int ngramCounts = arpaFile.size();
+ LOG.debug("ARPA file contains {} n-grams", ngramCounts);
+
+ this.children = new HashMap<Long,Integer>(ngramCounts);
+ this.logProbs = new HashMap<Long,Float>(ngramCounts);
+ this.backoffs = new HashMap<Integer,Float>(ngramCounts);
+
+ int nodeCounter = 0;
+
+ int lineNumber = 0;
+ for (ArpaNgram ngram : arpaFile) {
+ lineNumber += 1;
+ if (lineNumber % 100000 == 0){
+ LOG.info("Line: {}", lineNumber);
+ }
+
+ LOG.debug("{}-gram: ({} | {})", ngram.order(), ngram.getWord(),
+ Arrays.toString(ngram.getContext()));
+ int word = ngram.getWord();
+
+ int[] context = ngram.getContext();
+
+ {
+ // Find where the log prob should be stored
+ int contextNodeID = ROOT_NODE_ID;
+ {
+ for (int i=context.length-1; i>=0; i--) {
+ long key = Bits.encodeAsLong(contextNodeID, context[i]);
+ int childID;
+ if (children.containsKey(key)) {
+ childID = children.get(key);
+ } else {
+ childID = ++nodeCounter;
+ LOG.debug("children.put({}:{}, {})", contextNodeID, context[i], childID);
+ children.put(key, childID);
+ }
+ contextNodeID = childID;
+ }
+ }
+
+ // Store the log prob for this n-gram at this node in the trie
+ {
+ long key = Bits.encodeAsLong(contextNodeID, word);
+ float logProb = ngram.getValue();
+ LOG.debug("logProbs.put({}:{}, {}", contextNodeID, word, logProb);
+ this.logProbs.put(key, logProb);
+ }
+ }
+
+ {
+ // Find where the backoff should be stored
+ int backoffNodeID = ROOT_NODE_ID;
+ {
+ long backoffNodeKey = Bits.encodeAsLong(backoffNodeID, word);
+ int wordChildID;
+ if (children.containsKey(backoffNodeKey)) {
+ wordChildID = children.get(backoffNodeKey);
+ } else {
+ wordChildID = ++nodeCounter;
+ LOG.debug("children.put({}: {}, {})", backoffNodeID, word, wordChildID);
+ children.put(backoffNodeKey, wordChildID);
+ }
+ backoffNodeID = wordChildID;
+
+ for (int i=context.length-1; i>=0; i--) {
+ long key = Bits.encodeAsLong(backoffNodeID, context[i]);
+ int childID;
+ if (children.containsKey(key)) {
+ childID = children.get(key);
+ } else {
+ childID = ++nodeCounter;
+ LOG.debug("children.put({}:{}, {})", backoffNodeID, context[i], childID);
+ children.put(key, childID);
+ }
+ backoffNodeID = childID;
+ }
+ }
+
+ // Store the backoff for this n-gram at this node in the trie
+ {
+ float backoff = ngram.getBackoff();
+ LOG.debug("backoffs.put({}:{}, {})", backoffNodeID, word, backoff);
+ this.backoffs.put(backoffNodeID, backoff);
+ }
+ }
+
+ }
+ }
+
+
+ @Override
+ protected double logProbabilityOfBackoffState_helper(
+ int[] ngram, int order, int qtyAdditionalBackoffWeight
+ ) {
+ throw new UnsupportedOperationException("probabilityOfBackoffState_helper undefined for TrieLM");
+ }
+
+ @Override
+ protected float ngramLogProbability_helper(int[] ngram, int order) {
+
+// float logProb = (float) -JoshuaConfiguration.lm_ceiling_cost;//Float.NEGATIVE_INFINITY; // log(0.0f)
+ float backoff = 0.0f; // log(1.0f)
+
+ int i = ngram.length - 1;
+ int word = ngram[i];
+ i -= 1;
+
+ int nodeID = ROOT_NODE_ID;
+
+ while (true) {
+
+ {
+ long key = Bits.encodeAsLong(nodeID, word);
+ if (logProbs.containsKey(key)) {
+// logProb = logProbs.get(key);
+ backoff = 0.0f; // log(0.0f)
+ }
+ }
+
+ if (i < 0) {
+ break;
+ }
+
+ {
+ long key = Bits.encodeAsLong(nodeID, ngram[i]);
+
+ if (children.containsKey(key)) {
+ nodeID = children.get(key);
+
+ backoff += backoffs.get(nodeID);
+
+ i -= 1;
+
+ } else {
+ break;
+ }
+ }
+
+ }
+
+// double result = logProb + backoff;
+// if (result < -JoshuaConfiguration.lm_ceiling_cost) {
+// result = -JoshuaConfiguration.lm_ceiling_cost;
+// }
+//
+// return result;
+ return (Float) null;
+ }
+
+ public Map<Long,Integer> getChildren() {
+ return this.children;
+ }
+
+ public static void main(String[] args) throws IOException {
+
+ LOG.info("Constructing ARPA file");
+ ArpaFile arpaFile = new ArpaFile(args[0]);
+
+ LOG.info("Getting symbol table");
+ Vocabulary vocab = arpaFile.getVocab();
+
+ LOG.info("Constructing TrieLM");
+ TrieLM lm = new TrieLM(arpaFile);
+
+ int n = Integer.valueOf(args[2]);
+ LOG.info("N-gram order will be {}", n);
+
+ Scanner scanner = new Scanner(new File(args[1]));
+
+ LinkedList<String> wordList = new LinkedList<String>();
+ LinkedList<String> window = new LinkedList<String>();
+
+ LOG.info("Starting to scan {}", args[1]);
+ while (scanner.hasNext()) {
+
+ LOG.info("Getting next line...");
+ String line = scanner.nextLine();
+ LOG.info("Line: {}", line);
+
+ String[] words = Regex.spaces.split(line);
+ wordList.clear();
+
+ wordList.add("<s>");
+ for (String word : words) {
+ wordList.add(word);
+ }
+ wordList.add("</s>");
+
+ ArrayList<Integer> sentence = new ArrayList<Integer>();
+ // int[] ids = new int[wordList.size()];
+ for (int i=0, size=wordList.size(); i<size; i++) {
+ sentence.add(vocab.id(wordList.get(i)));
+ // ids[i] = ;
+ }
+
+
+
+ while (! wordList.isEmpty()) {
+ window.clear();
+
+ {
+ int i=0;
+ for (String word : wordList) {
+ if (i>=n) break;
+ window.add(word);
+ i++;
+ }
+ wordList.remove();
+ }
+
+ {
+ int i=0;
+ int[] wordIDs = new int[window.size()];
+ for (String word : window) {
+ wordIDs[i] = vocab.id(word);
+ i++;
+ }
+
+ LOG.info("logProb {} = {}", window, lm.ngramLogProbability(wordIDs, n));
+ }
+ }
+
+ double logProb = lm.sentenceLogProbability(sentence, n, 2);//.ngramLogProbability(ids, n);
+ double prob = Math.exp(logProb);
+
+ LOG.info("Total logProb = {}", logProb);
+ LOG.info("Total prob = {}", prob);
+ }
+
+ }
+
+ @Override
+ public boolean isOov(int id) {
+ throw new RuntimeException("Not implemented!");
+ }
+
+}
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/package-info.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/package-info.java
new file mode 100644
index 0000000..6c84703
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/buildin_lm/package-info.java
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.lm.buildin_lm;
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/package-info.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/package-info.java
new file mode 100644
index 0000000..22da71e
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/lm/package-info.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/**
+ * <p>Provides abstraction and support for the language model
+ * feature function typically used in hierarchical phrase-based
+ * decoding for statistical machine translation.</p>
+ * <p>The classes contained within this directory are
+ * responsible for two tasks: implementing the feature function,
+ * and representing the language model itself. The class
+ * `LanguageModelFF` implements the feature function by exending
+ * the class `DefaultStatefulFF`. One of these is instantiated
+ * for each language model present in the decoder.</p>
+ * <p>The language models themselves are implemented as a
+ * combination of an interface (`NGramLanguageModel`), a default
+ * implementation (`DefaultNgramLangaugeModel`), and an abstract
+ * implementation of the default (`AbstractLM`).</p>
+ *
+ * <pre>
+ * DefaultStatefulFF
+ * |- LanguageModelFF
+ *
+ * DefaultNgramLanguageModel implements interface NGramLanguageModel
+ * |- AbstractLM
+ * </pre>
+ */
+package org.apache.joshua.decoder.ff.lm;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/ff/package-info.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/package-info.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/package-info.java
new file mode 100644
index 0000000..b0af73e
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/package-info.java
@@ -0,0 +1,42 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+/**
+ * <p>Provides an implementation of the linear feature functions
+ * typically used in hierarchical phrase-based decoding for
+ * statistical machine translation.</p>
+ * <p>The following is a note from Juri describing some of the
+ * functionality of the feature functions interfaces and default
+ * abstract classes.</p>
+ * <pre>
+ * The equality that I intended for is ff.transitionLogP() =
+ * ff.estimateLogP() + ff.reEstimateTransitionLogP(). The re-estimate
+ * fixes the estimate to be the true transition cost that takes into
+ * account the state. Before decoding the cost of applying a rule is
+ * estimated via estimateLogP() and yields the phrasal feature costs plus
+ * an LM estimate of the cost of the lexical portions of the rule.
+ * transitionLogP() takes rule and state and computes everything from
+ * scratch, whereas reEstimateTransitionLogP() adds in the cost of new
+ * n-grams that result from combining the rule with the LM states and
+ * subtracts out the cost of superfluous less-than-n-grams that were
+ * overridden by the updated cost calculation.
+ *
+ * Hope this helps.
+ * </pre>
+ */
+package org.apache.joshua.decoder.ff;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java
new file mode 100644
index 0000000..f9e6a29
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/phrase/Distortion.java
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.phrase;
+
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.FeatureVector;
+import org.apache.joshua.decoder.ff.StatelessFF;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.phrase.Hypothesis;
+import org.apache.joshua.decoder.segment_file.Sentence;
+
+public class Distortion extends StatelessFF {
+
+ public Distortion(FeatureVector weights, String[] args, JoshuaConfiguration config) {
+ super(weights, "Distortion", args, config);
+
+ if (! config.search_algorithm.equals("stack")) {
+ String msg = "* FATAL: Distortion feature only application for phrase-based decoding. "
+ + "Use -search phrase or remove this feature";
+ throw new RuntimeException(msg);
+ }
+ }
+
+ @Override
+ public ArrayList<String> reportDenseFeatures(int index) {
+ denseFeatureIndex = index;
+
+ ArrayList<String> names = new ArrayList<String>();
+ names.add(name);
+ return names;
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ if (rule != Hypothesis.BEGIN_RULE && rule != Hypothesis.END_RULE) {
+ int start_point = j - rule.getFrench().length + rule.getArity();
+
+ int jump_size = Math.abs(tailNodes.get(0).j - start_point);
+// acc.add(name, -jump_size);
+ acc.add(denseFeatureIndex, -jump_size);
+ }
+
+// System.err.println(String.format("DISTORTION(%d, %d) from %d = %d", i, j, tailNodes != null ? tailNodes.get(0).j : -1, jump_size));
+
+ return null;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
new file mode 100644
index 0000000..91af58b
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/similarity/EdgePhraseSimilarityFF.java
@@ -0,0 +1,279 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.similarity;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+import java.net.Socket;
+import java.net.UnknownHostException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.List;
+
+import com.google.common.base.Throwables;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.chart_parser.SourcePath;
+import org.apache.joshua.decoder.ff.FeatureVector;
+import org.apache.joshua.decoder.ff.StatefulFF;
+import org.apache.joshua.decoder.ff.SourceDependentFF;
+import org.apache.joshua.decoder.ff.state_maintenance.DPState;
+import org.apache.joshua.decoder.ff.state_maintenance.NgramDPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HGNode;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.util.Cache;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class EdgePhraseSimilarityFF extends StatefulFF implements SourceDependentFF {
+
+ private static final Logger LOG = LoggerFactory.getLogger(EdgePhraseSimilarityFF.class);
+
+ private static Cache<String, Float> cache = new Cache<String, Float>(100000000);
+
+ private String host;
+ private int port;
+
+ private Socket socket;
+ private PrintWriter serverAsk;
+ private BufferedReader serverReply;
+
+ private int[] source;
+
+ private final int MAX_PHRASE_LENGTH = 4;
+ private final int GAP = 0;
+
+ public EdgePhraseSimilarityFF(FeatureVector weights, String[] args, JoshuaConfiguration config) throws NumberFormatException, UnknownHostException, IOException {
+ super(weights, "EdgePhraseSimilarity", args, config);
+
+ this.host = parsedArgs.get("host");
+ this.port = Integer.parseInt(parsedArgs.get("port"));
+
+ initializeConnection();
+ }
+
+ private void initializeConnection() throws NumberFormatException, IOException {
+ LOG.info("Opening connection.");
+ socket = new Socket(host, port);
+ serverAsk = new PrintWriter(socket.getOutputStream(), true);
+ serverReply = new BufferedReader(new InputStreamReader(socket.getInputStream()));
+ }
+
+ @Override
+ public DPState compute(Rule rule, List<HGNode> tailNodes, int i, int j, SourcePath sourcePath,
+ Sentence sentence, Accumulator acc) {
+
+ float value = computeScore(rule, tailNodes);
+ acc.add(name, value);
+
+ // TODO 07/2013: EdgePhraseSimilarity needs to know its order rather than inferring it from tail
+ // nodes.
+ return new NgramDPState(new int[1], new int[1]);
+ }
+
+ @Override
+ public DPState computeFinal(HGNode tailNode, int i, int j, SourcePath path, Sentence sentence, Accumulator acc) {
+ return null;
+ }
+
+ public float computeScore(Rule rule, List<HGNode> tailNodes) {
+ if (tailNodes == null || tailNodes.isEmpty())
+ return 0;
+
+ // System.err.println("RULE [" + spanStart + ", " + spanEnd + "]: " + rule.toString());
+
+ int[] target = rule.getEnglish();
+ int lm_state_size = 0;
+ for (HGNode node : tailNodes) {
+ NgramDPState state = (NgramDPState) node.getDPState(stateIndex);
+ lm_state_size += state.getLeftLMStateWords().length + state.getRightLMStateWords().length;
+ }
+
+ ArrayList<int[]> batch = new ArrayList<int[]>();
+
+ // Build joined target string.
+ int[] join = new int[target.length + lm_state_size];
+
+ int idx = 0, num_gaps = 1, num_anchors = 0;
+ int[] anchors = new int[rule.getArity() * 2];
+ int[] indices = new int[rule.getArity() * 2];
+ int[] gaps = new int[rule.getArity() + 2];
+ gaps[0] = 0;
+ for (int t = 0; t < target.length; t++) {
+ if (target[t] < 0) {
+ HGNode node = tailNodes.get(-(target[t] + 1));
+ if (t != 0) {
+ indices[num_anchors] = node.i;
+ anchors[num_anchors++] = idx;
+ }
+ NgramDPState state = (NgramDPState) node.getDPState(stateIndex);
+ // System.err.print("LEFT: ");
+ // for (int w : state.getLeftLMStateWords()) System.err.print(Vocabulary.word(w) + " ");
+ // System.err.println();
+ for (int w : state.getLeftLMStateWords())
+ join[idx++] = w;
+ join[idx++] = GAP;
+ gaps[num_gaps++] = idx;
+ // System.err.print("RIGHT: ");
+ // for (int w : state.getRightLMStateWords()) System.err.print(Vocabulary.word(w) + " ");
+ // System.err.println();
+ for (int w : state.getRightLMStateWords())
+ join[idx++] = w;
+ if (t != target.length - 1) {
+ indices[num_anchors] = node.j;
+ anchors[num_anchors++] = idx;
+ }
+ } else {
+ join[idx++] = target[t];
+ }
+ }
+ gaps[gaps.length - 1] = join.length + 1;
+
+ // int c = 0;
+ // System.err.print("> ");
+ // for (int k = 0; k < join.length; k++) {
+ // if (c < num_anchors && anchors[c] == k) {
+ // c++;
+ // System.err.print("| ");
+ // }
+ // System.err.print(Vocabulary.word(join[k]) + " ");
+ // }
+ // System.err.println("<");
+
+ int g = 0;
+ for (int a = 0; a < num_anchors; a++) {
+ if (a > 0 && anchors[a - 1] == anchors[a])
+ continue;
+ if (anchors[a] > gaps[g + 1])
+ g++;
+ int left = Math.max(gaps[g], anchors[a] - MAX_PHRASE_LENGTH + 1);
+ int right = Math.min(gaps[g + 1] - 1, anchors[a] + MAX_PHRASE_LENGTH - 1);
+
+ int[] target_phrase = new int[right - left];
+ System.arraycopy(join, left, target_phrase, 0, target_phrase.length);
+ int[] source_phrase = getSourcePhrase(indices[a]);
+
+ if (source_phrase != null && target_phrase.length != 0) {
+ // System.err.println("ANCHOR: " + indices[a]);
+ batch.add(source_phrase);
+ batch.add(target_phrase);
+ }
+ }
+ return getSimilarity(batch);
+ }
+
+ @Override
+ public float estimateFutureCost(Rule rule, DPState currentState, Sentence sentence) {
+ return 0.0f;
+ }
+
+ /**
+ * From SourceDependentFF interface.
+ */
+ @Override
+ public void setSource(Sentence sentence) {
+ if (! sentence.isLinearChain())
+ throw new RuntimeException("EdgePhraseSimilarity not defined for lattices");
+ this.source = sentence.getWordIDs();
+ }
+
+ public EdgePhraseSimilarityFF clone() {
+ try {
+ return new EdgePhraseSimilarityFF(this.weights, args, config);
+ } catch (Exception e) {
+ throw Throwables.propagate(e);
+ }
+ }
+
+ @Override
+ public float estimateCost(Rule rule, Sentence sentence) {
+ return 0.0f;
+ }
+
+ private final int[] getSourcePhrase(int anchor) {
+ int idx;
+ int length = Math.min(anchor, MAX_PHRASE_LENGTH - 1)
+ + Math.min(source.length - anchor, MAX_PHRASE_LENGTH - 1);
+ if (length <= 0)
+ return null;
+ int[] phrase = new int[length];
+ idx = 0;
+ for (int p = Math.max(0, anchor - MAX_PHRASE_LENGTH + 1); p < Math.min(source.length, anchor
+ + MAX_PHRASE_LENGTH - 1); p++)
+ phrase[idx++] = source[p];
+ return phrase;
+ }
+
+ private float getSimilarity(List<int[]> batch) {
+ float similarity = 0.0f;
+ int count = 0;
+ StringBuilder query = new StringBuilder();
+ List<String> to_cache = new ArrayList<String>();
+ query.append("xb");
+ for (int i = 0; i < batch.size(); i += 2) {
+ int[] source = batch.get(i);
+ int[] target = batch.get(i + 1);
+
+ if (Arrays.equals(source, target)) {
+ similarity += 1;
+ count++;
+ } else {
+ String source_string = Vocabulary.getWords(source);
+ String target_string = Vocabulary.getWords(target);
+
+ String both;
+ if (source_string.compareTo(target_string) > 0)
+ both = source_string + " ||| " + target_string;
+ else
+ both = target_string + " ||| " + source_string;
+
+ Float cached = cache.get(both);
+ if (cached != null) {
+ // System.err.println("SIM: " + source_string + " X " + target_string + " = " + cached);
+ similarity += cached;
+ count++;
+ } else {
+ query.append("\t").append(source_string);
+ query.append("\t").append(target_string);
+ to_cache.add(both);
+ }
+ }
+ }
+ if (!to_cache.isEmpty()) {
+ try {
+ serverAsk.println(query.toString());
+ String response = serverReply.readLine();
+ String[] scores = response.split("\\s+");
+ for (int i = 0; i < scores.length; i++) {
+ Float score = Float.parseFloat(scores[i]);
+ cache.put(to_cache.get(i), score);
+ similarity += score;
+ count++;
+ }
+ } catch (Exception e) {
+ return 0;
+ }
+ }
+ return (count == 0 ? 0 : similarity / count);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/DPState.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/DPState.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/DPState.java
new file mode 100644
index 0000000..e117fde
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/DPState.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.state_maintenance;
+
+/**
+ * Abstract class enforcing explicit implementation of the standard methods.
+ *
+ * @author Zhifei Li, zhifei.work@gmail.com
+ * @author Juri Ganitkevitch, juri@cs.jhu.edu
+ */
+public abstract class DPState {
+
+ public abstract String toString();
+
+ public abstract int hashCode();
+
+ public abstract boolean equals(Object other);
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/KenLMState.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/KenLMState.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/KenLMState.java
new file mode 100644
index 0000000..4fdc631
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/KenLMState.java
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.state_maintenance;
+
+/**
+ * Maintains a state pointer used by KenLM to implement left-state minimization.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ * @author Juri Ganitkevitch juri@cs.jhu.edu
+ */
+public class KenLMState extends DPState {
+
+ private long state = 0;
+
+ public KenLMState() {
+ }
+
+ public KenLMState(long stateId) {
+ this.state = stateId;
+ }
+
+ public long getState() {
+ return state;
+ }
+
+ @Override
+ public int hashCode() {
+ return (int) ((getState() >> 32) ^ getState());
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ return (other instanceof KenLMState && this.getState() == ((KenLMState) other).getState());
+ }
+
+ @Override
+ public String toString() {
+ return String.format("[KenLMState %d]", getState());
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/NgramDPState.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/NgramDPState.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/NgramDPState.java
new file mode 100644
index 0000000..b269bd9
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/state_maintenance/NgramDPState.java
@@ -0,0 +1,100 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.state_maintenance;
+
+import java.util.Arrays;
+
+import org.apache.joshua.corpus.Vocabulary;
+
+/**
+ * @author Zhifei Li, zhifei.work@gmail.com
+ * @author Juri Ganitkevitch, juri@cs.jhu.edu
+ */
+public class NgramDPState extends DPState {
+
+ private int[] left;
+ private int[] right;
+
+ private int hash = 0;
+
+ public NgramDPState(int[] l, int[] r) {
+ left = l;
+ right = r;
+ assertLengths();
+ }
+
+ public void setLeftLMStateWords(int[] words) {
+ left = words;
+ assertLengths();
+ }
+
+ public int[] getLeftLMStateWords() {
+ return left;
+ }
+
+ public void setRightLMStateWords(int[] words) {
+ right = words;
+ assertLengths();
+ }
+
+ public int[] getRightLMStateWords() {
+ return right;
+ }
+
+ private final void assertLengths() {
+ if (left.length != right.length)
+ throw new RuntimeException("Unequal lengths in left and right state: < "
+ + Vocabulary.getWords(left) + " | " + Vocabulary.getWords(right) + " >");
+ }
+
+ @Override
+ public int hashCode() {
+ if (hash == 0) {
+ hash = 31 + Arrays.hashCode(left);
+ hash = hash * 19 + Arrays.hashCode(right);
+ }
+ return hash;
+ }
+
+ @Override
+ public boolean equals(Object other) {
+ if (other instanceof NgramDPState) {
+ NgramDPState that = (NgramDPState) other;
+ if (this.left.length == that.left.length && this.right.length == that.right.length) {
+ for (int i = 0; i < left.length; ++i)
+ if (this.left[i] != that.left[i] || this.right[i] != that.right[i])
+ return false;
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ sb.append("<");
+ for (int id : left)
+ sb.append(" " + Vocabulary.word(id));
+ sb.append(" |");
+ for (int id : right)
+ sb.append(" " + Vocabulary.word(id));
+ sb.append(" >");
+ return sb.toString();
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java
new file mode 100644
index 0000000..01260ab
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/AbstractGrammar.java
@@ -0,0 +1,230 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import java.util.Arrays;
+import java.util.HashSet;
+import java.util.List;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.decoder.ff.FeatureFunction;
+import org.apache.joshua.decoder.phrase.PhraseTable;
+import org.apache.joshua.decoder.segment_file.Token;
+import org.apache.joshua.lattice.Arc;
+import org.apache.joshua.lattice.Lattice;
+import org.apache.joshua.lattice.Node;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * Partial implementation of the <code>Grammar</code> interface that provides logic for sorting a
+ * grammar.
+ * <p>
+ * <em>Note</em>: New classes implementing the <code>Grammar</code> interface should probably
+ * inherit from this class, unless a specific sorting technique different from that implemented by
+ * this class is required.
+ *
+ * @author Zhifei Li
+ * @author Lane Schwartz
+ * @author Matt Post post@cs.jhu.edu
+ */
+public abstract class AbstractGrammar implements Grammar {
+
+ /** Logger for this class. */
+ private static final Logger LOG = LoggerFactory.getLogger(AbstractGrammar.class);
+ /**
+ * Indicates whether the rules in this grammar have been sorted based on the latest feature
+ * function values.
+ */
+ protected boolean sorted = false;
+
+ /*
+ * The grammar's owner, used to determine which weights are applicable to the dense features found
+ * within.
+ */
+ protected final OwnerId owner;
+
+ /*
+ * The maximum length of a source-side phrase. Mostly used by the phrase-based decoder.
+ */
+ protected int maxSourcePhraseLength = -1;
+
+ /**
+ * Returns the longest source phrase read.
+ *
+ * @return the longest source phrase read (nonterminal + terminal symbols).
+ */
+ @Override
+ public int getMaxSourcePhraseLength() {
+ return maxSourcePhraseLength;
+ }
+
+ @Override
+ public OwnerId getOwner() {
+ return owner;
+ }
+
+ public int getSpanLimit() {
+ return spanLimit;
+ }
+
+ /* The maximum span of the input this grammar rules can be applied to. */
+ protected final int spanLimit;
+
+ protected final JoshuaConfiguration joshuaConfiguration;
+
+ /**
+ * Creates an empty, unsorted grammar with given owner and spanlimit
+ *
+ * @see Grammar#isSorted()
+ * @param owner the associated decoder-wide {@link org.apache.joshua.decoder.ff.tm.OwnerMap}
+ * @param config a {@link org.apache.joshua.decoder.JoshuaConfiguration} object
+ * @param spanLimit the maximum span of the input grammar rule(s) can be applied to.
+ */
+ public AbstractGrammar(final String owner, final JoshuaConfiguration config, final int spanLimit) {
+ this.sorted = false;
+ this.owner = OwnerMap.register(owner);
+ this.joshuaConfiguration = config;
+ this.spanLimit = spanLimit;
+ }
+
+ public static final int OOV_RULE_ID = 0;
+
+ /**
+ * Cube-pruning requires that the grammar be sorted based on the latest feature functions. To
+ * avoid synchronization, this method should be called before multiple threads are initialized for
+ * parallel decoding
+ * @param models {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ */
+ public void sortGrammar(List<FeatureFunction> models) {
+ Trie root = getTrieRoot();
+ if (root != null) {
+ sort(root, models);
+ setSorted(true);
+ }
+ }
+
+ /* See Javadoc comments for Grammar interface. */
+ public boolean isSorted() {
+ return sorted;
+ }
+
+ /**
+ * Sets the flag indicating whether this grammar is sorted.
+ * <p>
+ * This method is called by {@link org.apache.joshua.decoder.ff.tm.AbstractGrammar#sortGrammar(List)}
+ * to indicate that the grammar has been sorted.</p>
+ *
+ * <p>Its scope is protected so that child classes that override <code>sortGrammar</code> will also
+ * be able to call this method to indicate that the grammar has been sorted.</p>
+ *
+ * @param sorted set to true if the grammar is sorted
+ */
+ protected void setSorted(boolean sorted) {
+ this.sorted = sorted;
+ LOG.debug("This grammar is now sorted: {}", this);
+ }
+
+ /**
+ * Recursively sorts the grammar using the provided feature functions.
+ * <p>
+ * This method first sorts the rules stored at the provided node, then recursively calls itself on
+ * the child nodes of the provided node.
+ *
+ * @param node Grammar node in the <code>Trie</code> whose rules should be sorted.
+ * @param models Feature function models to use during sorting.
+ */
+ private void sort(Trie node, List<FeatureFunction> models) {
+
+ if (node != null) {
+ if (node.hasRules()) {
+ RuleCollection rules = node.getRuleCollection();
+ LOG.debug("Sorting node {}", Arrays.toString(rules.getSourceSide()));
+
+ /* This causes the rules at this trie node to be sorted */
+ rules.getSortedRules(models);
+
+ if (LOG.isDebugEnabled()) {
+ StringBuilder s = new StringBuilder();
+ for (Rule r : rules.getSortedRules(models)) {
+ s.append("\n\t" + r.getLHS() + " ||| " + Arrays.toString(r.getFrench()) + " ||| "
+ + Arrays.toString(r.getEnglish()) + " ||| " + r.getFeatureVector() + " ||| "
+ + r.getEstimatedCost() + " " + r.getClass().getName() + "@"
+ + Integer.toHexString(System.identityHashCode(r)));
+ }
+ LOG.debug("{}", s);
+ }
+ }
+
+ if (node.hasExtensions()) {
+ for (Trie child : node.getExtensions()) {
+ sort(child, models);
+ }
+ } else {
+ LOG.debug("Node has 0 children to extend: {}", node);
+ }
+ }
+ }
+
+ // write grammar to disk
+ public void writeGrammarOnDisk(String file) {
+ }
+
+ /**
+ * Adds OOV rules for all words in the input lattice to the current grammar. Uses addOOVRule() so that
+ * sub-grammars can define different types of OOV rules if needed (as is used in {@link PhraseTable}).
+ *
+ * @param grammar Grammar in the Trie
+ * @param inputLattice the lattice representing the input sentence
+ * @param featureFunctions a list of feature functions used for scoring
+ * @param onlyTrue determine if word is actual OOV.
+ */
+ public static void addOOVRules(Grammar grammar, Lattice<Token> inputLattice,
+ List<FeatureFunction> featureFunctions, boolean onlyTrue) {
+ /*
+ * Add OOV rules; This should be called after the manual constraints have
+ * been set up.
+ */
+ HashSet<Integer> words = new HashSet<Integer>();
+ for (Node<Token> node : inputLattice) {
+ for (Arc<Token> arc : node.getOutgoingArcs()) {
+ // create a rule, but do not add into the grammar trie
+ // TODO: which grammar should we use to create an OOV rule?
+ int sourceWord = arc.getLabel().getWord();
+ if (sourceWord == Vocabulary.id(Vocabulary.START_SYM)
+ || sourceWord == Vocabulary.id(Vocabulary.STOP_SYM))
+ continue;
+
+ // Determine if word is actual OOV.
+ if (onlyTrue && ! Vocabulary.hasId(sourceWord))
+ continue;
+
+ words.add(sourceWord);
+ }
+ }
+
+ for (int sourceWord: words)
+ grammar.addOOVRules(sourceWord, featureFunctions);
+
+ // Sort all the rules (not much to actually do, this just marks it as sorted)
+ grammar.sortGrammar(featureFunctions);
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/BasicRuleCollection.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/BasicRuleCollection.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/BasicRuleCollection.java
new file mode 100644
index 0000000..4cffb2f
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/BasicRuleCollection.java
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.joshua.decoder.ff.FeatureFunction;
+
+/**
+ * Basic collection of translation rules.
+ *
+ * @author Lane Schwartz
+ * @author Zhifei Li
+ */
+public class BasicRuleCollection implements RuleCollection {
+
+ /**
+ * Indicates whether the rules in this collection have been sorted based on the latest feature
+ * function values.
+ */
+ protected boolean sorted;
+
+ /** List of rules stored in this collection. */
+ protected final List<Rule> rules;
+
+ /** Number of nonterminals in the source pattern. */
+ protected int arity;
+
+ /**
+ * Sequence of terminals and nonterminals in the source pattern.
+ */
+ protected int[] sourceTokens;
+
+ /**
+ * Constructs an initially empty rule collection.
+ *
+ * @param arity Number of nonterminals in the source pattern
+ * @param sourceTokens Sequence of terminals and nonterminals in the source pattern
+ */
+ public BasicRuleCollection(int arity, int[] sourceTokens) {
+ this.rules = new ArrayList<Rule>();
+ this.sourceTokens = sourceTokens;
+ this.arity = arity;
+ this.sorted = false;
+ }
+
+ public int getArity() {
+ return this.arity;
+ }
+
+ /**
+ * Returns a list of the rules, without ensuring that they are first sorted.
+ */
+ @Override
+ public List<Rule> getRules() {
+ return this.rules;
+ }
+
+ @Override
+ public boolean isSorted() {
+ return sorted;
+ }
+
+ /**
+ * Return a list of rules sorted according to their estimated model costs.
+ */
+ @Override
+ public synchronized List<Rule> getSortedRules(List<FeatureFunction> models) {
+ if (! isSorted()) {
+ for (Rule rule: getRules())
+ rule.estimateRuleCost(models);
+
+ Collections.sort(rules, Rule.EstimatedCostComparator);
+ this.sorted = true;
+ }
+
+ return this.rules;
+ }
+
+ public int[] getSourceSide() {
+ return this.sourceTokens;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/CreateGlueGrammar.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/CreateGlueGrammar.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/CreateGlueGrammar.java
new file mode 100644
index 0000000..ce1e7d1
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/CreateGlueGrammar.java
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import static org.apache.joshua.decoder.ff.tm.packed.PackedGrammar.VOCABULARY_FILENAME;
+import static org.apache.joshua.util.FormatUtils.cleanNonTerminal;
+import static org.apache.joshua.util.FormatUtils.isNonterminal;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.util.io.LineReader;
+
+import org.kohsuke.args4j.CmdLineException;
+import org.kohsuke.args4j.CmdLineParser;
+import org.kohsuke.args4j.Option;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class CreateGlueGrammar {
+
+
+ private static final Logger LOG = LoggerFactory.getLogger(CreateGlueGrammar.class);
+
+ private final Set<String> nonTerminalSymbols = new HashSet<>();
+
+ @Option(name = "--grammar", aliases = {"-g"}, required = true, usage = "provide grammar to determine list of NonTerminal symbols.")
+ private String grammarPath;
+
+ @Option(name = "--goal", aliases = {"-goal"}, required = false, usage = "specify custom GOAL symbol. Default: 'GOAL'")
+ private String goalSymbol = cleanNonTerminal(new JoshuaConfiguration().goal_symbol);
+
+ /* Rule templates */
+ // [GOAL] ||| <s> ||| <s> ||| 0
+ private static final String R_START = "[%1$s] ||| <s> ||| <s> ||| 0";
+ // [GOAL] ||| [GOAL,1] [X,2] ||| [GOAL,1] [X,2] ||| -1
+ private static final String R_TWO = "[%1$s] ||| [%1$s,1] [%2$s,2] ||| [%1$s,1] [%2$s,2] ||| -1";
+ // [GOAL] ||| [GOAL,1] </s> ||| [GOAL,1] </s> ||| 0
+ private static final String R_END = "[%1$s] ||| [%1$s,1] </s> ||| [%1$s,1] </s> ||| 0";
+ // [GOAL] ||| <s> [X,1] </s> ||| <s> [X,1] </s> ||| 0
+ private static final String R_TOP = "[%1$s] ||| <s> [%2$s,1] </s> ||| <s> [%2$s,1] </s> ||| 0";
+
+ private void run() throws IOException {
+
+ File grammar_file = new File(grammarPath);
+ if (!grammar_file.exists()) {
+ throw new IOException("Grammar file doesn't exist: " + grammarPath);
+ }
+
+ // in case of a packedGrammar, we read the serialized vocabulary,
+ // collecting all cleaned nonTerminal symbols.
+ if (grammar_file.isDirectory()) {
+ Vocabulary.read(new File(grammarPath + File.separator + VOCABULARY_FILENAME));
+ for (int i = 0; i < Vocabulary.size(); ++i) {
+ final String token = Vocabulary.word(i);
+ if (isNonterminal(token)) {
+ nonTerminalSymbols.add(cleanNonTerminal(token));
+ }
+ }
+ // otherwise we collect cleaned left-hand sides from the rules in the text grammar.
+ } else {
+ final LineReader reader = new LineReader(grammarPath);
+ while (reader.hasNext()) {
+ final String line = reader.next();
+ int lhsStart = line.indexOf("[") + 1;
+ int lhsEnd = line.indexOf("]");
+ if (lhsStart < 1 || lhsEnd < 0) {
+ LOG.info("malformed rule: {}\n", line);
+ continue;
+ }
+ final String lhs = line.substring(lhsStart, lhsEnd);
+ nonTerminalSymbols.add(lhs);
+ }
+ }
+
+ LOG.info("{} nonTerminal symbols read: {}", nonTerminalSymbols.size(),
+ nonTerminalSymbols.toString());
+
+ // write glue rules to stdout
+
+ System.out.println(String.format(R_START, goalSymbol));
+
+ for (String nt : nonTerminalSymbols)
+ System.out.println(String.format(R_TWO, goalSymbol, nt));
+
+ System.out.println(String.format(R_END, goalSymbol));
+
+ for (String nt : nonTerminalSymbols)
+ System.out.println(String.format(R_TOP, goalSymbol, nt));
+
+ }
+
+ public static void main(String[] args) throws IOException {
+ final CreateGlueGrammar glueCreator = new CreateGlueGrammar();
+ final CmdLineParser parser = new CmdLineParser(glueCreator);
+
+ try {
+ parser.parseArgument(args);
+ glueCreator.run();
+ } catch (CmdLineException e) {
+ LOG.error(e.getMessage(), e);
+ parser.printUsage(System.err);
+ System.exit(1);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
new file mode 100644
index 0000000..8f90d1b
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/Grammar.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import java.util.List;
+
+import org.apache.joshua.decoder.ff.FeatureFunction;
+
+/**
+ * Grammar is a class for wrapping a trie of TrieGrammar in order to store holistic metadata.
+ *
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @author Zhifei Li, zhifei.work@gmail.com
+ */
+public interface Grammar {
+
+ /**
+ * Gets the root of the <code>Trie</code> backing this grammar.
+ * <p>
+ * <em>Note</em>: This method should run as a small constant-time function.
+ *
+ * @return the root of the <code>Trie</code> backing this grammar
+ */
+ Trie getTrieRoot();
+
+ /**
+ * After calling this method, the rules in this grammar are guaranteed to be sorted based on the
+ * latest feature function values.
+ * <p>
+ * Cube-pruning requires that the grammar be sorted based on the latest feature functions.
+ *
+ * @param models list of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ */
+ void sortGrammar(List<FeatureFunction> models);
+
+ /**
+ * Determines whether the rules in this grammar have been sorted based on the latest feature
+ * function values.
+ * <p>
+ * This method is needed for the cube-pruning algorithm.
+ *
+ * @return <code>true</code> if the rules in this grammar have been sorted based on the latest
+ * feature function values, <code>false</code> otherwise
+ */
+ boolean isSorted();
+
+ /**
+ * Returns whether this grammar has any valid rules for covering a particular span of a sentence.
+ * Hiero's "glue" grammar will only say True if the span is longer than our span limit, and is
+ * anchored at startIndex==0. Hiero's "regular" grammar will only say True if the span is less
+ * than the span limit. Other grammars, e.g. for rule-based systems, may have different behaviors.
+ *
+ * @param startIndex Indicates the starting index of a phrase in a source input phrase, or a
+ * starting node identifier in a source input lattice
+ * @param endIndex Indicates the ending index of a phrase in a source input phrase, or an ending
+ * node identifier in a source input lattice
+ * @param pathLength Length of the input path in a source input lattice. If a source input phrase
+ * is used instead of a lattice, this value will likely be ignored by the underlying
+ * implementation, but would normally be defined as <code>endIndex-startIndex</code>
+ * @return true if there is a rule for this span
+ */
+ boolean hasRuleForSpan(int startIndex, int endIndex, int pathLength);
+
+ /**
+ * Gets the number of rules stored in the grammar.
+ *
+ * @return the number of rules stored in the grammar
+ */
+ int getNumRules();
+
+ /**
+ * Returns the number of dense features.
+ *
+ * @return the number of dense features
+ */
+ int getNumDenseFeatures();
+
+ /**
+ * Return the grammar's owner.
+ * @return grammar owner
+ */
+ OwnerId getOwner();
+
+ /**
+ * Return the maximum source phrase length (terminals + nonterminals)
+ * @return the maximum source phrase length
+ */
+ int getMaxSourcePhraseLength();
+
+ /**
+ * Add an OOV rule for the requested word for the grammar.
+ *
+ * @param word input word to add rules to
+ * @param featureFunctions a {@link java.util.List} of {@link org.apache.joshua.decoder.ff.FeatureFunction}'s
+ */
+ void addOOVRules(int word, List<FeatureFunction> featureFunctions);
+
+ /**
+ * Add a rule to the grammar.
+ *
+ * @param rule the {@link org.apache.joshua.decoder.ff.tm.Rule}
+ */
+ void addRule(Rule rule);
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/5735d9ae/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
new file mode 100644
index 0000000..df00255
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ff/tm/GrammarReader.java
@@ -0,0 +1,158 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder.ff.tm;
+
+import java.io.IOException;
+import java.util.Iterator;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This is a base class for simple, ASCII line-based grammars that are stored on disk.
+ *
+ * @author Juri Ganitkevitch
+ *
+ */
+public abstract class GrammarReader<R extends Rule> implements Iterable<R>, Iterator<R> {
+
+ private static final Logger LOG = LoggerFactory.getLogger(GrammarReader.class);
+
+ protected static String fieldDelimiter;
+ protected static String description;
+
+ protected String fileName;
+ protected LineReader reader;
+ protected String lookAhead;
+ protected int numRulesRead;
+
+
+ // dummy constructor for
+ public GrammarReader() {
+ this.fileName = null;
+ }
+
+ public GrammarReader(String fileName) throws IOException {
+ this.fileName = fileName;
+ this.reader = new LineReader(fileName);
+ LOG.info("Reading grammar from file {}...", fileName);
+ numRulesRead = 0;
+ advanceReader();
+ }
+
+ // the reader is the iterator itself
+ public Iterator<R> iterator() {
+ return this;
+ }
+
+ /** Unsupported Iterator method. */
+ public void remove() throws UnsupportedOperationException {
+ throw new UnsupportedOperationException();
+ }
+
+ public void close() {
+ if (null != this.reader) {
+ try {
+ this.reader.close();
+ } catch (IOException e) {
+ LOG.warn(e.getMessage(), e);
+ LOG.error("Error closing grammar file stream: {}", this.fileName);
+ }
+ this.reader = null;
+ }
+ }
+
+ /**
+ * For correct behavior <code>close</code> must be called on every GrammarReader, however this
+ * code attempts to avoid resource leaks.
+ *
+ * @see org.apache.joshua.util.io.LineReader
+ */
+ @Override
+ protected void finalize() throws Throwable {
+ if (this.reader != null) {
+ LOG.error("Grammar file stream was not closed, this indicates a coding error: {}",
+ this.fileName);
+ }
+
+ this.close();
+ super.finalize();
+ }
+
+ @Override
+ public boolean hasNext() {
+ return lookAhead != null;
+ }
+
+ private void advanceReader() {
+ try {
+ lookAhead = reader.readLine();
+ numRulesRead++;
+ } catch (IOException e) {
+ LOG.error("Error reading grammar from file: {}", fileName);
+ LOG.error(e.getMessage(), e);
+ }
+ if (lookAhead == null && reader != null) {
+ this.close();
+ }
+ }
+
+ /**
+ * Read the next line, and print reader progress.
+ */
+ @Override
+ public R next() {
+ String line = lookAhead;
+
+ int oldProgress = reader.progress();
+ advanceReader();
+
+
+ if (Decoder.VERBOSE >= 1) {
+ int newProgress = (reader != null) ? reader.progress() : 100;
+
+ //TODO: review this code. It is better to print progress based on time gap (like for every 1s or 2sec) than %!
+ if (newProgress > oldProgress) {
+ for (int i = oldProgress + 1; i <= newProgress; i++)
+ if (i == 97) {
+ System.err.print("1");
+ } else if (i == 98) {
+ System.err.print("0");
+ } else if (i == 99) {
+ System.err.print("0");
+ } else if (i == 100) {
+ System.err.println("%");
+ } else if (i % 10 == 0) {
+ System.err.print(String.format("%d", i));
+ System.err.flush();
+ } else if ((i - 1) % 10 == 0)
+ ; // skip at 11 since 10, 20, etc take two digits
+ else {
+ System.err.print(".");
+ System.err.flush();
+ }
+ }
+ }
+ return parseLine(line);
+ }
+
+ protected abstract R parseLine(String line);
+}