You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/23 18:45:55 UTC
[44/60] [partial] incubator-joshua git commit: maven multi-module
layout 1st commit: moving files into joshua-core
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/corpus/syntax/ArraySyntaxTree.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/corpus/syntax/ArraySyntaxTree.java b/joshua-core/src/main/java/org/apache/joshua/corpus/syntax/ArraySyntaxTree.java
new file mode 100644
index 0000000..f374279
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/corpus/syntax/ArraySyntaxTree.java
@@ -0,0 +1,411 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.corpus.syntax;
+
+import java.io.Externalizable;
+import java.io.IOException;
+import java.io.ObjectInput;
+import java.io.ObjectOutput;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Map;
+import java.util.Set;
+import java.util.Stack;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.util.io.LineReader;
+
+public class ArraySyntaxTree implements SyntaxTree, Externalizable {
+
+ /**
+ * Note that index stores the indices of lattice node positions, i.e. the last element of index is
+ * the terminal node, pointing to lattice.size()
+ */
+ private ArrayList<Integer> forwardIndex;
+ private ArrayList<Integer> forwardLattice;
+ private ArrayList<Integer> backwardIndex;
+ private ArrayList<Integer> backwardLattice;
+
+ private ArrayList<Integer> terminals;
+
+ private boolean useBackwardLattice = true;
+
+ private static final int MAX_CONCATENATIONS = 3;
+ private static final int MAX_LABELS = 100;
+
+ public ArraySyntaxTree() {
+ forwardIndex = null;
+ forwardLattice = null;
+ backwardIndex = null;
+ backwardLattice = null;
+
+ terminals = null;
+ }
+
+
+ public ArraySyntaxTree(String parsed_line) {
+ initialize();
+ appendFromPennFormat(parsed_line);
+ }
+
+
+ /**
+ * Returns a collection of single-non-terminal labels that exactly cover the specified span in the
+ * lattice.
+ */
+ public Collection<Integer> getConstituentLabels(int from, int to) {
+ Collection<Integer> labels = new HashSet<Integer>();
+ int span_length = to - from;
+ for (int i = forwardIndex.get(from); i < forwardIndex.get(from + 1); i += 2) {
+ int current_span = forwardLattice.get(i + 1);
+ if (current_span == span_length)
+ labels.add(forwardLattice.get(i));
+ else if (current_span < span_length) break;
+ }
+ return labels;
+ }
+
+
+ public int getOneConstituent(int from, int to) {
+ int spanLength = to - from;
+ Stack<Integer> stack = new Stack<Integer>();
+
+ for (int i = forwardIndex.get(from); i < forwardIndex.get(from + 1); i += 2) {
+ int currentSpan = forwardLattice.get(i + 1);
+ if (currentSpan == spanLength) {
+ return forwardLattice.get(i);
+ } else if (currentSpan < spanLength) break;
+ }
+ if (stack.isEmpty()) return 0;
+ StringBuilder sb = new StringBuilder();
+ while (!stack.isEmpty()) {
+ String w = Vocabulary.word(stack.pop());
+ if (sb.length() != 0) sb.append(":");
+ sb.append(w);
+ }
+ String label = sb.toString();
+ return Vocabulary.id(adjustMarkup(label));
+ }
+
+
+ public int getOneSingleConcatenation(int from, int to) {
+ for (int midpt = from + 1; midpt < to; midpt++) {
+ int x = getOneConstituent(from, midpt);
+ if (x == 0) continue;
+ int y = getOneConstituent(midpt, to);
+ if (y == 0) continue;
+ String label = Vocabulary.word(x) + "+" + Vocabulary.word(y);
+ return Vocabulary.id(adjustMarkup(label));
+ }
+ return 0;
+ }
+
+
+ public int getOneDoubleConcatenation(int from, int to) {
+ for (int a = from + 1; a < to - 1; a++) {
+ for (int b = a + 1; b < to; b++) {
+ int x = getOneConstituent(from, a);
+ if (x == 0) continue;
+ int y = getOneConstituent(a, b);
+ if (y == 0) continue;
+ int z = getOneConstituent(b, to);
+ if (z == 0) continue;
+ String label = Vocabulary.word(x) + "+" + Vocabulary.word(y) + "+" + Vocabulary.word(z);
+ return Vocabulary.id(adjustMarkup(label));
+ }
+ }
+ return 0;
+ }
+
+
+ public int getOneRightSideCCG(int from, int to) {
+ for (int end = to + 1; end <= forwardLattice.size(); end++) {
+ int x = getOneConstituent(from, end);
+ if (x == 0) continue;
+ int y = getOneConstituent(to, end);
+ if (y == 0) continue;
+ String label = Vocabulary.word(x) + "/" + Vocabulary.word(y);
+ return Vocabulary.id(adjustMarkup(label));
+ }
+ return 0;
+ }
+
+
+ public int getOneLeftSideCCG(int from, int to) {
+ for (int start = from - 1; start >= 0; start--) {
+ int x = getOneConstituent(start, to);
+ if (x == 0) continue;
+ int y = getOneConstituent(start, from);
+ if (y == 0) continue;
+ String label = Vocabulary.word(y) + "\\" + Vocabulary.word(x);
+ return Vocabulary.id(adjustMarkup(label));
+ }
+ return 0;
+ }
+
+
+ /**
+ * Returns a collection of concatenated non-terminal labels that exactly cover the specified span
+ * in the lattice. The number of non-terminals concatenated is limited by MAX_CONCATENATIONS and
+ * the total number of labels returned is bounded by MAX_LABELS.
+ */
+ public Collection<Integer> getConcatenatedLabels(int from, int to) {
+ Collection<Integer> labels = new HashSet<Integer>();
+
+ int span_length = to - from;
+ Stack<Integer> nt_stack = new Stack<Integer>();
+ Stack<Integer> pos_stack = new Stack<Integer>();
+ Stack<Integer> depth_stack = new Stack<Integer>();
+
+ // seed stacks (reverse order to save on iterations, longer spans)
+ for (int i = forwardIndex.get(from + 1) - 2; i >= forwardIndex.get(from); i -= 2) {
+ int current_span = forwardLattice.get(i + 1);
+ if (current_span < span_length) {
+ nt_stack.push(forwardLattice.get(i));
+ pos_stack.push(from + current_span);
+ depth_stack.push(1);
+ } else if (current_span >= span_length) break;
+ }
+
+ while (!nt_stack.isEmpty() && labels.size() < MAX_LABELS) {
+ int nt = nt_stack.pop();
+ int pos = pos_stack.pop();
+ int depth = depth_stack.pop();
+
+ // maximum depth reached without filling span
+ if (depth == MAX_CONCATENATIONS) continue;
+
+ int remaining_span = to - pos;
+ for (int i = forwardIndex.get(pos + 1) - 2; i >= forwardIndex.get(pos); i -= 2) {
+ int current_span = forwardLattice.get(i + 1);
+ if (current_span > remaining_span) break;
+
+ // create and look up concatenated label
+ int concatenated_nt =
+ Vocabulary.id(adjustMarkup(Vocabulary.word(nt) + "+"
+ + Vocabulary.word(forwardLattice.get(i))));
+ if (current_span < remaining_span) {
+ nt_stack.push(concatenated_nt);
+ pos_stack.push(pos + current_span);
+ depth_stack.push(depth + 1);
+ } else if (current_span == remaining_span) {
+ labels.add(concatenated_nt);
+ }
+ }
+ }
+
+ return labels;
+ }
+
+ // TODO: can pre-comupute all that in top-down fashion.
+ public Collection<Integer> getCcgLabels(int from, int to) {
+ Collection<Integer> labels = new HashSet<Integer>();
+
+ int span_length = to - from;
+ // TODO: range checks on the to and from
+
+ boolean is_prefix = (forwardLattice.get(forwardIndex.get(from) + 1) > span_length);
+ if (is_prefix) {
+ Map<Integer, Set<Integer>> main_constituents = new HashMap<Integer, Set<Integer>>();
+ // find missing to the right
+ for (int i = forwardIndex.get(from); i < forwardIndex.get(from + 1); i += 2) {
+ int current_span = forwardLattice.get(i + 1);
+ if (current_span <= span_length)
+ break;
+ else {
+ int end_pos = forwardLattice.get(i + 1) + from;
+ Set<Integer> nts = main_constituents.get(end_pos);
+ if (nts == null) main_constituents.put(end_pos, new HashSet<Integer>());
+ main_constituents.get(end_pos).add(forwardLattice.get(i));
+ }
+ }
+ for (int i = forwardIndex.get(to); i < forwardIndex.get(to + 1); i += 2) {
+ Set<Integer> main_set = main_constituents.get(to + forwardLattice.get(i + 1));
+ if (main_set != null) {
+ for (int main : main_set)
+ labels.add(Vocabulary.id(adjustMarkup(Vocabulary.word(main) + "/"
+ + Vocabulary.word(forwardLattice.get(i)))));
+ }
+ }
+ }
+
+ if (!is_prefix) {
+ if (useBackwardLattice) {
+ // check if there is any possible higher-level constituent overlapping
+ int to_end =
+ (to == backwardIndex.size() - 1) ? backwardLattice.size() : backwardIndex.get(to + 1);
+ // check longest span ending in to..
+ if (backwardLattice.get(to_end - 1) <= span_length) return labels;
+
+ Map<Integer, Set<Integer>> main_constituents = new HashMap<Integer, Set<Integer>>();
+ // find missing to the left
+ for (int i = to_end - 2; i >= backwardIndex.get(to); i -= 2) {
+ int current_span = backwardLattice.get(i + 1);
+ if (current_span <= span_length)
+ break;
+ else {
+ int start_pos = to - backwardLattice.get(i + 1);
+ Set<Integer> nts = main_constituents.get(start_pos);
+ if (nts == null) main_constituents.put(start_pos, new HashSet<Integer>());
+ main_constituents.get(start_pos).add(backwardLattice.get(i));
+ }
+ }
+ for (int i = backwardIndex.get(from); i < backwardIndex.get(from + 1); i += 2) {
+ Set<Integer> main_set = main_constituents.get(from - backwardLattice.get(i + 1));
+ if (main_set != null) {
+ for (int main : main_set)
+ labels.add(Vocabulary.id(adjustMarkup(Vocabulary.word(main) + "\\"
+ + Vocabulary.word(backwardLattice.get(i)))));
+ }
+ }
+ } else {
+ // TODO: bothersome no-backwards-arrays method.
+ }
+ }
+ return labels;
+ }
+
+ @Override
+ public int[] getTerminals() {
+ return getTerminals(0, terminals.size());
+ }
+
+ @Override
+ public int[] getTerminals(int from, int to) {
+ int[] span = new int[to - from];
+ for (int i = from; i < to; i++)
+ span[i - from] = terminals.get(i);
+ return span;
+ }
+
+ public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException {
+ // TODO Auto-generated method stub
+ }
+
+ public void writeExternal(ObjectOutput out) throws IOException {
+ // TODO Auto-generated method stub
+ }
+
+ /**
+ * Reads Penn Treebank format file
+ * @param file_name the string path of the Penn Treebank file
+ * @throws IOException if the file does not exist
+ */
+ public void readExternalText(String file_name) throws IOException {
+ LineReader reader = new LineReader(file_name);
+ initialize();
+ for (String line : reader) {
+ if (line.trim().equals("")) continue;
+ appendFromPennFormat(line);
+ }
+ }
+
+ public void writeExternalText(String file_name) throws IOException {
+ // TODO Auto-generated method stub
+ }
+
+ @Override
+ public String toString() {
+ StringBuilder sb = new StringBuilder();
+ for (int i = 0; i < forwardIndex.size(); i++)
+ sb.append("FI[" + i + "] =\t" + forwardIndex.get(i) + "\n");
+ sb.append("\n");
+ for (int i = 0; i < forwardLattice.size(); i += 2)
+ sb.append("F[" + i + "] =\t" + Vocabulary.word(forwardLattice.get(i)) + " , "
+ + forwardLattice.get(i + 1) + "\n");
+
+ sb.append("\n");
+ for (int i = 0; i < terminals.size(); i += 1)
+ sb.append("T[" + i + "] =\t" + Vocabulary.word(terminals.get(i)) + " , 1 \n");
+
+ if (this.useBackwardLattice) {
+ sb.append("\n");
+ for (int i = 0; i < backwardIndex.size(); i++)
+ sb.append("BI[" + i + "] =\t" + backwardIndex.get(i) + "\n");
+ sb.append("\n");
+ for (int i = 0; i < backwardLattice.size(); i += 2)
+ sb.append("B[" + i + "] =\t" + Vocabulary.word(backwardLattice.get(i)) + " , "
+ + backwardLattice.get(i + 1) + "\n");
+ }
+ return sb.toString();
+ }
+
+
+ private void initialize() {
+ forwardIndex = new ArrayList<Integer>();
+ forwardIndex.add(0);
+ forwardLattice = new ArrayList<Integer>();
+ if (this.useBackwardLattice) {
+ backwardIndex = new ArrayList<Integer>();
+ backwardIndex.add(0);
+ backwardLattice = new ArrayList<Integer>();
+ }
+
+ terminals = new ArrayList<Integer>();
+ }
+
+
+ // TODO: could make this way more efficient
+ private void appendFromPennFormat(String line) {
+ String[] tokens = line.replaceAll("\\(", " ( ").replaceAll("\\)", " ) ").trim().split("\\s+");
+
+ boolean next_nt = false;
+ int current_id = 0;
+ Stack<Integer> stack = new Stack<Integer>();
+
+ for (String token : tokens) {
+ if ("(".equals(token)) {
+ next_nt = true;
+ continue;
+ }
+ if (")".equals(token)) {
+ int closing_pos = stack.pop();
+ forwardLattice.set(closing_pos, forwardIndex.size() - forwardLattice.get(closing_pos));
+ if (this.useBackwardLattice) {
+ backwardLattice.add(forwardLattice.get(closing_pos - 1));
+ backwardLattice.add(forwardLattice.get(closing_pos));
+ }
+ continue;
+ }
+ if (next_nt) {
+ // get NT id
+ current_id = Vocabulary.id(adjustMarkup(token));
+ // add into lattice
+ forwardLattice.add(current_id);
+ // push NT span field onto stack (added hereafter, we're just saving the "- 1")
+ stack.push(forwardLattice.size());
+ // add NT span field
+ forwardLattice.add(forwardIndex.size());
+ } else {
+ current_id = Vocabulary.id(token);
+ terminals.add(current_id);
+
+ forwardIndex.add(forwardLattice.size());
+ if (this.useBackwardLattice) backwardIndex.add(backwardLattice.size());
+ }
+ next_nt = false;
+ }
+ }
+
+ private String adjustMarkup(String nt) {
+ return "[" + nt.replaceAll("[\\[\\]]", "") + "]";
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/corpus/syntax/SyntaxTree.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/corpus/syntax/SyntaxTree.java b/joshua-core/src/main/java/org/apache/joshua/corpus/syntax/SyntaxTree.java
new file mode 100644
index 0000000..6bb4c0b
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/corpus/syntax/SyntaxTree.java
@@ -0,0 +1,34 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.corpus.syntax;
+
+import java.util.Collection;
+
+public interface SyntaxTree {
+
+ public Collection<Integer> getConstituentLabels(int from, int to);
+
+ public Collection<Integer> getConcatenatedLabels(int from, int to);
+
+ public Collection<Integer> getCcgLabels(int from, int to);
+
+ public int[] getTerminals();
+
+ public int[] getTerminals(int from, int to);
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/ArgsParser.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/ArgsParser.java b/joshua-core/src/main/java/org/apache/joshua/decoder/ArgsParser.java
new file mode 100644
index 0000000..5af6d11
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/ArgsParser.java
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder;
+
+import java.io.IOException;
+import java.nio.charset.Charset;
+import java.nio.file.Files;
+import java.nio.file.Paths;
+
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * @author orluke
+ *
+ */
+public class ArgsParser {
+
+ private static final Logger LOG = LoggerFactory.getLogger(ArgsParser.class);
+
+ private String configFile = null;
+
+ /**
+ * Parse the arguments passed from the command line when the JoshuaDecoder application was
+ * executed from the command line.
+ *
+ * @param args string array of input arguments
+ * @param config the {@link org.apache.joshua.decoder.JoshuaConfiguration}
+ * @throws IOException if there is an error wit the input arguments
+ */
+ public ArgsParser(String[] args, JoshuaConfiguration config) throws IOException {
+
+ /*
+ * Look for a verbose flag, -v.
+ *
+ * Look for an argument to the "-config" flag to find the config file, if any.
+ */
+ if (args.length >= 1) {
+ // Search for a verbose flag
+ for (int i = 0; i < args.length; i++) {
+ if (args[i].equals("-v")) {
+ Decoder.VERBOSE = Integer.parseInt(args[i + 1].trim());
+ config.setVerbosity(Decoder.VERBOSE);
+ }
+
+ if (args[i].equals("-version")) {
+ LineReader reader = new LineReader(String.format("%s/VERSION", System.getenv("JOSHUA")));
+ reader.readLine();
+ String version = reader.readLine().split("\\s+")[2];
+ System.out.println(String.format("The Apache Joshua machine translator, version %s", version));
+ System.out.println("joshua.incubator.apache.org");
+ System.exit(0);
+
+ } else if (args[i].equals("-license")) {
+ try {
+ for (String line: Files.readAllLines(Paths.get(String.format("%s/../LICENSE",
+ JoshuaConfiguration.class.getProtectionDomain().getCodeSource().getLocation().getPath())),
+ Charset.defaultCharset())) {
+ System.out.println(line);
+ }
+ } catch (IOException e) {
+ throw new RuntimeException("FATAL: missing license file!", e);
+ }
+ System.exit(0);
+ }
+ }
+
+ // Search for the configuration file from the end (so as to take the last one)
+ for (int i = args.length-1; i >= 0; i--) {
+ if (args[i].equals("-c") || args[i].equals("-config")) {
+
+ setConfigFile(args[i + 1].trim());
+ try {
+ LOG.info("Parameters read from configuration file: {}", getConfigFile());
+ config.readConfigFile(getConfigFile());
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ break;
+ }
+ }
+
+ // Now process all the command-line args
+ config.processCommandLineOptions(args);
+ }
+ }
+
+ /**
+ * @return the configFile
+ */
+ public String getConfigFile() {
+ return configFile;
+ }
+
+ /**
+ * @param configFile the configFile to set
+ */
+ public void setConfigFile(String configFile) {
+ this.configFile = configFile;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/BLEU.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/BLEU.java b/joshua-core/src/main/java/org/apache/joshua/decoder/BLEU.java
new file mode 100644
index 0000000..8b51403
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/BLEU.java
@@ -0,0 +1,562 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder;
+
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Map.Entry;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.state_maintenance.NgramDPState;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.hypergraph.HyperEdge;
+import org.apache.joshua.util.Ngram;
+import org.apache.joshua.util.Regex;
+
+/**
+ * this class implements: (1) sentence-level bleu, with smoothing
+ *
+ * @author Zhifei Li, zhifei.work@gmail.com
+ */
+public class BLEU {
+ // do_ngram_clip: consider global n-gram clip
+
+ public static float computeSentenceBleu(String[] refSents, String hypSent) {
+ return computeSentenceBleu(refSents, hypSent, true, 4, false);
+ }
+
+ // ====================multiple references
+ /**
+ *
+ * @param refSents todo
+ * @param hypSent todo
+ * @param doNgramClip Should usually be true
+ * @param bleuOrder Should usually be 4
+ * @param useShortestRef Probably use false
+ * @return todo
+ */
+ public static float computeSentenceBleu(String[] refSents, String hypSent, boolean doNgramClip,
+ int bleuOrder, boolean useShortestRef) {
+ // === ref tbl
+ HashMap<String, Integer> maxRefCountTbl = constructMaxRefCountTable(refSents, bleuOrder);
+
+ // == ref len
+ int[] refLens = new int[refSents.length];
+ for (int i = 0; i < refSents.length; i++) {
+ String[] refWords = Regex.spaces.split(refSents[i]);
+ refLens[i] = refWords.length;
+ }
+
+ float effectiveRefLen = computeEffectiveLen(refLens, useShortestRef);
+
+ // === hyp tbl
+ String[] hypWrds = Regex.spaces.split(hypSent);
+ HashMap<String, Integer> hypNgramTbl = new HashMap<String, Integer>();
+ Ngram.getNgrams(hypNgramTbl, 1, bleuOrder, hypWrds);
+ return computeSentenceBleu(effectiveRefLen, maxRefCountTbl, hypWrds.length, hypNgramTbl,
+ doNgramClip, bleuOrder);
+ }
+
+ public static float computeEffectiveLen(int[] refLens, boolean useShortestRef) {
+ if (useShortestRef) {
+ int res = Integer.MAX_VALUE;
+ for (int i = 0; i < refLens.length; i++)
+ if (refLens[i] < res)
+ res = refLens[i];
+ return res;
+ } else {// default is average length
+ float res = 0;
+ for (int i = 0; i < refLens.length; i++)
+ res += refLens[i];
+ return res * 1.0f / refLens.length;
+ }
+ }
+
+ /**
+ * words in the ngrams are using integer symbol ID
+ * @param refSents todo
+ * @param bleuOrder todo
+ * @return todo
+ * */
+ public static HashMap<String, Integer> constructMaxRefCountTable(String[] refSents, int bleuOrder) {
+
+ List<HashMap<String, Integer>> listRefNgramTbl = new ArrayList<HashMap<String, Integer>>();
+ for (int i = 0; i < refSents.length; i++) {
+ // if(refSents[i]==null){System.out.println("null ref sent"); System.exit(1);}
+ // String[] refWords = refSents[i].split("\\s+");
+ String[] refWords = Regex.spaces.split(refSents[i]);
+
+ HashMap<String, Integer> refNgramTbl = new HashMap<String, Integer>();
+ Ngram.getNgrams(refNgramTbl, 1, bleuOrder, refWords);
+ listRefNgramTbl.add(refNgramTbl);
+ }
+
+ return computeMaxRefCountTbl(listRefNgramTbl);
+ }
+
+ /**
+ * compute max_ref_count for each ngram in the reference sentences
+ * @param listRefNgramTbl todo
+ * @return todo
+ * */
+ public static HashMap<String, Integer> computeMaxRefCountTbl(
+ List<HashMap<String, Integer>> listRefNgramTbl) {
+
+ HashMap<String, Integer> merged = new HashMap<String, Integer>();
+
+ // == get merged key set
+ for (HashMap<String, Integer> tbl : listRefNgramTbl) {
+ for (String ngram : tbl.keySet()) {
+ merged.put(ngram, 0);
+ }
+ }
+
+ // == get max ref count
+ for (String ngram : merged.keySet()) {
+ int max = 0;
+ for (HashMap<String, Integer> tbl : listRefNgramTbl) {
+ Integer val = tbl.get(ngram);
+ if (val != null && val > max)
+ max = val;
+ }
+
+ merged.put(ngram, max);
+ }
+ return merged;
+ }
+
+ public static float computeSentenceBleu(float effectiveRefLen,
+ HashMap<String, Integer> maxRefCountTbl, int hypLen, HashMap<String, Integer> hypNgramTbl,
+ boolean doNgramClip, int bleuOrder) {
+
+ float resBleu = 0.0f;
+
+ int[] numNgramMatch = new int[bleuOrder];
+ for (Map.Entry<String, Integer> entry : hypNgramTbl.entrySet()) {// each ngram in hyp
+ String ngram = entry.getKey();
+ if (maxRefCountTbl.containsKey(ngram)) {
+ int hypNgramCount = entry.getValue();
+
+ int effectiveNumMatch = hypNgramCount;
+
+ if (doNgramClip) {// min{hypNgramCount, maxRefCount}
+ int maxRefCount = maxRefCountTbl.get(ngram);
+ effectiveNumMatch = (int) Support.findMin(hypNgramCount, maxRefCount); // ngram clip;
+ }
+
+ numNgramMatch[Regex.spaces.split(ngram).length - 1] += effectiveNumMatch;
+ }
+ }
+
+ resBleu = computeBleu(hypLen, effectiveRefLen, numNgramMatch, bleuOrder);
+ // System.out.println("hyp_len: " + hyp_sent.length + "; ref_len:" + ref_sent.length +
+ // "; bleu: " + res_bleu +" num_ngram_matches: " + num_ngram_match[0] + " " +num_ngram_match[1]+
+ // " " + num_ngram_match[2] + " " +num_ngram_match[3]);
+ // System.out.println("Blue is " + res_bleu);
+ return resBleu;
+ }
+
+ // ==============================multiple references end
+
+ public static float computeSentenceBleu(String refSent, String hypSent, boolean doNgramClip,
+ int bleuOrder) {
+ String[] refWrds = Regex.spaces.split(refSent);
+ String[] hypWrds = Regex.spaces.split(hypSent);
+ HashMap<String, Integer> refNgramTbl = new HashMap<String, Integer>();
+ Ngram.getNgrams(refNgramTbl, 1, bleuOrder, refWrds);
+ HashMap<String, Integer> hypNgramTbl = new HashMap<String, Integer>();
+ Ngram.getNgrams(hypNgramTbl, 1, bleuOrder, hypWrds);
+ return computeSentenceBleu(refWrds.length, refNgramTbl, hypWrds.length, hypNgramTbl,
+ doNgramClip, bleuOrder);
+ }
+
+ public static float computeSentenceBleu(int refLen, HashMap<String, Integer> refNgramTbl,
+ int hypLen, HashMap<String, Integer> hypNgramTbl, boolean doNgramClip, int bleuOrder) {
+ float resBleu = 0;
+
+ int[] numNgramMatch = new int[bleuOrder];
+ for (Map.Entry<String, Integer> entry : hypNgramTbl.entrySet()) {
+ String ngram = entry.getKey();
+ if (refNgramTbl.containsKey(ngram)) {
+ if (doNgramClip) {
+ numNgramMatch[Regex.spaces.split(ngram).length - 1] += Support.findMin(
+ refNgramTbl.get(ngram), entry.getValue()); // ngram clip
+ } else {
+ numNgramMatch[Regex.spaces.split(ngram).length - 1] += entry.getValue();// without ngram count clipping
+ }
+ }
+ }
+ resBleu = computeBleu(hypLen, refLen, numNgramMatch, bleuOrder);
+ // System.out.println("hyp_len: " + hyp_sent.length + "; ref_len:" + ref_sent.length +
+ // "; bleu: " + res_bleu +" num_ngram_matches: " + num_ngram_match[0] + " " +num_ngram_match[1]+
+ // " " + num_ngram_match[2] + " " +num_ngram_match[3]);
+ // System.out.println("Blue is " + res_bleu);
+ return resBleu;
+ }
+
+ // sentence-bleu: BLEU= bp * prec; where prec = exp (sum 1/4 * log(prec[order]))
+ public static float computeBleu(int hypLen, float refLen, int[] numNgramMatch, int bleuOrder) {
+ if (hypLen <= 0 || refLen <= 0) {
+ throw new RuntimeException("error: ref or hyp is zero len");
+ }
+ float res = 0;
+ float wt = 1.0f / bleuOrder;
+ float prec = 0;
+ float smooth_factor = 1.0f;
+ for (int t = 0; t < bleuOrder && t < hypLen; t++) {
+ if (numNgramMatch[t] > 0) {
+ prec += wt * Math.log(numNgramMatch[t] * 1.0 / (hypLen - t));
+ } else {
+ smooth_factor *= 0.5;// TODO
+ prec += wt * Math.log(smooth_factor / (hypLen - t));
+ }
+ }
+ float bp = (hypLen >= refLen) ? 1.0f : (float) Math.exp(1 - refLen / hypLen);
+ res = bp * (float) Math.exp(prec);
+ // System.out.println("hyp_len: " + hyp_len + "; ref_len:" + ref_len + "prec: " + Math.exp(prec)
+ // + "; bp: " + bp + "; bleu: " + res);
+ return res;
+ }
+
+ public static HashMap<String, Integer> constructNgramTable(String sentence, int bleuOrder) {
+ HashMap<String, Integer> ngramTable = new HashMap<String, Integer>();
+ String[] refWrds = Regex.spaces.split(sentence);
+ Ngram.getNgrams(ngramTable, 1, bleuOrder, refWrds);
+ return ngramTable;
+ }
+
+ // ================================ Google linear corpus gain
+ // ============================================
+ public static float computeLinearCorpusGain(float[] linearCorpusGainThetas, String[] refSents,
+ String hypSent) {
+ int bleuOrder = 4;
+ int hypLength = Regex.spaces.split(hypSent).length;
+ HashMap<String, Integer> refereceNgramTable = BLEU.constructMaxRefCountTable(refSents,
+ bleuOrder);
+ HashMap<String, Integer> hypNgramTable = BLEU.constructNgramTable(hypSent, bleuOrder);
+ return computeLinearCorpusGain(linearCorpusGainThetas, hypLength, hypNgramTable,
+ refereceNgramTable);
+ }
+
+ /**
+ * speed consideration: assume hypNgramTable has a smaller size than referenceNgramTable does
+ * @param linearCorpusGainThetas todo
+ * @param hypLength todo
+ * @param hypNgramTable todo
+ * @param referenceNgramTable todo
+ * @return todo
+ */
+ public static float computeLinearCorpusGain(float[] linearCorpusGainThetas, int hypLength,
+ Map<String, Integer> hypNgramTable, Map<String, Integer> referenceNgramTable) {
+ float res = 0;
+ res += linearCorpusGainThetas[0] * hypLength;
+ for (Entry<String, Integer> entry : hypNgramTable.entrySet()) {
+ String ngram = entry.getKey();
+ if (referenceNgramTable.containsKey(ngram)) {// delta function
+ int ngramOrder = Regex.spaces.split(ngram).length;
+ res += entry.getValue() * linearCorpusGainThetas[ngramOrder];
+ }
+ }
+ return res;
+ }
+
+ /* Convenience function */
+ public static int[] computeNgramMatches(String[] refSents, String hypSent) {
+ int bleuOrder = 4;
+ int hypLength = Regex.spaces.split(hypSent).length;
+ HashMap<String, Integer> refereceNgramTable = BLEU.constructMaxRefCountTable(refSents,
+ bleuOrder);
+ HashMap<String, Integer> hypNgramTable = BLEU.constructNgramTable(hypSent, bleuOrder);
+ return computeNgramMatches(hypLength, hypNgramTable, refereceNgramTable, bleuOrder);
+ }
+
+ public static int[] computeNgramMatches(int hypLength, Map<String, Integer> hypNgramTable,
+ Map<String, Integer> referenceNgramTable, int highestOrder) {
+ int[] res = new int[highestOrder + 1];
+ res[0] = hypLength;
+ for (Entry<String, Integer> entry : hypNgramTable.entrySet()) {
+ String ngram = entry.getKey();
+ if (referenceNgramTable.containsKey(ngram)) {// delta function
+ int ngramOrder = Regex.spaces.split(ngram).length;
+ res[ngramOrder] += entry.getValue();
+ }
+ }
+
+ /*
+ System.err.print("NGRAMS:");
+ for (String ngram: hypNgramTable.keySet())
+ System.err.print(" | " + ngram);
+ System.err.println();
+ System.err.print("REF:");
+ for (String ngram: referenceNgramTable.keySet())
+ System.err.print(" | " + ngram);
+ System.err.println();
+ System.err.print("COUNTS:");
+ for (int i = 1; i <= 4; i++)
+ System.err.print(" " + res[i]);
+ System.err.println();
+ */
+
+ return res;
+ }
+
+ static public float[] computeLinearCorpusThetas(int numUnigramTokens, float unigramPrecision,
+ float decayRatio) {
+ float[] res = new float[5];
+ res[0] = -1.0f / numUnigramTokens;
+ for (int i = 1; i < 5; i++)
+ res[i] = (1.0f / (4.0f * numUnigramTokens * unigramPrecision * (float) Math.pow(decayRatio,
+ i - 1)));
+
+ float firstWeight = res[0];
+ for (int i = 0; i < 5; i++)
+ res[i] /= Math.abs(firstWeight);// normalize by first one
+
+ System.out.print("Normalized Thetas are: ");
+ for (int i = 0; i < 5; i++)
+ System.out.print(res[i] + " ");
+ System.out.print("\n");
+
+ return res;
+ }
+
+ public static final int maxOrder = 4;
+
+ /**
+ * Computes BLEU statistics incurred by a rule. This is (a) all ngram (n <= 4) for terminal rules
+ * and (b) all ngrams overlying boundary points between terminals in the rule and ngram state from
+ * tail nodes.
+ *
+ * There are four cases to handle:
+ * <ul>
+ * <li>only words
+ * <li>a number of words followed by a nonterminal (left context of tail tail node)
+ * <li>a nonterminal (right context of tail node) followed by one or more words
+ * <li>two nonterminals (right context of tail node 1, left context of tail node 2)
+ * </ul>
+ *
+ * Of these, all but the first have a boundary point to consider.
+ *
+ * @param edge todo
+ * @param spanPct todo
+ * @param references the reference to compute statistics against
+ * @return todo
+ */
+ public static Stats compute(HyperEdge edge, float spanPct, References references) {
+ Stats stats = new Stats();
+ // TODO: this should not be the span width, but the real ref scaled to the span percentage
+ stats.reflen = (int) (spanPct * references.reflen);
+
+ Rule rule = edge.getRule();
+ if (rule != null) {
+ int[] symbols = rule.getEnglish();
+
+// System.err.println(String.format("compute(%s)", rule));
+
+ ArrayList<Integer> currentNgram = new ArrayList<Integer>();
+ int boundary = -1;
+ int tailIndex = -1;
+ for (int i = 0; i < symbols.length; i++) {
+ if (symbols[i] < 0) {
+ tailIndex++;
+
+ NgramDPState ngramState = null;
+ try {
+ ngramState = (NgramDPState) edge.getTailNodes().get(tailIndex).getDPState(0);
+ } catch (ClassCastException e) {
+ throw new RuntimeException(String.format(
+ "* FATAL: first state needs to be NgramDPState (found %s)", edge.getTailNodes()
+ .get(tailIndex).getDPState(0).getClass()));
+ }
+
+ // Compute ngrams overlapping with left context of tail node
+ if (currentNgram.size() > 0) {
+ boundary = currentNgram.size();
+ for (int id : ngramState.getLeftLMStateWords())
+ currentNgram.add(id);
+
+ // Compute the BLEU statistics
+ BLEU.Stats partStats = computeOverDivide(currentNgram, references, boundary);
+ stats.add(partStats);
+
+// System.err.println(" " + Vocabulary.getWords(ngramState.getLeftLMStateWords()));
+
+ currentNgram.clear();
+ }
+
+// System.err.println(" " + Vocabulary.getWords(ngramState.getRightLMStateWords()));
+
+ // Accumulate ngrams from right context of tail node
+ for (int id : ngramState.getRightLMStateWords())
+ currentNgram.add(id);
+
+ boundary = currentNgram.size();
+
+ } else { // terminal symbol
+ currentNgram.add(symbols[i]);
+ stats.len++;
+
+// System.err.println(" " + Vocabulary.word(symbols[i]));
+
+ if (boundary != -1) {
+ BLEU.Stats partStats = computeOverDivide(currentNgram, references, boundary);
+ stats.add(partStats);
+
+ // Shift off the context from the nonterminal's righthand side
+ for (int j = 0; j < boundary; j++)
+ currentNgram.remove(0);
+ boundary = -1;
+ }
+ }
+
+ /*
+ * At the end, we might have (a) nothing, (b) a sequence of words from a nonterminal's
+ * righthand side, (c) a sequence of words from the rule, or (d) a sequence of words from a
+ * nonterminal's righthand context and from the rule
+ */
+ if (currentNgram.size() > 0 && currentNgram.size() != boundary) { // skip cases (a) and (b)
+ BLEU.Stats partStats = computeOverDivide(currentNgram, references, boundary);
+ stats.add(partStats);
+ }
+ }
+ }
+ return stats;
+ }
+
+ /**
+ * When computing BLEU statistics over a rule, we need to avoid adding in ngrams that are
+ * exclusively contained inside tail nodes. This function accumulates all the eligible ngrams from
+ * a string respective of an optional boundary point, and then calls computeNgramMatches().
+ *
+ * @param ngram the current set of ngrams
+ * @param references contains the set of ngrams to compare against
+ * @param boundary the boundary over which all ngrams must fall (-1 means ignore boundary)
+ * @return
+ */
+ private static Stats computeOverDivide(ArrayList<Integer> ngram, References references,
+ int boundary) {
+
+// System.err.print(String.format(" BOUNDARY(%s, %d)", Vocabulary.getWords(ngram), boundary));
+
+ HashMap<String, Integer> boundaryNgrams = new HashMap<String, Integer>();
+ for (int width = 1; width <= Math.min(maxOrder, ngram.size()); width++) {
+ for (int i = 0; i < ngram.size() - width + 1; i++) {
+ int j = i + width;
+
+ final List<Integer> piece = ngram.subList(i, j);
+ if (boundary == -1 || (boundary > i && boundary < j)) {
+ String ngramStr = Vocabulary.getWords(piece);
+ if (!boundaryNgrams.containsKey(ngramStr))
+ boundaryNgrams.put(ngramStr, 1);
+ else
+ boundaryNgrams.put(ngramStr, boundaryNgrams.get(ngramStr));
+ }
+ }
+ }
+
+ /*
+ System.err.print(" FOUND");
+ for (String phr: boundaryNgrams.keySet())
+ System.err.print(" | " + phr);
+ System.err.println();
+ */
+
+ BLEU.Stats result = new BLEU.Stats();
+ int[] stats = BLEU.computeNgramMatches(0, boundaryNgrams, references.ngramCounts, maxOrder);
+ System.arraycopy(stats, 1, result.counts, 0, maxOrder);
+
+ return result;
+ }
+
+ public static class References {
+ HashMap<String, Integer> ngramCounts;
+ float reflen;
+
+ public References(String reference) {
+ String[] refs = new String[1];
+ refs[0] = reference;
+ fill(refs);
+ }
+
+ public References(String[] references) {
+ fill(references);
+ }
+
+ private void fill(String[] references) {
+ ngramCounts = new HashMap<String, Integer>();
+ reflen = 0.0f;
+ for (int i = 0; i < references.length; i++) {
+ String[] ref = references[i].split(" ");
+ Ngram.getNgrams(ngramCounts, 1, maxOrder, ref);
+ reflen += ref.length;
+ }
+ reflen /= references.length;
+ }
+ }
+
+ public static float score(Stats stats) {
+ float score = 0f;
+ float wt = 1.0f / maxOrder;
+ float prec = 0;
+ float smooth_factor = 1.0f;
+ for (int t = 0; t < maxOrder && t < stats.len; t++) {
+ if (stats.counts[t] > 0) {
+ prec += wt * Math.log(stats.counts[t] * 1.0 / (stats.len - t));
+ } else {
+ smooth_factor *= 0.5;// TODO
+ prec += wt * Math.log(smooth_factor / (stats.len - t));
+ }
+ }
+ float bp = (stats.len >= stats.reflen) ? 1.0f : (float) Math.exp(1 - stats.reflen / stats.len);
+ score = bp * (float) Math.exp(prec);
+
+// System.err.println(String.format("BLEU(%d %d %d %d / BP=%f) = %f", stats.counts[0], stats.counts[1], stats.counts[2], stats.counts[3], bp, score));
+ return score;
+ }
+
+ /**
+ * Accumulated sufficient statistics for computing BLEU.
+ */
+ public static class Stats {
+ public int[] counts;
+ public float len;
+ public float reflen;
+
+ public Stats() {
+ counts = new int[4];
+ len = 0.0f;
+ reflen = 0.0f;
+ }
+
+ public Stats(int[] counts, float len, float reflen) {
+ this.counts = counts;
+ this.len = len;
+ this.reflen = reflen;
+ }
+
+ public void add(Stats otherStats) {
+ for (int i = 0; i < counts.length; i++)
+ counts[i] += otherStats.counts[i];
+
+ len += otherStats.len;
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/Decoder.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/Decoder.java b/joshua-core/src/main/java/org/apache/joshua/decoder/Decoder.java
new file mode 100644
index 0000000..097ce59
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/Decoder.java
@@ -0,0 +1,813 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder;
+
+import static org.apache.joshua.decoder.ff.FeatureVector.DENSE_FEATURE_NAMES;
+
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.IOException;
+import java.io.FileNotFoundException;
+import java.lang.reflect.Constructor;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.List;
+import java.util.concurrent.ArrayBlockingQueue;
+import java.util.concurrent.BlockingQueue;
+
+import com.google.common.base.Strings;
+
+import org.apache.joshua.corpus.Vocabulary;
+import org.apache.joshua.decoder.ff.FeatureVector;
+import org.apache.joshua.decoder.ff.FeatureFunction;
+import org.apache.joshua.decoder.ff.PhraseModel;
+import org.apache.joshua.decoder.ff.StatefulFF;
+import org.apache.joshua.decoder.ff.lm.LanguageModelFF;
+import org.apache.joshua.decoder.ff.tm.Grammar;
+import org.apache.joshua.decoder.ff.tm.Rule;
+import org.apache.joshua.decoder.ff.tm.format.HieroFormatReader;
+import org.apache.joshua.decoder.ff.tm.hash_based.MemoryBasedBatchGrammar;
+import org.apache.joshua.decoder.ff.tm.packed.PackedGrammar;
+import org.apache.joshua.decoder.io.TranslationRequestStream;
+import org.apache.joshua.decoder.phrase.PhraseTable;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.util.FileUtility;
+import org.apache.joshua.util.FormatUtils;
+import org.apache.joshua.util.Regex;
+import org.apache.joshua.util.io.LineReader;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class handles decoder initialization and the complication introduced by multithreading.
+ *
+ * After initialization, the main entry point to the Decoder object is
+ * decodeAll(TranslationRequest), which returns a set of Translation objects wrapped in an iterable
+ * Translations object. It is important that we support multithreading both (a) across the sentences
+ * within a request and (b) across requests, in a round-robin fashion. This is done by maintaining a
+ * fixed sized concurrent thread pool. When a new request comes in, a RequestParallelizer thread is
+ * launched. This object iterates over the request's sentences, obtaining a thread from the
+ * thread pool, and using that thread to decode the sentence. If a decoding thread is not available,
+ * it will block until one is in a fair (FIFO) manner. RequestParallelizer thereby permits intra-request
+ * parallelization by separating out reading the input stream from processing the translated sentences,
+ * but also ensures that round-robin parallelization occurs, since RequestParallelizer uses the
+ * thread pool before translating each request.
+ *
+ * A decoding thread is handled by DecoderThread and launched from DecoderThreadRunner. The purpose
+ * of the runner is to record where to place the translated sentence when it is done (i.e., which
+ * Translations object). Translations itself is an iterator whose next() call blocks until the next
+ * translation is available.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ * @author Zhifei Li, zhifei.work@gmail.com
+ * @author wren ng thornton wren@users.sourceforge.net
+ * @author Lane Schwartz dowobeha@users.sourceforge.net
+ */
+public class Decoder {
+
+ private static final Logger LOG = LoggerFactory.getLogger(Decoder.class);
+
+ private final JoshuaConfiguration joshuaConfiguration;
+
+ public JoshuaConfiguration getJoshuaConfiguration() {
+ return joshuaConfiguration;
+ }
+
+ /*
+ * Many of these objects themselves are global objects. We pass them in when constructing other
+ * objects, so that they all share pointers to the same object. This is good because it reduces
+ * overhead, but it can be problematic because of unseen dependencies (for example, in the
+ * Vocabulary shared by language model, translation grammar, etc).
+ */
+ private List<Grammar> grammars;
+ private ArrayList<FeatureFunction> featureFunctions;
+ private Grammar customPhraseTable;
+
+ /* The feature weights. */
+ public static FeatureVector weights;
+
+ public static int VERBOSE = 1;
+
+ private BlockingQueue<DecoderThread> threadPool = null;
+
+ // ===============================================================
+ // Constructors
+ // ===============================================================
+
+ /**
+ * Constructor method that creates a new decoder using the specified configuration file.
+ *
+ * @param joshuaConfiguration a populated {@link org.apache.joshua.decoder.JoshuaConfiguration}
+ * @param configFile name of configuration file.
+ */
+ public Decoder(JoshuaConfiguration joshuaConfiguration, String configFile) {
+ this(joshuaConfiguration);
+ this.initialize(configFile);
+ }
+
+ /**
+ * Factory method that creates a new decoder using the specified configuration file.
+ *
+ * @param configFile Name of configuration file.
+ * @return a configured {@link org.apache.joshua.decoder.Decoder}
+ */
+ public static Decoder createDecoder(String configFile) {
+ JoshuaConfiguration joshuaConfiguration = new JoshuaConfiguration();
+ return new Decoder(joshuaConfiguration, configFile);
+ }
+
+ /**
+ * Constructs an uninitialized decoder for use in testing.
+ * <p>
+ * This method is private because it should only ever be called by the
+ * {@link #getUninitalizedDecoder()} method to provide an uninitialized decoder for use in
+ * testing.
+ */
+ private Decoder(JoshuaConfiguration joshuaConfiguration) {
+ this.joshuaConfiguration = joshuaConfiguration;
+ this.grammars = new ArrayList<Grammar>();
+ this.threadPool = new ArrayBlockingQueue<DecoderThread>(
+ this.joshuaConfiguration.num_parallel_decoders, true);
+ this.customPhraseTable = null;
+ }
+
+ /**
+ * Gets an uninitialized decoder for use in testing.
+ * <p>
+ * This method is called by unit tests or any outside packages (e.g., MERT) relying on the
+ * decoder.
+ * @param joshuaConfiguration a {@link org.apache.joshua.decoder.JoshuaConfiguration} object
+ * @return an uninitialized decoder for use in testing
+ */
+ static public Decoder getUninitalizedDecoder(JoshuaConfiguration joshuaConfiguration) {
+ return new Decoder(joshuaConfiguration);
+ }
+
+ // ===============================================================
+ // Public Methods
+ // ===============================================================
+
+ /**
+ * This class is responsible for getting sentences from the TranslationRequest and procuring a
+ * DecoderThreadRunner to translate it. Each call to decodeAll(TranslationRequest) launches a
+ * thread that will read the request's sentences, obtain a DecoderThread to translate them, and
+ * then place the Translation in the appropriate place.
+ *
+ * @author Matt Post <po...@cs.jhu.edu>
+ *
+ */
+ private class RequestParallelizer extends Thread {
+ /* Source of sentences to translate. */
+ private final TranslationRequestStream request;
+
+ /* Where to put translated sentences. */
+ private final Translations response;
+
+ RequestParallelizer(TranslationRequestStream request, Translations response) {
+ this.request = request;
+ this.response = response;
+ }
+
+ @Override
+ public void run() {
+ /*
+ * Repeatedly get an input sentence, wait for a DecoderThread, and then start a new thread to
+ * translate the sentence. We start a new thread (via DecoderRunnerThread) as opposed to
+ * blocking, so that the RequestHandler can go on to the next sentence in this request, which
+ * allows parallelization across the sentences of the request.
+ */
+ for (;;) {
+ Sentence sentence = request.next();
+
+ if (sentence == null) {
+ response.finish();
+ break;
+ }
+
+ // This will block until a DecoderThread becomes available.
+ DecoderThread thread = Decoder.this.getThread();
+ new DecoderThreadRunner(thread, sentence, response).start();
+ }
+ }
+
+ /**
+ * Strips the nonterminals from the lefthand side of the rule.
+ *
+ * @param rule
+ * @return
+ */
+ private String formatRule(Rule rule) {
+ String ruleString = "";
+ boolean first = true;
+ for (int word: rule.getFrench()) {
+ if (!first)
+ ruleString += " " + Vocabulary.word(word);
+ first = false;
+ }
+
+ ruleString += " |||"; // space will get added with first English word
+ first = true;
+ for (int word: rule.getEnglish()) {
+ if (!first)
+ ruleString += " " + Vocabulary.word(word);
+ first = false;
+ }
+
+ // strip of the leading space
+ return ruleString.substring(1);
+ }
+ }
+
+ /**
+ * Retrieve a thread from the thread pool, blocking until one is available. The blocking occurs in
+ * a fair fashion (i.e,. FIFO across requests).
+ *
+ * @return a thread that can be used for decoding.
+ */
+ public DecoderThread getThread() {
+ try {
+ return threadPool.take();
+ } catch (InterruptedException e) {
+ // TODO Auto-generated catch block
+ e.printStackTrace();
+ }
+ return null;
+ }
+
+ /**
+ * This class handles running a DecoderThread (which takes care of the actual translation of an
+ * input Sentence, returning a Translation object when its done). This is done in a thread so as
+ * not to tie up the RequestHandler that launched it, freeing it to go on to the next sentence in
+ * the TranslationRequest, in turn permitting parallelization across the sentences of a request.
+ *
+ * When the decoder thread is finshed, the Translation object is placed in the correct place in
+ * the corresponding Translations object that was returned to the caller of
+ * Decoder.decodeAll(TranslationRequest).
+ *
+ * @author Matt Post <po...@cs.jhu.edu>
+ */
+ private class DecoderThreadRunner extends Thread {
+
+ private final DecoderThread decoderThread;
+ private final Sentence sentence;
+ private final Translations translations;
+
+ DecoderThreadRunner(DecoderThread thread, Sentence sentence, Translations translations) {
+ this.decoderThread = thread;
+ this.sentence = sentence;
+ this.translations = translations;
+ }
+
+ @Override
+ public void run() {
+ /*
+ * Process any found metadata.
+ */
+
+ /*
+ * Use the thread to translate the sentence. Then record the translation with the
+ * corresponding Translations object, and return the thread to the pool.
+ */
+ try {
+ Translation translation = decoderThread.translate(this.sentence);
+ translations.record(translation);
+
+ /*
+ * This is crucial! It's what makes the thread available for the next sentence to be
+ * translated.
+ */
+ threadPool.put(decoderThread);
+ } catch (Exception e) {
+ throw new RuntimeException(String.format(
+ "Input %d: FATAL UNCAUGHT EXCEPTION: %s", sentence.id(), e.getMessage()), e);
+ // translations.record(new Translation(sentence, null, featureFunctions, joshuaConfiguration));
+ }
+ }
+ }
+
+ /**
+ * This function is the main entry point into the decoder. It translates all the sentences in a
+ * (possibly boundless) set of input sentences. Each request launches its own thread to read the
+ * sentences of the request.
+ *
+ * @param request the populated {@link org.apache.joshua.decoder.io.TranslationRequestStream}
+ * @throws IOException if there is an error with the input stream or writing the output
+ * @return an iterable, asynchronously-filled list of Translations
+ */
+ public Translations decodeAll(TranslationRequestStream request) throws IOException {
+ Translations translations = new Translations(request);
+
+ /* Start a thread to handle requests on the input stream */
+ new RequestParallelizer(request, translations).start();
+
+ return translations;
+ }
+
+
+ /**
+ * We can also just decode a single sentence.
+ *
+ * @param sentence {@link org.apache.joshua.lattice.Lattice} input
+ * @return the sentence {@link org.apache.joshua.decoder.Translation}
+ */
+ public Translation decode(Sentence sentence) {
+ // Get a thread.
+
+ try {
+ DecoderThread thread = threadPool.take();
+ Translation translation = thread.translate(sentence);
+ threadPool.put(thread);
+
+ return translation;
+
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+
+ return null;
+ }
+
+ /**
+ * Clean shutdown of Decoder, resetting all
+ * static variables, such that any other instance of Decoder
+ * afterwards gets a fresh start.
+ */
+ public void cleanUp() {
+ // shut down DecoderThreads
+ for (DecoderThread thread : threadPool) {
+ try {
+ thread.join();
+ } catch (InterruptedException e) {
+ e.printStackTrace();
+ }
+ }
+ resetGlobalState();
+ }
+
+ public static void resetGlobalState() {
+ // clear/reset static variables
+ DENSE_FEATURE_NAMES.clear();
+ Vocabulary.clear();
+ Vocabulary.unregisterLanguageModels();
+ LanguageModelFF.resetLmIndex();
+ StatefulFF.resetGlobalStateIndex();
+ }
+
+ public static void writeConfigFile(double[] newWeights, String template, String outputFile,
+ String newDiscriminativeModel) {
+ try {
+ int columnID = 0;
+
+ BufferedWriter writer = FileUtility.getWriteFileStream(outputFile);
+ LineReader reader = new LineReader(template);
+ try {
+ for (String line : reader) {
+ line = line.trim();
+ if (Regex.commentOrEmptyLine.matches(line) || line.indexOf("=") != -1) {
+ // comment, empty line, or parameter lines: just copy
+ writer.write(line);
+ writer.newLine();
+
+ } else { // models: replace the weight
+ String[] fds = Regex.spaces.split(line);
+ StringBuffer newSent = new StringBuffer();
+ if (!Regex.floatingNumber.matches(fds[fds.length - 1])) {
+ throw new IllegalArgumentException("last field is not a number; the field is: "
+ + fds[fds.length - 1]);
+ }
+
+ if (newDiscriminativeModel != null && "discriminative".equals(fds[0])) {
+ newSent.append(fds[0]).append(' ');
+ newSent.append(newDiscriminativeModel).append(' ');// change the
+ // file name
+ for (int i = 2; i < fds.length - 1; i++) {
+ newSent.append(fds[i]).append(' ');
+ }
+ } else {// regular
+ for (int i = 0; i < fds.length - 1; i++) {
+ newSent.append(fds[i]).append(' ');
+ }
+ }
+ if (newWeights != null)
+ newSent.append(newWeights[columnID++]);// change the weight
+ else
+ newSent.append(fds[fds.length - 1]);// do not change
+
+ writer.write(newSent.toString());
+ writer.newLine();
+ }
+ }
+ } finally {
+ reader.close();
+ writer.close();
+ }
+
+ if (newWeights != null && columnID != newWeights.length) {
+ throw new IllegalArgumentException("number of models does not match number of weights");
+ }
+
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+
+ // ===============================================================
+ // Initialization Methods
+ // ===============================================================
+
+ /**
+ * Moses requires the pattern .*_.* for sparse features, and prohibits underscores in dense features.
+ * This conforms to that pattern. We assume non-conforming dense features start with tm_ or lm_,
+ * and the only sparse feature that needs converting is OOVPenalty.
+ *
+ * @param feature
+ * @return the feature in Moses format
+ */
+ private String mosesize(String feature) {
+ if (joshuaConfiguration.moses) {
+ if (feature.startsWith("tm_") || feature.startsWith("lm_"))
+ return feature.replace("_", "-");
+ }
+
+ return feature;
+ }
+
+ /**
+ * Initialize all parts of the JoshuaDecoder.
+ *
+ * @param configFile File containing configuration options
+ * @return An initialized decoder
+ */
+ public Decoder initialize(String configFile) {
+ try {
+
+ long pre_load_time = System.currentTimeMillis();
+
+ /* Weights can be listed in a separate file (denoted by parameter "weights-file") or directly
+ * in the Joshua config file. Config file values take precedent.
+ */
+ this.readWeights(joshuaConfiguration.weights_file);
+
+
+ /* Add command-line-passed weights to the weights array for processing below */
+ if (!Strings.isNullOrEmpty(joshuaConfiguration.weight_overwrite)) {
+ String[] tokens = joshuaConfiguration.weight_overwrite.split("\\s+");
+ for (int i = 0; i < tokens.length; i += 2) {
+ String feature = tokens[i];
+ float value = Float.parseFloat(tokens[i+1]);
+
+ if (joshuaConfiguration.moses)
+ feature = demoses(feature);
+
+ joshuaConfiguration.weights.add(String.format("%s %s", feature, tokens[i+1]));
+ LOG.info("COMMAND LINE WEIGHT: {} -> {}", feature, value);
+ }
+ }
+
+ /* Read the weights found in the config file */
+ for (String pairStr: joshuaConfiguration.weights) {
+ String pair[] = pairStr.split("\\s+");
+
+ /* Sanity check for old-style unsupported feature invocations. */
+ if (pair.length != 2) {
+ StringBuilder errMsg = new StringBuilder();
+ errMsg.append("FATAL: Invalid feature weight line found in config file.\n");
+ errMsg.append(String.format("The line was '%s'\n", pairStr));
+ errMsg.append("You might be using an old version of the config file that is no longer supported\n");
+ errMsg.append("Check joshua-decoder.org or email joshua_support@googlegroups.com for help\n");
+ errMsg.append("Code = " + 17);
+ throw new RuntimeException(errMsg.toString());
+ }
+
+ weights.set(pair[0], Float.parseFloat(pair[1]));
+ }
+
+ LOG.info("Read {} weights ({} of them dense)", weights.size(), DENSE_FEATURE_NAMES.size());
+
+ // Do this before loading the grammars and the LM.
+ this.featureFunctions = new ArrayList<FeatureFunction>();
+
+ // Initialize and load grammars. This must happen first, since the vocab gets defined by
+ // the packed grammar (if any)
+ this.initializeTranslationGrammars();
+ LOG.info("Grammar loading took: {} seconds.",
+ (System.currentTimeMillis() - pre_load_time) / 1000);
+
+ // Initialize the features: requires that LM model has been initialized.
+ this.initializeFeatureFunctions();
+
+ // This is mostly for compatibility with the Moses tuning script
+ if (joshuaConfiguration.show_weights_and_quit) {
+ for (int i = 0; i < DENSE_FEATURE_NAMES.size(); i++) {
+ String name = DENSE_FEATURE_NAMES.get(i);
+ if (joshuaConfiguration.moses)
+ System.out.println(String.format("%s= %.5f", mosesize(name), weights.getDense(i)));
+ else
+ System.out.println(String.format("%s %.5f", name, weights.getDense(i)));
+ }
+ System.exit(0);
+ }
+
+ // Sort the TM grammars (needed to do cube pruning)
+ if (joshuaConfiguration.amortized_sorting) {
+ LOG.info("Grammar sorting happening lazily on-demand.");
+ } else {
+ long pre_sort_time = System.currentTimeMillis();
+ for (Grammar grammar : this.grammars) {
+ grammar.sortGrammar(this.featureFunctions);
+ }
+ LOG.info("Grammar sorting took {} seconds.",
+ (System.currentTimeMillis() - pre_sort_time) / 1000);
+ }
+
+ // Create the threads
+ for (int i = 0; i < joshuaConfiguration.num_parallel_decoders; i++) {
+ this.threadPool.put(new DecoderThread(this.grammars, Decoder.weights,
+ this.featureFunctions, joshuaConfiguration));
+ }
+ } catch (IOException | InterruptedException e) {
+ LOG.warn(e.getMessage(), e);
+ }
+
+ return this;
+ }
+
+ /**
+ * Initializes translation grammars Retained for backward compatibility
+ *
+ * @param ownersSeen Records which PhraseModelFF's have been instantiated (one is needed for each
+ * owner)
+ * @throws IOException
+ */
+ private void initializeTranslationGrammars() throws IOException {
+
+ if (joshuaConfiguration.tms.size() > 0) {
+
+ // collect packedGrammars to check if they use a shared vocabulary
+ final List<PackedGrammar> packed_grammars = new ArrayList<>();
+
+ // tm = {thrax/hiero,packed,samt,moses} OWNER LIMIT FILE
+ for (String tmLine : joshuaConfiguration.tms) {
+
+ String type = tmLine.substring(0, tmLine.indexOf(' '));
+ String[] args = tmLine.substring(tmLine.indexOf(' ')).trim().split("\\s+");
+ HashMap<String, String> parsedArgs = FeatureFunction.parseArgs(args);
+
+ String owner = parsedArgs.get("owner");
+ int span_limit = Integer.parseInt(parsedArgs.get("maxspan"));
+ String path = parsedArgs.get("path");
+
+ Grammar grammar = null;
+ if (! type.equals("moses") && ! type.equals("phrase")) {
+ if (new File(path).isDirectory()) {
+ try {
+ PackedGrammar packed_grammar = new PackedGrammar(path, span_limit, owner, type, joshuaConfiguration);
+ packed_grammars.add(packed_grammar);
+ grammar = packed_grammar;
+ } catch (FileNotFoundException e) {
+ String msg = String.format("Couldn't load packed grammar from '%s'", path)
+ + "Perhaps it doesn't exist, or it may be an old packed file format.";
+ throw new RuntimeException(msg);
+ }
+ } else {
+ // thrax, hiero, samt
+ grammar = new MemoryBasedBatchGrammar(type, path, owner,
+ joshuaConfiguration.default_non_terminal, span_limit, joshuaConfiguration);
+ }
+
+ } else {
+
+ int maxSourceLen = parsedArgs.containsKey("max-source-len")
+ ? Integer.parseInt(parsedArgs.get("max-source-len"))
+ : -1;
+
+ joshuaConfiguration.search_algorithm = "stack";
+ grammar = new PhraseTable(path, owner, type, joshuaConfiguration);
+ }
+
+ this.grammars.add(grammar);
+ }
+
+ checkSharedVocabularyChecksumsForPackedGrammars(packed_grammars);
+
+ } else {
+ LOG.warn("no grammars supplied! Supplying dummy glue grammar.");
+ MemoryBasedBatchGrammar glueGrammar = new MemoryBasedBatchGrammar("glue", joshuaConfiguration);
+ glueGrammar.setSpanLimit(-1);
+ glueGrammar.addGlueRules(featureFunctions);
+ this.grammars.add(glueGrammar);
+ }
+
+ /* Add the grammar for custom entries */
+ if (joshuaConfiguration.search_algorithm.equals("stack"))
+ this.customPhraseTable = new PhraseTable(null, "custom", "phrase", joshuaConfiguration);
+ else
+ this.customPhraseTable = new MemoryBasedBatchGrammar("custom", joshuaConfiguration);
+ this.grammars.add(this.customPhraseTable);
+
+ /* Create an epsilon-deleting grammar */
+ if (joshuaConfiguration.lattice_decoding) {
+ LOG.info("Creating an epsilon-deleting grammar");
+ MemoryBasedBatchGrammar latticeGrammar = new MemoryBasedBatchGrammar("lattice", joshuaConfiguration);
+ latticeGrammar.setSpanLimit(-1);
+ HieroFormatReader reader = new HieroFormatReader();
+
+ String goalNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.goal_symbol);
+ String defaultNT = FormatUtils.cleanNonTerminal(joshuaConfiguration.default_non_terminal);
+
+ //FIXME: too many arguments
+ String ruleString = String.format("[%s] ||| [%s,1] <eps> ||| [%s,1] ||| ", goalNT, goalNT, defaultNT,
+ goalNT, defaultNT);
+
+ Rule rule = reader.parseLine(ruleString);
+ latticeGrammar.addRule(rule);
+ rule.estimateRuleCost(featureFunctions);
+
+ this.grammars.add(latticeGrammar);
+ }
+
+ /* Now create a feature function for each owner */
+ HashSet<String> ownersSeen = new HashSet<String>();
+
+ for (Grammar grammar: this.grammars) {
+ String owner = Vocabulary.word(grammar.getOwner());
+ if (! ownersSeen.contains(owner)) {
+ this.featureFunctions.add(new PhraseModel(weights, new String[] { "tm", "-owner", owner },
+ joshuaConfiguration, grammar));
+ ownersSeen.add(owner);
+ }
+ }
+
+ LOG.info("Memory used {} MB",
+ ((Runtime.getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / 1000000.0));
+ }
+
+ /**
+ * Checks if multiple packedGrammars have the same vocabulary by comparing their vocabulary file checksums.
+ */
+ private static void checkSharedVocabularyChecksumsForPackedGrammars(final List<PackedGrammar> packed_grammars) {
+ String previous_checksum = "";
+ for (PackedGrammar grammar : packed_grammars) {
+ final String checksum = grammar.computeVocabularyChecksum();
+ if (previous_checksum.isEmpty()) {
+ previous_checksum = checksum;
+ } else {
+ if (!checksum.equals(previous_checksum)) {
+ throw new RuntimeException(
+ "Trying to load multiple packed grammars with different vocabularies!" +
+ "Have you packed them jointly?");
+ }
+ previous_checksum = checksum;
+ }
+ }
+ }
+
+ /*
+ * This function reads the weights for the model. Feature names and their weights are listed one
+ * per line in the following format:
+ *
+ * FEATURE_NAME WEIGHT
+ */
+ private void readWeights(String fileName) {
+ Decoder.weights = new FeatureVector();
+
+ if (fileName.equals(""))
+ return;
+
+ try {
+ LineReader lineReader = new LineReader(fileName);
+
+ for (String line : lineReader) {
+ line = line.replaceAll("\\s+", " ");
+
+ if (line.equals("") || line.startsWith("#") || line.startsWith("//")
+ || line.indexOf(' ') == -1)
+ continue;
+
+ String tokens[] = line.split("\\s+");
+ String feature = tokens[0];
+ Float value = Float.parseFloat(tokens[1]);
+
+ // Kludge for compatibility with Moses tuners
+ if (joshuaConfiguration.moses) {
+ feature = demoses(feature);
+ }
+
+ weights.increment(feature, value);
+ }
+ } catch (IOException ioe) {
+ throw new RuntimeException(ioe);
+ }
+ LOG.info("Read {} weights from file '{}'", weights.size(), fileName);
+ }
+
+ private String demoses(String feature) {
+ if (feature.endsWith("="))
+ feature = feature.replace("=", "");
+ if (feature.equals("OOV_Penalty"))
+ feature = "OOVPenalty";
+ else if (feature.startsWith("tm-") || feature.startsWith("lm-"))
+ feature = feature.replace("-", "_");
+ return feature;
+ }
+
+ /**
+ * Feature functions are instantiated with a line of the form
+ *
+ * <pre>
+ * FEATURE OPTIONS
+ * </pre>
+ *
+ * Weights for features are listed separately.
+ *
+ * @throws IOException
+ *
+ */
+ private void initializeFeatureFunctions() throws IOException {
+
+ for (String featureLine : joshuaConfiguration.features) {
+ // line starts with NAME, followed by args
+ // 1. create new class named NAME, pass it config, weights, and the args
+
+ String fields[] = featureLine.split("\\s+");
+ String featureName = fields[0];
+
+ try {
+
+ Class<?> clas = getFeatureFunctionClass(featureName);
+ Constructor<?> constructor = clas.getConstructor(FeatureVector.class,
+ String[].class, JoshuaConfiguration.class);
+ FeatureFunction feature = (FeatureFunction) constructor.newInstance(weights, fields, joshuaConfiguration);
+ this.featureFunctions.add(feature);
+
+ } catch (Exception e) {
+ throw new RuntimeException(String.format("Unable to instantiate feature function '%s'!", featureLine), e);
+ }
+ }
+
+ for (FeatureFunction feature : featureFunctions) {
+ LOG.info("FEATURE: {}", feature.logString());
+ }
+
+ weights.registerDenseFeatures(featureFunctions);
+ }
+
+ /**
+ * Searches a list of predefined paths for classes, and returns the first one found. Meant for
+ * instantiating feature functions.
+ *
+ * @param name
+ * @return the class, found in one of the search paths
+ * @throws ClassNotFoundException
+ */
+ private Class<?> getFeatureFunctionClass(String featureName) {
+ Class<?> clas = null;
+
+ String[] packages = { "org.apache.joshua.decoder.ff", "org.apache.joshua.decoder.ff.lm", "org.apache.joshua.decoder.ff.phrase" };
+ for (String path : packages) {
+ try {
+ clas = Class.forName(String.format("%s.%s", path, featureName));
+ break;
+ } catch (ClassNotFoundException e) {
+ try {
+ clas = Class.forName(String.format("%s.%sFF", path, featureName));
+ break;
+ } catch (ClassNotFoundException e2) {
+ // do nothing
+ }
+ }
+ }
+ return clas;
+ }
+
+ /**
+ * Adds a rule to the custom grammar.
+ *
+ * @param rule the rule to add
+ */
+ public void addCustomRule(Rule rule) {
+ customPhraseTable.addRule(rule);
+ rule.estimateRuleCost(featureFunctions);
+ }
+
+ public Grammar getCustomPhraseTable() {
+ return customPhraseTable;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/decoder/DecoderThread.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/decoder/DecoderThread.java b/joshua-core/src/main/java/org/apache/joshua/decoder/DecoderThread.java
new file mode 100644
index 0000000..d6f5233
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/decoder/DecoderThread.java
@@ -0,0 +1,201 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.decoder;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.List;
+
+import org.apache.joshua.decoder.chart_parser.Chart;
+import org.apache.joshua.decoder.ff.FeatureFunction;
+import org.apache.joshua.decoder.ff.FeatureVector;
+import org.apache.joshua.decoder.ff.SourceDependentFF;
+import org.apache.joshua.decoder.ff.tm.Grammar;
+import org.apache.joshua.decoder.hypergraph.ForestWalker;
+import org.apache.joshua.decoder.hypergraph.GrammarBuilderWalkerFunction;
+import org.apache.joshua.decoder.hypergraph.HyperGraph;
+import org.apache.joshua.decoder.phrase.Stacks;
+import org.apache.joshua.decoder.segment_file.Sentence;
+import org.apache.joshua.corpus.Vocabulary;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This class handles decoding of individual Sentence objects (which can represent plain sentences
+ * or lattices). A single sentence can be decoded by a call to translate() and, if an InputHandler
+ * is used, many sentences can be decoded in a thread-safe manner via a single call to
+ * translateAll(), which continually queries the InputHandler for sentences until they have all been
+ * consumed and translated.
+ *
+ * The DecoderFactory class is responsible for launching the threads.
+ *
+ * @author Matt Post post@cs.jhu.edu
+ * @author Zhifei Li, zhifei.work@gmail.com
+ */
+
+public class DecoderThread extends Thread {
+ private static final Logger LOG = LoggerFactory.getLogger(DecoderThread.class);
+
+ private final JoshuaConfiguration joshuaConfiguration;
+ /*
+ * these variables may be the same across all threads (e.g., just copy from DecoderFactory), or
+ * differ from thread to thread
+ */
+ private final List<Grammar> allGrammars;
+ private final List<FeatureFunction> featureFunctions;
+
+
+ // ===============================================================
+ // Constructor
+ // ===============================================================
+ public DecoderThread(List<Grammar> grammars, FeatureVector weights,
+ List<FeatureFunction> featureFunctions, JoshuaConfiguration joshuaConfiguration) throws IOException {
+
+ this.joshuaConfiguration = joshuaConfiguration;
+ this.allGrammars = grammars;
+
+ this.featureFunctions = new ArrayList<FeatureFunction>();
+ for (FeatureFunction ff : featureFunctions) {
+ if (ff instanceof SourceDependentFF) {
+ this.featureFunctions.add(((SourceDependentFF) ff).clone());
+ } else {
+ this.featureFunctions.add(ff);
+ }
+ }
+ }
+
+ // ===============================================================
+ // Methods
+ // ===============================================================
+
+ @Override
+ public void run() {
+ // Nothing to do but wait.
+ }
+
+ /**
+ * Translate a sentence.
+ *
+ * @param sentence The sentence to be translated.
+ * @return the sentence {@link org.apache.joshua.decoder.Translation}
+ */
+ public Translation translate(Sentence sentence) {
+
+ LOG.info("Input {}: {}", sentence.id(), sentence.fullSource());
+
+ if (sentence.target() != null)
+ LOG.info("Input {}: Constraining to target sentence '{}'",
+ sentence.id(), sentence.target());
+
+ // skip blank sentences
+ if (sentence.isEmpty()) {
+ LOG.info("Translation {}: Translation took 0 seconds", sentence.id());
+ return new Translation(sentence, null, featureFunctions, joshuaConfiguration);
+ }
+
+ long startTime = System.currentTimeMillis();
+
+ int numGrammars = allGrammars.size();
+ Grammar[] grammars = new Grammar[numGrammars];
+
+ for (int i = 0; i < allGrammars.size(); i++)
+ grammars[i] = allGrammars.get(i);
+
+ if (joshuaConfiguration.segment_oovs)
+ sentence.segmentOOVs(grammars);
+
+ /**
+ * Joshua supports (as of September 2014) both phrase-based and hierarchical decoding. Here
+ * we build the appropriate chart. The output of both systems is a hypergraph, which is then
+ * used for further processing (e.g., k-best extraction).
+ */
+ HyperGraph hypergraph = null;
+ try {
+
+ if (joshuaConfiguration.search_algorithm.equals("stack")) {
+ Stacks stacks = new Stacks(sentence, this.featureFunctions, grammars, joshuaConfiguration);
+
+ hypergraph = stacks.search();
+ } else {
+ /* Seeding: the chart only sees the grammars, not the factories */
+ Chart chart = new Chart(sentence, this.featureFunctions, grammars,
+ joshuaConfiguration.goal_symbol, joshuaConfiguration);
+
+ hypergraph = (joshuaConfiguration.use_dot_chart)
+ ? chart.expand()
+ : chart.expandSansDotChart();
+ }
+
+ } catch (java.lang.OutOfMemoryError e) {
+ LOG.error("Input {}: out of memory", sentence.id());
+ hypergraph = null;
+ }
+
+ float seconds = (System.currentTimeMillis() - startTime) / 1000.0f;
+ LOG.info("Input {}: Translation took {} seconds", sentence.id(), seconds);
+ LOG.info("Input {}: Memory used is {} MB", sentence.id(), (Runtime
+ .getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / 1000000.0);
+
+ /* Return the translation unless we're doing synchronous parsing. */
+ if (!joshuaConfiguration.parse || hypergraph == null) {
+ return new Translation(sentence, hypergraph, featureFunctions, joshuaConfiguration);
+ }
+
+ /*****************************************************************************************/
+
+ /*
+ * Synchronous parsing.
+ *
+ * Step 1. Traverse the hypergraph to create a grammar for the second-pass parse.
+ */
+ Grammar newGrammar = getGrammarFromHyperGraph(joshuaConfiguration.goal_symbol, hypergraph);
+ newGrammar.sortGrammar(this.featureFunctions);
+ long sortTime = System.currentTimeMillis();
+ LOG.info("Sentence {}: New grammar has {} rules.", sentence.id(),
+ newGrammar.getNumRules());
+
+ /* Step 2. Create a new chart and parse with the instantiated grammar. */
+ Grammar[] newGrammarArray = new Grammar[] { newGrammar };
+ Sentence targetSentence = new Sentence(sentence.target(), sentence.id(), joshuaConfiguration);
+ Chart chart = new Chart(targetSentence, featureFunctions, newGrammarArray, "GOAL",joshuaConfiguration);
+ int goalSymbol = GrammarBuilderWalkerFunction.goalSymbol(hypergraph);
+ String goalSymbolString = Vocabulary.word(goalSymbol);
+ LOG.info("Sentence {}: goal symbol is {} ({}).", sentence.id(),
+ goalSymbolString, goalSymbol);
+ chart.setGoalSymbolID(goalSymbol);
+
+ /* Parsing */
+ HyperGraph englishParse = chart.expand();
+ long secondParseTime = System.currentTimeMillis();
+ LOG.info("Sentence {}: Finished second chart expansion ({} seconds).",
+ sentence.id(), (secondParseTime - sortTime) / 1000);
+ LOG.info("Sentence {} total time: {} seconds.\n", sentence.id(),
+ (secondParseTime - startTime) / 1000);
+ LOG.info("Memory used after sentence {} is {} MB", sentence.id(), (Runtime
+ .getRuntime().totalMemory() - Runtime.getRuntime().freeMemory()) / 1000000.0);
+ return new Translation(sentence, englishParse, featureFunctions, joshuaConfiguration); // or do something else
+ }
+
+ private Grammar getGrammarFromHyperGraph(String goal, HyperGraph hg) {
+ GrammarBuilderWalkerFunction f = new GrammarBuilderWalkerFunction(goal,joshuaConfiguration);
+ ForestWalker walker = new ForestWalker();
+ walker.walk(hg.goalNode, f);
+ return f.getGrammar();
+ }
+}