You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/23 18:45:42 UTC
[31/60] [partial] incubator-joshua git commit: maven multi-module
layout 1st commit: moving files into joshua-core
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/metrics/PrecisMinusSourceBLEU.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/metrics/PrecisMinusSourceBLEU.java b/joshua-core/src/main/java/org/apache/joshua/metrics/PrecisMinusSourceBLEU.java
new file mode 100644
index 0000000..bfe15d0
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/metrics/PrecisMinusSourceBLEU.java
@@ -0,0 +1,182 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.metrics;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+
+public class PrecisMinusSourceBLEU extends EvaluationMetric {
+
+ private Precis myPrecis;
+ private SourceBLEU mySourceBLEU;
+
+ private double bleuWeight;
+
+ private int precisCount;
+ private int sourceBleuCount;
+
+ public PrecisMinusSourceBLEU(String[] options) {
+ // Automatically deactivate Levenshtein penalty for Precis.
+ bleuWeight = Double.parseDouble(options[5]);
+ options[5] = "0";
+
+ myPrecis = new Precis(options);
+ mySourceBLEU =
+ new SourceBLEU(Integer.parseInt(options[0]), options[1], Integer.parseInt(options[2]),
+ false);
+
+ initialize();
+ }
+
+ protected void initialize() {
+ metricName = "PRECIS-SRC_BLEU";
+ toBeMinimized = false;
+ precisCount = myPrecis.suffStatsCount;
+ sourceBleuCount = mySourceBLEU.suffStatsCount;
+ suffStatsCount = precisCount + sourceBleuCount;
+ }
+
+ public double bestPossibleScore() {
+ return 1.0;
+ }
+
+ public double worstPossibleScore() {
+ return -1.0;
+ }
+
+ public int[] suffStats(String cand_str, int i) {
+ return null;
+ }
+
+ public int[][] suffStats(String[] cand_strings, int[] cand_indices) {
+ int candCount = cand_strings.length;
+ if (cand_indices.length != candCount) {
+ System.out.println("Array lengths mismatch in suffStats(String[],int[]); returning null.");
+ return null;
+ }
+
+ int[][] stats = new int[candCount][suffStatsCount];
+
+ int[][] precis_stats = myPrecis.suffStats(cand_strings, cand_indices);
+ int[][] source_bleu_stats = mySourceBLEU.suffStats(cand_strings, cand_indices);
+
+ for (int d = 0; d < candCount; ++d) {
+ int s = 0;
+ for (int s_T = 0; s_T < precisCount; s_T++) {
+ stats[d][s] = precis_stats[d][s_T];
+ ++s;
+ }
+ for (int s_B = 0; s_B < sourceBleuCount; s_B++) {
+ stats[d][s] = source_bleu_stats[d][s_B];
+ ++s;
+ }
+ }
+ return stats;
+ }
+
+ public void createSuffStatsFile(String cand_strings_fileName, String cand_indices_fileName,
+ String outputFileName, int maxBatchSize) {
+ try {
+ myPrecis.createSuffStatsFile(cand_strings_fileName, cand_indices_fileName, outputFileName
+ + ".PRECIS", maxBatchSize);
+ mySourceBLEU.createSuffStatsFile(cand_strings_fileName, cand_indices_fileName, outputFileName
+ + ".SRC_BLEU", maxBatchSize);
+
+ PrintWriter outFile = new PrintWriter(outputFileName);
+
+ FileInputStream inStream_Precis = new FileInputStream(outputFileName + ".PRECIS");
+ BufferedReader inFile_Precis =
+ new BufferedReader(new InputStreamReader(inStream_Precis, "utf8"));
+
+ FileInputStream inStream_SourceBLEU = new FileInputStream(outputFileName + ".SRC_BLEU");
+ BufferedReader inFile_SourceBLEU =
+ new BufferedReader(new InputStreamReader(inStream_SourceBLEU, "utf8"));
+
+ String line_Precis = inFile_Precis.readLine();
+ String line_SourceBLEU = inFile_SourceBLEU.readLine();
+
+ // combine the two files into one
+ while (line_Precis != null) {
+ outFile.println(line_Precis + " " + line_SourceBLEU);
+ line_Precis = inFile_Precis.readLine();
+ line_SourceBLEU = inFile_SourceBLEU.readLine();
+ }
+
+ inFile_Precis.close();
+ inFile_SourceBLEU.close();
+ outFile.close();
+
+ File fd;
+ fd = new File(outputFileName + ".PRECIS");
+ if (fd.exists()) fd.delete();
+ fd = new File(outputFileName + ".SRC_BLEU");
+ if (fd.exists()) fd.delete();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public double score(int[] stats) {
+ if (stats.length != suffStatsCount) {
+ throw new RuntimeException("Mismatch between stats.length and suffStatsCount (" + stats.length
+ + " vs. " + suffStatsCount + ") in PrecisMinusSourceBLEU.score(int[])");
+ }
+
+ double sc = 0.0;
+
+ int[] stats_Precis = new int[precisCount];
+ int[] stats_SourceBLEU = new int[sourceBleuCount];
+ for (int s = 0; s < precisCount; ++s) {
+ stats_Precis[s] = stats[s];
+ }
+ for (int s = 0; s < sourceBleuCount; ++s) {
+ stats_SourceBLEU[s] = stats[s + precisCount];
+ }
+
+ double sc_T = myPrecis.score(stats_Precis);
+ double sc_B = mySourceBLEU.score(stats_SourceBLEU);
+
+ sc = sc_T - (bleuWeight * sc_B);
+
+ return sc;
+ }
+
+ public void printDetailedScore_fromStats(int[] stats, boolean oneLiner) {
+ int[] stats_Precis = new int[precisCount];
+ int[] stats_SourceBLEU = new int[sourceBleuCount];
+ for (int s = 0; s < precisCount; ++s) {
+ stats_Precis[s] = stats[s];
+ }
+ for (int s = 0; s < sourceBleuCount; ++s) {
+ stats_SourceBLEU[s] = stats[s + precisCount];
+ }
+
+ System.out.println("---PRECIS---");
+ myPrecis.printDetailedScore_fromStats(stats_Precis, oneLiner);
+ System.out.println("---SRC_BLEU---");
+ mySourceBLEU.printDetailedScore_fromStats(stats_SourceBLEU, oneLiner);
+ System.out.println("---------");
+ System.out.println(" => " + metricName + " = " + f4.format(score(stats)));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/metrics/SARI.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/metrics/SARI.java b/joshua-core/src/main/java/org/apache/joshua/metrics/SARI.java
new file mode 100644
index 0000000..129e4af
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/metrics/SARI.java
@@ -0,0 +1,681 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.joshua.metrics;
+
+// Changed PROCore.java (text normalization function) and EvaluationMetric too
+
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.logging.Logger;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+
+/***
+ * Implementation of the SARI metric for text-to-text correction.
+ *
+ * \@article{xu2016optimizing,
+ * title={Optimizing statistical machine translation for text simplification},
+ * author={Xu, Wei and Napoles, Courtney and Pavlick, Ellie and Chen, Quanze and Callison-Burch, Chris},
+ * journal={Transactions of the Association for Computational Linguistics},
+ * volume={4},
+ * year={2016}}
+ *
+ * @author Wei Xu
+ */
+public class SARI extends EvaluationMetric {
+ private static final Logger logger = Logger.getLogger(SARI.class.getName());
+
+ // The maximum n-gram we care about
+ protected int maxGramLength;
+ protected String[] srcSentences;
+ protected double[] weights;
+ protected HashMap<String, Integer>[][] refNgramCounts;
+ protected HashMap<String, Integer>[][] srcNgramCounts;
+
+ /*
+ * You already have access to these data members of the parent class (EvaluationMetric): int
+ * numSentences; number of sentences in the MERT set int refsPerSen; number of references per
+ * sentence String[][] refSentences; refSentences[i][r] stores the r'th reference of the i'th
+ * source sentence (both indices are 0-based)
+ */
+
+ public SARI(String[] Metric_options) {
+ int mxGrmLn = Integer.parseInt(Metric_options[0]);
+ if (mxGrmLn >= 1) {
+ maxGramLength = mxGrmLn;
+ } else {
+ logger.severe("Maximum gram length must be positive");
+ System.exit(1);
+ }
+
+ try {
+ loadSources(Metric_options[1]);
+ } catch (IOException e) {
+ logger.severe("Error loading the source sentences from " + Metric_options[1]);
+ System.exit(1);
+ }
+
+ initialize(); // set the data members of the metric
+
+ }
+
+ protected void initialize() {
+ metricName = "SARI";
+ toBeMinimized = false;
+ suffStatsCount = StatIndex.values().length * maxGramLength + 1;
+
+ set_weightsArray();
+ set_refNgramCounts();
+ set_srcNgramCounts();
+
+ }
+
+ public double bestPossibleScore() {
+ return 1.0;
+ }
+
+ public double worstPossibleScore() {
+ return 0.0;
+ }
+
+ /**
+ * Sets the BLEU weights for each n-gram level to uniform.
+ */
+ protected void set_weightsArray() {
+ weights = new double[1 + maxGramLength];
+ for (int n = 1; n <= maxGramLength; ++n) {
+ weights[n] = 1.0 / maxGramLength;
+ }
+ }
+
+ /**
+ * Computes the sum of ngram counts in references for each sentence (storing them in
+ * <code>refNgramCounts</code>), which are used for clipping n-gram counts.
+ */
+ protected void set_refNgramCounts() {
+ @SuppressWarnings("unchecked")
+
+ HashMap<String, Integer>[][] temp_HMA = new HashMap[numSentences][maxGramLength];
+ refNgramCounts = temp_HMA;
+
+ String gram = "";
+ int oldCount = 0, nextCount = 0;
+
+ for (int i = 0; i < numSentences; ++i) {
+ refNgramCounts[i] = getNgramCountsArray(refSentences[i][0]);
+ // initialize to ngramCounts[n] of the first reference translation...
+
+ // ...and update as necessary from the other reference translations
+ for (int r = 1; r < refsPerSen; ++r) {
+
+ HashMap<String, Integer>[] nextNgramCounts = getNgramCountsArray(refSentences[i][r]);
+
+ for (int n = 1; n <= maxGramLength; ++n) {
+
+ Iterator<String> it = (nextNgramCounts[n].keySet()).iterator();
+
+ while (it.hasNext()) {
+ gram = it.next();
+ nextCount = nextNgramCounts[n].get(gram);
+
+ if (refNgramCounts[i][n].containsKey(gram)) { // update if necessary
+ oldCount = refNgramCounts[i][n].get(gram);
+ refNgramCounts[i][n].put(gram, oldCount + nextCount);
+ } else { // add it
+ refNgramCounts[i][n].put(gram, nextCount);
+ }
+
+ }
+
+ } // for (n)
+
+ } // for (r)
+
+ } // for (i)
+
+ }
+
+ protected void set_srcNgramCounts() {
+ @SuppressWarnings("unchecked")
+
+ HashMap<String, Integer>[][] temp_HMA = new HashMap[numSentences][maxGramLength];
+ srcNgramCounts = temp_HMA;
+
+ for (int i = 0; i < numSentences; ++i) {
+ srcNgramCounts[i] = getNgramCountsArray(srcSentences[i]);
+ } // for (i)
+ }
+
+ // set contents of stats[] here!
+ public int[] suffStats(String cand_str, int i) {
+ int[] stats = new int[suffStatsCount];
+
+ HashMap<String, Integer>[] candNgramCounts = getNgramCountsArray(cand_str);
+
+ for (int n = 1; n <= maxGramLength; ++n) {
+
+ // ADD OPERATIONS
+ HashMap cand_sub_src = substractHashMap(candNgramCounts[n], srcNgramCounts[i][n]);
+ HashMap cand_and_ref_sub_src = intersectHashMap(cand_sub_src, refNgramCounts[i][n]);
+ HashMap ref_sub_src = substractHashMap(refNgramCounts[i][n], srcNgramCounts[i][n]);
+
+ stats[StatIndex.values().length * (n - 1)
+ + StatIndex.ADDBOTH.ordinal()] = cand_and_ref_sub_src.keySet().size();
+ stats[StatIndex.values().length * (n - 1) + StatIndex.ADDCAND.ordinal()] = cand_sub_src
+ .keySet().size();
+ stats[StatIndex.values().length * (n - 1) + StatIndex.ADDREF.ordinal()] = ref_sub_src.keySet()
+ .size();
+
+ // System.out.println("src_and_cand_sub_ref" + cand_and_ref_sub_src +
+ // cand_and_ref_sub_src.keySet().size());
+ // System.out.println("cand_sub_src" + cand_sub_src + cand_sub_src.keySet().size());
+ // System.out.println("ref_sub_src" + ref_sub_src + ref_sub_src.keySet().size());
+
+ // DELETION OPERATIONS
+ HashMap src_sub_cand = substractHashMap(srcNgramCounts[i][n], candNgramCounts[n],
+ this.refsPerSen, this.refsPerSen);
+ HashMap src_sub_ref = substractHashMap(srcNgramCounts[i][n], refNgramCounts[i][n],
+ this.refsPerSen, 1);
+ HashMap src_sub_cand_sub_ref = intersectHashMap(src_sub_cand, src_sub_ref, 1, 1);
+
+ stats[StatIndex.values().length * (n - 1) + StatIndex.DELBOTH.ordinal()] = sumHashMapByValues(
+ src_sub_cand_sub_ref);
+ stats[StatIndex.values().length * (n - 1) + StatIndex.DELCAND.ordinal()] = sumHashMapByValues(
+ src_sub_cand);
+ stats[StatIndex.values().length * (n - 1) + StatIndex.DELREF.ordinal()] = sumHashMapByValues(
+ src_sub_ref);
+
+ // System.out.println("src_sub_cand_sub_ref" + src_sub_cand_sub_ref +
+ // sumHashMapByValues(src_sub_cand_sub_ref));
+ // System.out.println("src_sub_cand" + src_sub_cand + sumHashMapByValues(src_sub_cand));
+ // System.out.println("src_sub_ref" + src_sub_ref + sumHashMapByValues(src_sub_ref));
+
+ stats[StatIndex.values().length * (n - 1) + StatIndex.DELREF.ordinal()] = src_sub_ref.keySet()
+ .size() * this.refsPerSen;
+
+ // KEEP OPERATIONS
+ HashMap src_and_cand = intersectHashMap(srcNgramCounts[i][n], candNgramCounts[n],
+ this.refsPerSen, this.refsPerSen);
+ HashMap src_and_ref = intersectHashMap(srcNgramCounts[i][n], refNgramCounts[i][n],
+ this.refsPerSen, 1);
+ HashMap src_and_cand_and_ref = intersectHashMap(src_and_cand, src_and_ref, 1, 1);
+
+ stats[StatIndex.values().length * (n - 1)
+ + StatIndex.KEEPBOTH.ordinal()] = sumHashMapByValues(src_and_cand_and_ref);
+ stats[StatIndex.values().length * (n - 1)
+ + StatIndex.KEEPCAND.ordinal()] = sumHashMapByValues(src_and_cand);
+ stats[StatIndex.values().length * (n - 1) + StatIndex.KEEPREF.ordinal()] = sumHashMapByValues(
+ src_and_ref);
+
+ stats[StatIndex.values().length * (n - 1) + StatIndex.KEEPBOTH.ordinal()] = (int) (1000000
+ * sumHashMapByDoubleValues(divideHashMap(src_and_cand_and_ref, src_and_cand)));
+ stats[StatIndex.values().length * (n - 1)
+ + StatIndex.KEEPCAND.ordinal()] = (int) sumHashMapByDoubleValues(
+ divideHashMap(src_and_cand_and_ref, src_and_ref));
+ stats[StatIndex.values().length * (n - 1) + StatIndex.KEEPREF.ordinal()] = src_and_ref
+ .keySet().size();
+
+ // System.out.println("src_and_cand_and_ref" + src_and_cand_and_ref);
+ // System.out.println("src_and_cand" + src_and_cand);
+ // System.out.println("src_and_ref" + src_and_ref);
+
+ // stats[StatIndex.values().length * (n - 1) + StatIndex.KEEPBOTH2.ordinal()] = (int)
+ // sumHashMapByDoubleValues(divideHashMap(src_and_cand_and_ref,src_and_ref)) * 100000000 /
+ // src_and_ref.keySet().size() ;
+ // stats[StatIndex.values().length * (n - 1) + StatIndex.KEEPREF.ordinal()] =
+ // src_and_ref.keySet().size() * 8;
+
+ // System.out.println("src_and_cand_and_ref" + src_and_cand_and_ref);
+ // System.out.println("src_and_cand" + src_and_cand);
+ // System.out.println("divide" + divideHashMap(src_and_cand_and_ref,src_and_cand));
+ // System.out.println(sumHashMapByDoubleValues(divideHashMap(src_and_cand_and_ref,src_and_cand)));
+
+ }
+
+ int n = 1;
+
+ // System.out.println("CAND: " + candNgramCounts[n]);
+ // System.out.println("SRC: " + srcNgramCounts[i][n]);
+ // System.out.println("REF: " + refNgramCounts[i][n]);
+
+ HashMap src_and_cand = intersectHashMap(srcNgramCounts[i][n], candNgramCounts[n],
+ this.refsPerSen, this.refsPerSen);
+ HashMap src_and_ref = intersectHashMap(srcNgramCounts[i][n], refNgramCounts[i][n],
+ this.refsPerSen, 1);
+ HashMap src_and_cand_and_ref = intersectHashMap(src_and_cand, src_and_ref, 1, 1);
+ // System.out.println("SRC&CAND&REF : " + src_and_cand_and_ref);
+
+ HashMap cand_sub_src = substractHashMap(candNgramCounts[n], srcNgramCounts[i][n]);
+ HashMap cand_and_ref_sub_src = intersectHashMap(cand_sub_src, refNgramCounts[i][n]);
+ // System.out.println("CAND&REF-SRC : " + cand_and_ref_sub_src);
+
+ HashMap src_sub_cand = substractHashMap(srcNgramCounts[i][n], candNgramCounts[n],
+ this.refsPerSen, this.refsPerSen);
+ HashMap src_sub_ref = substractHashMap(srcNgramCounts[i][n], refNgramCounts[i][n],
+ this.refsPerSen, 1);
+ HashMap src_sub_cand_sub_ref = intersectHashMap(src_sub_cand, src_sub_ref, 1, 1);
+ // System.out.println("SRC-REF-CAND : " + src_sub_cand_sub_ref);
+
+ // System.out.println("DEBUG:" + Arrays.toString(stats));
+ // System.out.println("REF-SRC: " + substractHashMap(refNgramCounts[i], srcNgramCounts[i][0],
+ // (double)refsPerSen));
+
+ return stats;
+ }
+
+ public double score(int[] stats) {
+ if (stats.length != suffStatsCount) {
+ System.out.println("Mismatch between stats.length and suffStatsCount (" + stats.length
+ + " vs. " + suffStatsCount + ") in NewMetric.score(int[])");
+ System.exit(1);
+ }
+
+ double sc = 0.0;
+
+ for (int n = 1; n <= maxGramLength; ++n) {
+
+ int addCandCorrectNgram = stats[StatIndex.values().length * (n - 1)
+ + StatIndex.ADDBOTH.ordinal()];
+ int addCandTotalNgram = stats[StatIndex.values().length * (n - 1)
+ + StatIndex.ADDCAND.ordinal()];
+ int addRefTotalNgram = stats[StatIndex.values().length * (n - 1)
+ + StatIndex.ADDREF.ordinal()];
+
+ double prec_add_n = 0.0;
+ if (addCandTotalNgram > 0) {
+ prec_add_n = addCandCorrectNgram / (double) addCandTotalNgram;
+ }
+
+ double recall_add_n = 0.0;
+ if (addRefTotalNgram > 0) {
+ recall_add_n = addCandCorrectNgram / (double) addRefTotalNgram;
+ }
+
+ // System.out.println("\nDEBUG-SARI:" + addCandCorrectNgram + " " + addCandTotalNgram + " " +
+ // addRefTotalNgram);
+
+ double f1_add_n = meanHarmonic(prec_add_n, recall_add_n);
+
+ sc += weights[n] * f1_add_n;
+
+ int delCandCorrectNgram = stats[StatIndex.values().length * (n - 1)
+ + StatIndex.DELBOTH.ordinal()];
+ int delCandTotalNgram = stats[StatIndex.values().length * (n - 1)
+ + StatIndex.DELCAND.ordinal()];
+ int delRefTotalNgram = stats[StatIndex.values().length * (n - 1)
+ + StatIndex.DELREF.ordinal()];
+
+ double prec_del_n = 0.0;
+ if (delCandTotalNgram > 0) {
+ prec_del_n = delCandCorrectNgram / (double) delCandTotalNgram;
+ }
+
+ double recall_del_n = 0.0;
+ if (delRefTotalNgram > 0) {
+ recall_del_n = delCandCorrectNgram / (double) delRefTotalNgram;
+ }
+
+ // System.out.println("\nDEBUG-SARI:" + delCandCorrectNgram + " " + delRefTotalNgram);
+
+ double f1_del_n = meanHarmonic(prec_del_n, recall_del_n);
+
+ // sc += weights[n] * f1_del_n;
+ sc += weights[n] * prec_del_n;
+
+ int keepCandCorrectNgram = stats[StatIndex.values().length * (n - 1)
+ + StatIndex.KEEPBOTH.ordinal()];
+ // int keepCandCorrectNgram2 = stats[StatIndex.values().length * (n - 1) +
+ // StatIndex.KEEPBOTH2.ordinal()];
+ int keepCandTotalNgram = stats[StatIndex.values().length * (n - 1)
+ + StatIndex.KEEPCAND.ordinal()];
+ int keepRefTotalNgram = stats[StatIndex.values().length * (n - 1)
+ + StatIndex.KEEPREF.ordinal()];
+
+ double prec_keep_n = 0.0;
+ if (keepCandTotalNgram > 0) {
+ prec_keep_n = keepCandCorrectNgram / (double) (1000000 * keepCandTotalNgram);
+ }
+
+ double recall_keep_n = 0.0;
+ if (keepRefTotalNgram > 0) {
+ recall_keep_n = keepCandTotalNgram / (double) keepRefTotalNgram;
+ }
+
+ // System.out.println("\nDEBUG-SARI-KEEP: " + n + " " + keepCandCorrectNgram + " " +
+ // keepCandTotalNgram + " " + keepRefTotalNgram);
+
+ double f1_keep_n = meanHarmonic(prec_keep_n, recall_keep_n);
+
+ sc += weights[n] * f1_keep_n;
+
+ // System.out.println("\nDEBUG-SARI: " + n + " " + prec_add_n + " " + recall_add_n + " " +
+ // prec_del_n + " " + recall_del_n + " " + prec_keep_n + " " + recall_keep_n);
+
+ // System.out.println("\nDEBUG-SARI-KEEP: " + n + " " + keepCandCorrectNgram + " " +
+ // keepCandTotalNgram + " " + keepRefTotalNgram);
+ }
+
+ sc = sc / 3.0;
+ //
+ //
+ // set sc here!
+ //
+ //
+
+ return sc;
+ }
+
+ public double meanHarmonic(double precision, double recall) {
+
+ if (precision > 0 && recall > 0) {
+ return (2.0 * precision * recall) / (precision + recall);
+ }
+ return 0.0;
+ }
+
+ public void loadSources(String filepath) throws IOException {
+ srcSentences = new String[numSentences];
+ // BufferedReader br = new BufferedReader(new FileReader(filepath));
+ InputStream inStream = new FileInputStream(new File(filepath));
+ BufferedReader br = new BufferedReader(new InputStreamReader(inStream, "utf8"));
+
+ String line;
+ int i = 0;
+ while (i < numSentences && (line = br.readLine()) != null) {
+ srcSentences[i] = line.trim();
+ i++;
+ }
+ br.close();
+ }
+
+ public double sumHashMapByDoubleValues(HashMap<String, Double> counter) {
+ double sumcounts = 0;
+
+ for (Map.Entry<String, Double> e : counter.entrySet()) {
+ sumcounts += (double) e.getValue();
+ }
+
+ return sumcounts;
+ }
+
+ public int sumHashMapByValues(HashMap<String, Integer> counter) {
+ int sumcounts = 0;
+
+ for (Map.Entry<String, Integer> e : counter.entrySet()) {
+ sumcounts += (int) e.getValue();
+ }
+
+ return sumcounts;
+ }
+
+ public HashMap<String, Integer> substractHashMap(HashMap<String, Integer> counter1,
+ HashMap<String, Integer> counter2) {
+ HashMap<String, Integer> newcounter = new HashMap<String, Integer>();
+
+ for (Map.Entry<String, Integer> e : counter1.entrySet()) {
+ String ngram = e.getKey();
+ int count1 = e.getValue();
+ int count2 = counter2.containsKey(ngram) ? counter2.get(ngram) : 0;
+ if (count2 == 0) {
+ newcounter.put(ngram, 1);
+ }
+ }
+
+ return newcounter;
+ }
+
+ // HashMap result = counter1*ratio1 - counter2*ratio2
+ public HashMap<String, Integer> substractHashMap(HashMap<String, Integer> counter1,
+ HashMap<String, Integer> counter2, int ratio1, int ratio2) {
+ HashMap<String, Integer> newcounter = new HashMap<String, Integer>();
+
+ for (Map.Entry<String, Integer> e : counter1.entrySet()) {
+ String ngram = e.getKey();
+ int count1 = e.getValue();
+ int count2 = counter2.containsKey(ngram) ? counter2.get(ngram) : 0;
+ int newcount = count1 * ratio1 - count2 * ratio2;
+ if (newcount > 0) {
+ newcounter.put(ngram, newcount);
+ }
+ }
+
+ return newcounter;
+ }
+
+ public HashMap<String, Double> divideHashMap(HashMap<String, Integer> counter1,
+ HashMap<String, Integer> counter2) {
+ HashMap<String, Double> newcounter = new HashMap<String, Double>();
+
+ for (Map.Entry<String, Integer> e : counter1.entrySet()) {
+ String ngram = e.getKey();
+ int count1 = e.getValue();
+ int count2 = counter2.containsKey(ngram) ? counter2.get(ngram) : 0;
+ if (count2 != 0) {
+ newcounter.put(ngram, (double) count1 / (double) count2);
+ }
+ }
+
+ return newcounter;
+ }
+
+ public HashMap<String, Integer> intersectHashMap(HashMap<String, Integer> counter1,
+ HashMap<String, Integer> counter2) {
+ HashMap<String, Integer> newcounter = new HashMap<String, Integer>();
+
+ for (Map.Entry<String, Integer> e : counter1.entrySet()) {
+ String ngram = e.getKey();
+ int count1 = e.getValue();
+ int count2 = counter2.containsKey(ngram) ? counter2.get(ngram) : 0;
+ if (count2 > 0) {
+ newcounter.put(ngram, 1);
+ }
+ }
+
+ return newcounter;
+ }
+
+ // HashMap result = (counter1*ratio1) & (counter2*ratio2)
+ public HashMap<String, Integer> intersectHashMap(HashMap<String, Integer> counter1,
+ HashMap<String, Integer> counter2, int ratio1, int ratio2) {
+ HashMap<String, Integer> newcounter = new HashMap<String, Integer>();
+
+ for (Map.Entry<String, Integer> e : counter1.entrySet()) {
+ String ngram = e.getKey();
+ int count1 = e.getValue();
+ int count2 = counter2.containsKey(ngram) ? counter2.get(ngram) : 0;
+ int newcount = Math.min(count1 * ratio1, count2 * ratio2);
+ if (newcount > 0) {
+ newcounter.put(ngram, newcount);
+ }
+ }
+
+ return newcounter;
+ }
+
+ protected int wordCount(String cand_str) {
+ if (!cand_str.equals("")) {
+ return cand_str.split("\\s+").length;
+ } else {
+ return 0;
+ }
+ }
+
+ public HashMap<String, Integer>[] getNgramCountsArray(String cand_str) {
+ if (!cand_str.equals("")) {
+ return getNgramCountsArray(cand_str.split("\\s+"));
+ } else {
+ return getNgramCountsArray(new String[0]);
+ }
+ }
+
+ public HashMap<String, Integer>[] getNgramCountsArray(String[] words) {
+ @SuppressWarnings("unchecked")
+ HashMap<String, Integer>[] ngramCountsArray = new HashMap[1 + maxGramLength];
+ ngramCountsArray[0] = null;
+ for (int n = 1; n <= maxGramLength; ++n) {
+ ngramCountsArray[n] = new HashMap<String, Integer>();
+ }
+
+ int len = words.length;
+ String gram;
+ int st = 0;
+
+ for (; st <= len - maxGramLength; ++st) {
+
+ gram = words[st];
+ if (ngramCountsArray[1].containsKey(gram)) {
+ int oldCount = ngramCountsArray[1].get(gram);
+ ngramCountsArray[1].put(gram, oldCount + 1);
+ } else {
+ ngramCountsArray[1].put(gram, 1);
+ }
+
+ for (int n = 2; n <= maxGramLength; ++n) {
+ gram = gram + " " + words[st + n - 1];
+ if (ngramCountsArray[n].containsKey(gram)) {
+ int oldCount = ngramCountsArray[n].get(gram);
+ ngramCountsArray[n].put(gram, oldCount + 1);
+ } else {
+ ngramCountsArray[n].put(gram, 1);
+ }
+ } // for (n)
+
+ } // for (st)
+
+ // now st is either len-maxGramLength+1 or zero (if above loop never entered, which
+ // happens with sentences that have fewer than maxGramLength words)
+
+ for (; st < len; ++st) {
+
+ gram = words[st];
+ if (ngramCountsArray[1].containsKey(gram)) {
+ int oldCount = ngramCountsArray[1].get(gram);
+ ngramCountsArray[1].put(gram, oldCount + 1);
+ } else {
+ ngramCountsArray[1].put(gram, 1);
+ }
+
+ int n = 2;
+ for (int fin = st + 1; fin < len; ++fin) {
+ gram = gram + " " + words[st + n - 1];
+
+ if (ngramCountsArray[n].containsKey(gram)) {
+ int oldCount = ngramCountsArray[n].get(gram);
+ ngramCountsArray[n].put(gram, oldCount + 1);
+ } else {
+ ngramCountsArray[n].put(gram, 1);
+ }
+ ++n;
+ } // for (fin)
+
+ } // for (st)
+
+ return ngramCountsArray;
+
+ }
+
+ public HashMap<String, Integer> getNgramCountsAll(String cand_str) {
+ if (!cand_str.equals("")) {
+ return getNgramCountsAll(cand_str.split("\\s+"));
+ } else {
+ return getNgramCountsAll(new String[0]);
+ }
+ }
+
+ public HashMap<String, Integer> getNgramCountsAll(String[] words) {
+ HashMap<String, Integer> ngramCountsAll = new HashMap<String, Integer>();
+
+ int len = words.length;
+ String gram;
+ int st = 0;
+
+ for (; st <= len - maxGramLength; ++st) {
+
+ gram = words[st];
+ if (ngramCountsAll.containsKey(gram)) {
+ int oldCount = ngramCountsAll.get(gram);
+ ngramCountsAll.put(gram, oldCount + 1);
+ } else {
+ ngramCountsAll.put(gram, 1);
+ }
+
+ for (int n = 2; n <= maxGramLength; ++n) {
+ gram = gram + " " + words[st + n - 1];
+ if (ngramCountsAll.containsKey(gram)) {
+ int oldCount = ngramCountsAll.get(gram);
+ ngramCountsAll.put(gram, oldCount + 1);
+ } else {
+ ngramCountsAll.put(gram, 1);
+ }
+ } // for (n)
+
+ } // for (st)
+
+ // now st is either len-maxGramLength+1 or zero (if above loop never entered, which
+ // happens with sentences that have fewer than maxGramLength words)
+
+ for (; st < len; ++st) {
+
+ gram = words[st];
+ if (ngramCountsAll.containsKey(gram)) {
+ int oldCount = ngramCountsAll.get(gram);
+ ngramCountsAll.put(gram, oldCount + 1);
+ } else {
+ ngramCountsAll.put(gram, 1);
+ }
+
+ int n = 2;
+ for (int fin = st + 1; fin < len; ++fin) {
+ gram = gram + " " + words[st + n - 1];
+
+ if (ngramCountsAll.containsKey(gram)) {
+ int oldCount = ngramCountsAll.get(gram);
+ ngramCountsAll.put(gram, oldCount + 1);
+ } else {
+ ngramCountsAll.put(gram, 1);
+ }
+ ++n;
+ } // for (fin)
+
+ } // for (st)
+
+ return ngramCountsAll;
+
+ }
+
+ public void printDetailedScore_fromStats(int[] stats, boolean oneLiner) {
+ System.out.println(metricName + " = " + score(stats));
+
+ // for (Map.Entry<String, Integer> entry : refNgramCounts.) {
+ // System.out.println(entry.getKey()+" : "+ entry.getValue());
+ // }
+ //
+ //
+ // optional (for debugging purposes)
+ //
+ //
+ }
+
+ private enum StatIndex {
+ KEEPBOTH, KEEPCAND, KEEPREF, DELBOTH, DELCAND, DELREF, ADDBOTH, ADDCAND, ADDREF, KEEPBOTH2
+ };
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/metrics/SourceBLEU.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/metrics/SourceBLEU.java b/joshua-core/src/main/java/org/apache/joshua/metrics/SourceBLEU.java
new file mode 100644
index 0000000..f594954
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/metrics/SourceBLEU.java
@@ -0,0 +1,107 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.metrics;
+
+import java.util.HashMap;
+
+public class SourceBLEU extends BLEU {
+ // We assume that the source for the paraphrasing run is
+ // part of the set of references
+ private int sourceReferenceIndex;
+
+ private int[] sourceWordCount;
+ private boolean useBrevityPenalty;
+
+ public SourceBLEU() {
+ super();
+ this.sourceReferenceIndex = 0;
+ this.useBrevityPenalty = true;
+ initialize();
+ }
+
+ public SourceBLEU(String[] options) {
+ super(options);
+ this.sourceReferenceIndex = Integer.parseInt(options[2]);
+ this.useBrevityPenalty = Boolean.parseBoolean(options[3]);
+ initialize();
+ }
+
+ public SourceBLEU(int num_references, String method, int source_index, boolean use_brevity_penalty) {
+ super(num_references, method);
+ this.sourceReferenceIndex = source_index;
+ this.useBrevityPenalty = use_brevity_penalty;
+ initialize();
+ }
+
+ protected void initialize() {
+ metricName = "SRC_BLEU";
+ toBeMinimized = true;
+ suffStatsCount = 2 * getMaxGramLength() + 2;
+
+ set_weightsArray();
+ set_maxNgramCounts();
+ }
+
+ public double bestPossibleScore() {
+ return 0.0;
+ }
+
+ public double worstPossibleScore() {
+ return 1.0;
+ }
+
+ protected void set_maxNgramCounts() {
+ @SuppressWarnings("unchecked")
+ HashMap<String, Integer>[] temp_HMA = new HashMap[numSentences];
+ maxNgramCounts = temp_HMA;
+ sourceWordCount = new int[numSentences];
+
+ for (int i = 0; i < numSentences; ++i) {
+ sourceWordCount[i] = wordCount(refSentences[i][sourceReferenceIndex]);
+ maxNgramCounts[i] = getNgramCountsAll(refSentences[i][sourceReferenceIndex]);
+ }
+ }
+
+ public int[] suffStats(String cand_str, int i) {
+ int[] stats = new int[suffStatsCount];
+
+ String[] candidate_words;
+ if (!cand_str.equals(""))
+ candidate_words = cand_str.split("\\s+");
+ else
+ candidate_words = new String[0];
+
+ set_prec_suffStats(stats, candidate_words, i);
+ if (this.useBrevityPenalty)
+ stats[suffStatsCount - 1] = effLength(candidate_words.length, i);
+ else
+ stats[suffStatsCount - 1] = candidate_words.length;
+ stats[suffStatsCount - 2] = candidate_words.length;
+
+ return stats;
+ }
+
+ public int effLength(int candLength, int i) {
+ return sourceWordCount[i];
+ }
+
+ public void printDetailedScore_fromStats(int[] stats, boolean oneLiner) {
+ System.out.println(String.format("SRC_BLEU = %.4f", score(stats)));
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/metrics/TER.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/metrics/TER.java b/joshua-core/src/main/java/org/apache/joshua/metrics/TER.java
new file mode 100644
index 0000000..0dcf9d9
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/metrics/TER.java
@@ -0,0 +1,460 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.metrics;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Executors;
+import java.util.concurrent.Semaphore;
+
+import org.apache.joshua.util.StreamGobbler;
+
+public class TER extends EvaluationMetric {
+ private boolean caseSensitive;
+ private boolean withPunctuation;
+ private int beamWidth;
+ private int maxShiftDist;
+ private String tercomJarFileName;
+ private int numScoringThreads;
+
+ public TER(String[] Metric_options) {
+ // M_o[0]: case sensitivity, case/nocase
+ // M_o[1]: with-punctuation, punc/nopunc
+ // M_o[2]: beam width, positive integer
+ // M_o[3]: maximum shift distance, positive integer
+ // M_o[4]: filename of tercom jar file
+ // M_o[5]: number of threads to use for TER scoring (= number of tercom processes launched)
+
+ // for 0-3, default values in tercom-0.7.25 are: nocase, punc, 20, 50
+
+ if (Metric_options[0].equals("case")) {
+ caseSensitive = true;
+ } else if (Metric_options[0].equals("nocase")) {
+ caseSensitive = false;
+ } else {
+ String msg = "Unknown case sensitivity string " + Metric_options[0]
+ + ". Should be one of case or nocase.";
+ throw new RuntimeException(msg);
+ }
+
+ if (Metric_options[1].equals("punc")) {
+ withPunctuation = true;
+ } else if (Metric_options[1].equals("nopunc")) {
+ withPunctuation = false;
+ } else {
+ String msg = "Unknown with-punctuation string " + Metric_options[1]
+ + ". Should be one of punc or nopunc.";
+ throw new RuntimeException(msg);
+ }
+
+ beamWidth = Integer.parseInt(Metric_options[2]);
+ if (beamWidth < 1) {
+ throw new RuntimeException("Beam width must be positive");
+ }
+
+ maxShiftDist = Integer.parseInt(Metric_options[3]);
+ if (maxShiftDist < 1) {
+ throw new RuntimeException("Maximum shift distance must be positive");
+ }
+
+ tercomJarFileName = Metric_options[4];
+
+ if (tercomJarFileName == null || tercomJarFileName.equals("")) {
+ throw new RuntimeException("Problem processing tercom's jar filename");
+ } else {
+ File checker = new File(tercomJarFileName);
+ if (!checker.exists()) {
+ String msg = "Could not find tercom jar file " + tercomJarFileName
+ + "(Please make sure you use the full path in the filename)";
+ throw new RuntimeException(msg);
+ }
+ }
+
+ numScoringThreads = Integer.parseInt(Metric_options[5]);
+ if (numScoringThreads < 1) {
+ throw new RuntimeException("Number of TER scoring threads must be positive");
+ }
+
+
+ TercomRunner.set_TercomParams(caseSensitive, withPunctuation, beamWidth, maxShiftDist,
+ tercomJarFileName);
+
+
+ initialize(); // set the data members of the metric
+ }
+
+ protected void initialize() {
+ metricName = "TER";
+ toBeMinimized = true;
+ suffStatsCount = 2;
+ }
+
+ public double bestPossibleScore() {
+ return 0.0;
+ }
+
+ public double worstPossibleScore() {
+ return (+1.0 / 0.0);
+ }
+
+ public int[] suffStats(String cand_str, int i) {
+ // this method should never be used when the metric is TER,
+ // because TER.java overrides createSuffStatsFile below,
+ // which is the only method that calls suffStats(String,int).
+ return null;
+ }
+
+ public int[][] suffStats(String[] cand_strings, int[] cand_indices) {
+ // calculate sufficient statistics for each sentence in an arbitrary set of candidates
+
+ int candCount = cand_strings.length;
+ if (cand_indices.length != candCount) {
+ System.out.println("Array lengths mismatch in suffStats(String[],int[]); returning null.");
+ return null;
+ }
+
+ int[][] stats = new int[candCount][suffStatsCount];
+
+ try {
+
+ // 1) Create input files for tercom
+
+ // 1a) Create hypothesis file
+ FileOutputStream outStream = new FileOutputStream("hyp.txt.TER", false); // false: don't
+ // append
+ OutputStreamWriter outStreamWriter = new OutputStreamWriter(outStream, "utf8");
+ BufferedWriter outFile = new BufferedWriter(outStreamWriter);
+
+ for (int d = 0; d < candCount; ++d) {
+ writeLine(cand_strings[d] + " (ID" + d + ")", outFile);
+ }
+
+ outFile.close();
+
+ // 1b) Create reference file
+ outStream = new FileOutputStream("ref.txt.TER", false); // false: don't append
+ outStreamWriter = new OutputStreamWriter(outStream, "utf8");
+ outFile = new BufferedWriter(outStreamWriter);
+
+ for (int d = 0; d < candCount; ++d) {
+ for (int r = 0; r < refsPerSen; ++r) {
+ writeLine(refSentences[cand_indices[d]][r] + " (ID" + d + ")", outFile);
+ }
+ }
+
+ outFile.close();
+
+ // 2) Launch tercom as an external process
+
+ runTercom("ref.txt.TER", "hyp.txt.TER", "TER_out", 500);
+
+ // 3) Read SS from output file produced by tercom.7.25.jar
+
+ BufferedReader inFile = new BufferedReader(new FileReader("TER_out.ter"));
+ String line = "";
+
+ line = inFile.readLine(); // skip hyp line
+ line = inFile.readLine(); // skip ref line
+
+ for (int d = 0; d < candCount; ++d) {
+ line = inFile.readLine(); // read info
+ String[] strA = line.split("\\s+");
+
+ stats[d][0] = (int) Double.parseDouble(strA[1]);
+ stats[d][1] = (int) Double.parseDouble(strA[2]);
+ }
+
+ inFile.close();
+
+ // 4) Delete TER files
+
+ File fd;
+ fd = new File("hyp.txt.TER");
+ if (fd.exists()) fd.delete();
+ fd = new File("ref.txt.TER");
+ if (fd.exists()) fd.delete();
+ fd = new File("TER_out.ter");
+ if (fd.exists()) fd.delete();
+
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ return stats;
+ }
+
+ public void createSuffStatsFile(String cand_strings_fileName, String cand_indices_fileName,
+ String outputFileName, int maxBatchSize) {
+
+ try {
+ int batchCount = 0;
+
+ FileInputStream inStream_cands = new FileInputStream(cand_strings_fileName);
+ BufferedReader inFile_cands =
+ new BufferedReader(new InputStreamReader(inStream_cands, "utf8"));
+
+ FileInputStream inStream_indices = new FileInputStream(cand_indices_fileName);
+ BufferedReader inFile_indices =
+ new BufferedReader(new InputStreamReader(inStream_indices, "utf8"));
+
+ while (true) {
+ ++batchCount;
+ int readCount =
+ createTercomHypFile(inFile_cands, tmpDirPrefix + "hyp.txt.TER.batch" + batchCount,
+ 10000);
+ createTercomRefFile(inFile_indices, tmpDirPrefix + "ref.txt.TER.batch" + batchCount, 10000);
+
+ if (readCount == 0) {
+ --batchCount;
+ break;
+ } else if (readCount < 10000) {
+ break;
+ }
+ }
+
+ // score the batchCount batches of candidates, in parallel, across numThreads threads
+ ExecutorService pool = Executors.newFixedThreadPool(numScoringThreads);
+ Semaphore blocker = new Semaphore(0);
+
+ for (int b = 1; b <= batchCount; ++b) {
+ pool.execute(new TercomRunner(blocker, tmpDirPrefix + "ref.txt.TER.batch" + b, tmpDirPrefix
+ + "hyp.txt.TER.batch" + b, tmpDirPrefix + "TER_out.batch" + b, 500));
+ // Each thread scores the candidates, creating a tercom output file,
+ // and then deletes the .hyp. and .ref. files, which are not needed
+ // for other batches.
+ }
+
+ pool.shutdown();
+
+ try {
+ blocker.acquire(batchCount);
+ } catch (java.lang.InterruptedException e) {
+ throw new RuntimeException(e);
+ }
+
+ PrintWriter outFile = new PrintWriter(outputFileName);
+ for (int b = 1; b <= batchCount; ++b) {
+ copySS(tmpDirPrefix + "TER_out.batch" + b + ".ter", outFile);
+ File fd;
+ fd = new File(tmpDirPrefix + "TER_out.batch" + b + ".ter");
+ if (fd.exists()) fd.delete();
+ // .hyp. and .ref. already deleted by individual threads
+ }
+ outFile.close();
+
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ }
+
+ public int createTercomHypFile(BufferedReader inFile_cands, String hypFileName, int numCands) {
+ // returns # lines read
+
+ int readCount = 0;
+
+ try {
+ FileOutputStream outStream = new FileOutputStream(hypFileName, false); // false: don't append
+ OutputStreamWriter outStreamWriter = new OutputStreamWriter(outStream, "utf8");
+ BufferedWriter outFile = new BufferedWriter(outStreamWriter);
+
+ String line_cand = "";
+
+ if (numCands > 0) {
+ for (int d = 0; d < numCands; ++d) {
+ line_cand = inFile_cands.readLine();
+ if (line_cand != null) {
+ ++readCount;
+ writeLine(line_cand + " (ID" + d + ")", outFile);
+ } else {
+ break;
+ }
+ }
+ } else {
+ line_cand = inFile_cands.readLine();
+ int d = -1;
+ while (line_cand != null) {
+ ++readCount;
+ ++d;
+ writeLine(line_cand + " (ID" + d + ")", outFile);
+ line_cand = inFile_cands.readLine();
+ }
+ }
+
+ outFile.close();
+
+ } catch (IOException e) {
+ throw new RuntimeException("IOException in TER.createTercomHypFile(...): " + e.getMessage(), e);
+ }
+
+ return readCount;
+
+ }
+
+ public int createTercomRefFile(BufferedReader inFile_indices, String refFileName, int numIndices) {
+ // returns # lines read
+
+ int readCount = 0;
+
+ try {
+ FileOutputStream outStream = new FileOutputStream(refFileName, false); // false: don't append
+ OutputStreamWriter outStreamWriter = new OutputStreamWriter(outStream, "utf8");
+ BufferedWriter outFile = new BufferedWriter(outStreamWriter);
+
+ String line_index = "";
+
+ if (numIndices > 0) {
+ for (int d = 0; d < numIndices; ++d) {
+ line_index = inFile_indices.readLine();
+ if (line_index != null) {
+ ++readCount;
+ int index = Integer.parseInt(line_index);
+ for (int r = 0; r < refsPerSen; ++r) {
+ writeLine(refSentences[index][r] + " (ID" + d + ")", outFile);
+ }
+ } else {
+ break;
+ }
+ }
+ } else {
+ line_index = inFile_indices.readLine();
+ int d = -1;
+ while (line_index != null) {
+ ++readCount;
+ ++d;
+ int index = Integer.parseInt(line_index);
+ for (int r = 0; r < refsPerSen; ++r) {
+ writeLine(refSentences[index][r] + " (ID" + d + ")", outFile);
+ }
+ line_index = inFile_indices.readLine();
+ }
+ }
+
+ outFile.close();
+
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ return readCount;
+
+ }
+
+ public int runTercom(String refFileName, String hypFileName, String outFileNamePrefix, int memSize) {
+ int exitValue = -1;
+
+ try {
+
+ String cmd_str =
+ "java -Xmx" + memSize + "m -Dfile.encoding=utf8 -jar " + tercomJarFileName + " -r "
+ + refFileName + " -h " + hypFileName + " -o ter -n " + outFileNamePrefix;
+ cmd_str += " -b " + beamWidth;
+ cmd_str += " -d " + maxShiftDist;
+ if (caseSensitive) {
+ cmd_str += " -s";
+ }
+ if (!withPunctuation) {
+ cmd_str += " -P";
+ }
+ /*
+ * From tercom's README: -s case sensitivity, optional, default is insensitive -P no
+ * punctuations, default is with punctuations.
+ */
+
+ Runtime rt = Runtime.getRuntime();
+ Process p = rt.exec(cmd_str);
+
+ StreamGobbler errorGobbler = new StreamGobbler(p.getErrorStream(), 0);
+ StreamGobbler outputGobbler = new StreamGobbler(p.getInputStream(), 0);
+
+ errorGobbler.start();
+ outputGobbler.start();
+
+ exitValue = p.waitFor();
+
+ } catch (IOException | InterruptedException e) {
+ throw new RuntimeException(e);
+ }
+
+ return exitValue;
+
+ }
+
+ public void copySS(String inputFileName, PrintWriter outFile) {
+ try {
+ BufferedReader inFile = new BufferedReader(new FileReader(inputFileName));
+ String line = "";
+
+ line = inFile.readLine(); // skip hyp line
+ line = inFile.readLine(); // skip ref line
+
+ line = inFile.readLine(); // read info for first line
+
+ while (line != null) {
+ String[] strA = line.split("\\s+");
+ outFile
+ .println((int) Double.parseDouble(strA[1]) + " " + (int) Double.parseDouble(strA[2]));
+ line = inFile.readLine(); // read info for next line
+ }
+
+ inFile.close();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ public double score(int[] stats) {
+ if (stats.length != suffStatsCount) {
+ throw new RuntimeException("Mismatch between stats.length and suffStatsCount (" + stats.length
+ + " vs. " + suffStatsCount + ") in TER.score(int[])");
+ }
+
+ double sc = 0.0;
+
+ sc = stats[0] / (double) stats[1];
+
+ return sc;
+ }
+
+ public void printDetailedScore_fromStats(int[] stats, boolean oneLiner) {
+ if (oneLiner) {
+ System.out.println("TER = " + stats[0] + " / " + stats[1] + " = " + f4.format(score(stats)));
+ } else {
+ System.out.println("# edits = " + stats[0]);
+ System.out.println("Reference length = " + stats[1]);
+ System.out.println("TER = " + stats[0] + " / " + stats[1] + " = " + f4.format(score(stats)));
+ }
+ }
+
+ private void writeLine(String line, BufferedWriter writer) throws IOException {
+ writer.write(line, 0, line.length());
+ writer.newLine();
+ writer.flush();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/metrics/TERMinusBLEU.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/metrics/TERMinusBLEU.java b/joshua-core/src/main/java/org/apache/joshua/metrics/TERMinusBLEU.java
new file mode 100644
index 0000000..bd40140
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/metrics/TERMinusBLEU.java
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.metrics;
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.PrintWriter;
+
+public class TERMinusBLEU extends EvaluationMetric {
+ // individual components
+ private TER myTER;
+ private BLEU myBLEU;
+ private int suffStatsCount_TER;
+ private int suffStatsCount_BLEU;
+
+ public TERMinusBLEU(String[] Metric_options) {
+ // M_o[0]: case sensitivity, case/nocase
+ // M_o[1]: with-punctuation, punc/nopunc
+ // M_o[2]: beam width, positive integer
+ // M_o[3]: maximum shift distance, positive integer
+ // M_o[4]: filename of tercom jar file
+ // M_o[5]: number of threads to use for TER scoring (= number of tercom processes launched)
+ // M_o[6]: maximum gram length, positive integer
+ // M_o[7]: effective length calculation method, closest/shortest/average
+
+ // for 0-3, default values in tercom-0.7.25 are: nocase, punc, 20, 50
+
+ myTER = new TER(Metric_options);
+ myBLEU = new BLEU(Integer.parseInt(Metric_options[6]), Metric_options[7]);
+
+ initialize(); // set the data members of the metric
+ }
+
+ protected void initialize() {
+ metricName = "TER-BLEU";
+ toBeMinimized = true;
+ suffStatsCount_TER = myTER.get_suffStatsCount();
+ suffStatsCount_BLEU = myBLEU.get_suffStatsCount();
+ suffStatsCount = suffStatsCount_TER + suffStatsCount_BLEU;
+ }
+
+ public double bestPossibleScore() {
+ return -1.0;
+ }
+
+ public double worstPossibleScore() {
+ return (+1.0 / 0.0);
+ }
+
+ public int[] suffStats(String cand_str, int i) {
+ // this method should never be used when the metric is TER-BLEU,
+ // because TERMinusBLEU.java overrides suffStats(String[],int[]) below,
+ // which is the only method that calls suffStats(Sting,int).
+ return null;
+ }
+
+ public int[][] suffStats(String[] cand_strings, int[] cand_indices) {
+ // calculate sufficient statistics for each sentence in an arbitrary set of candidates
+
+ int candCount = cand_strings.length;
+ if (cand_indices.length != candCount) {
+ System.out.println("Array lengths mismatch in suffStats(String[],int[]); returning null.");
+ return null;
+ }
+
+ int[][] stats = new int[candCount][suffStatsCount];
+ // size candCount x suffStatsCount
+ // = candCount x (suffStatsCount_TER + suffStatsCount_BLEU)
+
+ int[][] stats_TER = myTER.suffStats(cand_strings, cand_indices);
+ // size candCount x suffStatsCount_TER
+ int[][] stats_BLEU = myBLEU.suffStats(cand_strings, cand_indices);
+ // size candCount x suffStatsCount_BLEU
+
+ for (int d = 0; d < candCount; ++d) {
+ int s = 0;
+ for (int s_T = 0; s_T < suffStatsCount_TER; ++s_T) {
+ stats[d][s] = stats_TER[d][s_T];
+ ++s;
+ }
+
+ for (int s_B = 0; s_B < suffStatsCount_BLEU; ++s_B) {
+ stats[d][s] = stats_BLEU[d][s_B];
+ ++s;
+ }
+ }
+
+ return stats;
+
+ }
+
+ public void createSuffStatsFile(String cand_strings_fileName, String cand_indices_fileName,
+ String outputFileName, int maxBatchSize) {
+ try {
+ myTER.createSuffStatsFile(cand_strings_fileName, cand_indices_fileName, outputFileName
+ + ".TER", maxBatchSize);
+ myBLEU.createSuffStatsFile(cand_strings_fileName, cand_indices_fileName, outputFileName
+ + ".BLEU", maxBatchSize);
+
+ PrintWriter outFile = new PrintWriter(outputFileName);
+
+ FileInputStream inStream_TER = new FileInputStream(outputFileName + ".TER");
+ BufferedReader inFile_TER = new BufferedReader(new InputStreamReader(inStream_TER, "utf8"));
+
+ FileInputStream inStream_BLEU = new FileInputStream(outputFileName + ".BLEU");
+ BufferedReader inFile_BLEU = new BufferedReader(new InputStreamReader(inStream_BLEU, "utf8"));
+
+ String line_TER = inFile_TER.readLine();
+ String line_BLEU = inFile_BLEU.readLine();
+
+ // combine the two files into one
+ while (line_TER != null) {
+ outFile.println(line_TER + " " + line_BLEU);
+ line_TER = inFile_TER.readLine();
+ line_BLEU = inFile_BLEU.readLine();
+ }
+
+ inFile_TER.close();
+ inFile_BLEU.close();
+ outFile.close();
+
+ File fd;
+ fd = new File(outputFileName + ".TER");
+ if (fd.exists()) fd.delete();
+ fd = new File(outputFileName + ".BLEU");
+ if (fd.exists()) fd.delete();
+ } catch (IOException e) {
+ throw new RuntimeException("IOException in TER.createTercomHypFile(...): " + e.getMessage());
+ }
+ }
+
+ public double score(int[] stats) {
+ if (stats.length != suffStatsCount) {
+ throw new RuntimeException("Mismatch between stats.length and suffStatsCount (" + stats.length
+ + " vs. " + suffStatsCount + ") in TERMinusBLEU.score(int[])");
+ }
+
+ double sc = 0.0;
+
+ int[] stats_TER = new int[suffStatsCount_TER];
+ int[] stats_BLEU = new int[suffStatsCount_BLEU];
+ for (int s = 0; s < suffStatsCount_TER; ++s) {
+ stats_TER[s] = stats[s];
+ }
+ for (int s = 0; s < suffStatsCount_BLEU; ++s) {
+ stats_BLEU[s] = stats[s + suffStatsCount_TER];
+ }
+
+ double sc_T = myTER.score(stats_TER);
+ double sc_B = myBLEU.score(stats_BLEU);
+
+ sc = sc_T - sc_B;
+
+ return sc;
+ }
+
+ public void printDetailedScore_fromStats(int[] stats, boolean oneLiner) {
+ int[] stats_TER = new int[suffStatsCount_TER];
+ int[] stats_BLEU = new int[suffStatsCount_BLEU];
+ for (int s = 0; s < suffStatsCount_TER; ++s) {
+ stats_TER[s] = stats[s];
+ }
+ for (int s = 0; s < suffStatsCount_BLEU; ++s) {
+ stats_BLEU[s] = stats[s + suffStatsCount_TER];
+ }
+
+ System.out.println("---TER---");
+ myTER.printDetailedScore_fromStats(stats_TER, oneLiner);
+ System.out.println("---BLEU---");
+ myBLEU.printDetailedScore_fromStats(stats_BLEU, oneLiner);
+ System.out.println("---------");
+ System.out.println(" => " + metricName + " = " + f4.format(score(stats)));
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/metrics/TercomRunner.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/metrics/TercomRunner.java b/joshua-core/src/main/java/org/apache/joshua/metrics/TercomRunner.java
new file mode 100644
index 0000000..d7eeae5
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/metrics/TercomRunner.java
@@ -0,0 +1,115 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.metrics;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.concurrent.Semaphore;
+
+import org.apache.joshua.util.StreamGobbler;
+
+
+public class TercomRunner implements Runnable {
+ /* non-static data members */
+ private Semaphore blocker;
+
+ private String refFileName;
+ private String hypFileName;
+ private String outFileNamePrefix;
+ private int memSize;
+
+ /* static data members */
+ private static boolean caseSensitive;
+ private static boolean withPunctuation;
+ private static int beamWidth;
+ private static int maxShiftDist;
+ private static String tercomJarFileName;
+
+ public static void set_TercomParams(boolean in_caseSensitive, boolean in_withPunctuation,
+ int in_beamWidth, int in_maxShiftDist, String in_tercomJarFileName) {
+ caseSensitive = in_caseSensitive;
+ withPunctuation = in_withPunctuation;
+ beamWidth = in_beamWidth;
+ maxShiftDist = in_maxShiftDist;
+ tercomJarFileName = in_tercomJarFileName;
+ }
+
+ public TercomRunner(Semaphore in_blocker, String in_refFileName, String in_hypFileName,
+ String in_outFileNamePrefix, int in_memSize) {
+ blocker = in_blocker;
+ refFileName = in_refFileName;
+ hypFileName = in_hypFileName;
+ outFileNamePrefix = in_outFileNamePrefix;
+ memSize = in_memSize;
+ }
+
+ private void real_run() {
+
+ try {
+
+ String cmd_str =
+ "java -Xmx" + memSize + "m -Dfile.encoding=utf8 -jar " + tercomJarFileName + " -r "
+ + refFileName + " -h " + hypFileName + " -o ter -n " + outFileNamePrefix;
+ cmd_str += " -b " + beamWidth;
+ cmd_str += " -d " + maxShiftDist;
+ if (caseSensitive) {
+ cmd_str += " -s";
+ }
+ if (!withPunctuation) {
+ cmd_str += " -P";
+ }
+ /*
+ * From tercom's README: -s case sensitivity, optional, default is insensitive -P no
+ * punctuation, default is with punctuation.
+ */
+
+ Runtime rt = Runtime.getRuntime();
+ Process p = rt.exec(cmd_str);
+
+ StreamGobbler errorGobbler = new StreamGobbler(p.getErrorStream(), 0);
+ StreamGobbler outputGobbler = new StreamGobbler(p.getInputStream(), 0);
+
+ errorGobbler.start();
+ outputGobbler.start();
+
+ p.waitFor();
+
+ File fd;
+ fd = new File(hypFileName);
+ if (fd.exists()) fd.delete();
+ fd = new File(refFileName);
+ if (fd.exists()) fd.delete();
+
+ } catch (IOException| InterruptedException e) {
+ throw new RuntimeException(e);
+ }
+
+ blocker.release();
+
+ }
+
+ public void run() {
+ try {
+ real_run();
+ } catch (Exception e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/metrics/ZeroOneLoss.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/metrics/ZeroOneLoss.java b/joshua-core/src/main/java/org/apache/joshua/metrics/ZeroOneLoss.java
new file mode 100644
index 0000000..aee6bcc
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/metrics/ZeroOneLoss.java
@@ -0,0 +1,88 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.metrics;
+
+public class ZeroOneLoss extends EvaluationMetric {
+ public ZeroOneLoss() {
+ initialize();
+ }
+
+ public ZeroOneLoss(String[] ZOL_options) {
+ this();
+ }
+
+ protected void initialize() {
+ metricName = "01LOSS";
+ toBeMinimized = true;
+ suffStatsCount = 2;
+ }
+
+ public double bestPossibleScore() {
+ return 0.0;
+ }
+
+ public double worstPossibleScore() {
+ return 1.0;
+ }
+
+ public int[] suffStats(String cand_str, int i) {
+ int[] stats = new int[suffStatsCount];
+
+ boolean matchFound = false;
+
+ for (int r = 0; r < refsPerSen; ++r) {
+ if (cand_str.equals(refSentences[i][r])) {
+ matchFound = true;
+ break;
+ }
+ }
+
+ if (matchFound) {
+ stats[0] = 1;
+ } else {
+ stats[0] = 0;
+ }
+
+ stats[1] = 1;
+
+ return stats;
+ }
+
+ public double score(int[] stats) {
+ if (stats.length != suffStatsCount) {
+ throw new RuntimeException("Mismatch between stats.length and suffStatsCount (" + stats.length
+ + " vs. " + suffStatsCount + ") in ZeroOneLoss.score(int[])");
+ }
+
+ return 1.0 - (stats[0] / (double) stats[1]);
+ }
+
+ public void printDetailedScore_fromStats(int[] stats, boolean oneLiner) {
+ if (oneLiner) {
+ System.out.println("01LOSS = 1.0 - " + stats[0] + "/" + stats[1] + " = "
+ + f4.format(1.0 - (stats[0] / (double) stats[1])));
+ } else {
+ System.out.println("# correct = " + stats[0]);
+ System.out.println("# sentences = " + stats[1]);
+ System.out.println("01LOSS = 1.0 - " + stats[0] + "/" + stats[1] + " = "
+ + f4.format(1.0 - (stats[0] / (double) stats[1])));
+ }
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/mira/MIRA.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/mira/MIRA.java b/joshua-core/src/main/java/org/apache/joshua/mira/MIRA.java
new file mode 100755
index 0000000..fb1f5e2
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/mira/MIRA.java
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.mira;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.util.FileUtility;
+import org.apache.joshua.util.StreamGobbler;
+
+public class MIRA {
+ public static void main(String[] args) throws Exception {
+ JoshuaConfiguration joshuaConfiguration = new JoshuaConfiguration();
+ boolean external = false; // should each MIRA iteration be launched externally?
+
+ if (args.length == 1) {
+ if (args[0].equals("-h")) {
+ printMIRAUsage(args.length, true);
+ System.exit(2);
+ } else {
+ external = false;
+ }
+ } else if (args.length == 3) {
+ external = true;
+ } else {
+ printMIRAUsage(args.length, false);
+ System.exit(1);
+ }
+
+ if (!external) {
+ MIRACore myMIRA = new MIRACore(args[0], joshuaConfiguration);
+ myMIRA.run_MIRA(); // optimize lambda[]
+ myMIRA.finish();
+ } else {
+
+ int maxMem = Integer.parseInt(args[1]);
+ String configFileName = args[2];
+ String stateFileName = FileUtility.dirname(configFileName) + "/MIRA.temp.state";
+ String cp = System.getProperty("java.class.path");
+ boolean done = false;
+ int iteration = 0;
+
+ while (!done) {
+ ++iteration;
+ Runtime rt = Runtime.getRuntime();
+ Process p =
+ rt.exec("java -Xmx" + maxMem + "m -cp " + cp + " org.apache.joshua.mira.MIRACore " + configFileName
+ + " " + stateFileName + " " + iteration);
+ /*
+ * BufferedReader br_i = new BufferedReader(new InputStreamReader(p.getInputStream()));
+ * BufferedReader br_e = new BufferedReader(new InputStreamReader(p.getErrorStream()));
+ * String dummy_line = null; while ((dummy_line = br_i.readLine()) != null) {
+ * System.out.println(dummy_line); } while ((dummy_line = br_e.readLine()) != null) {
+ * System.out.println(dummy_line); }
+ */
+ StreamGobbler errorGobbler = new StreamGobbler(p.getErrorStream(), 1);
+ StreamGobbler outputGobbler = new StreamGobbler(p.getInputStream(), 1);
+
+ errorGobbler.start();
+ outputGobbler.start();
+
+ int status = p.waitFor();
+
+ if (status == 90) {
+ done = true;
+ } else if (status == 91) {
+ done = false;
+ } else {
+ System.out.println("MIRA exiting prematurely (MIRACore returned " + status + ")...");
+ break;
+ }
+ }
+ }
+
+ System.exit(0);
+
+ } // main(String[] args)
+
+ public static void printMIRAUsage(int argsLen, boolean detailed) {
+ if (!detailed) {
+ println("Oops, you provided " + argsLen + " args!");
+ println("");
+ println("Usage:");
+ println(" MIRA -maxMem maxMemoryInMB MIRA_configFile");
+ println("");
+ println("Where -maxMem specifies the maximum amount of memory (in MB) MIRA is");
+ println("allowed to use when performing its calculations (no memroy is needed while");
+ println("the decoder is running),");
+ println("and the config file contains any subset of MIRA's 20-some parameters,");
+ println("one per line. Run MIRA -h for more details on those parameters.");
+ } else {
+ println("Usage:");
+ println(" MIRA -maxMem maxMemoryInMB MIRA_configFile");
+ println("");
+ println("Where -maxMem specifies the maximum amount of memory (in MB) MIRA is");
+ println("allowed to use when performing its calculations (no memroy is needed while");
+ println("the decoder is running),");
+ println("and the config file contains any subset of MIRA's 20-some parameters,");
+ println("one per line. Those parameters, and their default values, are:");
+ println("");
+ println("Relevant files:");
+ println(" -dir dirPrefix: working directory\n [[default: null string (i.e. they are in the current directory)]]");
+ println(" -s sourceFile: source sentences (foreign sentences) of the MIRA dataset\n [[default: null string (i.e. file name is not needed by MIRA)]]");
+ println(" -r refFile: target sentences (reference translations) of the MIRA dataset\n [[default: reference.txt]]");
+ println(" -rps refsPerSen: number of reference translations per sentence\n [[default: 1]]");
+ //println(" -txtNrm textNormMethod: how should text be normalized?\n (0) don't normalize text,\n or (1) \"NIST-style\", and also rejoin 're, *'s, n't, etc,\n or (2) apply 1 and also rejoin dashes between letters,\n or (3) apply 1 and also drop non-ASCII characters,\n or (4) apply 1+2+3\n [[default: 1]]");
+ println(" -p paramsFile: file containing parameter names, initial values, and ranges\n [[default: params.txt]]");
+ //println(" -docInfo documentInfoFile: file informing MIRA which document each\n sentence belongs to\n [[default: null string (i.e. all sentences are in one 'document')]]");
+ println(" -fin finalLambda: file name for final lambda[] values\n [[default: null string (i.e. no such file will be created)]]");
+ println("");
+ println("MIRA specs:");
+ println(" -m metricName metric options: name of evaluation metric and its options\n [[default: BLEU 4 closest]]");
+ println(" -maxIt maxMIRAIts: maximum number of MIRA iterations\n [[default: 20]]");
+ println(" -prevIt prevMIRAIts: maximum number of previous MIRA iterations to\n construct candidate sets from\n [[default: 20]]");
+ println(" -minIt minMIRAIts: number of iterations before considering an early exit\n [[default: 5]]");
+ println(" -stopIt stopMinIts: some early stopping criterion must be satisfied in\n stopMinIts *consecutive* iterations before an early exit\n [[default: 3]]");
+ println(" -stopSig sigValue: early MIRA exit if no weight changes by more than sigValue\n [[default: -1 (i.e. this criterion is never investigated)]]");
+ //println(" -thrCnt threadCount: number of threads to run in parallel when optimizing\n [[default: 1]]");
+ println(" -save saveInter: save intermediate cfg files (1) or decoder outputs (2)\n or both (3) or neither (0)\n [[default: 3]]");
+ println(" -compress compressFiles: should MIRA compress the files it produces (1)\n or not (0)\n [[default: 0]]");
+ //println(" -ipi initsPerIt: number of intermediate initial points per iteration\n [[default: 20]]");
+ //println(" -opi oncePerIt: modify a parameter only once per iteration (1) or not (0)\n [[default: 0]]");
+ //println(" -rand randInit: choose initial point randomly (1) or from paramsFile (0)\n [[default: 0]]");
+ //println(" -seed seed: seed used to initialize random number generator\n [[default: time (i.e. value returned by System.currentTimeMillis()]]");
+ // println(" -ud useDisk: reliance on disk (0-2; higher value => more reliance)\n [[default: 2]]");
+ println("");
+ println("Decoder specs:");
+ println(" -cmd commandFile: name of file containing commands to run the decoder\n [[default: null string (i.e. decoder is a JoshuaDecoder object)]]");
+ println(" -passIt passIterationToDecoder: should iteration number be passed\n to command file (1) or not (0)\n [[default: 0]]");
+ println(" -decOut decoderOutFile: name of the output file produced by the decoder\n [[default: output.nbest]]");
+ println(" -decExit validExit: value returned by decoder to indicate success\n [[default: 0]]");
+ println(" -dcfg decConfigFile: name of decoder config file\n [[default: dec_cfg.txt]]");
+ println(" -N N: size of N-best list (per sentence) generated in each MIRA iteration\n [[default: 100]]");
+ println("");
+ println("Output specs:");
+ println(" -v verbosity: MIRA verbosity level (0-2; higher value => more verbose)\n [[default: 1]]");
+ println(" -decV decVerbosity: should decoder output be printed (1) or ignored (0)\n [[default: 0]]");
+ println("");
+ }
+ }
+
+ private static void println(Object obj) {
+ System.out.println(obj);
+ }
+
+}