You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by le...@apache.org on 2016/05/16 06:26:53 UTC
[37/66] [partial] incubator-joshua git commit: JOSHUA-252 Make it
possible to use Maven to build Joshua
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/lattice/NodeIdentifierComparator.java
----------------------------------------------------------------------
diff --git a/src/joshua/lattice/NodeIdentifierComparator.java b/src/joshua/lattice/NodeIdentifierComparator.java
deleted file mode 100644
index 40e50b8..0000000
--- a/src/joshua/lattice/NodeIdentifierComparator.java
+++ /dev/null
@@ -1,41 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.lattice;
-
-import java.io.Serializable;
-import java.util.Comparator;
-
-/**
- * Compares nodes based only on the natural order of their integer identifiers.
- *
- * @author Lane Schwartz
- */
-public class NodeIdentifierComparator implements Comparator<Node<?>>, Serializable {
-
- private static final long serialVersionUID = 1L;
-
- /* See Javadoc for java.util.Comparator#compare */
- public int compare(Node<?> o1, Node<?> o2) {
- if (o1.id() < o2.id())
- return -1;
- else if (o1.id() == o2.id())
- return 0;
- return 1;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/lattice/package.html
----------------------------------------------------------------------
diff --git a/src/joshua/lattice/package.html b/src/joshua/lattice/package.html
deleted file mode 100644
index a479be8..0000000
--- a/src/joshua/lattice/package.html
+++ /dev/null
@@ -1,18 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
-<html>
-<head></head>
-<body bgcolor="white">
-
-<!--
-##### THIS IS THE TEMPLATE FOR THE PACKAGE DOC COMMENTS. #####
-##### TYPE YOUR PACKAGE COMMENTS HERE. BEGIN WITH A #####
-##### ONE-SENTENCE SUMMARY STARTING WITH A VERB LIKE: #####
--->
-
-Provides implementations of lattice and related data structures.
-
-
-<!-- Put @see and @since tags down here. -->
-
-</body>
-</html>
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/metrics/BLEU.java
----------------------------------------------------------------------
diff --git a/src/joshua/metrics/BLEU.java b/src/joshua/metrics/BLEU.java
deleted file mode 100644
index 95c6cee..0000000
--- a/src/joshua/metrics/BLEU.java
+++ /dev/null
@@ -1,540 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.metrics;
-
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.logging.Logger;
-
-public class BLEU extends EvaluationMetric {
- private static final Logger logger = Logger.getLogger(BLEU.class.getName());
-
- // The maximum n-gram we care about
- protected int maxGramLength;
- protected EffectiveLengthMethod effLengthMethod;
- // 1: closest, 2: shortest, 3: average
- // protected HashMap[][] maxNgramCounts;
-
- protected HashMap<String, Integer>[] maxNgramCounts;
- protected int[][] refWordCount;
- protected double[] weights;
-
- public BLEU() {
- this(4, "closest");
- }
-
- public BLEU(String[] BLEU_options) {
- this(Integer.parseInt(BLEU_options[0]), BLEU_options[1]);
- }
-
- public BLEU(int mxGrmLn, String methodStr) {
- if (mxGrmLn >= 1) {
- maxGramLength = mxGrmLn;
- } else {
- logger.severe("Maximum gram length must be positive");
- System.exit(1);
- }
-
- if (methodStr.equals("closest")) {
- effLengthMethod = EffectiveLengthMethod.CLOSEST;
- } else if (methodStr.equals("shortest")) {
- effLengthMethod = EffectiveLengthMethod.SHORTEST;
- // } else if (methodStr.equals("average")) {
- // effLengthMethod = EffectiveLengthMethod.AVERAGE;
- } else {
- logger.severe("Unknown effective length method string " + methodStr + ".");
- // System.out.println("Should be one of closest, shortest, or average.");
- logger.severe("Should be one of closest or shortest.");
- System.exit(1);
- }
-
- initialize();
- }
-
- protected void initialize() {
- metricName = "BLEU";
- toBeMinimized = false;
- suffStatsCount = 2 * maxGramLength + 2;
- // 2 per gram length for its precision, and 2 for length info
- set_weightsArray();
- set_maxNgramCounts();
- }
-
- @Override
- public double bestPossibleScore() {
- return 1.0;
- }
-
- @Override
- public double worstPossibleScore() {
- return 0.0;
- }
-
- /**
- * Sets the BLEU weights for each n-gram level to uniform.
- */
- protected void set_weightsArray() {
- weights = new double[1 + maxGramLength];
- for (int n = 1; n <= maxGramLength; ++n) {
- weights[n] = 1.0 / maxGramLength;
- }
- }
-
- /**
- * Computes the maximum ngram counts for each sentence (storing them in
- * <code>maxNgramCounts</code>), which are used for clipping n-gram counts.
- */
- protected void set_maxNgramCounts() {
- @SuppressWarnings("unchecked")
- HashMap<String, Integer>[] temp_HMA = new HashMap[numSentences];
- maxNgramCounts = temp_HMA;
-
- String gram = "";
- int oldCount = 0, nextCount = 0;
-
- for (int i = 0; i < numSentences; ++i) {
- maxNgramCounts[i] = getNgramCountsAll(refSentences[i][0]);
- // initialize to ngramCounts[n] of the first reference translation...
-
- // ...and update as necessary from the other reference translations
- for (int r = 1; r < refsPerSen; ++r) {
- HashMap<String, Integer> nextNgramCounts = getNgramCountsAll(refSentences[i][r]);
- for (Map.Entry<String, Integer> entry : nextNgramCounts.entrySet()) {
- gram = entry.getKey();
- nextCount = entry.getValue();
-
- if (maxNgramCounts[i].containsKey(gram)) { // update if necessary
- oldCount = maxNgramCounts[i].get(gram);
- if (nextCount > oldCount) {
- maxNgramCounts[i].put(gram, nextCount);
- }
- } else { // add it
- maxNgramCounts[i].put(gram, nextCount);
- }
-
- }
-
- } // for (r)
-
- } // for (i)
-
- // For efficiency, calculate the reference lenghts, which will be used in effLength...
-
- refWordCount = new int[numSentences][refsPerSen];
- for (int i = 0; i < numSentences; ++i) {
- for (int r = 0; r < refsPerSen; ++r) {
- refWordCount[i][r] = wordCount(refSentences[i][r]);
- }
- }
- }
-
- /**
- * Computes the BLEU sufficient statistics on a hypothesis.
- */
- public int[] suffStats(String cand_str, int i) {
- int[] stats = new int[suffStatsCount];
-
- // int wordCount = words.length;
- // for (int j = 0; j < wordCount; ++j) { words[j] = words[j].intern(); }
-
- if (!cand_str.equals("")) {
- String[] words = cand_str.split("\\s+");
- set_prec_suffStats(stats, words, i);
- stats[suffStatsCount - 2] = words.length;
- stats[suffStatsCount - 1] = effLength(words.length, i);
- } else {
- String[] words = new String[0];
- set_prec_suffStats(stats, words, i);
- stats[suffStatsCount - 2] = 0;
- stats[suffStatsCount - 1] = effLength(0, i);
- }
-
- return stats;
- }
-
- /**
- * Computes the precision sufficient statistics, clipping counts.
- *
- * @param stats
- * @param words
- * @param i
- */
- public void set_prec_suffStats(int[] stats, String[] words, int i) {
- HashMap<String, Integer>[] candCountsArray = getNgramCountsArray(words);
-
- for (int n = 1; n <= maxGramLength; ++n) {
-
- int correctGramCount = 0;
- String gram = "";
- int candGramCount = 0, maxRefGramCount = 0, clippedCount = 0;
-
- Iterator<String> it = (candCountsArray[n].keySet()).iterator();
-
- while (it.hasNext()) {
- // for each n-gram type in the candidate
- gram = it.next();
- candGramCount = candCountsArray[n].get(gram);
- // if (maxNgramCounts[i][n].containsKey(gram)) {
- // maxRefGramCount = maxNgramCounts[i][n].get(gram);
- if (maxNgramCounts[i].containsKey(gram)) {
- maxRefGramCount = maxNgramCounts[i].get(gram);
- } else {
- maxRefGramCount = 0;
- }
-
- clippedCount = Math.min(candGramCount, maxRefGramCount);
- correctGramCount += clippedCount;
- }
-
- stats[2 * (n - 1)] = correctGramCount;
- stats[2 * (n - 1) + 1] = Math.max(words.length - (n - 1), 0); // total gram count
-
- } // for (n)
- }
-
- public int effLength(int candLength, int i) {
- if (effLengthMethod == EffectiveLengthMethod.CLOSEST) { // closest
-
- int closestRefLength = refWordCount[i][0];
- int minDiff = Math.abs(candLength - closestRefLength);
-
- for (int r = 1; r < refsPerSen; ++r) {
- int nextRefLength = refWordCount[i][r];
- int nextDiff = Math.abs(candLength - nextRefLength);
-
- if (nextDiff < minDiff) {
- closestRefLength = nextRefLength;
- minDiff = nextDiff;
- } else if (nextDiff == minDiff && nextRefLength < closestRefLength) {
- closestRefLength = nextRefLength;
- minDiff = nextDiff;
- }
- }
-
- return closestRefLength;
-
- } else if (effLengthMethod == EffectiveLengthMethod.SHORTEST) { // shortest
-
- int shortestRefLength = refWordCount[i][0];
-
- for (int r = 1; r < refsPerSen; ++r) {
- int nextRefLength = refWordCount[i][r];
- if (nextRefLength < shortestRefLength) {
- shortestRefLength = nextRefLength;
- }
- }
-
- return shortestRefLength;
-
- }
- /*
- * // commented out because it needs sufficient statistics to be doubles else { // average
- *
- * int totalRefLength = refWordCount[i][0];
- *
- * for (int r = 1; r < refsPerSen; ++r) { totalRefLength += refWordCount[i][r]; }
- *
- * return totalRefLength/(double)refsPerSen;
- *
- * }
- */
- return candLength; // should never get here anyway
-
- }
-
- public double score(int[] stats) {
- if (stats.length != suffStatsCount) {
- logger.severe("Mismatch between stats.length and suffStatsCount (" + stats.length + " vs. "
- + suffStatsCount + ") in BLEU.score(int[])");
- System.exit(2);
- }
-
- double BLEUsum = 0.0;
- double smooth_addition = 1.0; // following bleu-1.04.pl
- double c_len = stats[suffStatsCount - 2];
- double r_len = stats[suffStatsCount - 1];
-
- double correctGramCount, totalGramCount;
-
- for (int n = 1; n <= maxGramLength; ++n) {
- correctGramCount = stats[2 * (n - 1)];
- totalGramCount = stats[2 * (n - 1) + 1];
-
- double prec_n;
- if (totalGramCount > 0) {
- prec_n = correctGramCount / totalGramCount;
- } else {
- prec_n = 1; // following bleu-1.04.pl ???????
- }
-
- if (prec_n == 0) {
- smooth_addition *= 0.5;
- prec_n = smooth_addition / (c_len - n + 1);
- // isn't c_len-n+1 just totalGramCount ???????
- }
-
- BLEUsum += weights[n] * Math.log(prec_n);
-
- }
-
- double BP = 1.0;
- if (c_len < r_len)
- BP = Math.exp(1 - (r_len / c_len));
- // if c_len > r_len, no penalty applies
-
- return BP * Math.exp(BLEUsum);
-
- }
-
- public void printDetailedScore_fromStats(int[] stats, boolean oneLiner) {
- double BLEUsum = 0.0;
- double smooth_addition = 1.0; // following bleu-1.04.pl
- double c_len = stats[suffStatsCount - 2];
- double r_len = stats[suffStatsCount - 1];
-
- double correctGramCount, totalGramCount;
-
- if (oneLiner) {
- System.out.print("Precisions: ");
- }
-
- for (int n = 1; n <= maxGramLength; ++n) {
- correctGramCount = stats[2 * (n - 1)];
- totalGramCount = stats[2 * (n - 1) + 1];
-
- double prec_n;
- if (totalGramCount > 0) {
- prec_n = correctGramCount / totalGramCount;
- } else {
- prec_n = 1; // following bleu-1.04.pl ???????
- }
-
- if (prec_n > 0) {
- if (totalGramCount > 0) {
- if (oneLiner) {
- System.out.print(n + "=" + f4.format(prec_n) + ", ");
- } else {
- System.out.println("BLEU_precision(" + n + ") = " + (int) correctGramCount + " / "
- + (int) totalGramCount + " = " + f4.format(prec_n));
- }
- } else {
- if (oneLiner) {
- System.out.print(n + "=N/A, ");
- } else {
- System.out
- .println("BLEU_precision(" + n + ") = N/A (candidate has no " + n + "-grams)");
- }
- }
- } else {
- smooth_addition *= 0.5;
- prec_n = smooth_addition / (c_len - n + 1);
- // isn't c_len-n+1 just totalGramCount ???????
-
- if (oneLiner) {
- System.out.print(n + "~" + f4.format(prec_n) + ", ");
- } else {
- System.out.println("BLEU_precision(" + n + ") = " + (int) correctGramCount + " / "
- + (int) totalGramCount + " ==smoothed==> " + f4.format(prec_n));
- }
- }
-
- BLEUsum += weights[n] * Math.log(prec_n);
-
- }
-
- if (oneLiner) {
- System.out.print("(overall=" + f4.format(Math.exp(BLEUsum)) + "), ");
- } else {
- System.out.println("BLEU_precision = " + f4.format(Math.exp(BLEUsum)));
- System.out.println("");
- }
-
- double BP = 1.0;
- if (c_len < r_len)
- BP = Math.exp(1 - (r_len / c_len));
- // if c_len > r_len, no penalty applies
-
- if (oneLiner) {
- System.out.print("BP=" + f4.format(BP) + ", ");
- } else {
- System.out.println("Length of candidate corpus = " + (int) c_len);
- System.out.println("Effective length of reference corpus = " + (int) r_len);
- System.out.println("BLEU_BP = " + f4.format(BP));
- System.out.println("");
- }
-
- System.out.println(" => BLEU = " + f4.format(BP * Math.exp(BLEUsum)));
- }
-
- protected int wordCount(String cand_str) {
- if (!cand_str.equals("")) {
- return cand_str.split("\\s+").length;
- } else {
- return 0;
- }
- }
-
- public HashMap<String, Integer>[] getNgramCountsArray(String cand_str) {
- if (!cand_str.equals("")) {
- return getNgramCountsArray(cand_str.split("\\s+"));
- } else {
- return getNgramCountsArray(new String[0]);
- }
- }
-
- public HashMap<String, Integer>[] getNgramCountsArray(String[] words) {
- @SuppressWarnings("unchecked")
- HashMap<String, Integer>[] ngramCountsArray = new HashMap[1 + maxGramLength];
- ngramCountsArray[0] = null;
- for (int n = 1; n <= maxGramLength; ++n) {
- ngramCountsArray[n] = new HashMap<String, Integer>();
- }
-
- int len = words.length;
- String gram;
- int st = 0;
-
- for (; st <= len - maxGramLength; ++st) {
-
- gram = words[st];
- if (ngramCountsArray[1].containsKey(gram)) {
- int oldCount = ngramCountsArray[1].get(gram);
- ngramCountsArray[1].put(gram, oldCount + 1);
- } else {
- ngramCountsArray[1].put(gram, 1);
- }
-
- for (int n = 2; n <= maxGramLength; ++n) {
- gram = gram + " " + words[st + n - 1];
- if (ngramCountsArray[n].containsKey(gram)) {
- int oldCount = ngramCountsArray[n].get(gram);
- ngramCountsArray[n].put(gram, oldCount + 1);
- } else {
- ngramCountsArray[n].put(gram, 1);
- }
- } // for (n)
-
- } // for (st)
-
- // now st is either len-maxGramLength+1 or zero (if above loop never entered, which
- // happens with sentences that have fewer than maxGramLength words)
-
- for (; st < len; ++st) {
-
- gram = words[st];
- if (ngramCountsArray[1].containsKey(gram)) {
- int oldCount = ngramCountsArray[1].get(gram);
- ngramCountsArray[1].put(gram, oldCount + 1);
- } else {
- ngramCountsArray[1].put(gram, 1);
- }
-
- int n = 2;
- for (int fin = st + 1; fin < len; ++fin) {
- gram = gram + " " + words[st + n - 1];
-
- if (ngramCountsArray[n].containsKey(gram)) {
- int oldCount = ngramCountsArray[n].get(gram);
- ngramCountsArray[n].put(gram, oldCount + 1);
- } else {
- ngramCountsArray[n].put(gram, 1);
- }
- ++n;
- } // for (fin)
-
- } // for (st)
-
- return ngramCountsArray;
-
- }
-
- public HashMap<String, Integer> getNgramCountsAll(String cand_str) {
- if (!cand_str.equals("")) {
- return getNgramCountsAll(cand_str.split("\\s+"));
- } else {
- return getNgramCountsAll(new String[0]);
- }
- }
-
- public HashMap<String, Integer> getNgramCountsAll(String[] words) {
- HashMap<String, Integer> ngramCountsAll = new HashMap<String, Integer>();
-
- int len = words.length;
- String gram;
- int st = 0;
-
- for (; st <= len - maxGramLength; ++st) {
-
- gram = words[st];
- if (ngramCountsAll.containsKey(gram)) {
- int oldCount = ngramCountsAll.get(gram);
- ngramCountsAll.put(gram, oldCount + 1);
- } else {
- ngramCountsAll.put(gram, 1);
- }
-
- for (int n = 2; n <= maxGramLength; ++n) {
- gram = gram + " " + words[st + n - 1];
- if (ngramCountsAll.containsKey(gram)) {
- int oldCount = ngramCountsAll.get(gram);
- ngramCountsAll.put(gram, oldCount + 1);
- } else {
- ngramCountsAll.put(gram, 1);
- }
- } // for (n)
-
- } // for (st)
-
- // now st is either len-maxGramLength+1 or zero (if above loop never entered, which
- // happens with sentences that have fewer than maxGramLength words)
-
- for (; st < len; ++st) {
-
- gram = words[st];
- if (ngramCountsAll.containsKey(gram)) {
- int oldCount = ngramCountsAll.get(gram);
- ngramCountsAll.put(gram, oldCount + 1);
- } else {
- ngramCountsAll.put(gram, 1);
- }
-
- int n = 2;
- for (int fin = st + 1; fin < len; ++fin) {
- gram = gram + " " + words[st + n - 1];
-
- if (ngramCountsAll.containsKey(gram)) {
- int oldCount = ngramCountsAll.get(gram);
- ngramCountsAll.put(gram, oldCount + 1);
- } else {
- ngramCountsAll.put(gram, 1);
- }
- ++n;
- } // for (fin)
-
- } // for (st)
-
- return ngramCountsAll;
-
- }
-
- enum EffectiveLengthMethod {
- CLOSEST, SHORTEST, AVERAGE
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/metrics/BLEU_SBP.java
----------------------------------------------------------------------
diff --git a/src/joshua/metrics/BLEU_SBP.java b/src/joshua/metrics/BLEU_SBP.java
deleted file mode 100644
index e58256b..0000000
--- a/src/joshua/metrics/BLEU_SBP.java
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.metrics;
-
-public class BLEU_SBP extends BLEU {
- // constructors
- public BLEU_SBP() {
- super();
- }
-
- public BLEU_SBP(String[] BLEU_SBP_options) {
- super(BLEU_SBP_options);
- }
-
- public BLEU_SBP(int mxGrmLn, String methodStr) {
- super(mxGrmLn, methodStr);
- }
-
-
-
- public int[] suffStats(String cand_str, int i) {
- int[] stats = new int[suffStatsCount];
- stats[0] = 1;
-
- String[] words = cand_str.split("\\s+");
-
- // int wordCount = words.length;
- // for (int j = 0; j < wordCount; ++j) { words[j] = words[j].intern(); }
-
- set_prec_suffStats(stats, words, i);
-
- // the only place where BLEU_SBP differs from BLEU /* ~~~ */
- /* ~~~ */
- // stats[maxGramLength+1] = words.length;
- // stats[maxGramLength+2] = effLength(words.length,i);
- /* ~~~ */
-
- /* ~~~ */
- int effectiveLength = effLength(words.length, i);
- stats[maxGramLength + 1] = Math.min(words.length, effectiveLength);
- stats[maxGramLength + 2] = effectiveLength;
- /* ~~~ */
-
- return stats;
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/metrics/EvaluationMetric.java
----------------------------------------------------------------------
diff --git a/src/joshua/metrics/EvaluationMetric.java b/src/joshua/metrics/EvaluationMetric.java
deleted file mode 100644
index 4dd9fbd..0000000
--- a/src/joshua/metrics/EvaluationMetric.java
+++ /dev/null
@@ -1,399 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.metrics;
-
-import java.io.BufferedReader;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.PrintWriter;
-import java.text.DecimalFormat;
-import java.util.Arrays;
-import java.util.TreeMap;
-
-public abstract class EvaluationMetric {
- /* static data members */
- private static TreeMap<String, Integer> metricOptionCount; // maps metric names -> number of
- // options for that metric
- protected static int numSentences; // number of sentences in the MERT set
- protected static int numDocuments; // number of documents in the MERT set
- protected static int refsPerSen;
- protected static String[][] refSentences;
- protected final static DecimalFormat f0 = new DecimalFormat("###0");
- protected final static DecimalFormat f4 = new DecimalFormat("###0.0000");
- protected static String tmpDirPrefix;
-
- /* non-static data members */
- protected int suffStatsCount; // number of sufficient statistics
- protected String metricName; // number of metric
- protected boolean toBeMinimized;
-
- // is this a metric that should be minimized?
- // e.g. toBeMinimized = true for 01LOSS, WER, TER
- // toBeMinimized = false for BLEU
-
- /* static (=> also non-abstract) methods */
- public static void set_knownMetrics() {
- metricOptionCount = new TreeMap<String, Integer>();
-
- metricOptionCount.put("BLEU", 2);
- // the "BLEU" metric expects an options array of length 2
- metricOptionCount.put("BLEU_SBP", 2);
- // the "BLEU_SBP" metric expects an options array of length 2
- metricOptionCount.put("01LOSS", 0);
- // the "01LOSS" metric expects an options array of length 0
- metricOptionCount.put("TER", 6);
- // the "TER" metric expects an options array of length 5
- // metricOptionCount.put("METEOR",4);
- // the "METEOR" metric expects an options array of length 4
- // metricOptionCount.put("RYPT",5);
- // the "RYPT" metric expects an options array of length 5
- metricOptionCount.put("TER-BLEU", 8);
- // the "TER-BLEU" metric expects an options array of length 7
- // metricOptionCount.put("WER",0);
- // the "WER" metric expects an options array of length 0
- metricOptionCount.put("MC_BLEU", 4);
- metricOptionCount.put("PRECIS", 6);
- metricOptionCount.put("SRC_BLEU", 4);
- metricOptionCount.put("PRECIS-SRC_BLEU", 6);
- metricOptionCount.put("GL_BLEU", 3);
- }
-
- public static EvaluationMetric getMetric(String metricName, String[] metricOptions) {
- EvaluationMetric retMetric = null;
-
- if (metricName.equals("BLEU")) {
- retMetric = new BLEU(metricOptions); // the "BLEU" metric corresponds to the BLEU class
- } else if (metricName.equals("BLEU_SBP")) {
- retMetric = new BLEU_SBP(metricOptions); // the "BLEU_SBP" metric corresponds to the BLEU_SBP
- // class
- } else if (metricName.equals("01LOSS")) {
- retMetric = new ZeroOneLoss(metricOptions); // the "01LOSS" metric corresponds to the
- // ZeroOneLoss class
- } else if (metricName.equals("TER")) {
- retMetric = new TER(metricOptions); // the "TER" metric corresponds to the TER class
- // } else if (metricName.equals("METEOR")) {
- // retMetric = new METEOR(metricOptions); // the "METEOR" metric corresponds to the METEOR
- // class
- // } else if (metricName.equals("RYPT")) {
- // retMetric = new RYPT(metricOptions); // the "RYPT" metric corresponds to the RYPT class
- } else if (metricName.equals("TER-BLEU")) {
- retMetric = new TERMinusBLEU(metricOptions); // the "TER-BLEU" metric corresponds to the
- // TERMinusBLEU class
- // } else if (metricName.equals("WER")) {
- // retMetric = new WordErrorRate(metricOptions); // the "WER" metric corresponds to the
- // WordErrorRate class
- } else if (metricName.equals("MC_BLEU")) {
- retMetric = new MinimumChangeBLEU(metricOptions); // the "MC_BLEU" metric corresponds to the
- // ParaphraseBLEU class
- } else if (metricName.equals("PRECIS")) {
- retMetric = new Precis(metricOptions);
- } else if (metricName.equals("SRC_BLEU")) {
- retMetric = new SourceBLEU(metricOptions);
- } else if (metricName.equals("PRECIS-SRC_BLEU")) {
- retMetric = new PrecisMinusSourceBLEU(metricOptions);
- } else if (metricName.equals("GL_BLEU")) {
- retMetric = new GradeLevelBLEU(metricOptions); // the "GL_BLEU" metric corresponds to the
- // GradeLevelBLEU class
- }
- return retMetric;
- }
-
- public static void set_numSentences(int x) {
- numSentences = x;
- }
-
- public static void set_numDocuments(int x) {
- numDocuments = x;
- }
-
- public static void set_refsPerSen(int x) {
- refsPerSen = x;
- }
-
- public static void set_tmpDirPrefix(String S) {
- tmpDirPrefix = S;
- }
-
- public static void set_refSentences(String[][] refs) {
- refSentences = new String[numSentences][refsPerSen];
- for (int i = 0; i < numSentences; ++i) {
- for (int r = 0; r < refsPerSen; ++r) {
- refSentences[i][r] = refs[i][r];
- }
- }
- }
-
- public static boolean knownMetricName(String name) {
- return metricOptionCount.containsKey(name);
- }
-
- public static int metricOptionCount(String name) {
- return metricOptionCount.get(name);
- }
-
- /* non-abstract, non-static methods */
- public int get_suffStatsCount() {
- return suffStatsCount;
- }
-
- public String get_metricName() {
- return metricName;
- }
-
- public boolean getToBeMinimized() {
- return toBeMinimized;
- }
-
- public boolean isBetter(double x, double y) {
- // return true if x is better than y
- if (toBeMinimized) {
- return (x < y);
- } else {
- return (x > y);
- }
- }
-
- public double score(String cand_str, int i) {
- String[] SA = new String[1];
- SA[0] = cand_str;
- int[] IA = new int[1];
- IA[0] = i;
-
- int[][] SS = suffStats(SA, IA);
-
- int[] stats = new int[suffStatsCount];
- for (int s = 0; s < suffStatsCount; ++s) {
- stats[s] = SS[0][s];
- }
-
- return score(stats);
- }
-
- public double score(String[] topCand_str) {
- int[] stats = suffStats(topCand_str);
- return score(stats);
- }
-
- public int[] suffStats(String[] topCand_str) {
- int[] IA = new int[numSentences];
- for (int i = 0; i < numSentences; ++i) {
- IA[i] = i;
- }
-
- int[][] SS = suffStats(topCand_str, IA);
-
- int[] totStats = new int[suffStatsCount];
- for (int s = 0; s < suffStatsCount; ++s) {
- totStats[s] = 0;
- for (int i = 0; i < numSentences; ++i) {
- totStats[s] += SS[i][s];
- }
- }
-
- return totStats;
- }
-
- /**
- * Calculates sufficient statistics on each sentence in the corpus, returning them as arrays.
- *
- * @param cand_strings
- * @param cand_indices
- * @return
- */
- public int[][] suffStats(String[] cand_strings, int[] cand_indices) {
-
- int candCount = cand_strings.length;
- if (cand_indices.length != candCount) {
- System.out.println("Array lengths mismatch in suffStats(String[],int[]); returning null.");
- return null;
- }
-
- int[][] stats = new int[candCount][suffStatsCount];
-
- for (int d = 0; d < candCount; ++d) {
- int[] currStats = suffStats(cand_strings[d], cand_indices[d]);
-
- for (int s = 0; s < suffStatsCount; ++s) {
- stats[d][s] = currStats[s];
- }
- } // for (d)
-
- return stats;
- }
-
- public void createSuffStatsFile(String cand_strings_fileName, String cand_indices_fileName,
- String outputFileName, int maxBatchSize) {
- // similar to the above suffStats(String[], int[])
-
- try {
- FileInputStream inStream_cands = new FileInputStream(cand_strings_fileName);
- BufferedReader inFile_cands =
- new BufferedReader(new InputStreamReader(inStream_cands, "utf8"));
-
- FileInputStream inStream_indices = new FileInputStream(cand_indices_fileName);
- BufferedReader inFile_indices =
- new BufferedReader(new InputStreamReader(inStream_indices, "utf8"));
-
- PrintWriter outFile = new PrintWriter(outputFileName);
-
- String[] cand_strings = new String[maxBatchSize];
- int[] cand_indices = new int[maxBatchSize];
-
- String line_cand = inFile_cands.readLine();
- String line_index = inFile_indices.readLine();
-
- while (line_cand != null) {
- int size = 0;
- while (line_cand != null) {
- cand_strings[size] = line_cand;
- cand_indices[size] = Integer.parseInt(line_index);
- ++size; // now size is how many were read for this currnet batch
- if (size == maxBatchSize) break;
-
- line_cand = inFile_cands.readLine();
- line_index = inFile_indices.readLine();
- }
-
- if (size < maxBatchSize) { // last batch, and smaller than maxBatchSize
- String[] cand_strings_temp = new String[size];
- int[] cand_indices_temp = new int[size];
- for (int d = 0; d < size; ++d) {
- cand_strings_temp[d] = cand_strings[d];
- cand_indices_temp[d] = cand_indices[d];
- }
- cand_strings = cand_strings_temp;
- cand_indices = cand_indices_temp;
- }
-
- int[][] SS = suffStats(cand_strings, cand_indices);
- for (int d = 0; d < size; ++d) {
- StringBuilder stats_str = new StringBuilder();
-
- for (int s = 0; s < suffStatsCount - 1; ++s) {
- stats_str.append(SS[d][s]).append(" ");
- }
- stats_str.append(SS[d][suffStatsCount - 1]);
-
- outFile.println(stats_str);
- }
-
- line_cand = inFile_cands.readLine();
- line_index = inFile_indices.readLine();
- }
-
- inFile_cands.close();
- inFile_indices.close();
- outFile.close();
-
- } catch (IOException e) {
- System.err.println("IOException in EvaluationMetric.createSuffStatsFile(...): "
- + e.getMessage());
- System.exit(99902);
- }
-
- }
-
- public void printDetailedScore(String[] topCand_str, boolean oneLiner) {
- int[] stats = suffStats(topCand_str);
- printDetailedScore_fromStats(stats, oneLiner);
- }
-
- public double score(int[][] stats) {
- // returns an average of document scores (aka the document-level score, as opposed to
- // corpus-level score)
- // stats[][] is indexed [doc][s]
-
- double retVal = 0.0;
- for (int doc = 0; doc < numDocuments; ++doc) {
- retVal += score(stats[doc]);
- }
- return retVal / numDocuments;
- }
-
- public double score(int[][] stats, int firstRank, int lastRank) {
- // returns an average of document scores, restricted to the documents
- // ranked firstRank-lastRank, inclusive (ranks are 1-indexed, even though the docs are
- // 0-indexed)
-
- double[] scores = docScores(stats);
-
- Arrays.sort(scores);
- // sorts into ascending order
-
- double retVal = 0.0;
-
- if (toBeMinimized) {
- // scores[0] is rank 1, scores[numDocuments-1] is rank numDocuments
- // => scores[j] is rank j+1
- // => rank r is scores[r-1]
- for (int j = firstRank - 1; j < lastRank; ++j) {
- retVal += scores[j];
- }
- } else {
- // scores[numDocuments-1] is rank 1, scores[0] is rank numDocuments
- // => scores[j] is rank numDocuments-j
- // => rank r is scores[numDocuments-r]
- for (int j = numDocuments - firstRank; j >= numDocuments - lastRank; --j) {
- retVal += scores[j];
- }
- }
-
- return retVal / (lastRank - firstRank + 1);
-
- }
-
- public double[] docScores(int[][] stats) {
- // returns an array of document scores
- // stats[][] is indexed [doc][s]
-
- double[] scores = new double[numDocuments];
- for (int doc = 0; doc < numDocuments; ++doc) {
- scores[doc] = score(stats[doc]);
- }
- return scores;
- }
-
- public void printDetailedScore_fromStats(int[][] stats, String[] docNames) {
- // prints individual document scores
- // stats[][] is indexed [doc][s]
-
- for (int doc = 0; doc < numDocuments; ++doc) {
- if (docNames == null) {
- System.out.print("Document #" + doc + ": ");
- } else {
- System.out.print(docNames[doc] + ": ");
- }
- printDetailedScore_fromStats(stats[doc], true);
- }
- }
-
- /* abstract (=> also non-static) methods */
- protected abstract void initialize();
-
- public abstract double bestPossibleScore();
-
- public abstract double worstPossibleScore();
-
- public abstract int[] suffStats(String cand_str, int i);
-
- public abstract double score(int[] stats);
-
- public abstract void printDetailedScore_fromStats(int[] stats, boolean oneLiner);
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/metrics/GradeLevelBLEU.java
----------------------------------------------------------------------
diff --git a/src/joshua/metrics/GradeLevelBLEU.java b/src/joshua/metrics/GradeLevelBLEU.java
deleted file mode 100644
index 06efa8b..0000000
--- a/src/joshua/metrics/GradeLevelBLEU.java
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.metrics;
-
-import java.io.BufferedReader;
-import java.io.FileReader;
-import java.io.IOException;
-import java.text.DecimalFormat;
-import java.util.logging.Logger;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-
-public class GradeLevelBLEU extends BLEU {
- private static final Logger logger = Logger.getLogger(GradeLevelBLEU.class.getName());
-
- // syllable pattern matches /C*V+/
- private static final Pattern syllable = Pattern.compile("([^aeiouy]*[aeiouy]+)");
- private static final Pattern silentE = Pattern.compile("[^aeiou]e$");
- private static final int SOURCE = 0, CANDIDATE = 1, REFERENCE = 2;
- private int srcIndex = 1, sentCountIndex;
- private SourceBLEU srcBLEU;
- private double targetGL = 9.87; // tune.simp avg GL = 9.8704 (tune.en =
- // 14.0785
- private double alpha = 0.9;
- private boolean useTarget = true;
- private boolean useBLEUplus = true;
-
- public GradeLevelBLEU() {
- super();
- }
-
- // target == 0 : use the default target
- // target > 0 : use that target
- // target < 0 : use source GL for target
- public GradeLevelBLEU(String[] options) {
- super();
- // there are 3 arguments: target GL, alpha, and source path
- // the BLEU options are assumed to be "4 closest"
- if (Double.parseDouble(options[0]) > 0)
- targetGL = Double.parseDouble(options[0]);
- else if (Double.parseDouble(options[0]) < 0) useTarget = false;
- if (Double.parseDouble(options[1]) > 0) alpha = Double.parseDouble(options[1]);
- try {
- loadSources(options[2]);
- } catch (IOException e) {
- logger.severe("Error loading the source sentences from " + options[2]);
- System.exit(1);
- }
- if (useBLEUplus) srcBLEU = new SourceBLEU(4, "closest", srcIndex, true);
- initialize();
- }
-
- // hacky way to add the source sentence as the last reference sentence (in
- // accordance with SourceBLEU)
- public void loadSources(String filepath) throws IOException {
- String[][] newRefSentences = new String[numSentences][refsPerSen + 1];
- BufferedReader br = new BufferedReader(new FileReader(filepath));
- String line;
- int i = 0;
- while (i < numSentences && (line = br.readLine()) != null) {
- for (int r = 0; r < refsPerSen; ++r) {
- newRefSentences[i][r] = refSentences[i][r];
- }
- newRefSentences[i][refsPerSen] = line.trim();
- i++;
- }
- br.close();
- }
-
- public void initialize() {
- metricName = "GL_BLEU";
- effLengthMethod = EffectiveLengthMethod.SHORTEST;
- toBeMinimized = false;
- suffStatsCount = 4 * maxGramLength + 7;
- sentCountIndex = 4 * maxGramLength;
- set_weightsArray();
- set_maxNgramCounts();
- }
-
- public int[] suffStats(String cand_str, int i) {
- int[] stats = new int[suffStatsCount];
-
- String[] candidate_tokens = null;
-
- if (!cand_str.equals("")) {
- candidate_tokens = cand_str.split("\\s+");
- } else {
- candidate_tokens = new String[0];
- stats[tokenLength(CANDIDATE)] = 0;
- stats[tokenLength(REFERENCE)] = effLength(0, i);
- }
- // set the BLEU stats
- set_prec_suffStats(stats, candidate_tokens, i);
-
- // set source BLEU stats
- if (useBLEUplus) {
- int[] src_prec_suffStats = srcBLEU.suffStats(cand_str, i);
- for (int j = 0; j < src_prec_suffStats.length; j++) {
- stats[2 * maxGramLength + j] = src_prec_suffStats[j];
- }
- }
-
- // now set the readability stats
- String[] reference_tokens = refSentences[i][0].split("\\s+");
- String[] source_tokens = refSentences[i][srcIndex].split("\\s+");
-
- // set the number of sentences (necessary to calculate GL)
- stats[sentCountIndex] = 1;
- // token length
- stats[tokenLength(CANDIDATE)] = candidate_tokens.length;
- stats[tokenLength(REFERENCE)] = reference_tokens.length;
- stats[tokenLength(SOURCE)] = source_tokens.length;
-
- // syllable length
- stats[syllableLength(CANDIDATE)] = countTotalSyllables(candidate_tokens);
- stats[syllableLength(REFERENCE)] = countTotalSyllables(reference_tokens);
- stats[syllableLength(SOURCE)] = countTotalSyllables(source_tokens);
-
- return stats;
- }
-
- // create methods for accessing the indices to reduce possible human error
- private int tokenLength(int whichSentence) {
- return suffStatsCount - 3 + whichSentence;
- }
-
- private int syllableLength(int whichSentence) {
- return suffStatsCount - 6 + whichSentence;
- }
-
- // count syllables in a "sentence" (ss.length >= 1)
- public int countTotalSyllables(String[] ss) {
- int count = 0;
- for (String s : ss) {
- int i = countSyllables(s);
- count += i;
- }
- return count;
- }
-
- // count syllables in a "word"
- // add a syllable for punctuation, etc., so it isn't free
- public int countSyllables(String s) {
- if (s.equals("-")) {
- return 1;
- }
- // if the word is hyphenated, split at the hyphen before counting
- // syllables
- if (s.contains("-")) {
- int count = 0;
- String[] temp = s.split("-");
- for (String t : temp)
- count += countSyllables(t);
- return count;
- }
-
- int count = 0;
- Matcher m = syllable.matcher(s);
- while (m.find())
- count++;
- // subtract 1 if the word ends in a silent e
- m = silentE.matcher(s);
- if (m.find()) count--;
- if (count <= 0) count = 1;
- return count;
- }
-
- public double score(int[] stats) {
- if (stats.length != suffStatsCount) {
- logger.severe("Mismatch between stats.length and suffStatsCount (" + stats.length + " vs. "
- + suffStatsCount + ") in BLEU.score(int[])");
- System.exit(2);
- }
- double BLEUscore = super.score(stats);
- double candGL =
- gradeLevel(stats[tokenLength(CANDIDATE)], stats[syllableLength(CANDIDATE)],
- stats[sentCountIndex]);
- double readabilityPenalty = 1;
-
- if (useTarget) {
- readabilityPenalty = getReadabilityPenalty(candGL, targetGL);
- } else {
- double srcGL =
- gradeLevel(stats[tokenLength(SOURCE)], stats[syllableLength(SOURCE)],
- stats[sentCountIndex]);
- readabilityPenalty = getReadabilityPenalty(candGL, srcGL);
- }
-
- if (useBLEUplus) {
- int[] srcStats = new int[2 * maxGramLength];
- for (int i = 0; i < 2 * maxGramLength; i++) {
- srcStats[i] = stats[2 * maxGramLength + i];
- }
- srcStats[2 * maxGramLength] = stats[tokenLength(CANDIDATE)];
- srcStats[2 * maxGramLength] = stats[tokenLength(SOURCE)];
- double srcBLEUscore = srcBLEU.score(stats);
- BLEUscore = BLEU_plus(BLEUscore, srcBLEUscore);
- }
- return readabilityPenalty * BLEUscore;
- }
-
- // Flesch-Kincaid Grade Level
- // (http://en.wikipedia.org/wiki/Flesch-Kincaid_readability_test)
- public double gradeLevel(int numWords, int numSyllables, int numSentences) {
- double d = 0.39 * numWords / numSentences + 11.8 * numSyllables / numWords - 15.19;
- if (d < 0) d = 0;
- return d;
- }
-
- // calculate BLEU+ (per submitted paper CCB reviewed)
- private double BLEU_plus(double bleu_ref, double bleu_src) {
- return alpha * bleu_ref - (1 - alpha) * bleu_src;
- }
-
- private double getReadabilityPenalty(double this_gl, double target_gl) {
- if (this_gl < target_gl) return 1.0;
- return 0.0;
- }
-
- public void printDetailedScore_fromStats(int[] stats, boolean oneLiner) {
- DecimalFormat df = new DecimalFormat("#.###");
- double source_gl =
- gradeLevel(stats[tokenLength(SOURCE)], stats[syllableLength(SOURCE)], stats[sentCountIndex]);
- double cand_gl =
- gradeLevel(stats[tokenLength(CANDIDATE)], stats[syllableLength(CANDIDATE)],
- stats[sentCountIndex]);
- double ref_gl =
- gradeLevel(stats[tokenLength(REFERENCE)], stats[syllableLength(REFERENCE)],
- stats[sentCountIndex]);
- double penalty = 1;
- double bleu_ref = super.score(stats);
- double bleu_src = srcBLEU.score(stats);
- double bleu_plus = BLEU_plus(bleu_ref, bleu_src);
-
- if (useTarget)
- penalty = getReadabilityPenalty(cand_gl, targetGL);
- else
- penalty = getReadabilityPenalty(cand_gl, source_gl);
-
- if (oneLiner) {
- System.out.print("GL_BLEU=" + df.format(score(stats)));
- System.out.print(" BLEU=" + df.format(bleu_ref));
- System.out.print(" BLEU_src=" + df.format(bleu_src));
- System.out.print(" iBLEU=" + df.format(bleu_plus));
- System.out.print(" GL_cand=" + df.format(cand_gl));
- System.out.print(" GL_src=" + df.format(source_gl));
- System.out.print(" GL_ref=" + df.format(ref_gl));
- System.out.print(" Read_penalty=" + df.format(penalty));
- System.out.println();
- } else {
- System.out.println("GL_BLEU = " + df.format(score(stats)));
- System.out.println("BLEU = " + df.format(bleu_ref));
- System.out.println("BLEU_src = " + df.format(bleu_src));
- System.out.println("iBLEU = " + df.format(bleu_plus));
- System.out.println("GL_cand = " + df.format(cand_gl));
- System.out.println("GL_src = " + df.format(source_gl));
- System.out.println("GL_ref = " + df.format(ref_gl));
- System.out.println("Read penalty = " + df.format(penalty));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/metrics/METEOR.java
----------------------------------------------------------------------
diff --git a/src/joshua/metrics/METEOR.java b/src/joshua/metrics/METEOR.java
deleted file mode 100644
index d94599b..0000000
--- a/src/joshua/metrics/METEOR.java
+++ /dev/null
@@ -1,243 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.metrics;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.FileOutputStream;
-import java.io.FileReader;
-import java.io.IOException;
-import java.io.OutputStreamWriter;
-
-import joshua.util.StreamGobbler;
-
-
-public class METEOR extends EvaluationMetric {
- protected String targetLanguage;
- protected boolean normalize;
- protected boolean keepPunctuation;
- private int maxComputations;
-
- public METEOR(String[] Metric_options) {
- // M_o[0]: -l language, one of {en,cz,fr,de,es}
- // M_o[1]: -normalize, one of {norm_yes,norm_no}
- // M_o[2]: -keepPunctuation, one of {keepPunc,removePunc}
- // M_o[3]: maxComputations, positive integer
-
- // default in meteor v0.8: en, norm_no, removePunc
-
- if (Metric_options[0].equals("en")) {
- targetLanguage = "en";
- } else if (Metric_options[0].equals("cz")) {
- targetLanguage = "cz";
- } else if (Metric_options[0].equals("fr")) {
- targetLanguage = "fr";
- } else if (Metric_options[0].equals("de")) {
- targetLanguage = "de";
- } else if (Metric_options[0].equals("es")) {
- targetLanguage = "es";
- } else {
- System.out.println("Unknown language string " + Metric_options[0] + ".");
- System.out.println("Should be one of {en,cz,fr,de,es}.");
- System.exit(1);
- }
-
- if (Metric_options[1].equals("norm_yes")) {
- normalize = true;
- } else if (Metric_options[1].equals("norm_no")) {
- normalize = false;
- } else {
- System.out.println("Unknown normalize string " + Metric_options[1] + ".");
- System.out.println("Should be one of norm_yes or norm_no.");
- System.exit(1);
- }
-
- if (Metric_options[2].equals("keepPunc")) {
- keepPunctuation = true;
- } else if (Metric_options[1].equals("removePunk")) {
- keepPunctuation = false;
- } else {
- System.out.println("Unknown keepPunctuation string " + Metric_options[1] + ".");
- System.out.println("Should be one of keepPunc or removePunk.");
- System.exit(1);
- }
-
- maxComputations = Integer.parseInt(Metric_options[3]);
- if (maxComputations < 1) {
- System.out.println("Maximum computations must be positive");
- System.exit(2);
- }
-
- initialize(); // set the data members of the metric
- }
-
- protected void initialize() {
- metricName = "METEOR";
- toBeMinimized = false;
- suffStatsCount = 5;
- }
-
- public double bestPossibleScore() {
- return 1.0;
- }
-
- public double worstPossibleScore() {
- return 0.0;
- }
-
- public int[] suffStats(String cand_str, int i) {
- // this method should never be used when the metric is METEOR,
- // because METEOR.java overrides suffStats(String[],int[]) below,
- // which is the only method that calls suffStats(Sting,int).
- return null;
- }
-
- public int[][] suffStats(String[] cand_strings, int[] cand_indices) {
- // calculate sufficient statistics for each sentence in an arbitrary set of candidates
-
- int candCount = cand_strings.length;
- if (cand_indices.length != candCount) {
- System.out.println("Array lengths mismatch in suffStats(String[],int[]); returning null.");
- return null;
- }
-
- int[][] stats = new int[candCount][suffStatsCount];
-
- try {
-
- // 1) Create input files for meteor
-
- // 1a) Create hypothesis file
- FileOutputStream outStream = new FileOutputStream("hyp.txt.METEOR", false); // false: don't
- // append
- OutputStreamWriter outStreamWriter = new OutputStreamWriter(outStream, "utf8");
- BufferedWriter outFile = new BufferedWriter(outStreamWriter);
-
- for (int d = 0; d < candCount; ++d) {
- writeLine(cand_strings[d], outFile);
- }
-
- outFile.close();
-
- // 1b) Create reference file
- outStream = new FileOutputStream("ref.txt.METEOR", false); // false: don't append
- outStreamWriter = new OutputStreamWriter(outStream, "utf8");
- outFile = new BufferedWriter(outStreamWriter);
-
- for (int d = 0; d < candCount; ++d) {
- for (int r = 0; r < refsPerSen; ++r) {
- writeLine(refSentences[cand_indices[d]][r], outFile);
- }
- }
-
- outFile.close();
-
- // 2) Launch meteor as an external process
-
- String cmd_str = "./meteor hyp.txt.METEOR ref.txt.METEOR";
- cmd_str += " -l " + targetLanguage;
- cmd_str += " -r " + refsPerSen;
- if (normalize) {
- cmd_str += " -normalize";
- }
- if (keepPunctuation) {
- cmd_str += " -keepPunctuation";
- }
- cmd_str += " -ssOut";
-
- Runtime rt = Runtime.getRuntime();
- Process p = rt.exec(cmd_str);
-
- StreamGobbler errorGobbler = new StreamGobbler(p.getErrorStream(), 0);
- StreamGobbler outputGobbler = new StreamGobbler(p.getInputStream(), 0);
-
- errorGobbler.start();
- outputGobbler.start();
-
- @SuppressWarnings("unused")
- int exitValue = p.waitFor();
-
-
- // 3) Read SS from output file produced by meteor
-
- BufferedReader inFile = new BufferedReader(new FileReader("TER_out.ter"));
- String line = "";
-
- line = inFile.readLine(); // skip hyp line
- line = inFile.readLine(); // skip ref line
-
- for (int d = 0; d < candCount; ++d) {
- line = inFile.readLine(); // read info
- String[] strA = line.split("\\s+");
-
- stats[d][0] = (int) Double.parseDouble(strA[0]);
- stats[d][1] = (int) Double.parseDouble(strA[1]);
- stats[d][2] = (int) Double.parseDouble(strA[2]);
- stats[d][3] = (int) Double.parseDouble(strA[3]);
- stats[d][4] = (int) Double.parseDouble(strA[4]);
- }
-
- inFile.close();
- } catch (IOException e) {
- System.err.println("IOException in METEOR.suffStats(String[],int[]): " + e.getMessage());
- System.exit(99902);
- } catch (InterruptedException e) {
- System.err.println("InterruptedException in METEOR.suffStats(String[],int[]): "
- + e.getMessage());
- System.exit(99903);
- }
-
- return stats;
- }
-
- public double score(int[] stats) {
- if (stats.length != suffStatsCount) {
- System.out.println("Mismatch between stats.length and suffStatsCount (" + stats.length
- + " vs. " + suffStatsCount + ") in METEOR.score(int[])");
- System.exit(1);
- }
-
- double sc = 0.0;
-
- // sc = ???
-
- return sc;
- }
-
- public void printDetailedScore_fromStats(int[] stats, boolean oneLiner) {
- if (oneLiner) {
- System.out.println("METEOR = METEOR(" + stats[0] + "," + stats[1] + "," + stats[2] + ","
- + stats[3] + "," + stats[4] + " = " + score(stats));
- } else {
- System.out.println("# matches = " + stats[0]);
- System.out.println("test length = " + stats[1]);
- System.out.println("ref length = " + stats[2]);
- System.out.println("# chunks = " + stats[3]);
- System.out.println("length cost = " + stats[4]);
- System.out.println("METEOR = " + score(stats));
- }
- }
-
- private void writeLine(String line, BufferedWriter writer) throws IOException {
- writer.write(line, 0, line.length());
- writer.newLine();
- writer.flush();
- }
-
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/metrics/MinimumChangeBLEU.java
----------------------------------------------------------------------
diff --git a/src/joshua/metrics/MinimumChangeBLEU.java b/src/joshua/metrics/MinimumChangeBLEU.java
deleted file mode 100644
index fa764c3..0000000
--- a/src/joshua/metrics/MinimumChangeBLEU.java
+++ /dev/null
@@ -1,221 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.metrics;
-
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.logging.Logger;
-
-import joshua.util.Algorithms;
-
-public class MinimumChangeBLEU extends BLEU {
- private static final Logger logger = Logger.getLogger(MinimumChangeBLEU.class.getName());
-
- // we assume that the source for the paraphrasing run is
- // part of the set of references
- private int sourceReferenceIndex;
- private double thresholdWER;
-
-
- public MinimumChangeBLEU() {
- super();
- this.sourceReferenceIndex = 0;
- this.thresholdWER = 0.3;
- initialize();
- }
-
-
- public MinimumChangeBLEU(String[] options) {
- super(options);
- this.sourceReferenceIndex = Integer.parseInt(options[2]);
- this.thresholdWER = Double.parseDouble(options[3]);
- initialize();
- }
-
-
- protected void initialize() {
- metricName = "MC_BLEU";
- toBeMinimized = false;
- // adding 1 to the sufficient stats for regular BLEU
- suffStatsCount = 2 * maxGramLength + 3;
-
- set_weightsArray();
- set_maxNgramCounts();
- }
-
-
- protected void set_maxNgramCounts() {
- @SuppressWarnings("unchecked")
- HashMap<String, Integer>[] temp_HMA = new HashMap[numSentences];
- maxNgramCounts = temp_HMA;
-
- String gram = "";
- int oldCount = 0, nextCount = 0;
-
- for (int i = 0; i < numSentences; ++i) {
- // update counts as necessary from the reference translations
- for (int r = 0; r < refsPerSen; ++r) {
- // skip source reference
- if (r == this.sourceReferenceIndex) continue;
- if (maxNgramCounts[i] == null) {
- maxNgramCounts[i] = getNgramCountsAll(refSentences[i][r]);
- } else {
- HashMap<String, Integer> nextNgramCounts = getNgramCountsAll(refSentences[i][r]);
- for (Map.Entry<String, Integer> entry : nextNgramCounts.entrySet()) {
- gram = entry.getKey();
- nextCount = entry.getValue();
-
- if (maxNgramCounts[i].containsKey(gram)) {
- oldCount = maxNgramCounts[i].get(gram);
- if (nextCount > oldCount) {
- maxNgramCounts[i].put(gram, nextCount);
- }
- } else { // add it
- maxNgramCounts[i].put(gram, nextCount);
- }
- }
- }
- } // for (r)
- } // for (i)
-
- // for efficiency, calculate the reference lenghts, which will be used
- // in effLength...
- refWordCount = new int[numSentences][refsPerSen];
- for (int i = 0; i < numSentences; ++i) {
- for (int r = 0; r < refsPerSen; ++r) {
- if (r == this.sourceReferenceIndex) continue;
- refWordCount[i][r] = wordCount(refSentences[i][r]);
- }
- }
- }
-
-
- public int[] suffStats(String cand_str, int i) {
- int[] stats = new int[suffStatsCount];
-
- String[] candidate_words;
- if (!cand_str.equals(""))
- candidate_words = cand_str.split("\\s+");
- else
- candidate_words = new String[0];
-
- // dropping "_OOV" marker
- for (int j = 0; j < candidate_words.length; j++) {
- if (candidate_words[j].endsWith("_OOV"))
- candidate_words[j] = candidate_words[j].substring(0, candidate_words[j].length() - 4);
- }
-
- set_prec_suffStats(stats, candidate_words, i);
- String[] source_words = refSentences[i][sourceReferenceIndex].split("\\s+");
- stats[suffStatsCount - 1] = Algorithms.levenshtein(candidate_words, source_words);
- stats[suffStatsCount - 2] = effLength(candidate_words.length, i);
- stats[suffStatsCount - 3] = candidate_words.length;
-
- return stats;
- }
-
-
- public int effLength(int candLength, int i) {
- if (effLengthMethod == EffectiveLengthMethod.CLOSEST) {
- int closestRefLength = Integer.MIN_VALUE;
- int minDiff = Math.abs(candLength - closestRefLength);
-
- for (int r = 0; r < refsPerSen; ++r) {
- if (r == this.sourceReferenceIndex) continue;
- int nextRefLength = refWordCount[i][r];
- int nextDiff = Math.abs(candLength - nextRefLength);
-
- if (nextDiff < minDiff) {
- closestRefLength = nextRefLength;
- minDiff = nextDiff;
- } else if (nextDiff == minDiff && nextRefLength < closestRefLength) {
- closestRefLength = nextRefLength;
- minDiff = nextDiff;
- }
- }
- return closestRefLength;
- } else if (effLengthMethod == EffectiveLengthMethod.SHORTEST) {
- int shortestRefLength = Integer.MAX_VALUE;
-
- for (int r = 0; r < refsPerSen; ++r) {
- if (r == this.sourceReferenceIndex) continue;
-
- int nextRefLength = refWordCount[i][r];
- if (nextRefLength < shortestRefLength) {
- shortestRefLength = nextRefLength;
- }
- }
- return shortestRefLength;
- }
-
- return candLength; // should never get here anyway
- }
-
-
- public double score(int[] stats) {
- if (stats.length != suffStatsCount) {
- logger.severe("Mismatch between stats.length and " + "suffStatsCount (" + stats.length
- + " vs. " + suffStatsCount + ") in BLEU.score(int[])");
- System.exit(2);
- }
-
- double accuracy = 0.0;
- double smooth_addition = 1.0; // following bleu-1.04.pl
- double c_len = stats[suffStatsCount - 3];
- double r_len = stats[suffStatsCount - 2];
-
- double wer = stats[suffStatsCount - 1] / c_len;
- double wer_penalty = (wer >= thresholdWER) ? 1.0 : (wer / thresholdWER);
-
- double correctGramCount, totalGramCount;
-
- for (int n = 1; n <= maxGramLength; ++n) {
- correctGramCount = stats[2 * (n - 1)];
- totalGramCount = stats[2 * (n - 1) + 1];
-
- double prec_n;
- if (totalGramCount > 0) {
- prec_n = correctGramCount / totalGramCount;
- } else {
- prec_n = 1; // following bleu-1.04.pl ???????
- }
-
- if (prec_n == 0) {
- smooth_addition *= 0.5;
- prec_n = smooth_addition / (c_len - n + 1);
- // isn't c_len-n+1 just totalGramCount ???????
- }
- accuracy += weights[n] * Math.log(prec_n);
- }
- double brevity_penalty = 1.0;
- if (c_len < r_len) brevity_penalty = Math.exp(1 - (r_len / c_len));
-
- return wer_penalty * brevity_penalty * Math.exp(accuracy);
- }
-
-
- public void printDetailedScore_fromStats(int[] stats, boolean oneLiner) {
- double wer = stats[suffStatsCount - 1] / stats[suffStatsCount - 3];
- double wer_penalty = (wer >= thresholdWER) ? 1.0d : (wer / thresholdWER);
-
- System.out.println("WER_penalty = " + wer_penalty);
- System.out.println("MC_BLEU= " + score(stats));
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/metrics/NewMetric.java.template
----------------------------------------------------------------------
diff --git a/src/joshua/metrics/NewMetric.java.template b/src/joshua/metrics/NewMetric.java.template
deleted file mode 100644
index 3b8ed83..0000000
--- a/src/joshua/metrics/NewMetric.java.template
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.zmert;
-import java.math.*;
-import java.util.*;
-import java.io.*;
-
-***(1)***
-public class __new_metric_CLASS_name__ extends EvaluationMetric
-{
- /********************************************
- private data members for this error metric
- ********************************************/
-
- ***(2)***
- private ;
- private ;
- private ;
-
- /*
- You already have access to these data members of the parent
- class (EvaluationMetric):
- int numSentences;
- number of sentences in the MERT set
- int refsPerSen;
- number of references per sentence
- String[][] refSentences;
- refSentences[i][r] stores the r'th reference of the i'th
- source sentence (both indices are 0-based)
- */
- /********************************************
- ********************************************/
-
- public constructorNameMustMatchClassName(String[] Metric_options)
- {
-
- ***(3)***
-
- //
- //
- // process the Metric_options array
- //
- //
-
- initialize(); // set the data members of the metric
- }
-
- protected void initialize()
- {
- ***(4)***
- metricName = "XXXXXXXX"; <- pick a metric name
- toBeMinimized = true/false; <- should it be minimized?
- suffStatsCount = ???; <- how many SS does the metric need?
-
- ***(5)***
- /* here you make calls to any methods that set the data members */
- /* here you make calls to any methods that set the data members */
- /* here you make calls to any methods that set the data members */
- }
-
- ***(6)***
- public double bestPossibleScore() { return ???; }
- --> what's the best score of the metric? <--
- public double worstPossibleScore() { return ???; }
- --> what's the worst score of the metric? <--
-
- ***(7)***
- /* here you define any methods that set the data members */
- /* here you define any methods that set the data members */
- /* here you define any methods that set the data members */
-
- ***(8)***
- public int[] suffStats(String cand_str, int i) throws Exception
- {
- int[] stats = new int[suffStatsCount];
-
- //
- //
- // set contents of stats[] here!
- //
- //
-
- return stats;
- }
-
- ***(9a)***
- public double score(int[] stats)
- {
- if (stats.length != suffStatsCount) {
- System.out.println("Mismatch between stats.length and suffStatsCount (" + stats.length + " vs. " + suffStatsCount + ") in NewMetric.score(int[])");
- System.exit(1);
- }
-
- double sc = 0.0;
-
- //
- //
- // set sc here!
- //
- //
-
- return sc;
- }
-
- ***(9b)***
- public void printDetailedScore_fromStats(int[] stats, boolean oneLiner)
- {
- System.out.println(metricName + " = " + score(stats));
-
- //
- //
- // optional (for debugging purposes)
- //
- //
- }
-
-}
-
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/metrics/Precis.java
----------------------------------------------------------------------
diff --git a/src/joshua/metrics/Precis.java b/src/joshua/metrics/Precis.java
deleted file mode 100644
index 82f4106..0000000
--- a/src/joshua/metrics/Precis.java
+++ /dev/null
@@ -1,332 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.metrics;
-
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.Map;
-import java.util.logging.Logger;
-
-import joshua.util.Algorithms;
-
-// The metric re-uses most of the BLEU code
-public class Precis extends BLEU {
- private static final Logger logger = Logger.getLogger(Precis.class.getName());
-
- private static final double REF_CR = -1.0;
-
- // We assume that the source for the paraphrasing run is
- // part of the set of references, this is its index.
- private int sourceReferenceIndex;
-
- // A global target compression rate to achieve
- // if negative, we default to locally aiming for the compression
- // rate given by the (closest) reference compression?
- private double targetCompressionRate;
-
- // Are we optimizing for character-based compression (as opposed
- // to token-based)?
- private boolean characterBased;
-
- // Weight for factoring in Levenshtein distance to source as a penalty for
- // insufficient change.
- private double similarityWeight;
-
- public Precis() {
- super();
- this.sourceReferenceIndex = 0;
- this.targetCompressionRate = 0;
- this.characterBased = false;
- this.similarityWeight = 0;
- initialize();
- }
-
- // We require the BLEU arguments (that's 2) plus
- // 3 of our own (see above) - the total is registered with
- // ZMERT in EvaluationMetric, line ~66
- public Precis(String[] options) {
- super(options);
- this.sourceReferenceIndex = Integer.parseInt(options[2]);
-
- if ("ref".equals(options[3])) {
- targetCompressionRate = REF_CR;
- } else {
- targetCompressionRate = Double.parseDouble(options[3]);
- if (targetCompressionRate > 1 || targetCompressionRate < 0)
- throw new RuntimeException("Invalid compression ratio requested: " + options[3]);
- }
-
- if ("chars".equals(options[4]))
- this.characterBased = true;
- else if ("words".equals(options[4]))
- this.characterBased = false;
- else
- throw new RuntimeException("Unknown compression style: " + options[4]);
-
- similarityWeight = Double.parseDouble(options[5]);
- if (similarityWeight < 0 || similarityWeight > 1)
- throw new RuntimeException("Source penalty out of bounds: " + options[5]);
-
- initialize();
- }
-
- // in addition to BLEU's statistics, we store some length info;
- // for character-based compression we need to store more (for token-based
- // BLEU already has us partially covered by storing some num_of_words)
- //
- // here's where you'd make additional room for statistics of your own
- protected void initialize() {
- metricName = "PRECIS";
- toBeMinimized = false;
- // Adding 3 to the sufficient stats for regular BLEU - character-based
- // compression requires extra stats. We additionally store the Levenshtein
- // distance to the source, the source length in tokens and the source
- // length relevant
- suffStatsCount = 2 * maxGramLength + 4 + (this.characterBased ? 3 : 0);
-
- set_weightsArray();
- set_maxNgramCounts();
- }
-
- // The only difference to BLEU here is that we're excluding the input from
- // the collection of ngram statistics - that's actually up for debate
- protected void set_maxNgramCounts() {
- @SuppressWarnings("unchecked")
- HashMap<String, Integer>[] temp_HMA = new HashMap[numSentences];
- maxNgramCounts = temp_HMA;
-
- String gram = "";
- int oldCount = 0, nextCount = 0;
-
- for (int i = 0; i < numSentences; ++i) {
- // update counts as necessary from the reference translations
- for (int r = 0; r < refsPerSen; ++r) {
- // skip source reference
- if (r == this.sourceReferenceIndex) continue;
- if (maxNgramCounts[i] == null) {
- maxNgramCounts[i] = getNgramCountsAll(refSentences[i][r]);
- } else {
- HashMap<String, Integer> nextNgramCounts = getNgramCountsAll(refSentences[i][r]);
- for ( Map.Entry<String, Integer> entry : nextNgramCounts.entrySet() ) {
- gram = entry.getKey();
- nextCount = entry.getValue();
-
- if (maxNgramCounts[i].containsKey(gram)) {
- oldCount = maxNgramCounts[i].get(gram);
- if (nextCount > oldCount) {
- maxNgramCounts[i].put(gram, nextCount);
- }
- } else { // add it
- maxNgramCounts[i].put(gram, nextCount);
- }
- }
- }
- } // for (r)
- } // for (i)
-
- // for efficiency, calculate the reference lengths, which will be used
- // in effLength...
- refWordCount = new int[numSentences][refsPerSen];
- for (int i = 0; i < numSentences; ++i) {
- for (int r = 0; r < refsPerSen; ++r) {
- refWordCount[i][r] = wordCount(refSentences[i][r]);
- }
- }
- }
-
- // computation of statistics
- public int[] suffStats(String cand_str, int i) {
- int[] stats = new int[suffStatsCount];
-
- String[] candidate_words;
- if (!cand_str.equals(""))
- candidate_words = cand_str.split("\\s+");
- else
- candidate_words = new String[0];
-
- // Set n-gram precision stats.
- set_prec_suffStats(stats, candidate_words, i);
-
- // Same as BLEU.
- stats[2 * maxGramLength] = candidate_words.length;
- stats[2 * maxGramLength + 1] = effLength(candidate_words.length, i);
-
- // Source length in tokens.
- stats[2 * maxGramLength + 2] = refWordCount[i][sourceReferenceIndex];
-
- // Character-based compression requires stats in character counts.
- if (this.characterBased) {
- // Candidate length in characters.
- stats[suffStatsCount - 4] = cand_str.length() - candidate_words.length + 1;
- // Reference length in characters.
- stats[suffStatsCount - 3] = effLength(stats[suffStatsCount - 4], i, true);
- // Source length in characters.
- stats[suffStatsCount - 2] =
- refSentences[i][sourceReferenceIndex].length() - refWordCount[i][sourceReferenceIndex]
- + 1;
- }
-
- // Levenshtein distance to source.
- if (this.similarityWeight > 0)
- stats[suffStatsCount - 1] =
- Algorithms.levenshtein(candidate_words,
- refSentences[i][sourceReferenceIndex].split("\\s+"));
-
- return stats;
- }
-
- public int effLength(int candLength, int i) {
- return effLength(candLength, i, false);
- }
-
- // hacked to be able to return character length upon request
- public int effLength(int candLength, int i, boolean character_length) {
- if (effLengthMethod == EffectiveLengthMethod.CLOSEST) {
- int closestRefLength = Integer.MIN_VALUE;
- int minDiff = Math.abs(candLength - closestRefLength);
-
- for (int r = 0; r < refsPerSen; ++r) {
- if (r == this.sourceReferenceIndex) continue;
- int nextRefLength =
- (character_length
- ? refSentences[i][r].length() - refWordCount[i][r] + 1
- : refWordCount[i][r]);
- int nextDiff = Math.abs(candLength - nextRefLength);
-
- if (nextDiff < minDiff) {
- closestRefLength = nextRefLength;
- minDiff = nextDiff;
- } else if (nextDiff == minDiff && nextRefLength < closestRefLength) {
- closestRefLength = nextRefLength;
- minDiff = nextDiff;
- }
- }
- return closestRefLength;
- } else if (effLengthMethod == EffectiveLengthMethod.SHORTEST) {
- int shortestRefLength = Integer.MAX_VALUE;
-
- for (int r = 0; r < refsPerSen; ++r) {
- if (r == this.sourceReferenceIndex) continue;
-
- int nextRefLength =
- (character_length
- ? refSentences[i][r].length() - refWordCount[i][r] + 1
- : refWordCount[i][r]);
- if (nextRefLength < shortestRefLength) {
- shortestRefLength = nextRefLength;
- }
- }
- return shortestRefLength;
- }
-
- return candLength; // should never get here anyway
- }
-
- // calculate the actual score from the statistics
- public double score(int[] stats) {
- if (stats.length != suffStatsCount) {
- logger.severe("Mismatch between stats.length and suffStatsCount (" + stats.length + " vs. "
- + suffStatsCount + ") in Precis.score(int[])");
- System.exit(2);
- }
-
- double accuracy = 0.0;
- double smooth_addition = 1.0; // following bleu-1.04.pl
-
- double cnd_len = stats[2 * maxGramLength];
- double ref_len = stats[2 * maxGramLength + 1];
- double src_len = stats[2 * maxGramLength + 2];
- double compression_cnd_len = stats[suffStatsCount - 4];
- double compression_ref_len = stats[suffStatsCount - 3];
- double compression_src_len = stats[suffStatsCount - 2];
- double src_lev = stats[suffStatsCount - 1];
-
- double compression_ratio = compression_cnd_len / compression_src_len;
-
- double verbosity_penalty =
- getVerbosityPenalty(compression_ratio, (targetCompressionRate == REF_CR
- ? compression_ref_len / compression_src_len
- : targetCompressionRate));
-
- // this part matches BLEU
- double correctGramCount, totalGramCount;
- for (int n = 1; n <= maxGramLength; ++n) {
- correctGramCount = stats[2 * (n - 1)];
- totalGramCount = stats[2 * (n - 1) + 1];
- double prec_n;
- if (totalGramCount > 0) {
- prec_n = correctGramCount / totalGramCount;
- } else {
- prec_n = 1;
- }
- if (prec_n == 0) {
- smooth_addition *= 0.5;
- prec_n = smooth_addition / (cnd_len - n + 1);
- }
- accuracy += weights[n] * Math.log(prec_n);
- }
- double brevity_penalty = 1.0;
- double similarity_penalty = similarityWeight * Math.max(0, 1 - src_lev / src_len);
-
- if (cnd_len < ref_len) brevity_penalty = Math.exp(1 - (ref_len / cnd_len));
-
- // We add on our penalties on top of BLEU.
- return verbosity_penalty * brevity_penalty * Math.exp(accuracy) - similarity_penalty;
- }
-
- // Somewhat not-so-detailed, this is used in the JoshuaEval tool.
- public void printDetailedScore_fromStats(int[] stats, boolean oneLiner) {
- double cnd_len = stats[2 * maxGramLength];
- double ref_len = stats[2 * maxGramLength + 1];
- double src_len = stats[2 * maxGramLength + 2];
- double compression_cnd_len = stats[suffStatsCount - 4];
- double compression_ref_len = stats[suffStatsCount - 3];
- double compression_src_len = stats[suffStatsCount - 2];
- double src_lev = stats[suffStatsCount - 1];
-
- double brevity_penalty = 1;
- if (cnd_len < ref_len) brevity_penalty = Math.exp(1 - (ref_len / cnd_len));
-
- double cr = compression_cnd_len / compression_src_len;
- double similarity_penalty = Math.max(0, 1 - src_lev / src_len);
-
- double verbosity_penalty =
- getVerbosityPenalty(cr, (targetCompressionRate == REF_CR ? compression_ref_len
- / compression_src_len : targetCompressionRate));
-
- System.out.println(String.format("Similarity Penalty = %.2f * %.4f", similarityWeight,
- similarity_penalty));
- System.out.println(String.format("Verbosity Penalty = %.4f", verbosity_penalty));
- System.out.println(String.format("Brevity Penalty = %.4f", brevity_penalty));
- System.out.println(String.format("Precis = %.4f", score(stats)));
- }
-
- // Returns the score penalty as a function of the achieved and target
- // compression rates currently an exponential fall-off to make sure the not
- // compressing enough is costly.
- protected static double getVerbosityPenalty(double cr, double target_rate) {
- if (cr <= target_rate)
- return 1.0;
- else {
- // linear option: (1 - cr) / (1 - compressionRate);
- // doesn't penalize insufficient compressions hard enough
- return Math.exp(5 * (target_rate - cr));
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/metrics/PrecisMinusSourceBLEU.java
----------------------------------------------------------------------
diff --git a/src/joshua/metrics/PrecisMinusSourceBLEU.java b/src/joshua/metrics/PrecisMinusSourceBLEU.java
deleted file mode 100644
index f56f8cb..0000000
--- a/src/joshua/metrics/PrecisMinusSourceBLEU.java
+++ /dev/null
@@ -1,184 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.metrics;
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.PrintWriter;
-
-public class PrecisMinusSourceBLEU extends EvaluationMetric {
-
- private Precis myPrecis;
- private SourceBLEU mySourceBLEU;
-
- private double bleuWeight;
-
- private int precisCount;
- private int sourceBleuCount;
-
- public PrecisMinusSourceBLEU(String[] options) {
- // Automatically deactivate Levenshtein penalty for Precis.
- bleuWeight = Double.parseDouble(options[5]);
- options[5] = "0";
-
- myPrecis = new Precis(options);
- mySourceBLEU =
- new SourceBLEU(Integer.parseInt(options[0]), options[1], Integer.parseInt(options[2]),
- false);
-
- initialize();
- }
-
- protected void initialize() {
- metricName = "PRECIS-SRC_BLEU";
- toBeMinimized = false;
- precisCount = myPrecis.suffStatsCount;
- sourceBleuCount = mySourceBLEU.suffStatsCount;
- suffStatsCount = precisCount + sourceBleuCount;
- }
-
- public double bestPossibleScore() {
- return 1.0;
- }
-
- public double worstPossibleScore() {
- return -1.0;
- }
-
- public int[] suffStats(String cand_str, int i) {
- return null;
- }
-
- public int[][] suffStats(String[] cand_strings, int[] cand_indices) {
- int candCount = cand_strings.length;
- if (cand_indices.length != candCount) {
- System.out.println("Array lengths mismatch in suffStats(String[],int[]); returning null.");
- return null;
- }
-
- int[][] stats = new int[candCount][suffStatsCount];
-
- int[][] precis_stats = myPrecis.suffStats(cand_strings, cand_indices);
- int[][] source_bleu_stats = mySourceBLEU.suffStats(cand_strings, cand_indices);
-
- for (int d = 0; d < candCount; ++d) {
- int s = 0;
- for (int s_T = 0; s_T < precisCount; s_T++) {
- stats[d][s] = precis_stats[d][s_T];
- ++s;
- }
- for (int s_B = 0; s_B < sourceBleuCount; s_B++) {
- stats[d][s] = source_bleu_stats[d][s_B];
- ++s;
- }
- }
- return stats;
- }
-
- public void createSuffStatsFile(String cand_strings_fileName, String cand_indices_fileName,
- String outputFileName, int maxBatchSize) {
- try {
- myPrecis.createSuffStatsFile(cand_strings_fileName, cand_indices_fileName, outputFileName
- + ".PRECIS", maxBatchSize);
- mySourceBLEU.createSuffStatsFile(cand_strings_fileName, cand_indices_fileName, outputFileName
- + ".SRC_BLEU", maxBatchSize);
-
- PrintWriter outFile = new PrintWriter(outputFileName);
-
- FileInputStream inStream_Precis = new FileInputStream(outputFileName + ".PRECIS");
- BufferedReader inFile_Precis =
- new BufferedReader(new InputStreamReader(inStream_Precis, "utf8"));
-
- FileInputStream inStream_SourceBLEU = new FileInputStream(outputFileName + ".SRC_BLEU");
- BufferedReader inFile_SourceBLEU =
- new BufferedReader(new InputStreamReader(inStream_SourceBLEU, "utf8"));
-
- String line_Precis = inFile_Precis.readLine();
- String line_SourceBLEU = inFile_SourceBLEU.readLine();
-
- // combine the two files into one
- while (line_Precis != null) {
- outFile.println(line_Precis + " " + line_SourceBLEU);
- line_Precis = inFile_Precis.readLine();
- line_SourceBLEU = inFile_SourceBLEU.readLine();
- }
-
- inFile_Precis.close();
- inFile_SourceBLEU.close();
- outFile.close();
-
- File fd;
- fd = new File(outputFileName + ".PRECIS");
- if (fd.exists()) fd.delete();
- fd = new File(outputFileName + ".SRC_BLEU");
- if (fd.exists()) fd.delete();
- } catch (IOException e) {
- System.err.println("IOException: " + e.getMessage());
- System.exit(99902);
- }
- }
-
- public double score(int[] stats) {
- if (stats.length != suffStatsCount) {
- System.out.println("Mismatch between stats.length and suffStatsCount (" + stats.length
- + " vs. " + suffStatsCount + ") in PrecisMinusSourceBLEU.score(int[])");
- System.exit(1);
- }
-
- double sc = 0.0;
-
- int[] stats_Precis = new int[precisCount];
- int[] stats_SourceBLEU = new int[sourceBleuCount];
- for (int s = 0; s < precisCount; ++s) {
- stats_Precis[s] = stats[s];
- }
- for (int s = 0; s < sourceBleuCount; ++s) {
- stats_SourceBLEU[s] = stats[s + precisCount];
- }
-
- double sc_T = myPrecis.score(stats_Precis);
- double sc_B = mySourceBLEU.score(stats_SourceBLEU);
-
- sc = sc_T - (bleuWeight * sc_B);
-
- return sc;
- }
-
- public void printDetailedScore_fromStats(int[] stats, boolean oneLiner) {
- int[] stats_Precis = new int[precisCount];
- int[] stats_SourceBLEU = new int[sourceBleuCount];
- for (int s = 0; s < precisCount; ++s) {
- stats_Precis[s] = stats[s];
- }
- for (int s = 0; s < sourceBleuCount; ++s) {
- stats_SourceBLEU[s] = stats[s + precisCount];
- }
-
- System.out.println("---PRECIS---");
- myPrecis.printDetailedScore_fromStats(stats_Precis, oneLiner);
- System.out.println("---SRC_BLEU---");
- mySourceBLEU.printDetailedScore_fromStats(stats_SourceBLEU, oneLiner);
- System.out.println("---------");
- System.out.println(" => " + metricName + " = " + f4.format(score(stats)));
- }
-
-}