You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/08 12:47:03 UTC
[2/4] incubator-joshua git commit: Added SARI metric from cdnapoles
Added SARI metric from cdnapoles
@article{xu2016optimizing,
title={Optimizing statistical machine translation for text simplification},
author={Xu, Wei and Napoles, Courtney and Pavlick, Ellie and Chen, Quanze and Callison-Burch, Chris},
journal={Transactions of the Association for Computational Linguistics},
volume={4},
year={2016}}
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/7e2aeb0d
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/7e2aeb0d
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/7e2aeb0d
Branch: refs/heads/master
Commit: 7e2aeb0d0d78a26e53c7f3d3f84bd2200b996626
Parents: 07df598
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue Jun 7 23:19:36 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue Jun 7 23:19:36 2016 -0400
----------------------------------------------------------------------
.../apache/joshua/metrics/EvaluationMetric.java | 9 +-
.../java/org/apache/joshua/metrics/SARI.java | 681 +++++++++++++++++++
2 files changed, 689 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7e2aeb0d/src/main/java/org/apache/joshua/metrics/EvaluationMetric.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/metrics/EvaluationMetric.java b/src/main/java/org/apache/joshua/metrics/EvaluationMetric.java
index 08efdeb..9a8786c 100644
--- a/src/main/java/org/apache/joshua/metrics/EvaluationMetric.java
+++ b/src/main/java/org/apache/joshua/metrics/EvaluationMetric.java
@@ -27,6 +27,9 @@ import java.text.DecimalFormat;
import java.util.Arrays;
import java.util.TreeMap;
+/***
+ * @author Omar Zaidan
+ */
public abstract class EvaluationMetric {
/* static data members */
private static TreeMap<String, Integer> metricOptionCount; // maps metric names -> number of
@@ -73,6 +76,7 @@ public abstract class EvaluationMetric {
metricOptionCount.put("SRC_BLEU", 4);
metricOptionCount.put("PRECIS-SRC_BLEU", 6);
metricOptionCount.put("GL_BLEU", 3);
+ metricOptionCount.put("SARI", 2); // options: max-ngram source-path
}
public static EvaluationMetric getMetric(String metricName, String[] metricOptions) {
@@ -111,7 +115,10 @@ public abstract class EvaluationMetric {
} else if (metricName.equals("GL_BLEU")) {
retMetric = new GradeLevelBLEU(metricOptions); // the "GL_BLEU" metric corresponds to the
// GradeLevelBLEU class
- }
+ } else if (metricName.equals("SARI")) {
+ retMetric = new SARI(metricOptions);
+ }
+
return retMetric;
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/7e2aeb0d/src/main/java/org/apache/joshua/metrics/SARI.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/metrics/SARI.java b/src/main/java/org/apache/joshua/metrics/SARI.java
new file mode 100644
index 0000000..55d87e0
--- /dev/null
+++ b/src/main/java/org/apache/joshua/metrics/SARI.java
@@ -0,0 +1,681 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more contributor license
+ * agreements. See the NOTICE file distributed with this work for additional information regarding
+ * copyright ownership. The ASF licenses this file to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance with the License. You may obtain a
+ * copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+package org.apache.joshua.metrics;
+
+// Changed PROCore.java (text normalization function) and EvaluationMetric too
+
+import java.util.Map;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.logging.Logger;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.InputStream;
+
+/***
+ * Implementation of the SARI metric for text-to-text correction.
+ *
+ * @article{xu2016optimizing,
+ * title={Optimizing statistical machine translation for text simplification},
+ * author={Xu, Wei and Napoles, Courtney and Pavlick, Ellie and Chen, Quanze and Callison-Burch, Chris},
+ * journal={Transactions of the Association for Computational Linguistics},
+ * volume={4},
+ * year={2016}}
+ *
+ * @author Courtney Napoles
+ */
+public class SARI extends EvaluationMetric {
+ private static final Logger logger = Logger.getLogger(SARI.class.getName());
+
+ // The maximum n-gram we care about
+ protected int maxGramLength;
+ protected String[] srcSentences;
+ protected double[] weights;
+ protected HashMap<String, Integer>[][] refNgramCounts;
+ protected HashMap<String, Integer>[][] srcNgramCounts;
+
+ /*
+ * You already have access to these data members of the parent class (EvaluationMetric): int
+ * numSentences; number of sentences in the MERT set int refsPerSen; number of references per
+ * sentence String[][] refSentences; refSentences[i][r] stores the r'th reference of the i'th
+ * source sentence (both indices are 0-based)
+ */
+
+ public SARI(String[] Metric_options) {
+ int mxGrmLn = Integer.parseInt(Metric_options[0]);
+ if (mxGrmLn >= 1) {
+ maxGramLength = mxGrmLn;
+ } else {
+ logger.severe("Maximum gram length must be positive");
+ System.exit(1);
+ }
+
+ try {
+ loadSources(Metric_options[1]);
+ } catch (IOException e) {
+ logger.severe("Error loading the source sentences from " + Metric_options[1]);
+ System.exit(1);
+ }
+
+ initialize(); // set the data members of the metric
+
+ }
+
+ protected void initialize() {
+ metricName = "SARI";
+ toBeMinimized = false;
+ suffStatsCount = StatIndex.values().length * maxGramLength + 1;
+
+ set_weightsArray();
+ set_refNgramCounts();
+ set_srcNgramCounts();
+
+ }
+
+ public double bestPossibleScore() {
+ return 1.0;
+ }
+
+ public double worstPossibleScore() {
+ return 0.0;
+ }
+
+ /**
+ * Sets the BLEU weights for each n-gram level to uniform.
+ */
+ protected void set_weightsArray() {
+ weights = new double[1 + maxGramLength];
+ for (int n = 1; n <= maxGramLength; ++n) {
+ weights[n] = 1.0 / maxGramLength;
+ }
+ }
+
+ /**
+ * Computes the sum of ngram counts in references for each sentence (storing them in
+ * <code>refNgramCounts</code>), which are used for clipping n-gram counts.
+ */
+ protected void set_refNgramCounts() {
+ @SuppressWarnings("unchecked")
+
+ HashMap<String, Integer>[][] temp_HMA = new HashMap[numSentences][maxGramLength];
+ refNgramCounts = temp_HMA;
+
+ String gram = "";
+ int oldCount = 0, nextCount = 0;
+
+ for (int i = 0; i < numSentences; ++i) {
+ refNgramCounts[i] = getNgramCountsArray(refSentences[i][0]);
+ // initialize to ngramCounts[n] of the first reference translation...
+
+ // ...and update as necessary from the other reference translations
+ for (int r = 1; r < refsPerSen; ++r) {
+
+ HashMap<String, Integer>[] nextNgramCounts = getNgramCountsArray(refSentences[i][r]);
+
+ for (int n = 1; n <= maxGramLength; ++n) {
+
+ Iterator<String> it = (nextNgramCounts[n].keySet()).iterator();
+
+ while (it.hasNext()) {
+ gram = it.next();
+ nextCount = nextNgramCounts[n].get(gram);
+
+ if (refNgramCounts[i][n].containsKey(gram)) { // update if necessary
+ oldCount = refNgramCounts[i][n].get(gram);
+ refNgramCounts[i][n].put(gram, oldCount + nextCount);
+ } else { // add it
+ refNgramCounts[i][n].put(gram, nextCount);
+ }
+
+ }
+
+ } // for (n)
+
+ } // for (r)
+
+ } // for (i)
+
+ }
+
+ protected void set_srcNgramCounts() {
+ @SuppressWarnings("unchecked")
+
+ HashMap<String, Integer>[][] temp_HMA = new HashMap[numSentences][maxGramLength];
+ srcNgramCounts = temp_HMA;
+
+ for (int i = 0; i < numSentences; ++i) {
+ srcNgramCounts[i] = getNgramCountsArray(srcSentences[i]);
+ } // for (i)
+ }
+
+ // set contents of stats[] here!
+ public int[] suffStats(String cand_str, int i) {
+ int[] stats = new int[suffStatsCount];
+
+ HashMap<String, Integer>[] candNgramCounts = getNgramCountsArray(cand_str);
+
+ for (int n = 1; n <= maxGramLength; ++n) {
+
+ // ADD OPERATIONS
+ HashMap cand_sub_src = substractHashMap(candNgramCounts[n], srcNgramCounts[i][n]);
+ HashMap cand_and_ref_sub_src = intersectHashMap(cand_sub_src, refNgramCounts[i][n]);
+ HashMap ref_sub_src = substractHashMap(refNgramCounts[i][n], srcNgramCounts[i][n]);
+
+ stats[StatIndex.values().length * (n - 1)
+ + StatIndex.ADDBOTH.ordinal()] = cand_and_ref_sub_src.keySet().size();
+ stats[StatIndex.values().length * (n - 1) + StatIndex.ADDCAND.ordinal()] = cand_sub_src
+ .keySet().size();
+ stats[StatIndex.values().length * (n - 1) + StatIndex.ADDREF.ordinal()] = ref_sub_src.keySet()
+ .size();
+
+ // System.out.println("src_and_cand_sub_ref" + cand_and_ref_sub_src +
+ // cand_and_ref_sub_src.keySet().size());
+ // System.out.println("cand_sub_src" + cand_sub_src + cand_sub_src.keySet().size());
+ // System.out.println("ref_sub_src" + ref_sub_src + ref_sub_src.keySet().size());
+
+ // DELETION OPERATIONS
+ HashMap src_sub_cand = substractHashMap(srcNgramCounts[i][n], candNgramCounts[n],
+ this.refsPerSen, this.refsPerSen);
+ HashMap src_sub_ref = substractHashMap(srcNgramCounts[i][n], refNgramCounts[i][n],
+ this.refsPerSen, 1);
+ HashMap src_sub_cand_sub_ref = intersectHashMap(src_sub_cand, src_sub_ref, 1, 1);
+
+ stats[StatIndex.values().length * (n - 1) + StatIndex.DELBOTH.ordinal()] = sumHashMapByValues(
+ src_sub_cand_sub_ref);
+ stats[StatIndex.values().length * (n - 1) + StatIndex.DELCAND.ordinal()] = sumHashMapByValues(
+ src_sub_cand);
+ stats[StatIndex.values().length * (n - 1) + StatIndex.DELREF.ordinal()] = sumHashMapByValues(
+ src_sub_ref);
+
+ // System.out.println("src_sub_cand_sub_ref" + src_sub_cand_sub_ref +
+ // sumHashMapByValues(src_sub_cand_sub_ref));
+ // System.out.println("src_sub_cand" + src_sub_cand + sumHashMapByValues(src_sub_cand));
+ // System.out.println("src_sub_ref" + src_sub_ref + sumHashMapByValues(src_sub_ref));
+
+ stats[StatIndex.values().length * (n - 1) + StatIndex.DELREF.ordinal()] = src_sub_ref.keySet()
+ .size() * this.refsPerSen;
+
+ // KEEP OPERATIONS
+ HashMap src_and_cand = intersectHashMap(srcNgramCounts[i][n], candNgramCounts[n],
+ this.refsPerSen, this.refsPerSen);
+ HashMap src_and_ref = intersectHashMap(srcNgramCounts[i][n], refNgramCounts[i][n],
+ this.refsPerSen, 1);
+ HashMap src_and_cand_and_ref = intersectHashMap(src_and_cand, src_and_ref, 1, 1);
+
+ stats[StatIndex.values().length * (n - 1)
+ + StatIndex.KEEPBOTH.ordinal()] = sumHashMapByValues(src_and_cand_and_ref);
+ stats[StatIndex.values().length * (n - 1)
+ + StatIndex.KEEPCAND.ordinal()] = sumHashMapByValues(src_and_cand);
+ stats[StatIndex.values().length * (n - 1) + StatIndex.KEEPREF.ordinal()] = sumHashMapByValues(
+ src_and_ref);
+
+ stats[StatIndex.values().length * (n - 1) + StatIndex.KEEPBOTH.ordinal()] = (int) (1000000
+ * sumHashMapByDoubleValues(divideHashMap(src_and_cand_and_ref, src_and_cand)));
+ stats[StatIndex.values().length * (n - 1)
+ + StatIndex.KEEPCAND.ordinal()] = (int) sumHashMapByDoubleValues(
+ divideHashMap(src_and_cand_and_ref, src_and_ref));
+ stats[StatIndex.values().length * (n - 1) + StatIndex.KEEPREF.ordinal()] = src_and_ref
+ .keySet().size();
+
+ // System.out.println("src_and_cand_and_ref" + src_and_cand_and_ref);
+ // System.out.println("src_and_cand" + src_and_cand);
+ // System.out.println("src_and_ref" + src_and_ref);
+
+ // stats[StatIndex.values().length * (n - 1) + StatIndex.KEEPBOTH2.ordinal()] = (int)
+ // sumHashMapByDoubleValues(divideHashMap(src_and_cand_and_ref,src_and_ref)) * 100000000 /
+ // src_and_ref.keySet().size() ;
+ // stats[StatIndex.values().length * (n - 1) + StatIndex.KEEPREF.ordinal()] =
+ // src_and_ref.keySet().size() * 8;
+
+ // System.out.println("src_and_cand_and_ref" + src_and_cand_and_ref);
+ // System.out.println("src_and_cand" + src_and_cand);
+ // System.out.println("divide" + divideHashMap(src_and_cand_and_ref,src_and_cand));
+ // System.out.println(sumHashMapByDoubleValues(divideHashMap(src_and_cand_and_ref,src_and_cand)));
+
+ }
+
+ int n = 1;
+
+ // System.out.println("CAND: " + candNgramCounts[n]);
+ // System.out.println("SRC: " + srcNgramCounts[i][n]);
+ // System.out.println("REF: " + refNgramCounts[i][n]);
+
+ HashMap src_and_cand = intersectHashMap(srcNgramCounts[i][n], candNgramCounts[n],
+ this.refsPerSen, this.refsPerSen);
+ HashMap src_and_ref = intersectHashMap(srcNgramCounts[i][n], refNgramCounts[i][n],
+ this.refsPerSen, 1);
+ HashMap src_and_cand_and_ref = intersectHashMap(src_and_cand, src_and_ref, 1, 1);
+ // System.out.println("SRC&CAND&REF : " + src_and_cand_and_ref);
+
+ HashMap cand_sub_src = substractHashMap(candNgramCounts[n], srcNgramCounts[i][n]);
+ HashMap cand_and_ref_sub_src = intersectHashMap(cand_sub_src, refNgramCounts[i][n]);
+ // System.out.println("CAND&REF-SRC : " + cand_and_ref_sub_src);
+
+ HashMap src_sub_cand = substractHashMap(srcNgramCounts[i][n], candNgramCounts[n],
+ this.refsPerSen, this.refsPerSen);
+ HashMap src_sub_ref = substractHashMap(srcNgramCounts[i][n], refNgramCounts[i][n],
+ this.refsPerSen, 1);
+ HashMap src_sub_cand_sub_ref = intersectHashMap(src_sub_cand, src_sub_ref, 1, 1);
+ // System.out.println("SRC-REF-CAND : " + src_sub_cand_sub_ref);
+
+ // System.out.println("DEBUG:" + Arrays.toString(stats));
+ // System.out.println("REF-SRC: " + substractHashMap(refNgramCounts[i], srcNgramCounts[i][0],
+ // (double)refsPerSen));
+
+ return stats;
+ }
+
+ public double score(int[] stats) {
+ if (stats.length != suffStatsCount) {
+ System.out.println("Mismatch between stats.length and suffStatsCount (" + stats.length
+ + " vs. " + suffStatsCount + ") in NewMetric.score(int[])");
+ System.exit(1);
+ }
+
+ double sc = 0.0;
+
+ for (int n = 1; n <= maxGramLength; ++n) {
+
+ int addCandCorrectNgram = stats[StatIndex.values().length * (n - 1)
+ + StatIndex.ADDBOTH.ordinal()];
+ int addCandTotalNgram = stats[StatIndex.values().length * (n - 1)
+ + StatIndex.ADDCAND.ordinal()];
+ int addRefTotalNgram = stats[StatIndex.values().length * (n - 1)
+ + StatIndex.ADDREF.ordinal()];
+
+ double prec_add_n = 0.0;
+ if (addCandTotalNgram > 0) {
+ prec_add_n = addCandCorrectNgram / (double) addCandTotalNgram;
+ }
+
+ double recall_add_n = 0.0;
+ if (addRefTotalNgram > 0) {
+ recall_add_n = addCandCorrectNgram / (double) addRefTotalNgram;
+ }
+
+ // System.out.println("\nDEBUG-SARI:" + addCandCorrectNgram + " " + addCandTotalNgram + " " +
+ // addRefTotalNgram);
+
+ double f1_add_n = meanHarmonic(prec_add_n, recall_add_n);
+
+ sc += weights[n] * f1_add_n;
+
+ int delCandCorrectNgram = stats[StatIndex.values().length * (n - 1)
+ + StatIndex.DELBOTH.ordinal()];
+ int delCandTotalNgram = stats[StatIndex.values().length * (n - 1)
+ + StatIndex.DELCAND.ordinal()];
+ int delRefTotalNgram = stats[StatIndex.values().length * (n - 1)
+ + StatIndex.DELREF.ordinal()];
+
+ double prec_del_n = 0.0;
+ if (delCandTotalNgram > 0) {
+ prec_del_n = delCandCorrectNgram / (double) delCandTotalNgram;
+ }
+
+ double recall_del_n = 0.0;
+ if (delRefTotalNgram > 0) {
+ recall_del_n = delCandCorrectNgram / (double) delRefTotalNgram;
+ }
+
+ // System.out.println("\nDEBUG-SARI:" + delCandCorrectNgram + " " + delRefTotalNgram);
+
+ double f1_del_n = meanHarmonic(prec_del_n, recall_del_n);
+
+ // sc += weights[n] * f1_del_n;
+ sc += weights[n] * prec_del_n;
+
+ int keepCandCorrectNgram = stats[StatIndex.values().length * (n - 1)
+ + StatIndex.KEEPBOTH.ordinal()];
+ // int keepCandCorrectNgram2 = stats[StatIndex.values().length * (n - 1) +
+ // StatIndex.KEEPBOTH2.ordinal()];
+ int keepCandTotalNgram = stats[StatIndex.values().length * (n - 1)
+ + StatIndex.KEEPCAND.ordinal()];
+ int keepRefTotalNgram = stats[StatIndex.values().length * (n - 1)
+ + StatIndex.KEEPREF.ordinal()];
+
+ double prec_keep_n = 0.0;
+ if (keepCandTotalNgram > 0) {
+ prec_keep_n = keepCandCorrectNgram / (double) (1000000 * keepCandTotalNgram);
+ }
+
+ double recall_keep_n = 0.0;
+ if (keepRefTotalNgram > 0) {
+ recall_keep_n = keepCandTotalNgram / (double) keepRefTotalNgram;
+ }
+
+ // System.out.println("\nDEBUG-SARI-KEEP: " + n + " " + keepCandCorrectNgram + " " +
+ // keepCandTotalNgram + " " + keepRefTotalNgram);
+
+ double f1_keep_n = meanHarmonic(prec_keep_n, recall_keep_n);
+
+ sc += weights[n] * f1_keep_n;
+
+ // System.out.println("\nDEBUG-SARI: " + n + " " + prec_add_n + " " + recall_add_n + " " +
+ // prec_del_n + " " + recall_del_n + " " + prec_keep_n + " " + recall_keep_n);
+
+ // System.out.println("\nDEBUG-SARI-KEEP: " + n + " " + keepCandCorrectNgram + " " +
+ // keepCandTotalNgram + " " + keepRefTotalNgram);
+ }
+
+ sc = sc / 3.0;
+ //
+ //
+ // set sc here!
+ //
+ //
+
+ return sc;
+ }
+
+ public double meanHarmonic(double precision, double recall) {
+
+ if (precision > 0 && recall > 0) {
+ return (2.0 * precision * recall) / (precision + recall);
+ }
+ return 0.0;
+ }
+
+ public void loadSources(String filepath) throws IOException {
+ srcSentences = new String[numSentences];
+ // BufferedReader br = new BufferedReader(new FileReader(filepath));
+ InputStream inStream = new FileInputStream(new File(filepath));
+ BufferedReader br = new BufferedReader(new InputStreamReader(inStream, "utf8"));
+
+ String line;
+ int i = 0;
+ while (i < numSentences && (line = br.readLine()) != null) {
+ srcSentences[i] = line.trim();
+ i++;
+ }
+ br.close();
+ }
+
+ public double sumHashMapByDoubleValues(HashMap<String, Double> counter) {
+ double sumcounts = 0;
+
+ for (Map.Entry<String, Double> e : counter.entrySet()) {
+ sumcounts += (double) e.getValue();
+ }
+
+ return sumcounts;
+ }
+
+ public int sumHashMapByValues(HashMap<String, Integer> counter) {
+ int sumcounts = 0;
+
+ for (Map.Entry<String, Integer> e : counter.entrySet()) {
+ sumcounts += (int) e.getValue();
+ }
+
+ return sumcounts;
+ }
+
+ public HashMap<String, Integer> substractHashMap(HashMap<String, Integer> counter1,
+ HashMap<String, Integer> counter2) {
+ HashMap<String, Integer> newcounter = new HashMap<String, Integer>();
+
+ for (Map.Entry<String, Integer> e : counter1.entrySet()) {
+ String ngram = e.getKey();
+ int count1 = e.getValue();
+ int count2 = counter2.containsKey(ngram) ? counter2.get(ngram) : 0;
+ if (count2 == 0) {
+ newcounter.put(ngram, 1);
+ }
+ }
+
+ return newcounter;
+ }
+
+ // HashMap result = counter1*ratio1 - counter2*ratio2
+ public HashMap<String, Integer> substractHashMap(HashMap<String, Integer> counter1,
+ HashMap<String, Integer> counter2, int ratio1, int ratio2) {
+ HashMap<String, Integer> newcounter = new HashMap<String, Integer>();
+
+ for (Map.Entry<String, Integer> e : counter1.entrySet()) {
+ String ngram = e.getKey();
+ int count1 = e.getValue();
+ int count2 = counter2.containsKey(ngram) ? counter2.get(ngram) : 0;
+ int newcount = count1 * ratio1 - count2 * ratio2;
+ if (newcount > 0) {
+ newcounter.put(ngram, newcount);
+ }
+ }
+
+ return newcounter;
+ }
+
+ public HashMap<String, Double> divideHashMap(HashMap<String, Integer> counter1,
+ HashMap<String, Integer> counter2) {
+ HashMap<String, Double> newcounter = new HashMap<String, Double>();
+
+ for (Map.Entry<String, Integer> e : counter1.entrySet()) {
+ String ngram = e.getKey();
+ int count1 = e.getValue();
+ int count2 = counter2.containsKey(ngram) ? counter2.get(ngram) : 0;
+ if (count2 != 0) {
+ newcounter.put(ngram, (double) count1 / (double) count2);
+ }
+ }
+
+ return newcounter;
+ }
+
+ public HashMap<String, Integer> intersectHashMap(HashMap<String, Integer> counter1,
+ HashMap<String, Integer> counter2) {
+ HashMap<String, Integer> newcounter = new HashMap<String, Integer>();
+
+ for (Map.Entry<String, Integer> e : counter1.entrySet()) {
+ String ngram = e.getKey();
+ int count1 = e.getValue();
+ int count2 = counter2.containsKey(ngram) ? counter2.get(ngram) : 0;
+ if (count2 > 0) {
+ newcounter.put(ngram, 1);
+ }
+ }
+
+ return newcounter;
+ }
+
+ // HashMap result = (counter1*ratio1) & (counter2*ratio2)
+ public HashMap<String, Integer> intersectHashMap(HashMap<String, Integer> counter1,
+ HashMap<String, Integer> counter2, int ratio1, int ratio2) {
+ HashMap<String, Integer> newcounter = new HashMap<String, Integer>();
+
+ for (Map.Entry<String, Integer> e : counter1.entrySet()) {
+ String ngram = e.getKey();
+ int count1 = e.getValue();
+ int count2 = counter2.containsKey(ngram) ? counter2.get(ngram) : 0;
+ int newcount = Math.min(count1 * ratio1, count2 * ratio2);
+ if (newcount > 0) {
+ newcounter.put(ngram, newcount);
+ }
+ }
+
+ return newcounter;
+ }
+
+ protected int wordCount(String cand_str) {
+ if (!cand_str.equals("")) {
+ return cand_str.split("\\s+").length;
+ } else {
+ return 0;
+ }
+ }
+
+ public HashMap<String, Integer>[] getNgramCountsArray(String cand_str) {
+ if (!cand_str.equals("")) {
+ return getNgramCountsArray(cand_str.split("\\s+"));
+ } else {
+ return getNgramCountsArray(new String[0]);
+ }
+ }
+
+ public HashMap<String, Integer>[] getNgramCountsArray(String[] words) {
+ @SuppressWarnings("unchecked")
+ HashMap<String, Integer>[] ngramCountsArray = new HashMap[1 + maxGramLength];
+ ngramCountsArray[0] = null;
+ for (int n = 1; n <= maxGramLength; ++n) {
+ ngramCountsArray[n] = new HashMap<String, Integer>();
+ }
+
+ int len = words.length;
+ String gram;
+ int st = 0;
+
+ for (; st <= len - maxGramLength; ++st) {
+
+ gram = words[st];
+ if (ngramCountsArray[1].containsKey(gram)) {
+ int oldCount = ngramCountsArray[1].get(gram);
+ ngramCountsArray[1].put(gram, oldCount + 1);
+ } else {
+ ngramCountsArray[1].put(gram, 1);
+ }
+
+ for (int n = 2; n <= maxGramLength; ++n) {
+ gram = gram + " " + words[st + n - 1];
+ if (ngramCountsArray[n].containsKey(gram)) {
+ int oldCount = ngramCountsArray[n].get(gram);
+ ngramCountsArray[n].put(gram, oldCount + 1);
+ } else {
+ ngramCountsArray[n].put(gram, 1);
+ }
+ } // for (n)
+
+ } // for (st)
+
+ // now st is either len-maxGramLength+1 or zero (if above loop never entered, which
+ // happens with sentences that have fewer than maxGramLength words)
+
+ for (; st < len; ++st) {
+
+ gram = words[st];
+ if (ngramCountsArray[1].containsKey(gram)) {
+ int oldCount = ngramCountsArray[1].get(gram);
+ ngramCountsArray[1].put(gram, oldCount + 1);
+ } else {
+ ngramCountsArray[1].put(gram, 1);
+ }
+
+ int n = 2;
+ for (int fin = st + 1; fin < len; ++fin) {
+ gram = gram + " " + words[st + n - 1];
+
+ if (ngramCountsArray[n].containsKey(gram)) {
+ int oldCount = ngramCountsArray[n].get(gram);
+ ngramCountsArray[n].put(gram, oldCount + 1);
+ } else {
+ ngramCountsArray[n].put(gram, 1);
+ }
+ ++n;
+ } // for (fin)
+
+ } // for (st)
+
+ return ngramCountsArray;
+
+ }
+
+ public HashMap<String, Integer> getNgramCountsAll(String cand_str) {
+ if (!cand_str.equals("")) {
+ return getNgramCountsAll(cand_str.split("\\s+"));
+ } else {
+ return getNgramCountsAll(new String[0]);
+ }
+ }
+
+ public HashMap<String, Integer> getNgramCountsAll(String[] words) {
+ HashMap<String, Integer> ngramCountsAll = new HashMap<String, Integer>();
+
+ int len = words.length;
+ String gram;
+ int st = 0;
+
+ for (; st <= len - maxGramLength; ++st) {
+
+ gram = words[st];
+ if (ngramCountsAll.containsKey(gram)) {
+ int oldCount = ngramCountsAll.get(gram);
+ ngramCountsAll.put(gram, oldCount + 1);
+ } else {
+ ngramCountsAll.put(gram, 1);
+ }
+
+ for (int n = 2; n <= maxGramLength; ++n) {
+ gram = gram + " " + words[st + n - 1];
+ if (ngramCountsAll.containsKey(gram)) {
+ int oldCount = ngramCountsAll.get(gram);
+ ngramCountsAll.put(gram, oldCount + 1);
+ } else {
+ ngramCountsAll.put(gram, 1);
+ }
+ } // for (n)
+
+ } // for (st)
+
+ // now st is either len-maxGramLength+1 or zero (if above loop never entered, which
+ // happens with sentences that have fewer than maxGramLength words)
+
+ for (; st < len; ++st) {
+
+ gram = words[st];
+ if (ngramCountsAll.containsKey(gram)) {
+ int oldCount = ngramCountsAll.get(gram);
+ ngramCountsAll.put(gram, oldCount + 1);
+ } else {
+ ngramCountsAll.put(gram, 1);
+ }
+
+ int n = 2;
+ for (int fin = st + 1; fin < len; ++fin) {
+ gram = gram + " " + words[st + n - 1];
+
+ if (ngramCountsAll.containsKey(gram)) {
+ int oldCount = ngramCountsAll.get(gram);
+ ngramCountsAll.put(gram, oldCount + 1);
+ } else {
+ ngramCountsAll.put(gram, 1);
+ }
+ ++n;
+ } // for (fin)
+
+ } // for (st)
+
+ return ngramCountsAll;
+
+ }
+
+ public void printDetailedScore_fromStats(int[] stats, boolean oneLiner) {
+ System.out.println(metricName + " = " + score(stats));
+
+ // for (Map.Entry<String, Integer> entry : refNgramCounts.) {
+ // System.out.println(entry.getKey()+" : "+ entry.getValue());
+ // }
+ //
+ //
+ // optional (for debugging purposes)
+ //
+ //
+ }
+
+ private enum StatIndex {
+ KEEPBOTH, KEEPCAND, KEEPREF, DELBOTH, DELCAND, DELREF, ADDBOTH, ADDCAND, ADDREF, KEEPBOTH2
+ };
+
+}