You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/22 03:08:12 UTC
[1/4] incubator-joshua git commit: Adding the CHRF metric for tuning
Repository: incubator-joshua
Updated Branches:
refs/heads/master 55e88d1fc -> c01ce779c
Adding the CHRF metric for tuning
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/52c83d59
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/52c83d59
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/52c83d59
Branch: refs/heads/master
Commit: 52c83d5930c74e15186c5710b9b47b97d2a9230b
Parents: 55e88d1
Author: antot <an...@riseup.net>
Authored: Tue Jun 21 17:12:40 2016 +0100
Committer: antot <an...@riseup.net>
Committed: Tue Jun 21 17:12:40 2016 +0100
----------------------------------------------------------------------
.../java/org/apache/joshua/metrics/CHRF.java | 302 +++++++++++++++++++
.../apache/joshua/metrics/EvaluationMetric.java | 6 +-
2 files changed, 307 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/52c83d59/src/main/java/org/apache/joshua/metrics/CHRF.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/metrics/CHRF.java b/src/main/java/org/apache/joshua/metrics/CHRF.java
new file mode 100644
index 0000000..f490f58
--- /dev/null
+++ b/src/main/java/org/apache/joshua/metrics/CHRF.java
@@ -0,0 +1,302 @@
+/*
+ * Copyright 2016 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.joshua.metrics;
+
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.logging.Logger;
+
+
+/**
+ *
+ * An implementation of the chrF evaluation metric for tuning.
+ * It is based on the original code by Maja Popovic [1] with the following main modifications:
+ * - Adapted to extend Joshua's EvaluationMetric class
+ * - Use of a length penalty to prevent chrF to prefer too long (with beta>1) or too short (with beta<1) translations
+ * - Use of hash tables for efficient n-gram matching
+ *
+ * If you use this metric in your research please cite [2]
+ *
+ * [1] Maja Popovic. 2015. chrF: character n-gram F-score for automatic MT evaluation.
+ * In Proceedings of the Tenth Workshop on Statistical Machine Translation. Lisbon, Portugal, pages 392\u2013395.
+ * [2] V�ctor S�nchez Cartagena and Antonio Toral. 2016.
+ * Abu-MaTran at WMT 2016 Translation Task: Deep Learning, Morphological Segmentation and Tuning on Character Sequences.
+ * In Proceedings of the First Conference on Machine Translation (WMT16). Berlin, Germany.
+
+ * @author Antonio Toral
+ */
+public class CHRF extends EvaluationMetric {
+ private static final Logger logger = Logger.getLogger(CHRF.class.getName());
+
+ protected double beta = 3;
+ protected double factor;
+ protected int maxGramLength = 6; // The maximum n-gram we care about
+ //private double[] nGramWeights; //TODO to weight them differently
+
+ //private String metricName;
+ //private boolean toBeMinimized;
+ //private int suffStatsCount;
+
+
+ public CHRF()
+ {
+ this(3, 6);
+ }
+
+ public CHRF(String[] CHRF_options)
+ {
+ //
+ //
+ // process the Metric_options array
+ //
+ //
+ this(Double.parseDouble(CHRF_options[0]), Integer.parseInt(CHRF_options[1]));
+ }
+
+ public CHRF(double bt, int mxGrmLn){
+ if (bt > 0) {
+ beta = bt;
+ } else {
+ logger.severe("Beta must be positive");
+ System.exit(1);
+ }
+
+ if (mxGrmLn >= 1) {
+ maxGramLength = mxGrmLn;
+ } else {
+ logger.severe("Maximum gram length must be positive");
+ System.exit(1);
+ }
+
+ initialize(); // set the data members of the metric
+ }
+
+ protected void initialize()
+ {
+ metricName = "CHRF";
+ toBeMinimized = false;
+ suffStatsCount = 4 * maxGramLength;
+ factor = Math.pow(beta, 2);
+ }
+
+ public double bestPossibleScore() { return 100.0; }
+
+ public double worstPossibleScore() { return 0.0; }
+
+ protected String separateCharacters(String s)
+ {
+ String s_chars = "";
+ //alternative implementation (the one below seems more robust)
+ /*for (int i = 0; i < s.length(); i++) {
+ if (s.charAt(i) == ' ') continue;
+ s_chars += s.charAt(i) + " ";
+ }
+ System.out.println("CHRF separate chars1: " + s_chars);*/
+
+ String[] words = s.split("\\s+");
+ for (String w: words) {
+ for (int i = 0; i<w.length(); i++)
+ s_chars += w.charAt(i);
+ }
+
+ //System.out.println("CHRF separate chars: " + s_chars);
+ return s_chars;
+ }
+
+
+ protected HashMap<String, Integer>[] getGrams(String s)
+ {
+ HashMap<String, Integer>[] grams = new HashMap[1 + maxGramLength];
+ grams[0] = null;
+ for (int n = 1; n <= maxGramLength; ++n) {
+ grams[n] = new HashMap<String, Integer>();
+ }
+
+
+ for (int n=1; n<=maxGramLength; n++){
+ String gram = "";
+ for (int i = 0; i < s.length() - n + 1; i++){
+ gram = s.substring(i, i+n);
+ if(grams[n].containsKey(gram)){
+ int old_count = grams[n].get(gram);
+ grams[n].put(gram, old_count+1);
+ } else {
+ grams[n].put(gram, 1);
+ }
+ }
+
+ }
+
+ /* debugging
+ String key, value;
+ for (int n=1; n<=maxGramLength; n++){
+ System.out.println("Grams of order " + n);
+ for (String gram: grams[n].keySet()){
+ key = gram.toString();
+ value = grams[n].get(gram).toString();
+ System.out.println(key + " " + value);
+ }
+ }*/
+
+ return grams;
+ }
+
+
+ protected int[] candRefErrors(HashMap<String, Integer> ref, HashMap<String, Integer> cand)
+ {
+ int[] to_return = {0,0};
+ String gram;
+ int cand_grams = 0, ref_grams = 0;
+ int candGramCount = 0, refGramCount = 0;
+ int errors = 0;
+ double result = 0;
+ String not_found = "";
+
+
+ Iterator<String> it = (cand.keySet()).iterator();
+
+ while (it.hasNext()) {
+ gram = it.next();
+ candGramCount = cand.get(gram);
+ cand_grams += candGramCount;
+ if (ref.containsKey(gram)) {
+ refGramCount = ref.get(gram);
+ ref_grams += refGramCount;
+ if(candGramCount>refGramCount){
+ int error_here = candGramCount - refGramCount;
+ errors += error_here;
+ not_found += gram + " (" + error_here + " times) ";
+ }
+ } else {
+ refGramCount = 0;
+ errors += candGramCount;
+ not_found += gram + " ";
+ }
+ }
+
+ //System.out.println(" Ngrams not found: " + not_found);
+
+ to_return[0] = cand_grams;
+ to_return[1] = errors;
+
+ return to_return;
+ }
+
+ public int[] suffStats(String cand_str, int i) //throws Exception
+ {
+ int[] stats = new int[suffStatsCount];
+
+ double[] precisions = new double[maxGramLength+1];
+ double[] recalls = new double[maxGramLength+1];
+
+ //TODO check unicode chars correctly split
+ String cand_char = separateCharacters(cand_str);
+ String ref_char = separateCharacters(refSentences[i][0]);
+
+ HashMap<String, Integer>[] grams_cand = getGrams(cand_char);
+ HashMap<String, Integer>[] grams_ref = getGrams(ref_char);
+
+ for (int n = 1; n <= maxGramLength; ++n) {
+ //System.out.println("Calculating precision...");
+ int[] precision_vals = candRefErrors(grams_ref[n], grams_cand[n]);
+ //System.out.println(" length: " + precision_vals[0] + ", errors: " + precision_vals[1]);
+ //System.out.println("Calculating recall...");
+ int[] recall_vals = candRefErrors(grams_cand[n], grams_ref[n]);
+ //System.out.println(" length: " + recall_vals[0] + ", errors: " + recall_vals[1]);
+
+ stats[4*(n-1)] = precision_vals[0]; //cand_grams
+ stats[4*(n-1)+1] = precision_vals[1]; //errors (precision)
+ stats[4*(n-1)+2] = recall_vals[0]; //ref_grams
+ stats[4*(n-1)+3] = recall_vals[1]; //errors (recall)
+ }
+
+ return stats;
+ }
+
+
+ public double score(int[] stats)
+ {
+ int precision_ngrams, recall_ngrams, precision_errors, recall_errors;
+ double[] precisions = new double[maxGramLength+1];
+ double[] recalls = new double[maxGramLength+1];
+ double[] fs = new double[maxGramLength+1];
+ //double[] scs = new double[maxGramLength+1];
+ double totalPrecision = 0, totalRecall = 0, totalF = 0, totalSC = 0;
+ double lp = 1;
+
+ if (stats.length != suffStatsCount) {
+ System.out.println("Mismatch between stats.length and suffStatsCount (" + stats.length + " vs. " + suffStatsCount + ") in NewMetric.score(int[])");
+ System.exit(1);
+ }
+
+ for (int n = 1; n <= maxGramLength; n++) {
+ precision_ngrams = stats[4 * (n - 1)];
+ precision_errors = stats[4 * (n - 1) + 1];
+ recall_ngrams = stats[4 * (n - 1) + 2];
+ recall_errors = stats[4 * (n - 1) + 3];
+
+ if (precision_ngrams != 0)
+ precisions[n] = 100 - 100*precision_errors/ (double)precision_ngrams;
+ else precisions[n] = 0;
+
+ if (recall_ngrams != 0)
+ recalls[n] = 100 - 100*recall_errors/ (double)recall_ngrams;
+ else
+ recalls[n] = 0;
+
+ if(precisions[n] != 0 || recalls[n] != 0)
+ fs[n] = (1+factor) * recalls[n] * precisions[n] / (factor * precisions[n] + recalls[n]);
+ else
+ fs[n] = 0;
+
+ //System.out.println("Precision (n=" + n + "): " + precisions[n]);
+ //System.out.println("Recall (n=" + n + "): " + recalls[n]);
+ //System.out.println("F (n=" + n + "): " + fs[n]);
+
+ totalPrecision += (1/(double)maxGramLength) * precisions[n];
+ totalRecall += (1/(double)maxGramLength) * recalls[n];
+ totalF += (1/(double)maxGramLength) * fs[n];
+ }
+
+ //length penalty
+ if (beta>1){ //penalise long translations
+ lp = Math.min(1, stats[2]/(double)stats[0]);
+ } else if (beta < 1){ //penalise short translations
+ lp = Math.min(1, stats[0]/(double)stats[2]);
+ }
+ totalSC = totalF*lp;
+
+ //System.out.println("Precision (total): " + totalPrecision);
+ //System.out.println("Recall (total):" + totalRecall);
+ //System.out.println("F (total): " + totalF);
+
+ return totalSC;
+ }
+
+
+ public void printDetailedScore_fromStats(int[] stats, boolean oneLiner)
+ {
+ System.out.println(metricName + " = " + score(stats));
+
+ //
+ //
+ // optional (for debugging purposes)
+ //
+ //
+ }
+
+}
+
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/52c83d59/src/main/java/org/apache/joshua/metrics/EvaluationMetric.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/metrics/EvaluationMetric.java b/src/main/java/org/apache/joshua/metrics/EvaluationMetric.java
index 9a8786c..9ac77f1 100644
--- a/src/main/java/org/apache/joshua/metrics/EvaluationMetric.java
+++ b/src/main/java/org/apache/joshua/metrics/EvaluationMetric.java
@@ -77,6 +77,7 @@ public abstract class EvaluationMetric {
metricOptionCount.put("PRECIS-SRC_BLEU", 6);
metricOptionCount.put("GL_BLEU", 3);
metricOptionCount.put("SARI", 2); // options: max-ngram source-path
+ metricOptionCount.put("CHRF", 2); // options: beta (how much to weight recall vs precision) and max-ngram
}
public static EvaluationMetric getMetric(String metricName, String[] metricOptions) {
@@ -117,7 +118,10 @@ public abstract class EvaluationMetric {
// GradeLevelBLEU class
} else if (metricName.equals("SARI")) {
retMetric = new SARI(metricOptions);
- }
+
+ } else if (metricName.equals("CHRF")) {
+ retMetric = new CHRF(metricOptions);
+ }
return retMetric;
}
[3/4] incubator-joshua git commit: don't return formatted string;
updated test case
Posted by mj...@apache.org.
don't return formatted string; updated test case
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/6bf0c848
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/6bf0c848
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/6bf0c848
Branch: refs/heads/master
Commit: 6bf0c8489dcae5d9152770b048409a2689c0526d
Parents: 32a5003
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue Jun 21 23:03:19 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue Jun 21 23:03:19 2016 -0400
----------------------------------------------------------------------
src/main/java/org/apache/joshua/decoder/io/JSONMessage.java | 2 +-
src/test/resources/server/http/expected | 3 ++-
2 files changed, 3 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6bf0c848/src/main/java/org/apache/joshua/decoder/io/JSONMessage.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/io/JSONMessage.java b/src/main/java/org/apache/joshua/decoder/io/JSONMessage.java
index 9c3899e..6ec3e57 100644
--- a/src/main/java/org/apache/joshua/decoder/io/JSONMessage.java
+++ b/src/main/java/org/apache/joshua/decoder/io/JSONMessage.java
@@ -77,7 +77,7 @@ public class JSONMessage {
TranslationItem item = addTranslation(viterbi);
for (StructuredTranslation hyp: translation.getStructuredTranslations()) {
- String text = hyp.getFormattedTranslationString();
+ String text = hyp.getTranslationString();
float score = hyp.getTranslationScore();
item.addHypothesis(text, score);
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/6bf0c848/src/test/resources/server/http/expected
----------------------------------------------------------------------
diff --git a/src/test/resources/server/http/expected b/src/test/resources/server/http/expected
index 11ea273..d0a254b 100644
--- a/src/test/resources/server/http/expected
+++ b/src/test/resources/server/http/expected
@@ -11,5 +11,6 @@
]
}
]
- }
+ },
+ "metadata": []
}
[4/4] incubator-joshua git commit: Reverted commit
6d2213a20b74432fc7cb131c732f7507b74053e9,
removed FeatureVector from StructuredTranslation
Posted by mj...@apache.org.
Reverted commit 6d2213a20b74432fc7cb131c732f7507b74053e9, removed FeatureVector from StructuredTranslation
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/c01ce779
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/c01ce779
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/c01ce779
Branch: refs/heads/master
Commit: c01ce779c18da9f4eb8411d2a22a3f429e51d2f9
Parents: 6bf0c84
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue Jun 21 23:08:06 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue Jun 21 23:08:06 2016 -0400
----------------------------------------------------------------------
.../joshua/decoder/StructuredTranslation.java | 18 +++++-------------
.../decoder/StructuredTranslationFactory.java | 6 +++---
2 files changed, 8 insertions(+), 16 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/c01ce779/src/main/java/org/apache/joshua/decoder/StructuredTranslation.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/StructuredTranslation.java b/src/main/java/org/apache/joshua/decoder/StructuredTranslation.java
index 2faacf2..fb97ee7 100644
--- a/src/main/java/org/apache/joshua/decoder/StructuredTranslation.java
+++ b/src/main/java/org/apache/joshua/decoder/StructuredTranslation.java
@@ -45,7 +45,7 @@ public class StructuredTranslation {
private final List<String> translationTokens;
private final float translationScore;
private final List<List<Integer>> translationWordAlignments;
- private final FeatureVector translationFeatures;
+ private final Map<String,Float> translationFeatures;
private final float extractionTime;
public StructuredTranslation(
@@ -54,7 +54,7 @@ public class StructuredTranslation {
final List<String> translationTokens,
final float translationScore,
final List<List<Integer>> translationWordAlignments,
- final FeatureVector translationFeatures,
+ final Map<String,Float> translationFeatures,
final float extractionTime) {
this.sourceSentence = sourceSentence;
this.translationString = translationString;
@@ -83,20 +83,12 @@ public class StructuredTranslation {
}
/**
- * Produces the translation formatted according to the value of {@value JoshuaConfiguration.output_format}.
- * Also includes formatting options such as {@value JoshuaConfiguration.project_case}.
+ * Returns the output string formatted according to {@value JoshuaConfiguration.output_format}.
*
* @return
*/
public String getFormattedTranslationString() {
- JoshuaConfiguration config = sourceSentence.config;
- String outputString = config.outputFormat
- .replace("%s", getTranslationString())
- .replace("%S", DeNormalize.processSingleLine(maybeProjectCase(getTranslationString())))
- .replace("%i", Integer.toString(getSentenceId()))
- .replace("%f", config.moses ? translationFeatures.mosesString() : translationFeatures.toString())
- .replace("%c", String.format("%.3f", getTranslationScore()));
- return outputString;
+ throw new RuntimeException("Not yet implemented");
}
public List<String> getTranslationTokens() {
@@ -116,7 +108,7 @@ public class StructuredTranslation {
}
public Map<String,Float> getTranslationFeatures() {
- return translationFeatures.getMap();
+ return translationFeatures;
}
/**
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/c01ce779/src/main/java/org/apache/joshua/decoder/StructuredTranslationFactory.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/decoder/StructuredTranslationFactory.java b/src/main/java/org/apache/joshua/decoder/StructuredTranslationFactory.java
index 4389135..916a5a7 100644
--- a/src/main/java/org/apache/joshua/decoder/StructuredTranslationFactory.java
+++ b/src/main/java/org/apache/joshua/decoder/StructuredTranslationFactory.java
@@ -62,7 +62,7 @@ public class StructuredTranslationFactory {
extractTranslationTokens(translationString),
extractTranslationScore(hypergraph),
getViterbiWordAlignmentList(hypergraph),
- getViterbiFeatures(hypergraph, featureFunctions, sourceSentence),
+ getViterbiFeatures(hypergraph, featureFunctions, sourceSentence).getMap(),
(System.currentTimeMillis() - startTime) / 1000.0f);
}
@@ -73,7 +73,7 @@ public class StructuredTranslationFactory {
*/
public static StructuredTranslation fromEmptyOutput(final Sentence sourceSentence) {
return new StructuredTranslation(
- sourceSentence, "", emptyList(), 0, emptyList(), new FeatureVector(), 0f);
+ sourceSentence, "", emptyList(), 0, emptyList(), emptyMap(), 0f);
}
/**
@@ -93,7 +93,7 @@ public class StructuredTranslationFactory {
extractTranslationTokens(translationString),
derivationState.getModelCost(),
derivationState.getWordAlignmentList(),
- derivationState.getFeatures(),
+ derivationState.getFeatures().getMap(),
(System.currentTimeMillis() - startTime) / 1000.0f);
}
[2/4] incubator-joshua git commit: Adding the CHRF metric for tuning
Posted by mj...@apache.org.
Adding the CHRF metric for tuning
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/32a5003e
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/32a5003e
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/32a5003e
Branch: refs/heads/master
Commit: 32a5003e671bfd781a5bf774cc48d72443fc2c1e
Parents: 52c83d5
Author: antot <an...@riseup.net>
Authored: Tue Jun 21 17:24:34 2016 +0100
Committer: antot <an...@riseup.net>
Committed: Tue Jun 21 17:24:34 2016 +0100
----------------------------------------------------------------------
src/main/java/org/apache/joshua/metrics/CHRF.java | 12 +++++++++---
1 file changed, 9 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/32a5003e/src/main/java/org/apache/joshua/metrics/CHRF.java
----------------------------------------------------------------------
diff --git a/src/main/java/org/apache/joshua/metrics/CHRF.java b/src/main/java/org/apache/joshua/metrics/CHRF.java
index f490f58..f02fc5f 100644
--- a/src/main/java/org/apache/joshua/metrics/CHRF.java
+++ b/src/main/java/org/apache/joshua/metrics/CHRF.java
@@ -28,7 +28,13 @@ import java.util.logging.Logger;
* - Use of a length penalty to prevent chrF to prefer too long (with beta>1) or too short (with beta<1) translations
* - Use of hash tables for efficient n-gram matching
*
- * If you use this metric in your research please cite [2]
+ * The metric has 2 parameters:
+ * - Beta. It assigns beta times more weight to recall than to precision. By default 1.
+ * Although for evaluation the best correlation was found with beta=3, we've found the
+ * best results for tuning so far with beta=1
+ * - Max-ngram. Maximum n-gram length (characters). By default 6.
+ *
+ * If you use this metric in your research please cite [2].
*
* [1] Maja Popovic. 2015. chrF: character n-gram F-score for automatic MT evaluation.
* In Proceedings of the Tenth Workshop on Statistical Machine Translation. Lisbon, Portugal, pages 392\u2013395.
@@ -41,7 +47,7 @@ import java.util.logging.Logger;
public class CHRF extends EvaluationMetric {
private static final Logger logger = Logger.getLogger(CHRF.class.getName());
- protected double beta = 3;
+ protected double beta = 1;
protected double factor;
protected int maxGramLength = 6; // The maximum n-gram we care about
//private double[] nGramWeights; //TODO to weight them differently
@@ -53,7 +59,7 @@ public class CHRF extends EvaluationMetric {
public CHRF()
{
- this(3, 6);
+ this(1, 6);
}
public CHRF(String[] CHRF_options)