You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/23 18:45:41 UTC
[30/60] [partial] incubator-joshua git commit: maven multi-module
layout 1st commit: moving files into joshua-core
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/mira/MIRACore.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/mira/MIRACore.java b/joshua-core/src/main/java/org/apache/joshua/mira/MIRACore.java
new file mode 100755
index 0000000..42dd995
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/mira/MIRACore.java
@@ -0,0 +1,3112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.mira;
+
+import java.io.BufferedReader;
+import java.io.BufferedWriter;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileNotFoundException;
+import java.io.FileOutputStream;
+import java.io.FileReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import java.text.DecimalFormat;
+import java.util.ArrayList;
+import java.util.Date;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Random;
+import java.util.Scanner;
+import java.util.TreeSet;
+import java.util.Vector;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.zip.GZIPInputStream;
+import java.util.zip.GZIPOutputStream;
+
+import org.apache.joshua.decoder.Decoder;
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.metrics.EvaluationMetric;
+import org.apache.joshua.util.StreamGobbler;
+import org.apache.joshua.corpus.Vocabulary;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This code was originally written by Yuan Cao, who copied the MERT code to produce this file.
+ */
+
+public class MIRACore {
+
+ private static final Logger LOG = LoggerFactory.getLogger(MIRACore.class);
+
+ private final JoshuaConfiguration joshuaConfiguration;
+ private TreeSet<Integer>[] indicesOfInterest_all;
+
+ private final static DecimalFormat f4 = new DecimalFormat("###0.0000");
+ private final Runtime myRuntime = Runtime.getRuntime();
+
+ private final static double NegInf = (-1.0 / 0.0);
+ private final static double PosInf = (+1.0 / 0.0);
+ private final static double epsilon = 1.0 / 1000000;
+
+ private int progress;
+
+ private int verbosity; // anything of priority <= verbosity will be printed
+ // (lower value for priority means more important)
+
+ private Random randGen;
+ private int generatedRands;
+
+ private int numSentences;
+ // number of sentences in the dev set
+ // (aka the "MERT training" set)
+
+ private int numDocuments;
+ // number of documents in the dev set
+ // this should be 1, unless doing doc-level optimization
+
+ private int[] docOfSentence;
+ // docOfSentence[i] stores which document contains the i'th sentence.
+ // docOfSentence is 0-indexed, as are the documents (i.e. first doc is indexed 0)
+
+ private int[] docSubsetInfo;
+ // stores information regarding which subset of the documents are evaluated
+ // [0]: method (0-6)
+ // [1]: first (1-indexed)
+ // [2]: last (1-indexed)
+ // [3]: size
+ // [4]: center
+ // [5]: arg1
+ // [6]: arg2
+ // [1-6] are 0 for method 0, [6] is 0 for methods 1-4 as well
+ // only [1] and [2] are needed for optimization. The rest are only needed for an output message.
+
+ private int refsPerSen;
+ // number of reference translations per sentence
+
+ private int textNormMethod;
+ // 0: no normalization, 1: "NIST-style" tokenization, and also rejoin 'm, 're, *'s, 've, 'll, 'd,
+ // and n't,
+ // 2: apply 1 and also rejoin dashes between letters, 3: apply 1 and also drop non-ASCII
+ // characters
+ // 4: apply 1+2+3
+
+ private int numParams;
+ // total number of firing features
+ // this number may increase overtime as new n-best lists are decoded
+ // initially it is equal to the # of params in the parameter config file
+ private int numParamsOld;
+ // number of features before observing the new features fired in the current iteration
+
+ private double[] normalizationOptions;
+ // How should a lambda[] vector be normalized (before decoding)?
+ // nO[0] = 0: no normalization
+ // nO[0] = 1: scale so that parameter nO[2] has absolute value nO[1]
+ // nO[0] = 2: scale so that the maximum absolute value is nO[1]
+ // nO[0] = 3: scale so that the minimum absolute value is nO[1]
+ // nO[0] = 4: scale so that the L-nO[1] norm equals nO[2]
+
+ /* *********************************************************** */
+ /* NOTE: indexing starts at 1 in the following few arrays: */
+ /* *********************************************************** */
+
+ // private double[] lambda;
+ private ArrayList<Double> lambda = new ArrayList<Double>();
+ // the current weight vector. NOTE: indexing starts at 1.
+ private ArrayList<Double> bestLambda = new ArrayList<Double>();
+ // the best weight vector across all iterations
+
+ private boolean[] isOptimizable;
+ // isOptimizable[c] = true iff lambda[c] should be optimized
+
+ private double[] minRandValue;
+ private double[] maxRandValue;
+ // when choosing a random value for the lambda[c] parameter, it will be
+ // chosen from the [minRandValue[c],maxRandValue[c]] range.
+ // (*) minRandValue and maxRandValue must be real values, but not -Inf or +Inf
+
+ private double[] defaultLambda;
+ // "default" parameter values; simply the values read in the parameter file
+ // USED FOR NON-OPTIMIZABLE (FIXED) FEATURES
+
+ /* *********************************************************** */
+ /* *********************************************************** */
+
+ private Decoder myDecoder;
+ // COMMENT OUT if decoder is not Joshua
+
+ private String decoderCommand;
+ // the command that runs the decoder; read from decoderCommandFileName
+
+ private int decVerbosity;
+ // verbosity level for decoder output. If 0, decoder output is ignored.
+ // If 1, decoder output is printed.
+
+ private int validDecoderExitValue;
+ // return value from running the decoder command that indicates success
+
+ private int numOptThreads;
+ // number of threads to run things in parallel
+
+ private int saveInterFiles;
+ // 0: nothing, 1: only configs, 2: only n-bests, 3: both configs and n-bests
+
+ private int compressFiles;
+ // should MIRA gzip the large files? If 0, no compression takes place.
+ // If 1, compression is performed on: decoder output files, temp sents files,
+ // and temp feats files.
+
+ private int sizeOfNBest;
+ // size of N-best list generated by decoder at each iteration
+ // (aka simply N, but N is a bad variable name)
+
+ private long seed;
+ // seed used to create random number generators
+
+ private boolean randInit;
+ // if true, parameters are initialized randomly. If false, parameters
+ // are initialized using values from parameter file.
+
+ private int maxMERTIterations, minMERTIterations, prevMERTIterations;
+ // max: maximum number of MERT iterations
+ // min: minimum number of MERT iterations before an early MERT exit
+ // prev: number of previous MERT iterations from which to consider candidates (in addition to
+ // the candidates from the current iteration)
+
+ private double stopSigValue;
+ // early MERT exit if no weight changes by more than stopSigValue
+ // (but see minMERTIterations above and stopMinIts below)
+
+ private int stopMinIts;
+ // some early stopping criterion must be satisfied in stopMinIts *consecutive* iterations
+ // before an early exit (but see minMERTIterations above)
+
+ private boolean oneModificationPerIteration;
+ // if true, each MERT iteration performs at most one parameter modification.
+ // If false, a new MERT iteration starts (i.e. a new N-best list is
+ // generated) only after the previous iteration reaches a local maximum.
+
+ private String metricName;
+ // name of evaluation metric optimized by MERT
+
+ private String metricName_display;
+ // name of evaluation metric optimized by MERT, possibly with "doc-level " prefixed
+
+ private String[] metricOptions;
+ // options for the evaluation metric (e.g. for BLEU, maxGramLength and effLengthMethod)
+
+ private EvaluationMetric evalMetric;
+ // the evaluation metric used by MERT
+
+ private int suffStatsCount;
+ // number of sufficient statistics for the evaluation metric
+
+ private String tmpDirPrefix;
+ // prefix for the MIRA.temp.* files
+
+ private boolean passIterationToDecoder;
+ // should the iteration number be passed as an argument to decoderCommandFileName?
+
+ // used by mira
+ private boolean needShuffle = true; // shuffle the training sentences or not
+ private boolean needAvg = true; // average the weihgts or not?
+ private boolean runPercep = false; // run perceptron instead of mira
+ private boolean usePseudoBleu = true; // need to use pseudo corpus to compute bleu?
+ private boolean returnBest = false; // return the best weight during tuning
+ private boolean needScale = true; // need scaling?
+ private String trainingMode;
+ private int oraSelectMode = 1;
+ private int predSelectMode = 1;
+ private int miraIter = 1;
+ private int batchSize = 1;
+ private double C = 0.01; // relaxation coefficient
+ private double R = 0.99; // corpus decay when pseudo corpus is used for bleu computation
+ // private double sentForScale = 0.15; //percentage of sentences for scale factor estimation
+ private double scoreRatio = 5.0; // sclale so that model_score/metric_score = scoreratio
+ private double prevMetricScore = 0; // final metric score of the previous iteration, used only
+ // when returnBest = true
+
+ private String dirPrefix; // where are all these files located?
+ private String paramsFileName, docInfoFileName, finalLambdaFileName;
+ private String sourceFileName, refFileName, decoderOutFileName;
+ private String decoderConfigFileName, decoderCommandFileName;
+ private String fakeFileNameTemplate, fakeFileNamePrefix, fakeFileNameSuffix;
+
+ // e.g. output.it[1-x].someOldRun would be specified as:
+ // output.it?.someOldRun
+ // and we'd have prefix = "output.it" and suffix = ".sameOldRun"
+
+ // private int useDisk;
+
+ public MIRACore(JoshuaConfiguration joshuaConfiguration) {
+ this.joshuaConfiguration = joshuaConfiguration;
+ }
+
+ public MIRACore(String[] args, JoshuaConfiguration joshuaConfiguration) {
+ this.joshuaConfiguration = joshuaConfiguration;
+ EvaluationMetric.set_knownMetrics();
+ processArgsArray(args);
+ initialize(0);
+ }
+
+ public MIRACore(String configFileName, JoshuaConfiguration joshuaConfiguration) {
+ this.joshuaConfiguration = joshuaConfiguration;
+ EvaluationMetric.set_knownMetrics();
+ processArgsArray(cfgFileToArgsArray(configFileName));
+ initialize(0);
+ }
+
+ private void initialize(int randsToSkip) {
+ println("NegInf: " + NegInf + ", PosInf: " + PosInf + ", epsilon: " + epsilon, 4);
+
+ randGen = new Random(seed);
+ for (int r = 1; r <= randsToSkip; ++r) {
+ randGen.nextDouble();
+ }
+ generatedRands = randsToSkip;
+
+ if (randsToSkip == 0) {
+ println("----------------------------------------------------", 1);
+ println("Initializing...", 1);
+ println("----------------------------------------------------", 1);
+ println("", 1);
+
+ println("Random number generator initialized using seed: " + seed, 1);
+ println("", 1);
+ }
+
+ // count the total num of sentences to be decoded, reffilename is the combined reference file
+ // name(auto generated)
+ numSentences = countLines(refFileName) / refsPerSen;
+
+ // ??
+ processDocInfo();
+ // sets numDocuments and docOfSentence[]
+
+ if (numDocuments > 1)
+ metricName_display = "doc-level " + metricName;
+
+ // ??
+ set_docSubsetInfo(docSubsetInfo);
+
+ // count the number of initial features
+ numParams = countNonEmptyLines(paramsFileName) - 1;
+ numParamsOld = numParams;
+
+ // read parameter config file
+ try {
+ // read dense parameter names
+ BufferedReader inFile_names = new BufferedReader(new FileReader(paramsFileName));
+
+ for (int c = 1; c <= numParams; ++c) {
+ String line = "";
+ while (line != null && line.length() == 0) { // skip empty lines
+ line = inFile_names.readLine();
+ }
+
+ // save feature names
+ String paramName = (line.substring(0, line.indexOf("|||"))).trim();
+ Vocabulary.id(paramName);
+ // System.err.println(String.format("VOCAB(%s) = %d", paramName, id));
+ }
+
+ inFile_names.close();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ // the parameter file contains one line per parameter
+ // and one line for the normalization method
+ // indexing starts at 1 in these arrays
+ for (int p = 0; p <= numParams; ++p)
+ lambda.add(new Double(0));
+ bestLambda.add(new Double(0));
+ // why only lambda is a list? because the size of lambda
+ // may increase over time, but other arrays are specified in
+ // the param config file, only used for initialization
+ isOptimizable = new boolean[1 + numParams];
+ minRandValue = new double[1 + numParams];
+ maxRandValue = new double[1 + numParams];
+ defaultLambda = new double[1 + numParams];
+ normalizationOptions = new double[3];
+
+ // read initial param values
+ processParamFile();
+ // sets the arrays declared just above
+
+ // SentenceInfo.createV(); // uncomment ONLY IF using vocabulary implementation of SentenceInfo
+
+ String[][] refSentences = new String[numSentences][refsPerSen];
+
+ try {
+
+ // read in reference sentences
+ InputStream inStream_refs = new FileInputStream(new File(refFileName));
+ BufferedReader inFile_refs = new BufferedReader(new InputStreamReader(inStream_refs, "utf8"));
+
+ for (int i = 0; i < numSentences; ++i) {
+ for (int r = 0; r < refsPerSen; ++r) {
+ // read the rth reference translation for the ith sentence
+ refSentences[i][r] = inFile_refs.readLine();
+ }
+ }
+
+ inFile_refs.close();
+
+ // normalize reference sentences
+ for (int i = 0; i < numSentences; ++i) {
+ for (int r = 0; r < refsPerSen; ++r) {
+ // normalize the rth reference translation for the ith sentence
+ refSentences[i][r] = normalize(refSentences[i][r], textNormMethod);
+ }
+ }
+
+ // read in decoder command, if any
+ decoderCommand = null;
+ if (decoderCommandFileName != null) {
+ if (fileExists(decoderCommandFileName)) {
+ BufferedReader inFile_comm = new BufferedReader(new FileReader(decoderCommandFileName));
+ decoderCommand = inFile_comm.readLine(); // READ IN DECODE COMMAND
+ inFile_comm.close();
+ }
+ }
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ // set static data members for the EvaluationMetric class
+ EvaluationMetric.set_numSentences(numSentences);
+ EvaluationMetric.set_numDocuments(numDocuments);
+ EvaluationMetric.set_refsPerSen(refsPerSen);
+ EvaluationMetric.set_refSentences(refSentences);
+ EvaluationMetric.set_tmpDirPrefix(tmpDirPrefix);
+
+ evalMetric = EvaluationMetric.getMetric(metricName, metricOptions);
+ // used only if returnBest = true
+ prevMetricScore = evalMetric.getToBeMinimized() ? PosInf : NegInf;
+
+ // length of sufficient statistics
+ // for bleu: suffstatscount=8 (2*ngram+2)
+ suffStatsCount = evalMetric.get_suffStatsCount();
+
+ // set static data members for the IntermediateOptimizer class
+ /*
+ * IntermediateOptimizer.set_MERTparams(numSentences, numDocuments, docOfSentence,
+ * docSubsetInfo, numParams, normalizationOptions, isOptimizable oneModificationPerIteration,
+ * evalMetric, tmpDirPrefix, verbosity);
+ */
+
+ // print info
+ if (randsToSkip == 0) { // i.e. first iteration
+ println("Number of sentences: " + numSentences, 1);
+ println("Number of documents: " + numDocuments, 1);
+ println("Optimizing " + metricName_display, 1);
+
+ /*
+ * print("docSubsetInfo: {", 1); for (int f = 0; f < 6; ++f) print(docSubsetInfo[f] + ", ",
+ * 1); println(docSubsetInfo[6] + "}", 1);
+ */
+
+ println("Number of initial features: " + numParams, 1);
+ print("Initial feature names: {", 1);
+
+ for (int c = 1; c <= numParams; ++c)
+ print("\"" + Vocabulary.word(c) + "\"", 1);
+ println("}", 1);
+ println("", 1);
+
+ // TODO just print the correct info
+ println("c Default value\tOptimizable?\tRand. val. range", 1);
+
+ for (int c = 1; c <= numParams; ++c) {
+ print(c + " " + f4.format(lambda.get(c).doubleValue()) + "\t\t", 1);
+
+ if (!isOptimizable[c]) {
+ println(" No", 1);
+ } else {
+ print(" Yes\t\t", 1);
+ print(" [" + minRandValue[c] + "," + maxRandValue[c] + "]", 1);
+ println("", 1);
+ }
+ }
+
+ println("", 1);
+ print("Weight vector normalization method: ", 1);
+ if (normalizationOptions[0] == 0) {
+ println("none.", 1);
+ } else if (normalizationOptions[0] == 1) {
+ println(
+ "weights will be scaled so that the \""
+ + Vocabulary.word((int) normalizationOptions[2])
+ + "\" weight has an absolute value of " + normalizationOptions[1] + ".", 1);
+ } else if (normalizationOptions[0] == 2) {
+ println("weights will be scaled so that the maximum absolute value is "
+ + normalizationOptions[1] + ".", 1);
+ } else if (normalizationOptions[0] == 3) {
+ println("weights will be scaled so that the minimum absolute value is "
+ + normalizationOptions[1] + ".", 1);
+ } else if (normalizationOptions[0] == 4) {
+ println("weights will be scaled so that the L-" + normalizationOptions[1] + " norm is "
+ + normalizationOptions[2] + ".", 1);
+ }
+
+ println("", 1);
+
+ println("----------------------------------------------------", 1);
+ println("", 1);
+
+ // rename original config file so it doesn't get overwritten
+ // (original name will be restored in finish())
+ renameFile(decoderConfigFileName, decoderConfigFileName + ".MIRA.orig");
+ } // if (randsToSkip == 0)
+
+ // by default, load joshua decoder
+ if (decoderCommand == null && fakeFileNameTemplate == null) {
+ println("Loading Joshua decoder...", 1);
+ myDecoder = new Decoder(joshuaConfiguration, decoderConfigFileName + ".MIRA.orig");
+ println("...finished loading @ " + (new Date()), 1);
+ println("");
+ } else {
+ myDecoder = null;
+ }
+
+ @SuppressWarnings("unchecked")
+ TreeSet<Integer>[] temp_TSA = new TreeSet[numSentences];
+ indicesOfInterest_all = temp_TSA;
+
+ for (int i = 0; i < numSentences; ++i) {
+ indicesOfInterest_all[i] = new TreeSet<Integer>();
+ }
+ } // void initialize(...)
+
+ // -------------------------
+
+ public void run_MIRA() {
+ run_MIRA(minMERTIterations, maxMERTIterations, prevMERTIterations);
+ }
+
+ public void run_MIRA(int minIts, int maxIts, int prevIts) {
+ // FIRST, CLEAN ALL PREVIOUS TEMP FILES
+ String dir;
+ int k = tmpDirPrefix.lastIndexOf("/");
+ if (k >= 0) {
+ dir = tmpDirPrefix.substring(0, k + 1);
+ } else {
+ dir = "./";
+ }
+ String files;
+ File folder = new File(dir);
+
+ if (folder.exists()) {
+ File[] listOfFiles = folder.listFiles();
+
+ for (int i = 0; i < listOfFiles.length; i++) {
+ if (listOfFiles[i].isFile()) {
+ files = listOfFiles[i].getName();
+ if (files.startsWith("MIRA.temp")) {
+ deleteFile(files);
+ }
+ }
+ }
+ }
+
+ println("----------------------------------------------------", 1);
+ println("MIRA run started @ " + (new Date()), 1);
+ // printMemoryUsage();
+ println("----------------------------------------------------", 1);
+ println("", 1);
+
+ // if no default lambda is provided
+ if (randInit) {
+ println("Initializing lambda[] randomly.", 1);
+ // initialize optimizable parameters randomly (sampling uniformly from
+ // that parameter's random value range)
+ lambda = randomLambda();
+ }
+
+ println("Initial lambda[]: " + lambdaToString(lambda), 1);
+ println("", 1);
+
+ int[] maxIndex = new int[numSentences];
+
+ // HashMap<Integer,int[]>[] suffStats_array = new HashMap[numSentences];
+ // suffStats_array[i] maps candidates of interest for sentence i to an array
+ // storing the sufficient statistics for that candidate
+
+ int earlyStop = 0;
+ // number of consecutive iteration an early stopping criterion was satisfied
+
+ for (int iteration = 1;; ++iteration) {
+
+ // what does "A" contain?
+ // retA[0]: FINAL_score
+ // retA[1]: earlyStop
+ // retA[2]: should this be the last iteration?
+ double[] A = run_single_iteration(iteration, minIts, maxIts, prevIts, earlyStop, maxIndex);
+ if (A != null) {
+ earlyStop = (int) A[1];
+ if (A[2] == 1)
+ break;
+ } else {
+ break;
+ }
+
+ } // for (iteration)
+
+ println("", 1);
+
+ println("----------------------------------------------------", 1);
+ println("MIRA run ended @ " + (new Date()), 1);
+ // printMemoryUsage();
+ println("----------------------------------------------------", 1);
+ println("", 1);
+ if (!returnBest)
+ println("FINAL lambda: " + lambdaToString(lambda), 1);
+ // + " (" + metricName_display + ": " + FINAL_score + ")",1);
+ else
+ println("BEST lambda: " + lambdaToString(lambda), 1);
+
+ // delete intermediate .temp.*.it* decoder output files
+ for (int iteration = 1; iteration <= maxIts; ++iteration) {
+ if (compressFiles == 1) {
+ deleteFile(tmpDirPrefix + "temp.sents.it" + iteration + ".gz");
+ deleteFile(tmpDirPrefix + "temp.feats.it" + iteration + ".gz");
+ if (fileExists(tmpDirPrefix + "temp.stats.it" + iteration + ".copy.gz")) {
+ deleteFile(tmpDirPrefix + "temp.stats.it" + iteration + ".copy.gz");
+ } else {
+ deleteFile(tmpDirPrefix + "temp.stats.it" + iteration + ".gz");
+ }
+ } else {
+ deleteFile(tmpDirPrefix + "temp.sents.it" + iteration);
+ deleteFile(tmpDirPrefix + "temp.feats.it" + iteration);
+ if (fileExists(tmpDirPrefix + "temp.stats.it" + iteration + ".copy")) {
+ deleteFile(tmpDirPrefix + "temp.stats.it" + iteration + ".copy");
+ } else {
+ deleteFile(tmpDirPrefix + "temp.stats.it" + iteration);
+ }
+ }
+ }
+ } // void run_MIRA(int maxIts)
+
+ // this is the key function!
+ @SuppressWarnings("unchecked")
+ public double[] run_single_iteration(int iteration, int minIts, int maxIts, int prevIts,
+ int earlyStop, int[] maxIndex) {
+ double FINAL_score = 0;
+
+ double[] retA = new double[3];
+ // retA[0]: FINAL_score
+ // retA[1]: earlyStop
+ // retA[2]: should this be the last iteration?
+
+ boolean done = false;
+ retA[2] = 1; // will only be made 0 if we don't break from the following loop
+
+ // save feats and stats for all candidates(old & new)
+ HashMap<String, String>[] feat_hash = new HashMap[numSentences];
+ for (int i = 0; i < numSentences; i++)
+ feat_hash[i] = new HashMap<String, String>();
+
+ HashMap<String, String>[] stats_hash = new HashMap[numSentences];
+ for (int i = 0; i < numSentences; i++)
+ stats_hash[i] = new HashMap<String, String>();
+
+ while (!done) { // NOTE: this "loop" will only be carried out once
+ println("--- Starting MIRA iteration #" + iteration + " @ " + (new Date()) + " ---", 1);
+
+ // printMemoryUsage();
+
+ /******************************/
+ // CREATE DECODER CONFIG FILE //
+ /******************************/
+
+ createConfigFile(lambda, decoderConfigFileName, decoderConfigFileName + ".MIRA.orig");
+ // i.e. use the original config file as a template
+
+ /***************/
+ // RUN DECODER //
+ /***************/
+
+ if (iteration == 1) {
+ println("Decoding using initial weight vector " + lambdaToString(lambda), 1);
+ } else {
+ println("Redecoding using weight vector " + lambdaToString(lambda), 1);
+ }
+
+ // generate the n-best file after decoding
+ String[] decRunResult = run_decoder(iteration); // iteration passed in case fake decoder will
+ // be used
+ // [0] name of file to be processed
+ // [1] indicates how the output file was obtained:
+ // 1: external decoder
+ // 2: fake decoder
+ // 3: internal decoder
+
+ if (!decRunResult[1].equals("2")) {
+ println("...finished decoding @ " + (new Date()), 1);
+ }
+
+ checkFile(decRunResult[0]);
+
+ /************* END OF DECODING **************/
+
+ println("Producing temp files for iteration " + iteration, 3);
+
+ produceTempFiles(decRunResult[0], iteration);
+
+ // save intermedidate output files
+ // save joshua.config.mira.it*
+ if (saveInterFiles == 1 || saveInterFiles == 3) { // make copy of intermediate config file
+ if (!copyFile(decoderConfigFileName, decoderConfigFileName + ".MIRA.it" + iteration)) {
+ println("Warning: attempt to make copy of decoder config file (to create"
+ + decoderConfigFileName + ".MIRA.it" + iteration + ") was unsuccessful!", 1);
+ }
+ }
+
+ // save output.nest.MIRA.it*
+ if (saveInterFiles == 2 || saveInterFiles == 3) { // make copy of intermediate decoder output
+ // file...
+
+ if (!decRunResult[1].equals("2")) { // ...but only if no fake decoder
+ if (!decRunResult[0].endsWith(".gz")) {
+ if (!copyFile(decRunResult[0], decRunResult[0] + ".MIRA.it" + iteration)) {
+ println("Warning: attempt to make copy of decoder output file (to create"
+ + decRunResult[0] + ".MIRA.it" + iteration + ") was unsuccessful!", 1);
+ }
+ } else {
+ String prefix = decRunResult[0].substring(0, decRunResult[0].length() - 3);
+ if (!copyFile(prefix + ".gz", prefix + ".MIRA.it" + iteration + ".gz")) {
+ println("Warning: attempt to make copy of decoder output file (to create" + prefix
+ + ".MIRA.it" + iteration + ".gz" + ") was unsuccessful!", 1);
+ }
+ }
+
+ if (compressFiles == 1 && !decRunResult[0].endsWith(".gz")) {
+ gzipFile(decRunResult[0] + ".MIRA.it" + iteration);
+ }
+ } // if (!fake)
+ }
+
+ // ------------- end of saving .mira.it* files ---------------
+
+ int[] candCount = new int[numSentences];
+ int[] lastUsedIndex = new int[numSentences];
+
+ ConcurrentHashMap<Integer, int[]>[] suffStats_array = new ConcurrentHashMap[numSentences];
+ for (int i = 0; i < numSentences; ++i) {
+ candCount[i] = 0;
+ lastUsedIndex[i] = -1;
+ // suffStats_array[i].clear();
+ suffStats_array[i] = new ConcurrentHashMap<Integer, int[]>();
+ }
+
+ // initLambda[0] is not used!
+ double[] initialLambda = new double[1 + numParams];
+ for (int i = 1; i <= numParams; ++i)
+ initialLambda[i] = lambda.get(i);
+
+ // the "score" in initialScore refers to that
+ // assigned by the evaluation metric)
+
+ // you may consider all candidates from iter 1, or from iter (iteration-prevIts) to current
+ // iteration
+ int firstIt = Math.max(1, iteration - prevIts);
+ // i.e. only process candidates from the current iteration and candidates
+ // from up to prevIts previous iterations.
+ println("Reading candidate translations from iterations " + firstIt + "-" + iteration, 1);
+ println("(and computing " + metricName
+ + " sufficient statistics for previously unseen candidates)", 1);
+ print(" Progress: ");
+
+ int[] newCandidatesAdded = new int[1 + iteration];
+ for (int it = 1; it <= iteration; ++it)
+ newCandidatesAdded[it] = 0;
+
+ try {
+ // read temp files from all past iterations
+ // 3 types of temp files:
+ // 1. output hypo at iter i
+ // 2. feature value of each hypo at iter i
+ // 3. suff stats of each hypo at iter i
+
+ // each inFile corresponds to the output of an iteration
+ // (index 0 is not used; no corresponding index for the current iteration)
+ BufferedReader[] inFile_sents = new BufferedReader[iteration];
+ BufferedReader[] inFile_feats = new BufferedReader[iteration];
+ BufferedReader[] inFile_stats = new BufferedReader[iteration];
+
+ // temp file(array) from previous iterations
+ for (int it = firstIt; it < iteration; ++it) {
+ InputStream inStream_sents, inStream_feats, inStream_stats;
+ if (compressFiles == 0) {
+ inStream_sents = new FileInputStream(tmpDirPrefix + "temp.sents.it" + it);
+ inStream_feats = new FileInputStream(tmpDirPrefix + "temp.feats.it" + it);
+ inStream_stats = new FileInputStream(tmpDirPrefix + "temp.stats.it" + it);
+ } else {
+ inStream_sents = new GZIPInputStream(new FileInputStream(tmpDirPrefix + "temp.sents.it"
+ + it + ".gz"));
+ inStream_feats = new GZIPInputStream(new FileInputStream(tmpDirPrefix + "temp.feats.it"
+ + it + ".gz"));
+ inStream_stats = new GZIPInputStream(new FileInputStream(tmpDirPrefix + "temp.stats.it"
+ + it + ".gz"));
+ }
+
+ inFile_sents[it] = new BufferedReader(new InputStreamReader(inStream_sents, "utf8"));
+ inFile_feats[it] = new BufferedReader(new InputStreamReader(inStream_feats, "utf8"));
+ inFile_stats[it] = new BufferedReader(new InputStreamReader(inStream_stats, "utf8"));
+ }
+
+ InputStream inStream_sentsCurrIt, inStream_featsCurrIt, inStream_statsCurrIt;
+ // temp file for current iteration!
+ if (compressFiles == 0) {
+ inStream_sentsCurrIt = new FileInputStream(tmpDirPrefix + "temp.sents.it" + iteration);
+ inStream_featsCurrIt = new FileInputStream(tmpDirPrefix + "temp.feats.it" + iteration);
+ } else {
+ inStream_sentsCurrIt = new GZIPInputStream(new FileInputStream(tmpDirPrefix
+ + "temp.sents.it" + iteration + ".gz"));
+ inStream_featsCurrIt = new GZIPInputStream(new FileInputStream(tmpDirPrefix
+ + "temp.feats.it" + iteration + ".gz"));
+ }
+
+ BufferedReader inFile_sentsCurrIt = new BufferedReader(new InputStreamReader(
+ inStream_sentsCurrIt, "utf8"));
+ BufferedReader inFile_featsCurrIt = new BufferedReader(new InputStreamReader(
+ inStream_featsCurrIt, "utf8"));
+
+ BufferedReader inFile_statsCurrIt = null; // will only be used if statsCurrIt_exists below
+ // is set to true
+ PrintWriter outFile_statsCurrIt = null; // will only be used if statsCurrIt_exists below is
+ // set to false
+
+ // just to check if temp.stat.it.iteration exists
+ boolean statsCurrIt_exists = false;
+
+ if (fileExists(tmpDirPrefix + "temp.stats.it" + iteration)) {
+ inStream_statsCurrIt = new FileInputStream(tmpDirPrefix + "temp.stats.it" + iteration);
+ inFile_statsCurrIt = new BufferedReader(new InputStreamReader(inStream_statsCurrIt,
+ "utf8"));
+ statsCurrIt_exists = true;
+ copyFile(tmpDirPrefix + "temp.stats.it" + iteration, tmpDirPrefix + "temp.stats.it"
+ + iteration + ".copy");
+ } else if (fileExists(tmpDirPrefix + "temp.stats.it" + iteration + ".gz")) {
+ inStream_statsCurrIt = new GZIPInputStream(new FileInputStream(tmpDirPrefix
+ + "temp.stats.it" + iteration + ".gz"));
+ inFile_statsCurrIt = new BufferedReader(new InputStreamReader(inStream_statsCurrIt,
+ "utf8"));
+ statsCurrIt_exists = true;
+ copyFile(tmpDirPrefix + "temp.stats.it" + iteration + ".gz", tmpDirPrefix
+ + "temp.stats.it" + iteration + ".copy.gz");
+ } else {
+ outFile_statsCurrIt = new PrintWriter(tmpDirPrefix + "temp.stats.it" + iteration);
+ }
+
+ // output the 4^th temp file: *.temp.stats.merged
+ PrintWriter outFile_statsMerged = new PrintWriter(tmpDirPrefix + "temp.stats.merged");
+ // write sufficient statistics from all the sentences
+ // from the output files into a single file
+ PrintWriter outFile_statsMergedKnown = new PrintWriter(tmpDirPrefix
+ + "temp.stats.mergedKnown");
+ // write sufficient statistics from all the sentences
+ // from the output files into a single file
+
+ // output the 5^th 6^th temp file, but will be deleted at the end of the function
+ FileOutputStream outStream_unknownCands = new FileOutputStream(tmpDirPrefix
+ + "temp.currIt.unknownCands", false);
+ OutputStreamWriter outStreamWriter_unknownCands = new OutputStreamWriter(
+ outStream_unknownCands, "utf8");
+ BufferedWriter outFile_unknownCands = new BufferedWriter(outStreamWriter_unknownCands);
+
+ PrintWriter outFile_unknownIndices = new PrintWriter(tmpDirPrefix
+ + "temp.currIt.unknownIndices");
+
+ String sents_str, feats_str, stats_str;
+
+ // BUG: this assumes a candidate string cannot be produced for two
+ // different source sentences, which is not necessarily true
+ // (It's not actually a bug, but only because existingCandStats gets
+ // cleared before moving to the next source sentence.)
+ // FIX: should be made an array, indexed by i
+ HashMap<String, String> existingCandStats = new HashMap<String, String>();
+ // VERY IMPORTANT:
+ // A CANDIDATE X MAY APPEARED IN ITER 1, ITER 3
+ // BUT IF THE USER SPECIFIED TO CONSIDER ITERATIONS FROM ONLY ITER 2, THEN
+ // X IS NOT A "REPEATED" CANDIDATE IN ITER 3. THEREFORE WE WANT TO KEEP THE
+ // SUFF STATS FOR EACH CANDIDATE(TO SAVE COMPUTATION IN THE FUTURE)
+
+ // Stores precalculated sufficient statistics for candidates, in case
+ // the same candidate is seen again. (SS stored as a String.)
+ // Q: Why do we care? If we see the same candidate again, aren't we going
+ // to ignore it? So, why do we care about the SS of this repeat candidate?
+ // A: A "repeat" candidate may not be a repeat candidate in later
+ // iterations if the user specifies a value for prevMERTIterations
+ // that causes MERT to skip candidates from early iterations.
+
+ double[] currFeatVal = new double[1 + numParams];
+ String[] featVal_str;
+
+ int totalCandidateCount = 0;
+
+ // new candidate size for each sentence
+ int[] sizeUnknown_currIt = new int[numSentences];
+
+ for (int i = 0; i < numSentences; ++i) {
+ // process candidates from previous iterations
+ // low efficiency? for each iteration, it reads in all previous iteration outputs
+ // therefore a lot of overlapping jobs
+ // this is an easy implementation to deal with the situation in which user only specified
+ // "previt" and hopes to consider only the previous previt
+ // iterations, then for each iteration the existing candadites will be different
+ for (int it = firstIt; it < iteration; ++it) {
+ // Why up to but *excluding* iteration?
+ // Because the last iteration is handled a little differently, since
+ // the SS must be calculated (and the corresponding file created),
+ // which is not true for previous iterations.
+
+ for (int n = 0; n <= sizeOfNBest; ++n) {
+ // note that in all temp files, "||||||" is a separator between 2 n-best lists
+
+ // Why up to and *including* sizeOfNBest?
+ // So that it would read the "||||||" separator even if there is
+ // a complete list of sizeOfNBest candidates.
+
+ // for the nth candidate for the ith sentence, read the sentence, feature values,
+ // and sufficient statistics from the various temp files
+
+ // read one line of temp.sent, temp.feat, temp.stats from iteration it
+ sents_str = inFile_sents[it].readLine();
+ feats_str = inFile_feats[it].readLine();
+ stats_str = inFile_stats[it].readLine();
+
+ if (sents_str.equals("||||||")) {
+ n = sizeOfNBest + 1; // move on to the next n-best list
+ } else if (!existingCandStats.containsKey(sents_str)) // if this candidate does not
+ // exist
+ {
+ outFile_statsMergedKnown.println(stats_str);
+
+ // save feats & stats
+ feat_hash[i].put(sents_str, feats_str);
+ stats_hash[i].put(sents_str, stats_str);
+
+ // extract feature value
+ featVal_str = feats_str.split("\\s+");
+
+ existingCandStats.put(sents_str, stats_str);
+ candCount[i] += 1;
+ newCandidatesAdded[it] += 1;
+
+ } // if unseen candidate
+ } // for (n)
+ } // for (it)
+
+ outFile_statsMergedKnown.println("||||||");
+
+ // ---------- end of processing previous iterations ----------
+ // ---------- now start processing new candidates ----------
+
+ // now process the candidates of the current iteration
+ // now determine the new candidates of the current iteration
+
+ /*
+ * remember: BufferedReader inFile_sentsCurrIt BufferedReader inFile_featsCurrIt
+ * PrintWriter outFile_statsCurrIt
+ */
+
+ String[] sentsCurrIt_currSrcSent = new String[sizeOfNBest + 1];
+
+ Vector<String> unknownCands_V = new Vector<String>();
+ // which candidates (of the i'th source sentence) have not been seen before
+ // this iteration?
+
+ for (int n = 0; n <= sizeOfNBest; ++n) {
+ // Why up to and *including* sizeOfNBest?
+ // So that it would read the "||||||" separator even if there is
+ // a complete list of sizeOfNBest candidates.
+
+ // for the nth candidate for the ith sentence, read the sentence,
+ // and store it in the sentsCurrIt_currSrcSent array
+
+ sents_str = inFile_sentsCurrIt.readLine(); // read one candidate from the current
+ // iteration
+ sentsCurrIt_currSrcSent[n] = sents_str; // Note: possibly "||||||"
+
+ if (sents_str.equals("||||||")) {
+ n = sizeOfNBest + 1;
+ } else if (!existingCandStats.containsKey(sents_str)) {
+ unknownCands_V.add(sents_str); // NEW CANDIDATE FROM THIS ITERATION
+ writeLine(sents_str, outFile_unknownCands);
+ outFile_unknownIndices.println(i); // INDEX OF THE NEW CANDIDATES
+ newCandidatesAdded[iteration] += 1;
+ existingCandStats.put(sents_str, "U"); // i.e. unknown
+ // we add sents_str to avoid duplicate entries in unknownCands_V
+ }
+ } // for (n)
+
+ // only compute suff stats for new candidates
+ // now unknownCands_V has the candidates for which we need to calculate
+ // sufficient statistics (for the i'th source sentence)
+ int sizeUnknown = unknownCands_V.size();
+ sizeUnknown_currIt[i] = sizeUnknown;
+
+ existingCandStats.clear();
+
+ } // for (i) each sentence
+
+ // ---------- end of merging candidates stats from previous iterations
+ // and finding new candidates ------------
+
+ /*
+ * int[][] newSuffStats = null; if (!statsCurrIt_exists && sizeUnknown > 0) { newSuffStats =
+ * evalMetric.suffStats(unknownCands, indices); }
+ */
+
+ outFile_statsMergedKnown.close();
+ outFile_unknownCands.close();
+ outFile_unknownIndices.close();
+
+ // want to re-open all temp files and start from scratch again?
+ for (int it = firstIt; it < iteration; ++it) // previous iterations temp files
+ {
+ inFile_sents[it].close();
+ inFile_stats[it].close();
+
+ InputStream inStream_sents, inStream_stats;
+ if (compressFiles == 0) {
+ inStream_sents = new FileInputStream(tmpDirPrefix + "temp.sents.it" + it);
+ inStream_stats = new FileInputStream(tmpDirPrefix + "temp.stats.it" + it);
+ } else {
+ inStream_sents = new GZIPInputStream(new FileInputStream(tmpDirPrefix + "temp.sents.it"
+ + it + ".gz"));
+ inStream_stats = new GZIPInputStream(new FileInputStream(tmpDirPrefix + "temp.stats.it"
+ + it + ".gz"));
+ }
+
+ inFile_sents[it] = new BufferedReader(new InputStreamReader(inStream_sents, "utf8"));
+ inFile_stats[it] = new BufferedReader(new InputStreamReader(inStream_stats, "utf8"));
+ }
+
+ inFile_sentsCurrIt.close();
+ // current iteration temp files
+ if (compressFiles == 0) {
+ inStream_sentsCurrIt = new FileInputStream(tmpDirPrefix + "temp.sents.it" + iteration);
+ } else {
+ inStream_sentsCurrIt = new GZIPInputStream(new FileInputStream(tmpDirPrefix
+ + "temp.sents.it" + iteration + ".gz"));
+ }
+ inFile_sentsCurrIt = new BufferedReader(new InputStreamReader(inStream_sentsCurrIt, "utf8"));
+
+ // calculate SS for unseen candidates and write them to file
+ FileInputStream inStream_statsCurrIt_unknown = null;
+ BufferedReader inFile_statsCurrIt_unknown = null;
+
+ if (!statsCurrIt_exists && newCandidatesAdded[iteration] > 0) {
+ // create the file...
+ evalMetric.createSuffStatsFile(tmpDirPrefix + "temp.currIt.unknownCands", tmpDirPrefix
+ + "temp.currIt.unknownIndices", tmpDirPrefix + "temp.stats.unknown", sizeOfNBest);
+
+ // ...and open it
+ inStream_statsCurrIt_unknown = new FileInputStream(tmpDirPrefix + "temp.stats.unknown");
+ inFile_statsCurrIt_unknown = new BufferedReader(new InputStreamReader(
+ inStream_statsCurrIt_unknown, "utf8"));
+ }
+
+ // open mergedKnown file
+ // newly created by the big loop above
+ FileInputStream instream_statsMergedKnown = new FileInputStream(tmpDirPrefix
+ + "temp.stats.mergedKnown");
+ BufferedReader inFile_statsMergedKnown = new BufferedReader(new InputStreamReader(
+ instream_statsMergedKnown, "utf8"));
+
+ // num of features before observing new firing features from this iteration
+ numParamsOld = numParams;
+
+ for (int i = 0; i < numSentences; ++i) {
+ // reprocess candidates from previous iterations
+ for (int it = firstIt; it < iteration; ++it) {
+ for (int n = 0; n <= sizeOfNBest; ++n) {
+ sents_str = inFile_sents[it].readLine();
+ stats_str = inFile_stats[it].readLine();
+
+ if (sents_str.equals("||||||")) {
+ n = sizeOfNBest + 1;
+ } else if (!existingCandStats.containsKey(sents_str)) {
+ existingCandStats.put(sents_str, stats_str);
+ } // if unseen candidate
+ } // for (n)
+ } // for (it)
+
+ // copy relevant portion from mergedKnown to the merged file
+ String line_mergedKnown = inFile_statsMergedKnown.readLine();
+ while (!line_mergedKnown.equals("||||||")) {
+ outFile_statsMerged.println(line_mergedKnown);
+ line_mergedKnown = inFile_statsMergedKnown.readLine();
+ }
+
+ int[] stats = new int[suffStatsCount];
+
+ for (int n = 0; n <= sizeOfNBest; ++n) {
+ sents_str = inFile_sentsCurrIt.readLine();
+ feats_str = inFile_featsCurrIt.readLine();
+
+ if (sents_str.equals("||||||")) {
+ n = sizeOfNBest + 1;
+ } else if (!existingCandStats.containsKey(sents_str)) {
+
+ if (!statsCurrIt_exists) {
+ stats_str = inFile_statsCurrIt_unknown.readLine();
+
+ String[] temp_stats = stats_str.split("\\s+");
+ for (int s = 0; s < suffStatsCount; ++s) {
+ stats[s] = Integer.parseInt(temp_stats[s]);
+ }
+
+ outFile_statsCurrIt.println(stats_str);
+ } else {
+ stats_str = inFile_statsCurrIt.readLine();
+
+ String[] temp_stats = stats_str.split("\\s+");
+ for (int s = 0; s < suffStatsCount; ++s) {
+ stats[s] = Integer.parseInt(temp_stats[s]);
+ }
+ }
+
+ outFile_statsMerged.println(stats_str);
+
+ // save feats & stats
+ // System.out.println(sents_str+" "+feats_str);
+
+ feat_hash[i].put(sents_str, feats_str);
+ stats_hash[i].put(sents_str, stats_str);
+
+ featVal_str = feats_str.split("\\s+");
+
+ if (feats_str.indexOf('=') != -1) {
+ for (String featurePair : featVal_str) {
+ String[] pair = featurePair.split("=");
+ String name = pair[0];
+ Double value = Double.parseDouble(pair[1]);
+ int featId = Vocabulary.id(name);
+
+ // need to identify newly fired feats here
+ // in this case currFeatVal is not given the value
+ // of the new feat, since the corresponding weight is
+ // initialized as zero anyway
+ if (featId > numParams) {
+ ++numParams;
+ lambda.add(new Double(0));
+ }
+ }
+ }
+ existingCandStats.put(sents_str, stats_str);
+ candCount[i] += 1;
+
+ // newCandidatesAdded[iteration] += 1;
+ // moved to code above detecting new candidates
+ } else {
+ if (statsCurrIt_exists)
+ inFile_statsCurrIt.readLine();
+ else {
+ // write SS to outFile_statsCurrIt
+ stats_str = existingCandStats.get(sents_str);
+ outFile_statsCurrIt.println(stats_str);
+ }
+ }
+
+ } // for (n)
+
+ // now d = sizeUnknown_currIt[i] - 1
+
+ if (statsCurrIt_exists)
+ inFile_statsCurrIt.readLine();
+ else
+ outFile_statsCurrIt.println("||||||");
+
+ existingCandStats.clear();
+ totalCandidateCount += candCount[i];
+
+ // output sentence progress
+ if ((i + 1) % 500 == 0) {
+ print((i + 1) + "\n" + " ", 1);
+ } else if ((i + 1) % 100 == 0) {
+ print("+", 1);
+ } else if ((i + 1) % 25 == 0) {
+ print(".", 1);
+ }
+
+ } // for (i)
+
+ inFile_statsMergedKnown.close();
+ outFile_statsMerged.close();
+
+ // for testing
+ /*
+ * int total_sent = 0; for( int i=0; i<numSentences; i++ ) {
+ * System.out.println(feat_hash[i].size()+" "+candCount[i]); total_sent +=
+ * feat_hash[i].size(); feat_hash[i].clear(); }
+ * System.out.println("----------------total sent: "+total_sent); total_sent = 0; for( int
+ * i=0; i<numSentences; i++ ) { System.out.println(stats_hash[i].size()+" "+candCount[i]);
+ * total_sent += stats_hash[i].size(); stats_hash[i].clear(); }
+ * System.out.println("*****************total sent: "+total_sent);
+ */
+
+ println("", 1); // finish progress line
+
+ for (int it = firstIt; it < iteration; ++it) {
+ inFile_sents[it].close();
+ inFile_feats[it].close();
+ inFile_stats[it].close();
+ }
+
+ inFile_sentsCurrIt.close();
+ inFile_featsCurrIt.close();
+ if (statsCurrIt_exists)
+ inFile_statsCurrIt.close();
+ else
+ outFile_statsCurrIt.close();
+
+ if (compressFiles == 1 && !statsCurrIt_exists) {
+ gzipFile(tmpDirPrefix + "temp.stats.it" + iteration);
+ }
+
+ // clear temp files
+ deleteFile(tmpDirPrefix + "temp.currIt.unknownCands");
+ deleteFile(tmpDirPrefix + "temp.currIt.unknownIndices");
+ deleteFile(tmpDirPrefix + "temp.stats.unknown");
+ deleteFile(tmpDirPrefix + "temp.stats.mergedKnown");
+
+ // cleanupMemory();
+
+ println("Processed " + totalCandidateCount + " distinct candidates " + "(about "
+ + totalCandidateCount / numSentences + " per sentence):", 1);
+ for (int it = firstIt; it <= iteration; ++it) {
+ println("newCandidatesAdded[it=" + it + "] = " + newCandidatesAdded[it] + " (about "
+ + newCandidatesAdded[it] / numSentences + " per sentence)", 1);
+ }
+
+ println("", 1);
+
+ println("Number of features observed so far: " + numParams);
+ println("", 1);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ // n-best list converges
+ if (newCandidatesAdded[iteration] == 0) {
+ if (!oneModificationPerIteration) {
+ println("No new candidates added in this iteration; exiting MIRA.", 1);
+ println("", 1);
+ println("--- MIRA iteration #" + iteration + " ending @ " + (new Date()) + " ---", 1);
+ println("", 1);
+ deleteFile(tmpDirPrefix + "temp.stats.merged");
+
+ if (returnBest) {
+ // note that bestLambda.size() <= lambda.size()
+ for (int p = 1; p < bestLambda.size(); ++p)
+ lambda.set(p, bestLambda.get(p));
+ // and set the rest of lambda to be 0
+ for (int p = 0; p < lambda.size() - bestLambda.size(); ++p)
+ lambda.set(p + bestLambda.size(), new Double(0));
+ }
+
+ return null; // this means that the old values should be kept by the caller
+ } else {
+ println("Note: No new candidates added in this iteration.", 1);
+ }
+ }
+
+ /************* start optimization **************/
+
+ /*
+ * for( int v=1; v<initialLambda[1].length; v++ ) System.out.print(initialLambda[1][v]+" ");
+ * System.exit(0);
+ */
+
+ Optimizer.sentNum = numSentences; // total number of training sentences
+ Optimizer.needShuffle = needShuffle;
+ Optimizer.miraIter = miraIter;
+ Optimizer.oraSelectMode = oraSelectMode;
+ Optimizer.predSelectMode = predSelectMode;
+ Optimizer.runPercep = runPercep;
+ Optimizer.C = C;
+ Optimizer.needAvg = needAvg;
+ // Optimizer.sentForScale = sentForScale;
+ Optimizer.scoreRatio = scoreRatio;
+ Optimizer.evalMetric = evalMetric;
+ Optimizer.normalizationOptions = normalizationOptions;
+ Optimizer.needScale = needScale;
+ Optimizer.batchSize = batchSize;
+
+ // if need to use bleu stats history
+ if (iteration == 1) {
+ if (evalMetric.get_metricName().equals("BLEU") && usePseudoBleu) {
+ Optimizer.initBleuHistory(numSentences, evalMetric.get_suffStatsCount());
+ Optimizer.usePseudoBleu = usePseudoBleu;
+ Optimizer.R = R;
+ }
+ if (evalMetric.get_metricName().equals("TER-BLEU") && usePseudoBleu) {
+ Optimizer.initBleuHistory(numSentences, evalMetric.get_suffStatsCount() - 2); // Stats
+ // count of
+ // TER=2
+ Optimizer.usePseudoBleu = usePseudoBleu;
+ Optimizer.R = R;
+ }
+ }
+
+ Vector<String> output = new Vector<String>();
+
+ // note: initialLambda[] has length = numParamsOld
+ // augmented with new feature weights, initial values are 0
+ double[] initialLambdaNew = new double[1 + numParams];
+ System.arraycopy(initialLambda, 1, initialLambdaNew, 1, numParamsOld);
+
+ // finalLambda[] has length = numParams (considering new features)
+ double[] finalLambda = new double[1 + numParams];
+
+ Optimizer opt = new Optimizer(output, isOptimizable, initialLambdaNew, feat_hash, stats_hash);
+ finalLambda = opt.runOptimizer();
+
+ if (returnBest) {
+ double metricScore = opt.getMetricScore();
+ if (!evalMetric.getToBeMinimized()) {
+ if (metricScore > prevMetricScore) {
+ prevMetricScore = metricScore;
+ for (int p = 1; p < bestLambda.size(); ++p)
+ bestLambda.set(p, finalLambda[p]);
+ if (1 + numParams > bestLambda.size()) {
+ for (int p = bestLambda.size(); p <= numParams; ++p)
+ bestLambda.add(p, finalLambda[p]);
+ }
+ }
+ } else {
+ if (metricScore < prevMetricScore) {
+ prevMetricScore = metricScore;
+ for (int p = 1; p < bestLambda.size(); ++p)
+ bestLambda.set(p, finalLambda[p]);
+ if (1 + numParams > bestLambda.size()) {
+ for (int p = bestLambda.size(); p <= numParams; ++p)
+ bestLambda.add(p, finalLambda[p]);
+ }
+ }
+ }
+ }
+
+ // System.out.println(finalLambda.length);
+ // for( int i=0; i<finalLambda.length-1; i++ )
+ // System.out.println(finalLambda[i+1]);
+
+ /************* end optimization **************/
+
+ for (int i = 0; i < output.size(); i++)
+ println(output.get(i));
+
+ // check if any parameter has been updated
+ boolean anyParamChanged = false;
+ boolean anyParamChangedSignificantly = false;
+
+ for (int c = 1; c <= numParams; ++c) {
+ if (finalLambda[c] != lambda.get(c)) {
+ anyParamChanged = true;
+ }
+ if (Math.abs(finalLambda[c] - lambda.get(c)) > stopSigValue) {
+ anyParamChangedSignificantly = true;
+ }
+ }
+
+ // System.arraycopy(finalLambda,1,lambda,1,numParams);
+
+ println("--- MIRA iteration #" + iteration + " ending @ " + (new Date()) + " ---", 1);
+ println("", 1);
+
+ if (!anyParamChanged) {
+ println("No parameter value changed in this iteration; exiting MIRA.", 1);
+ println("", 1);
+ break; // exit for (iteration) loop preemptively
+ }
+
+ // was an early stopping criterion satisfied?
+ boolean critSatisfied = false;
+ if (!anyParamChangedSignificantly && stopSigValue >= 0) {
+ println("Note: No parameter value changed significantly " + "(i.e. by more than "
+ + stopSigValue + ") in this iteration.", 1);
+ critSatisfied = true;
+ }
+
+ if (critSatisfied) {
+ ++earlyStop;
+ println("", 1);
+ } else {
+ earlyStop = 0;
+ }
+
+ // if min number of iterations executed, investigate if early exit should happen
+ if (iteration >= minIts && earlyStop >= stopMinIts) {
+ println("Some early stopping criteria has been observed " + "in " + stopMinIts
+ + " consecutive iterations; exiting MIRA.", 1);
+ println("", 1);
+
+ if (returnBest) {
+ for (int f = 1; f <= bestLambda.size() - 1; ++f)
+ lambda.set(f, bestLambda.get(f));
+ } else {
+ for (int f = 1; f <= numParams; ++f)
+ lambda.set(f, finalLambda[f]);
+ }
+
+ break; // exit for (iteration) loop preemptively
+ }
+
+ // if max number of iterations executed, exit
+ if (iteration >= maxIts) {
+ println("Maximum number of MIRA iterations reached; exiting MIRA.", 1);
+ println("", 1);
+
+ if (returnBest) {
+ for (int f = 1; f <= bestLambda.size() - 1; ++f)
+ lambda.set(f, bestLambda.get(f));
+ } else {
+ for (int f = 1; f <= numParams; ++f)
+ lambda.set(f, finalLambda[f]);
+ }
+
+ break; // exit for (iteration) loop
+ }
+
+ // use the new wt vector to decode the next iteration
+ // (interpolation with previous wt vector)
+ double interCoef = 1.0; // no interpolation for now
+ for (int i = 1; i <= numParams; i++)
+ lambda.set(i, interCoef * finalLambda[i] + (1 - interCoef) * lambda.get(i).doubleValue());
+
+ println("Next iteration will decode with lambda: " + lambdaToString(lambda), 1);
+ println("", 1);
+
+ // printMemoryUsage();
+ for (int i = 0; i < numSentences; ++i) {
+ suffStats_array[i].clear();
+ }
+ // cleanupMemory();
+ // println("",2);
+
+ retA[2] = 0; // i.e. this should NOT be the last iteration
+ done = true;
+
+ } // while (!done) // NOTE: this "loop" will only be carried out once
+
+ // delete .temp.stats.merged file, since it is not needed in the next
+ // iteration (it will be recreated from scratch)
+ deleteFile(tmpDirPrefix + "temp.stats.merged");
+
+ retA[0] = FINAL_score;
+ retA[1] = earlyStop;
+ return retA;
+
+ } // run_single_iteration
+
+ private String lambdaToString(ArrayList<Double> lambdaA) {
+ String retStr = "{";
+ int featToPrint = numParams > 15 ? 15 : numParams;
+ // print at most the first 15 features
+
+ retStr += "(listing the first " + featToPrint + " lambdas)";
+ for (int c = 1; c <= featToPrint - 1; ++c) {
+ retStr += "" + String.format("%.4f", lambdaA.get(c).doubleValue()) + ", ";
+ }
+ retStr += "" + String.format("%.4f", lambdaA.get(numParams).doubleValue()) + "}";
+
+ return retStr;
+ }
+
+ private String[] run_decoder(int iteration) {
+ String[] retSA = new String[2];
+
+ // retsa saves the output file name(nbest-file)
+ // and the decoder type
+
+ // [0] name of file to be processed
+ // [1] indicates how the output file was obtained:
+ // 1: external decoder
+ // 2: fake decoder
+ // 3: internal decoder
+
+ // use fake decoder
+ if (fakeFileNameTemplate != null
+ && fileExists(fakeFileNamePrefix + iteration + fakeFileNameSuffix)) {
+ String fakeFileName = fakeFileNamePrefix + iteration + fakeFileNameSuffix;
+ println("Not running decoder; using " + fakeFileName + " instead.", 1);
+ /*
+ * if (fakeFileName.endsWith(".gz")) { copyFile(fakeFileName,decoderOutFileName+".gz");
+ * gunzipFile(decoderOutFileName+".gz"); } else { copyFile(fakeFileName,decoderOutFileName); }
+ */
+ retSA[0] = fakeFileName;
+ retSA[1] = "2";
+
+ } else {
+ println("Running external decoder...", 1);
+
+ try {
+ ArrayList<String> cmd = new ArrayList<String>();
+ cmd.add(decoderCommandFileName);
+
+ if (passIterationToDecoder)
+ cmd.add(Integer.toString(iteration));
+
+ ProcessBuilder pb = new ProcessBuilder(cmd);
+ // this merges the error and output streams of the subprocess
+ pb.redirectErrorStream(true);
+ Process p = pb.start();
+
+ // capture the sub-command's output
+ new StreamGobbler(p.getInputStream(), decVerbosity).start();
+
+ int decStatus = p.waitFor();
+ if (decStatus != validDecoderExitValue) {
+ throw new RuntimeException("Call to decoder returned " + decStatus + "; was expecting "
+ + validDecoderExitValue + ".");
+ }
+ } catch (IOException| InterruptedException e) {
+ throw new RuntimeException(e);
+ }
+
+ retSA[0] = decoderOutFileName;
+ retSA[1] = "1";
+
+ }
+
+ return retSA;
+ }
+
+ private void produceTempFiles(String nbestFileName, int iteration) {
+ try {
+ String sentsFileName = tmpDirPrefix + "temp.sents.it" + iteration;
+ String featsFileName = tmpDirPrefix + "temp.feats.it" + iteration;
+
+ FileOutputStream outStream_sents = new FileOutputStream(sentsFileName, false);
+ OutputStreamWriter outStreamWriter_sents = new OutputStreamWriter(outStream_sents, "utf8");
+ BufferedWriter outFile_sents = new BufferedWriter(outStreamWriter_sents);
+
+ PrintWriter outFile_feats = new PrintWriter(featsFileName);
+
+ InputStream inStream_nbest = null;
+ if (nbestFileName.endsWith(".gz")) {
+ inStream_nbest = new GZIPInputStream(new FileInputStream(nbestFileName));
+ } else {
+ inStream_nbest = new FileInputStream(nbestFileName);
+ }
+ BufferedReader inFile_nbest = new BufferedReader(
+ new InputStreamReader(inStream_nbest, "utf8"));
+
+ String line; // , prevLine;
+ String candidate_str = "";
+ String feats_str = "";
+
+ int i = 0;
+ int n = 0;
+ line = inFile_nbest.readLine();
+
+ while (line != null) {
+
+ /*
+ * line format:
+ *
+ * i ||| words of candidate translation . ||| feat-1_val feat-2_val ... feat-numParams_val
+ * .*
+ */
+
+ // in a well formed file, we'd find the nth candidate for the ith sentence
+
+ int read_i = Integer.parseInt((line.substring(0, line.indexOf("|||"))).trim());
+
+ if (read_i != i) {
+ writeLine("||||||", outFile_sents);
+ outFile_feats.println("||||||");
+ n = 0;
+ ++i;
+ }
+
+ line = (line.substring(line.indexOf("|||") + 3)).trim(); // get rid of initial text
+
+ candidate_str = (line.substring(0, line.indexOf("|||"))).trim();
+ feats_str = (line.substring(line.indexOf("|||") + 3)).trim();
+ // get rid of candidate string
+
+ int junk_i = feats_str.indexOf("|||");
+ if (junk_i >= 0) {
+ feats_str = (feats_str.substring(0, junk_i)).trim();
+ }
+
+ writeLine(normalize(candidate_str, textNormMethod), outFile_sents);
+ outFile_feats.println(feats_str);
+
+ ++n;
+ if (n == sizeOfNBest) {
+ writeLine("||||||", outFile_sents);
+ outFile_feats.println("||||||");
+ n = 0;
+ ++i;
+ }
+
+ line = inFile_nbest.readLine();
+ }
+
+ if (i != numSentences) { // last sentence had too few candidates
+ writeLine("||||||", outFile_sents);
+ outFile_feats.println("||||||");
+ }
+
+ inFile_nbest.close();
+ outFile_sents.close();
+ outFile_feats.close();
+
+ if (compressFiles == 1) {
+ gzipFile(sentsFileName);
+ gzipFile(featsFileName);
+ }
+
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ }
+
+ private void createConfigFile(ArrayList<Double> params, String cfgFileName,
+ String templateFileName) {
+ try {
+ // i.e. create cfgFileName, which is similar to templateFileName, but with
+ // params[] as parameter values
+
+ BufferedReader inFile = new BufferedReader(new FileReader(templateFileName));
+ PrintWriter outFile = new PrintWriter(cfgFileName);
+
+ BufferedReader inFeatDefFile = null;
+ PrintWriter outFeatDefFile = null;
+ int origFeatNum = 0; // feat num in the template file
+
+ String line = inFile.readLine();
+ while (line != null) {
+ int c_match = -1;
+ for (int c = 1; c <= numParams; ++c) {
+ if (line.startsWith(Vocabulary.word(c) + " ")) {
+ c_match = c;
+ ++origFeatNum;
+ break;
+ }
+ }
+
+ if (c_match == -1) {
+ outFile.println(line);
+ } else {
+ if (Math.abs(params.get(c_match).doubleValue()) > 1e-20)
+ outFile.println(Vocabulary.word(c_match) + " " + params.get(c_match));
+ }
+
+ line = inFile.readLine();
+ }
+
+ // now append weights of new features
+ for (int c = origFeatNum + 1; c <= numParams; ++c) {
+ if (Math.abs(params.get(c).doubleValue()) > 1e-20)
+ outFile.println(Vocabulary.word(c) + " " + params.get(c));
+ }
+
+ inFile.close();
+ outFile.close();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ private void processParamFile() {
+ // process parameter file
+ Scanner inFile_init = null;
+ try {
+ inFile_init = new Scanner(new FileReader(paramsFileName));
+ } catch (FileNotFoundException e) {
+ throw new RuntimeException(e);
+ }
+
+ String dummy = "";
+
+ // initialize lambda[] and other related arrays
+ for (int c = 1; c <= numParams; ++c) {
+ // skip parameter name
+ while (!dummy.equals("|||")) {
+ dummy = inFile_init.next();
+ }
+
+ // read default value
+ lambda.set(c, inFile_init.nextDouble());
+ defaultLambda[c] = lambda.get(c).doubleValue();
+
+ // read isOptimizable
+ dummy = inFile_init.next();
+ if (dummy.equals("Opt")) {
+ isOptimizable[c] = true;
+ } else if (dummy.equals("Fix")) {
+ isOptimizable[c] = false;
+ } else {
+ throw new RuntimeException("Unknown isOptimizable string " + dummy + " (must be either Opt or Fix)");
+ }
+
+ if (!isOptimizable[c]) { // skip next two values
+ dummy = inFile_init.next();
+ dummy = inFile_init.next();
+ dummy = inFile_init.next();
+ dummy = inFile_init.next();
+ } else {
+ // the next two values are not used, only to be consistent with ZMERT's params file format
+ dummy = inFile_init.next();
+ dummy = inFile_init.next();
+ // set minRandValue[c] and maxRandValue[c] (range for random values)
+ dummy = inFile_init.next();
+ if (dummy.equals("-Inf") || dummy.equals("+Inf")) {
+ throw new RuntimeException("minRandValue[" + c + "] cannot be -Inf or +Inf!");
+ } else {
+ minRandValue[c] = Double.parseDouble(dummy);
+ }
+
+ dummy = inFile_init.next();
+ if (dummy.equals("-Inf") || dummy.equals("+Inf")) {
+ throw new RuntimeException("maxRandValue[" + c + "] cannot be -Inf or +Inf!");
+ } else {
+ maxRandValue[c] = Double.parseDouble(dummy);
+ }
+
+ // check for illogical values
+ if (minRandValue[c] > maxRandValue[c]) {
+ throw new RuntimeException("minRandValue[" + c + "]=" + minRandValue[c]
+ + " > " + maxRandValue[c] + "=maxRandValue[" + c + "]!");
+ }
+
+ // check for odd values
+ if (minRandValue[c] == maxRandValue[c]) {
+ println("Warning: lambda[" + c + "] has " + "minRandValue = maxRandValue = "
+ + minRandValue[c] + ".", 1);
+ }
+ } // if (!isOptimizable[c])
+
+ /*
+ * precision[c] = inFile_init.nextDouble(); if (precision[c] < 0) { println("precision[" + c +
+ * "]=" + precision[c] + " < 0! Must be non-negative."); System.exit(21); }
+ */
+
+ }
+
+ // set normalizationOptions[]
+ String origLine = "";
+ while (origLine != null && origLine.length() == 0) {
+ origLine = inFile_init.nextLine();
+ }
+
+ // How should a lambda[] vector be normalized (before decoding)?
+ // nO[0] = 0: no normalization
+ // nO[0] = 1: scale so that parameter nO[2] has absolute value nO[1]
+ // nO[0] = 2: scale so that the maximum absolute value is nO[1]
+ // nO[0] = 3: scale so that the minimum absolute value is nO[1]
+ // nO[0] = 4: scale so that the L-nO[1] norm equals nO[2]
+
+ // normalization = none
+ // normalization = absval 1 lm
+ // normalization = maxabsval 1
+ // normalization = minabsval 1
+ // normalization = LNorm 2 1
+
+ dummy = (origLine.substring(origLine.indexOf("=") + 1)).trim();
+ String[] dummyA = dummy.split("\\s+");
+
+ if (dummyA[0].equals("none")) {
+ normalizationOptions[0] = 0;
+ } else if (dummyA[0].equals("absval")) {
+ normalizationOptions[0] = 1;
+ normalizationOptions[1] = Double.parseDouble(dummyA[1]);
+ String pName = dummyA[2];
+ for (int i = 3; i < dummyA.length; ++i) { // in case parameter name has multiple words
+ pName = pName + " " + dummyA[i];
+ }
+ normalizationOptions[2] = Vocabulary.id(pName);
+
+ if (normalizationOptions[1] <= 0) {
+ throw new RuntimeException("Value for the absval normalization method must be positive.");
+ }
+ if (normalizationOptions[2] == 0) {
+ throw new RuntimeException("Unrecognized feature name " + normalizationOptions[2]
+ + " for absval normalization method.");
+ }
+ } else if (dummyA[0].equals("maxabsval")) {
+ normalizationOptions[0] = 2;
+ normalizationOptions[1] = Double.parseDouble(dummyA[1]);
+ if (normalizationOptions[1] <= 0) {
+ throw new RuntimeException("Value for the maxabsval normalization method must be positive.");
+ }
+ } else if (dummyA[0].equals("minabsval")) {
+ normalizationOptions[0] = 3;
+ normalizationOptions[1] = Double.parseDouble(dummyA[1]);
+ if (normalizationOptions[1] <= 0) {
+ throw new RuntimeException("Value for the minabsval normalization method must be positive.");
+ }
+ } else if (dummyA[0].equals("LNorm")) {
+ normalizationOptions[0] = 4;
+ normalizationOptions[1] = Double.parseDouble(dummyA[1]);
+ normalizationOptions[2] = Double.parseDouble(dummyA[2]);
+ if (normalizationOptions[1] <= 0 || normalizationOptions[2] <= 0) {
+ throw new RuntimeException("Both values for the LNorm normalization method must be"
+ + " positive.");
+ }
+ } else {
+ throw new RuntimeException("Unrecognized normalization method " + dummyA[0] + "; "
+ + "must be one of none, absval, maxabsval, and LNorm.");
+ } // if (dummyA[0])
+
+ inFile_init.close();
+ } // processParamFile()
+
+ private void processDocInfo() {
+ // sets numDocuments and docOfSentence[]
+ docOfSentence = new int[numSentences];
+
+ if (docInfoFileName == null) {
+ for (int i = 0; i < numSentences; ++i)
+ docOfSentence[i] = 0;
+ numDocuments = 1;
+ } else {
+
+ try {
+
+ // 4 possible formats:
+ // 1) List of numbers, one per document, indicating # sentences in each document.
+ // 2) List of "docName size" pairs, one per document, indicating name of document and #
+ // sentences.
+ // 3) List of docName's, one per sentence, indicating which doument each sentence belongs
+ // to.
+ // 4) List of docName_number's, one per sentence, indicating which doument each sentence
+ // belongs to,
+ // and its order in that document. (can also use '-' instead of '_')
+
+ int docInfoSize = countNonEmptyLines(docInfoFileName);
+
+ if (docInfoSize < numSentences) { // format #1 or #2
+ numDocuments = docInfoSize;
+ int i = 0;
+
+ BufferedReader inFile = new BufferedReader(new FileReader(docInfoFileName));
+ String line = inFile.readLine();
+ boolean format1 = (!(line.contains(" ")));
+
+ for (int doc = 0; doc < numDocuments; ++doc) {
+
+ if (doc != 0)
+ line = inFile.readLine();
+
+ int docSize = 0;
+ if (format1) {
+ docSize = Integer.parseInt(line);
+ } else {
+ docSize = Integer.parseInt(line.split("\\s+")[1]);
+ }
+
+ for (int i2 = 1; i2 <= docSize; ++i2) {
+ docOfSentence[i] = doc;
+ ++i;
+ }
+
+ }
+
+ // now i == numSentences
+
+ inFile.close();
+
+ } else if (docInfoSize == numSentences) { // format #3 or #4
+
+ boolean format3 = false;
+
+ HashSet<String> seenStrings = new HashSet<String>();
+ BufferedReader inFile = new BufferedReader(new FileReader(docInfoFileName));
+ for (int i = 0; i < numSentences; ++i) {
+ // set format3 = true if a duplicate is found
+ String line = inFile.readLine();
+ if (seenStrings.contains(line))
+ format3 = true;
+ seenStrings.add(line);
+ }
+
+ inFile.close();
+
+ HashSet<String> seenDocNames = new HashSet<String>();
+ HashMap<String, Integer> docOrder = new HashMap<String, Integer>();
+ // maps a document name to the order (0-indexed) in which it was seen
+
+ inFile = new BufferedReader(new FileReader(docInfoFileName));
+ for (int i = 0; i < numSentences; ++i) {
+ String line = inFile.readLine();
+
+ String docName = "";
+ if (format3) {
+ docName = line;
+ } else {
+ int sep_i = Math.max(line.lastIndexOf('_'), line.lastIndexOf('-'));
+ docName = line.substring(0, sep_i);
+ }
+
+ if (!seenDocNames.contains(docName)) {
+ seenDocNames.add(docName);
+ docOrder.put(docName, seenDocNames.size() - 1);
+ }
+
+ int docOrder_i = docOrder.get(docName);
+
+ docOfSentence[i] = docOrder_i;
+
+ }
+
+ inFile.close();
+
+ numDocuments = seenDocNames.size();
+
+ } else { // badly formatted
+
+ }
+
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ }
+
+ private boolean copyFile(String origFileName, String newFileName) {
+ try {
+ File inputFile = new File(origFileName);
+ File outputFile = new File(newFileName);
+
+ InputStream in = new FileInputStream(inputFile);
+ OutputStream out = new FileOutputStream(outputFile);
+
+ byte[] buffer = new byte[1024];
+ int len;
+ while ((len = in.read(buffer)) > 0) {
+ out.write(buffer, 0, len);
+ }
+ in.close();
+ out.close();
+
+ /*
+ * InputStream inStream = new FileInputStream(new File(origFileName)); BufferedReader inFile =
+ * new BufferedReader(new InputStreamReader(inStream, "utf8"));
+ *
+ * FileOutputStream outStream = new FileOutputStream(newFileName, false); OutputStreamWriter
+ * outStreamWriter = new OutputStreamWriter(outStream, "utf8"); BufferedWriter outFile = new
+ * BufferedWriter(outStreamWriter);
+ *
+ * String line; while(inFile.ready()) { line = inFile.readLine(); writeLine(line, outFile); }
+ *
+ * inFile.close(); outFile.close();
+ */
+ return true;
+ } catch (IOException e) {
+ LOG.error(e.getMessage(), e);
+ return false;
+ }
+ }
+
+ private void renameFile(String origFileName, String newFileName) {
+ if (fileExists(origFileName)) {
+ deleteFile(newFileName);
+ File oldFile = new File(origFileName);
+ File newFile = new File(newFileName);
+ if (!oldFile.renameTo(newFile)) {
+ println("Warning: attempt to rename " + origFileName + " to " + newFileName
+ + " was unsuccessful!", 1);
+ }
+ } else {
+ println("Warning: file " + origFileName + " does not exist! (in MIRACore.renameFile)", 1);
+ }
+ }
+
+ private void deleteFile(String fileName) {
+ if (fileExists(fileName)) {
+ File fd = new File(fileName);
+ if (!fd.delete()) {
+ println("Warning: attempt to delete " + fileName + " was unsuccessful!", 1);
+ }
+ }
+ }
+
+ private void writeLine(String line, BufferedWriter writer) throws IOException {
+ writer.write(line, 0, line.length());
+ writer.newLine();
+ writer.flush();
+ }
+
+ // need to re-write to handle different forms of lambda
+ public void finish() {
+ if (myDecoder != null) {
+ myDecoder.cleanUp();
+ }
+
+ // create config file with final values
+ createConfigFile(lambda, decoderConfigFileName + ".MIRA.final", decoderConfigFileName
+ + ".MIRA.orig");
+
+ // delete current decoder config file and decoder output
+ deleteFile(decoderConfigFileName);
+ deleteFile(decoderOutFileName);
+
+ // restore original name for config file (name was changed
+ // in initialize() so it doesn't get overwritten)
+ renameFile(decoderConfigFileName + ".MIRA.orig", decoderConfigFileName);
+
+ if (finalLambdaFileName != null) {
+ try {
+ PrintWriter outFile_lambdas = new PrintWriter(finalLambdaFileName);
+ for (int c = 1; c <= numParams; ++c) {
+ outFile_lambdas.println(Vocabulary.word(c) + " ||| " + lambda.get(c).doubleValue());
+ }
+ outFile_lambdas.close();
+
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ }
+
+ private String[] cfgFileToArgsArray(String fileName) {
+ checkFile(fileName);
+
+ Vector<String> argsVector = new Vector<String>();
+
+ BufferedReader inFile = null;
+ try {
+ inFile = new BufferedReader(new FileReader(fileName));
+ String line, origLine;
+ do {
+ line = inFile.readLine();
+ origLine = line; // for error reporting purposes
+
+ if (line != null && line.length() > 0 && line.charAt(0) != '#') {
+
+ if (line.indexOf("#") != -1) { // discard comment
+ line = line.substring(0, line.indexOf("#"));
+ }
+
+ line = line.trim();
+
+ // now line should look like "-xxx XXX"
+
+ /*
+ * OBSOLETE MODIFICATION //SPECIAL HANDLING FOR MIRA CLASSIFIER PARAMETERS String[] paramA
+ * = line.split("\\s+");
+ *
+ * if( paramA[0].equals("-classifierParams") ) { String classifierParam = ""; for(int p=1;
+ * p<=paramA.length-1; p++) classifierParam += paramA[p]+" ";
+ *
+ * if(paramA.length>=2) { String[] tmpParamA = new String[2]; tmpParamA[0] = paramA[0];
+ * tmpParamA[1] = classifierParam; paramA = tmpParamA; } else {
+ * println("Malformed line in config file:"); println(origLine); System.exit(70); } }//END
+ * MODIFICATION
+ */
+
+ // cmu modification(from meteor for zmert)
+ // Parse args
+ ArrayList<String> argList = new ArrayList<String>();
+ StringBuilder arg = new StringBuilder();
+ boolean quoted = false;
+ for (int i = 0; i < line.length(); i++) {
+ if (Character.isWhitespace(line.charAt(i))) {
+ if (quoted)
+ arg.append(line.charAt(i));
+ else if (arg.length() > 0) {
+ argList.add(arg.toString());
+ arg = new StringBuilder();
+ }
+ } else if (line.charAt(i) == '\'') {
+ if (quoted) {
+ argList.add(arg.toString());
+ arg = new StringBuilder();
+ }
+ quoted = !quoted;
+ } else
+ arg.append(line.charAt(i));
+ }
+ if (arg.length() > 0)
+ argList.add(arg.toString());
+ // Create paramA
+ String[] paramA = new String[argList.size()];
+ for (int i = 0; i < paramA.length; paramA[i] = argList.get(i++))
+ ;
+ // END CMU MODIFICATION
+
+ if (paramA.length == 2 && paramA[0].charAt(0) == '-') {
+ argsVector.add(paramA[0]);
+ argsVector.add(paramA[1]);
+ } else if (paramA.length > 2 && (paramA[0].equals("-m") || paramA[0].equals("-docSet"))) {
+ // -m (metricName), -docSet are allowed to have extra optinos
+ for (int opt = 0; opt < paramA.length; ++opt) {
+ argsVector.add(paramA[opt]);
+ }
+ } else {
+ throw new RuntimeException("Malformed line in config file:" + origLine);
+ }
+
+ }
+ } while (line != null);
+
+ inFile.close();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+
+ String[] argsArray = new String[argsVector.size()];
+
+ for (int i = 0; i < argsVector.size(); ++i) {
+ argsArray[i] = argsVector.elementAt(i);
+ }
+
+ return argsArray;
+ }
+
+ private void processArgsArray(String[] args) {
+ processArgsArray(args, true);
+ }
+
+ private void processArgsArray(String[] args, boolean firstTime) {
+ /* set default values */
+ // Relevant files
+ dirPrefix = null;
+ sourceFileName = null;
+ refFileName = "reference.txt";
+ refsPerSen = 1;
+ textNormMethod = 1;
+ paramsFileName = "params.txt";
+ docInfoFileName = null;
+ finalLambdaFileName = null;
+ // MERT specs
+ metricName = "BLEU";
+ metricName_display = metricName;
+ metricOptions = new String[2];
+ metricOptions[0] = "4";
+ metricOptions[1] = "closest";
+ docSubsetInfo = new int[7];
+ docSubsetInfo[0] = 0;
+ maxMERTIterations = 20;
+ prevMERTIterations = 20;
+ minMERTIterations = 5;
+ stopMinIts = 3;
+ stopSigValue = -1;
+ //
+ // /* possibly other early stopping criteria here */
+ //
+ numOptThreads = 1;
+ saveInterFiles = 3;
+ compressFiles = 0;
+ oneModificationPerIteration = false;
+ randInit = false;
+ seed = System.currentTimeMillis();
+ // useDisk = 2;
+ // Decoder specs
+ decoderCommandFileName = null;
+ passIterationToDecoder = false;
+ decoderOutFileName = "output.nbest";
+ validDecoderExitValue = 0;
+ decoderConfigFileName = "dec_cfg.txt";
+ sizeOfNBest = 100;
+ fakeFileNameTemplate = null;
+ fakeFileNamePrefix = null;
+ fakeFileNameSuffix = null;
+ // Output specs
+ verbosity = 1;
+ decVerbosity = 0;
+
+ int i = 0;
+
+ while (i < args.length) {
+ String option = args[i];
+ // Relevant files
+ if (option.equals("-dir")) {
+ dirPrefix = args[i + 1];
+ } else if (option.equals("-s")) {
+ sourceFileName = args[i + 1];
+ } else if (option.equals("-r")) {
+ refFileName = args[i + 1];
+ } else if (option.equals("-rps")) {
+ refsPerSen = Integer.parseInt(args[i + 1]);
+ if (refsPerSen < 1) {
+ throw new RuntimeException("refsPerSen must be positive.");
+ }
+ } else if (option.equals("-txtNrm")) {
+ textNormMethod = Integer.parseInt(args[i + 1]);
+ if (textNormMethod < 0 || textNormMethod > 4) {
+ throw new RuntimeException("textNormMethod should be between 0 and 4");
+ }
+ } else if (option.equals("-p")) {
+ paramsFileName = args[i + 1];
+ } else if (option.equals("-docInfo")) {
+ docInfoFileName = args[i + 1];
+ } else if (option.equals("-fin")) {
+ finalLambdaFileName = args[i + 1];
+ // MERT specs
+ } else if (option.equals("-m")) {
+ metricName = args[i + 1];
+ metricName_display = metricName;
+ if (EvaluationMetric.knownMetricName(metricName)) {
+ int optionCount = EvaluationMetric.metricOptionCount(metricName);
+ metricOptions = new String[optionCount];
+ for (int opt = 0; opt < optionCount; ++opt) {
+ metricOptions[opt] = args[i + opt + 2];
+ }
+ i += optionCount;
+ } else {
+ throw new RuntimeException("Unknown metric name " + metricName + ".");
+ }
+ } else if (option.equals("-docSet")) {
+ String method = args[i + 1];
+
+ if (method.equals("all")) {
+ docSubsetInfo[0] = 0;
+ i += 0;
+ } else if (method.equals("bottom")) {
+ String a = args[i + 2];
+ if (a.endsWith("d")) {
+ docSubsetInfo[0] = 1;
+ a = a.substring(0, a.indexOf("d"));
+ } else {
+ docSubsetInfo[0] = 2;
+ a = a.substring(0, a.indexOf("%"));
+ }
+ docSubsetInfo[5] = Integer.parseInt(a);
+ i += 1;
+ } else if (method.equals("top")) {
+ String a = args[i + 2];
+ if (a.endsWith("d")) {
+ docSubsetInfo[0] = 3;
+ a = a.substring(0, a.indexOf("d"));
+ } else {
+ docSubsetInfo[0] = 4;
+ a = a.substring(0, a.indexOf("%"));
+ }
+ docSubsetInfo[5] = Integer.parseInt(a);
+ i += 1;
+ } else if (method.equals("window")) {
+ String a1 = args[i + 2];
+ a1 = a1.substring(0, a1.indexOf("d")); // size of window
+ String a2 = args[i + 4];
+ if (a2.indexOf("p") > 0) {
+ docSubsetInfo[0] = 5;
+ a2 = a2.substring(0, a2.indexOf("p"));
+ } else {
+ docSubsetInfo[0] = 6;
+ a2 = a2.substring(0, a2.indexOf("r"));
+ }
+ docSubsetInfo[5] = Integer.parseInt(a1);
+ docSubsetInfo[6] = Integer.parseInt(a2);
+ i += 3;
+ } else {
+ throw new RuntimeException("Unknown docSet method " + method + ".");
+ }
+ } else if (option.equals("-maxIt")) {
+ maxMERTIterations = Integer.parseInt(args[i + 1]);
+ if (maxMERTIterations < 1) {
+ throw new RuntimeException("maxIt must be positive.");
+ }
+ } else if (option.equals("-minIt")) {
+ minMERTIterations = Integer.parseInt(args[i + 1]);
+ if (minMERTIterations < 1) {
+ throw new RuntimeException("minIt must be positive.");
+ }
+ } else if (option.equals("-prevIt")) {
+ prevMERTIterations = Integer.parseInt(args[i + 1]);
+ if (prevMERTIterations < 0) {
+ throw new RuntimeException("prevIt must be non-negative.");
+ }
+ } else if (option.equals("-stopIt")) {
+ stopMinIts = Integer.parseInt(args[i + 1]);
+ if (stopMinIts < 1) {
+ throw new RuntimeException("stopIts must be positive.");
+ }
+ } else if (option.equals("-stopSig")) {
+ stopSigValue = Double.parseDouble(args[i + 1]);
+ }
+ //
+ // /* possibly other early stopping criteria here */
+ //
+ else if (option.equals("-thrCnt")) {
+ numOptThreads = Integer.parseInt(args[i + 1]);
+ if (numOptThreads < 1) {
+ throw new RuntimeException("threadCount must be positive.");
+ }
+ } else if (option.equals("-save")) {
+ saveInterFiles = Integer.parseInt(args[i + 1]);
+ if (saveInterFiles < 0 || saveInterFiles > 3) {
+ throw new RuntimeException("save should be between 0 and 3");
+ }
+ } else if (option.equals("-compress")) {
+ compressFiles = Integer.parseInt(args[i + 1]);
+ if (compressFiles < 0 || compressFiles > 1) {
+ throw new RuntimeException("compressFiles should be either 0 or 1");
+ }
+ } else if (option.equals("-opi")) {
+ int opi = Integer.parseInt(args[i + 1]);
+ if (opi == 1) {
+ oneModificationPerIteration = true;
+ } else if (opi == 0) {
+ oneModificationPerIteration = false;
+ } else {
+ throw new RuntimeException("oncePerIt must be either 0 or 1.");
+ }
+ } else if (option.equals("-rand")) {
+ int rand = Integer.parseInt(args[i + 1]);
+ if (rand == 1) {
+ randInit = true;
+ } else if (rand == 0) {
+ randInit = false;
+ } else {
+ throw new RuntimeException("randInit must be either 0 or 1.");
+ }
+ } else if (option.equals("-seed")) {
+ if (args[i + 1].equals("time")) {
+ seed = System.currentTimeMillis();
+ } else {
+ seed = Long.parseLong(args[i + 1]);
+ }
+ }
+ /*
+ * else if (option.equals("-ud")) { useDisk = Integer.parseInt(args[i+1]); if (useDisk < 0 ||
+ * useDisk > 2) { println("useDisk should be between 0 and 2"); System.exit(10); } }
+
<TRUNCATED>