You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by le...@apache.org on 2016/05/16 06:26:40 UTC
[24/66] [partial] incubator-joshua git commit: JOSHUA-252 Make it
possible to use Maven to build Joshua
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8cdbc4b8/src/joshua/zmert/MertCore.java
----------------------------------------------------------------------
diff --git a/src/joshua/zmert/MertCore.java b/src/joshua/zmert/MertCore.java
deleted file mode 100644
index 0e96347..0000000
--- a/src/joshua/zmert/MertCore.java
+++ /dev/null
@@ -1,3268 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied. See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-package joshua.zmert;
-
-import java.io.BufferedReader;
-import java.io.BufferedWriter;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileNotFoundException;
-import java.io.FileOutputStream;
-import java.io.FileReader;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.InputStreamReader;
-import java.io.ObjectInputStream;
-import java.io.ObjectOutputStream;
-import java.io.OutputStream;
-import java.io.OutputStreamWriter;
-import java.io.PrintWriter;
-import java.text.DecimalFormat;
-import java.util.ArrayList;
-import java.util.Date;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Random;
-import java.util.Scanner;
-import java.util.TreeSet;
-import java.util.Vector;
-import java.util.concurrent.ConcurrentHashMap;
-import java.util.concurrent.ExecutorService;
-import java.util.concurrent.Executors;
-import java.util.concurrent.Semaphore;
-import java.util.zip.GZIPInputStream;
-import java.util.zip.GZIPOutputStream;
-
-import joshua.decoder.Decoder;
-import joshua.decoder.JoshuaConfiguration;
-import joshua.metrics.EvaluationMetric;
-import joshua.util.StreamGobbler;
-
-/**
- * This code was originally written by Omar Zaidan. In September of 2012, it was augmented to support
- * a sparse feature implementation.
- *
- * @author Omar Zaidan
- */
-
-public class MertCore {
- private final JoshuaConfiguration joshuaConfiguration;
- private TreeSet<Integer>[] indicesOfInterest_all;
-
- private final static DecimalFormat f4 = new DecimalFormat("###0.0000");
- private final Runtime myRuntime = Runtime.getRuntime();
-
- private final static double NegInf = (-1.0 / 0.0);
- private final static double PosInf = (+1.0 / 0.0);
- private final static double epsilon = 1.0 / 1000000;
-
- private int verbosity; // anything of priority <= verbosity will be printed
- // (lower value for priority means more important)
-
- private Random randGen;
- private int generatedRands;
-
- private int numSentences;
- // number of sentences in the dev set
- // (aka the "MERT training" set)
-
- private int numDocuments;
- // number of documents in the dev set
- // this should be 1, unless doing doc-level optimization
-
- private int[] docOfSentence;
- // docOfSentence[i] stores which document contains the i'th sentence.
- // docOfSentence is 0-indexed, as are the documents (i.e. first doc is indexed 0)
-
- private int[] docSubsetInfo;
- // stores information regarding which subset of the documents are evaluated
- // [0]: method (0-6)
- // [1]: first (1-indexed)
- // [2]: last (1-indexed)
- // [3]: size
- // [4]: center
- // [5]: arg1
- // [6]: arg2
- // [1-6] are 0 for method 0, [6] is 0 for methods 1-4 as well
- // only [1] and [2] are needed for optimization. The rest are only needed for an output message.
-
- private int refsPerSen;
- // number of reference translations per sentence
-
- private int textNormMethod;
- // 0: no normalization, 1: "NIST-style" tokenization, and also rejoin 'm, 're, *'s, 've, 'll, 'd,
- // and n't,
- // 2: apply 1 and also rejoin dashes between letters, 3: apply 1 and also drop non-ASCII
- // characters
- // 4: apply 1+2+3
-
- private int numParams;
- // number of features for the log-linear model
-
- private double[] normalizationOptions;
- // How should a lambda[] vector be normalized (before decoding)?
- // nO[0] = 0: no normalization
- // nO[0] = 1: scale so that parameter nO[2] has absolute value nO[1]
- // nO[0] = 2: scale so that the maximum absolute value is nO[1]
- // nO[0] = 3: scale so that the minimum absolute value is nO[1]
- // nO[0] = 4: scale so that the L-nO[1] norm equals nO[2]
-
- /* *********************************************************** */
- /* NOTE: indexing starts at 1 in the following few arrays: */
- /* *********************************************************** */
-
- private String[] paramNames;
- // feature names, needed to read/create config file
-
- private double[] lambda;
- // the current weight vector. NOTE: indexing starts at 1.
-
- private boolean[] isOptimizable;
- // isOptimizable[c] = true iff lambda[c] should be optimized
-
- private double[] minThValue;
- private double[] maxThValue;
- // when investigating thresholds along the lambda[c] dimension, only values
- // in the [minThValue[c],maxThValue[c]] range will be considered.
- // (*) minThValue and maxThValue can be real values as well as -Infinity and +Infinity
- // (coded as -Inf and +Inf, respectively, in an input file)
-
- private double[] minRandValue;
- private double[] maxRandValue;
- // when choosing a random value for the lambda[c] parameter, it will be
- // chosen from the [minRandValue[c],maxRandValue[c]] range.
- // (*) minRandValue and maxRandValue must be real values, but not -Inf or +Inf
-
- private int damianos_method;
- private double damianos_param;
- private double damianos_mult;
-
- private double[] defaultLambda;
- // "default" parameter values; simply the values read in the parameter file
-
- /* *********************************************************** */
- /* *********************************************************** */
-
- private Decoder myDecoder;
- // COMMENT OUT if decoder is not Joshua
-
- private String decoderCommand;
- // the command that runs the decoder; read from decoderCommandFileName
-
- private int decVerbosity;
- // verbosity level for decoder output. If 0, decoder output is ignored.
- // If 1, decoder output is printed.
-
- private int validDecoderExitValue;
- // return value from running the decoder command that indicates success
-
- private int numOptThreads;
- // number of threads to run things in parallel
-
- private int saveInterFiles;
- // 0: nothing, 1: only configs, 2: only n-bests, 3: both configs and n-bests
-
- private int compressFiles;
- // should Z-MERT gzip the large files? If 0, no compression takes place.
- // If 1, compression is performed on: decoder output files, temp sents files,
- // and temp feats files.
-
- private int sizeOfNBest;
- // size of N-best list generated by decoder at each iteration
- // (aka simply N, but N is a bad variable name)
-
- private long seed;
- // seed used to create random number generators
-
- private boolean randInit;
- // if true, parameters are initialized randomly. If false, parameters
- // are initialized using values from parameter file.
-
- private int initsPerIt;
- // number of intermediate initial points per iteration
-
- private int maxMERTIterations, minMERTIterations, prevMERTIterations;
- // max: maximum number of MERT iterations
- // min: minimum number of MERT iterations before an early MERT exit
- // prev: number of previous MERT iterations from which to consider candidates (in addition to
- // the candidates from the current iteration)
-
- private double stopSigValue;
- // early MERT exit if no weight changes by more than stopSigValue
- // (but see minMERTIterations above and stopMinIts below)
-
- private int stopMinIts;
- // some early stopping criterion must be satisfied in stopMinIts *consecutive* iterations
- // before an early exit (but see minMERTIterations above)
-
- private boolean oneModificationPerIteration;
- // if true, each MERT iteration performs at most one parameter modification.
- // If false, a new MERT iteration starts (i.e. a new N-best list is
- // generated) only after the previous iteration reaches a local maximum.
-
- private String metricName;
- // name of evaluation metric optimized by MERT
-
- private String metricName_display;
- // name of evaluation metric optimized by MERT, possibly with "doc-level " prefixed
-
- private String[] metricOptions;
- // options for the evaluation metric (e.g. for BLEU, maxGramLength and effLengthMethod)
-
- private EvaluationMetric evalMetric;
- // the evaluation metric used by MERT
-
- private int suffStatsCount;
- // number of sufficient statistics for the evaluation metric
-
- private String tmpDirPrefix;
- // prefix for the ZMERT.temp.* files
-
- private boolean passIterationToDecoder;
- // should the iteration number be passed as an argument to decoderCommandFileName?
- // If 1, iteration number is passed. If 0, launch with no arguments.
-
- private String dirPrefix; // where are all these files located?
- private String paramsFileName, docInfoFileName, finalLambdaFileName;
- private String sourceFileName, refFileName, decoderOutFileName;
- private String decoderConfigFileName, decoderCommandFileName;
- private String fakeFileNameTemplate, fakeFileNamePrefix, fakeFileNameSuffix;
-
- // e.g. output.it[1-x].someOldRun would be specified as:
- // output.it?.someOldRun
- // and we'd have prefix = "output.it" and suffix = ".sameOldRun"
-
- // private int useDisk;
-
- public MertCore(JoshuaConfiguration joshuaConfiguration)
- {
- this.joshuaConfiguration = joshuaConfiguration;
- }
-
- public MertCore(String[] args, JoshuaConfiguration joshuaConfiguration) {
- this.joshuaConfiguration = joshuaConfiguration;
- EvaluationMetric.set_knownMetrics();
- processArgsArray(args);
- initialize(0);
- }
-
- public MertCore(String configFileName,JoshuaConfiguration joshuaConfiguration) {
- this.joshuaConfiguration = joshuaConfiguration;
- EvaluationMetric.set_knownMetrics();
- processArgsArray(cfgFileToArgsArray(configFileName));
- initialize(0);
- }
-
- private void initialize(int randsToSkip) {
- println("NegInf: " + NegInf + ", PosInf: " + PosInf + ", epsilon: " + epsilon, 4);
-
- randGen = new Random(seed);
- for (int r = 1; r <= randsToSkip; ++r) {
- randGen.nextDouble();
- }
- generatedRands = randsToSkip;
-
- if (randsToSkip == 0) {
- println("----------------------------------------------------", 1);
- println("Initializing...", 1);
- println("----------------------------------------------------", 1);
- println("", 1);
-
- println("Random number generator initialized using seed: " + seed, 1);
- println("", 1);
- }
-
- if (refsPerSen > 1) {
- String refFile = refFileName + "0";
- if (! new File(refFile).exists())
- refFile = refFileName + ".0";
- if (! new File(refFile).exists()) {
- System.err.println(String.format("* FATAL: can't find first reference file '%s{0,.0}'", refFileName));
- System.exit(1);
- }
-
- numSentences = countLines(refFile);
- } else {
- numSentences = countLines(refFileName);
- }
-
- processDocInfo();
- // sets numDocuments and docOfSentence[]
-
- if (numDocuments > 1) metricName_display = "doc-level " + metricName;
-
- set_docSubsetInfo(docSubsetInfo);
-
-
-
- numParams = countNonEmptyLines(paramsFileName) - 1;
- // the parameter file contains one line per parameter
- // and one line for the normalization method
-
-
- paramNames = new String[1 + numParams];
- lambda = new double[1 + numParams]; // indexing starts at 1 in these arrays
- isOptimizable = new boolean[1 + numParams];
- minThValue = new double[1 + numParams];
- maxThValue = new double[1 + numParams];
- minRandValue = new double[1 + numParams];
- maxRandValue = new double[1 + numParams];
- // precision = new double[1+numParams];
- defaultLambda = new double[1 + numParams];
- normalizationOptions = new double[3];
-
- try {
- // read parameter names
- BufferedReader inFile_names = new BufferedReader(new FileReader(paramsFileName));
-
- for (int c = 1; c <= numParams; ++c) {
- String line = "";
- while (line != null && line.length() == 0) { // skip empty lines
- line = inFile_names.readLine();
- }
- String paramName = (line.substring(0, line.indexOf("|||"))).trim();
- paramNames[c] = paramName;
- }
-
- inFile_names.close();
- } catch (FileNotFoundException e) {
- System.err.println("FileNotFoundException in MertCore.initialize(int): " + e.getMessage());
- System.exit(99901);
- } catch (IOException e) {
- System.err.println("IOException in MertCore.initialize(int): " + e.getMessage());
- System.exit(99902);
- }
-
- processParamFile();
- // sets the arrays declared just above
-
- // SentenceInfo.createV(); // uncomment ONLY IF using vocabulary implementation of SentenceInfo
-
-
- String[][] refSentences = new String[numSentences][refsPerSen];
-
- try {
-
- // read in reference sentences
- BufferedReader reference_readers[] = new BufferedReader[refsPerSen];
- if (refsPerSen == 1) {
- reference_readers[0] = new BufferedReader(new InputStreamReader(new FileInputStream(new File(refFileName)), "utf8"));
- } else {
- for (int i = 0; i < refsPerSen; i++) {
- String refFile = refFileName + i;
- if (! new File(refFile).exists())
- refFile = refFileName + "." + i;
- if (! new File(refFile).exists()) {
- System.err.println(String.format("* FATAL: can't find reference file '%s'", refFile));
- System.exit(1);
- }
-
- reference_readers[i] = new BufferedReader(new InputStreamReader(new FileInputStream(new File(refFile)), "utf8"));
- }
- }
-
- for (int i = 0; i < numSentences; ++i) {
- for (int r = 0; r < refsPerSen; ++r) {
- // read the rth reference translation for the ith sentence
- refSentences[i][r] = normalize(reference_readers[r].readLine(), textNormMethod);
- }
- }
-
- // close all the reference files
- for (int i = 0; i < refsPerSen; i++)
- reference_readers[i].close();
-
- // read in decoder command, if any
- decoderCommand = null;
- if (decoderCommandFileName != null) {
- if (fileExists(decoderCommandFileName)) {
- BufferedReader inFile_comm = new BufferedReader(new FileReader(decoderCommandFileName));
- decoderCommand = inFile_comm.readLine();
- inFile_comm.close();
- }
- }
- } catch (FileNotFoundException e) {
- System.err.println("FileNotFoundException in MertCore.initialize(int): " + e.getMessage());
- System.exit(99901);
- } catch (IOException e) {
- System.err.println("IOException in MertCore.initialize(int): " + e.getMessage());
- System.exit(99902);
- }
-
-
- // set static data members for the EvaluationMetric class
- EvaluationMetric.set_numSentences(numSentences);
- EvaluationMetric.set_numDocuments(numDocuments);
- EvaluationMetric.set_refsPerSen(refsPerSen);
- EvaluationMetric.set_refSentences(refSentences);
- EvaluationMetric.set_tmpDirPrefix(tmpDirPrefix);
-
- evalMetric = EvaluationMetric.getMetric(metricName, metricOptions);
-
- suffStatsCount = evalMetric.get_suffStatsCount();
-
- // set static data members for the IntermediateOptimizer class
- IntermediateOptimizer.set_MERTparams(numSentences, numDocuments, docOfSentence, docSubsetInfo,
- numParams, normalizationOptions, isOptimizable, minThValue, maxThValue,
- oneModificationPerIteration, evalMetric, tmpDirPrefix, verbosity);
-
-
-
- if (randsToSkip == 0) { // i.e. first iteration
- println("Number of sentences: " + numSentences, 1);
- println("Number of documents: " + numDocuments, 1);
- println("Optimizing " + metricName_display, 1);
-
- print("docSubsetInfo: {", 1);
- for (int f = 0; f < 6; ++f)
- print(docSubsetInfo[f] + ", ", 1);
- println(docSubsetInfo[6] + "}", 1);
-
- println("Number of features: " + numParams, 1);
- print("Feature names: {", 1);
- for (int c = 1; c <= numParams; ++c) {
- print("\"" + paramNames[c] + "\"", 1);
- if (c < numParams) print(",", 1);
- }
- println("}", 1);
- println("", 1);
-
- println("c Default value\tOptimizable?\tCrit. val. range\tRand. val. range", 1);
-
- for (int c = 1; c <= numParams; ++c) {
- print(c + " " + f4.format(lambda[c]) + "\t\t", 1);
- if (!isOptimizable[c]) {
- println(" No", 1);
- } else {
- print(" Yes\t\t", 1);
- // print("[" + minThValue[c] + "," + maxThValue[c] + "] @ " + precision[c] +
- // " precision",1);
- print(" [" + minThValue[c] + "," + maxThValue[c] + "]", 1);
- print("\t\t", 1);
- print(" [" + minRandValue[c] + "," + maxRandValue[c] + "]", 1);
- println("", 1);
- }
- }
-
- println("", 1);
- print("Weight vector normalization method: ", 1);
- if (normalizationOptions[0] == 0) {
- println("none.", 1);
- } else if (normalizationOptions[0] == 1) {
- println("weights will be scaled so that the \"" + paramNames[(int) normalizationOptions[1]]
- + "\" weight has an absolute value of " + normalizationOptions[2] + ".", 1);
- } else if (normalizationOptions[0] == 2) {
- println("weights will be scaled so that the maximum absolute value is "
- + normalizationOptions[1] + ".", 1);
- } else if (normalizationOptions[0] == 3) {
- println("weights will be scaled so that the minimum absolute value is "
- + normalizationOptions[1] + ".", 1);
- } else if (normalizationOptions[0] == 4) {
- println("weights will be scaled so that the L-" + normalizationOptions[1] + " norm is "
- + normalizationOptions[2] + ".", 1);
- }
-
- println("", 1);
-
- println("----------------------------------------------------", 1);
- println("", 1);
-
- // rename original config file so it doesn't get overwritten
- // (original name will be restored in finish())
- renameFile(decoderConfigFileName, decoderConfigFileName + ".ZMERT.orig");
-
- } // if (randsToSkip == 0)
-
-
- if (decoderCommand == null && fakeFileNameTemplate == null) {
- println("Loading Joshua decoder...", 1);
- myDecoder = new Decoder(joshuaConfiguration, decoderConfigFileName + ".ZMERT.orig");
- println("...finished loading @ " + (new Date()), 1);
- println("");
- } else {
- myDecoder = null;
- }
-
-
-
- @SuppressWarnings("unchecked")
- TreeSet<Integer>[] temp_TSA = new TreeSet[numSentences];
- indicesOfInterest_all = temp_TSA;
-
- for (int i = 0; i < numSentences; ++i) {
- indicesOfInterest_all[i] = new TreeSet<Integer>();
- }
-
-
- } // void initialize(...)
-
- public void run_MERT() {
- run_MERT(minMERTIterations, maxMERTIterations, prevMERTIterations);
- }
-
- public void run_MERT(int minIts, int maxIts, int prevIts) {
- println("----------------------------------------------------", 1);
- println("Z-MERT run started @ " + (new Date()), 1);
- // printMemoryUsage();
- println("----------------------------------------------------", 1);
- println("", 1);
-
- if (randInit) {
- println("Initializing lambda[] randomly.", 1);
-
- // initialize optimizable parameters randomly (sampling uniformly from
- // that parameter's random value range)
- lambda = randomLambda();
- }
-
- println("Initial lambda[]: " + lambdaToString(lambda), 1);
- println("", 1);
-
- double FINAL_score = evalMetric.worstPossibleScore();
-
-
- // int[] lastUsedIndex = new int[numSentences];
- int[] maxIndex = new int[numSentences];
- // used to grow featVal_array dynamically
- // HashMap<Integer,int[]>[] suffStats_array = new HashMap[numSentences];
- // suffStats_array[i] maps candidates of interest for sentence i to an array
- // storing the sufficient statistics for that candidate
- for (int i = 0; i < numSentences; ++i) {
- // lastUsedIndex[i] = -1;
- maxIndex[i] = sizeOfNBest - 1;
- // suffStats_array[i] = new HashMap<Integer,int[]>();
- }
- /*
- * double[][][] featVal_array = new double[1+numParams][][]; // indexed by
- * [param][sentence][candidate] featVal_array[0] = null; // param indexing starts at 1 for (int
- * c = 1; c <= numParams; ++c) { featVal_array[c] = new double[numSentences][]; for (int i = 0;
- * i < numSentences; ++i) { featVal_array[c][i] = new double[maxIndex[i]]; // will grow
- * dynamically as needed } }
- */
- int earlyStop = 0;
- // number of consecutive iteration an early stopping criterion was satisfied
-
- for (int iteration = 1;; ++iteration) {
-
- double[] A = run_single_iteration(iteration, minIts, maxIts, prevIts, earlyStop, maxIndex);
- if (A != null) {
- FINAL_score = A[0];
- earlyStop = (int) A[1];
- if (A[2] == 1) break;
- } else {
- break;
- }
-
- } // for (iteration)
-
- println("", 1);
-
- println("----------------------------------------------------", 1);
- println("Z-MERT run ended @ " + (new Date()), 1);
- // printMemoryUsage();
- println("----------------------------------------------------", 1);
- println("", 1);
- println("FINAL lambda: " + lambdaToString(lambda) + " (" + metricName_display + ": "
- + FINAL_score + ")", 1);
- // check if a lambda is outside its threshold range
- for (int c = 1; c <= numParams; ++c) {
- if (lambda[c] < minThValue[c] || lambda[c] > maxThValue[c]) {
- println("Warning: after normalization, lambda[" + c + "]=" + f4.format(lambda[c])
- + " is outside its critical value range.", 1);
- }
- }
- println("", 1);
-
- // delete intermediate .temp.*.it* decoder output files
- for (int iteration = 1; iteration <= maxIts; ++iteration) {
- if (compressFiles == 1) {
- deleteFile(tmpDirPrefix + "temp.sents.it" + iteration + ".gz");
- deleteFile(tmpDirPrefix + "temp.feats.it" + iteration + ".gz");
- if (fileExists(tmpDirPrefix + "temp.stats.it" + iteration + ".copy.gz")) {
- deleteFile(tmpDirPrefix + "temp.stats.it" + iteration + ".copy.gz");
- } else {
- deleteFile(tmpDirPrefix + "temp.stats.it" + iteration + ".gz");
- }
- } else {
- deleteFile(tmpDirPrefix + "temp.sents.it" + iteration);
- deleteFile(tmpDirPrefix + "temp.feats.it" + iteration);
- if (fileExists(tmpDirPrefix + "temp.stats.it" + iteration + ".copy")) {
- deleteFile(tmpDirPrefix + "temp.stats.it" + iteration + ".copy");
- } else {
- deleteFile(tmpDirPrefix + "temp.stats.it" + iteration);
- }
- }
- }
-
- } // void run_MERT(int maxIts)
-
-
- @SuppressWarnings("unchecked")
- public double[] run_single_iteration(int iteration, int minIts, int maxIts, int prevIts,
- int earlyStop, int[] maxIndex) {
- double FINAL_score = 0;
-
- double[] retA = new double[3];
- // retA[0]: FINAL_score
- // retA[1]: earlyStop
- // retA[2]: should this be the last iteration?
-
- boolean done = false;
- retA[2] = 1; // will only be made 0 if we don't break from the following loop
-
-
- double[][][] featVal_array = new double[1 + numParams][][];
- // indexed by [param][sentence][candidate]
- featVal_array[0] = null; // param indexing starts at 1
- for (int c = 1; c <= numParams; ++c) {
- featVal_array[c] = new double[numSentences][];
- for (int i = 0; i < numSentences; ++i) {
- featVal_array[c][i] = new double[maxIndex[i] + 1];
- // will grow dynamically as needed
- }
- }
-
-
- while (!done) { // NOTE: this "loop" will only be carried out once
- println("--- Starting Z-MERT iteration #" + iteration + " @ " + (new Date()) + " ---", 1);
-
- // printMemoryUsage();
-
- // run the decoder on all the sentences, producing for each sentence a set of
- // sizeOfNBest candidates, with numParams feature values for each candidate
-
- /******************************/
- // CREATE DECODER CONFIG FILE //
- /******************************/
-
- createConfigFile(lambda, decoderConfigFileName, decoderConfigFileName + ".ZMERT.orig");
- // i.e. use the original config file as a template
-
- /***************/
- // RUN DECODER //
- /***************/
-
- if (iteration == 1) {
- println("Decoding using initial weight vector " + lambdaToString(lambda), 1);
- } else {
- println("Redecoding using weight vector " + lambdaToString(lambda), 1);
- }
-
- String[] decRunResult = run_decoder(iteration); // iteration passed in case fake decoder will
- // be used
- // [0] name of file to be processed
- // [1] indicates how the output file was obtained:
- // 1: external decoder
- // 2: fake decoder
- // 3: internal decoder
-
- if (!decRunResult[1].equals("2")) {
- println("...finished decoding @ " + (new Date()), 1);
- }
-
- checkFile(decRunResult[0]);
-
- println("Producing temp files for iteration " + iteration, 3);
-
- produceTempFiles(decRunResult[0], iteration);
-
- if (saveInterFiles == 1 || saveInterFiles == 3) { // make copy of intermediate config file
- if (!copyFile(decoderConfigFileName, decoderConfigFileName + ".ZMERT.it" + iteration)) {
- println("Warning: attempt to make copy of decoder config file (to create"
- + decoderConfigFileName + ".ZMERT.it" + iteration + ") was unsuccessful!", 1);
- }
- }
- if (saveInterFiles == 2 || saveInterFiles == 3) { // make copy of intermediate decoder output
- // file...
-
- if (!decRunResult[1].equals("2")) { // ...but only if no fake decoder
- if (!decRunResult[0].endsWith(".gz")) {
- if (!copyFile(decRunResult[0], decRunResult[0] + ".ZMERT.it" + iteration)) {
- println("Warning: attempt to make copy of decoder output file (to create"
- + decRunResult[0] + ".ZMERT.it" + iteration + ") was unsuccessful!", 1);
- }
- } else {
- String prefix = decRunResult[0].substring(0, decRunResult[0].length() - 3);
- if (!copyFile(prefix + ".gz", prefix + ".ZMERT.it" + iteration + ".gz")) {
- println("Warning: attempt to make copy of decoder output file (to create" + prefix
- + ".ZMERT.it" + iteration + ".gz" + ") was unsuccessful!", 1);
- }
- }
-
- if (compressFiles == 1 && !decRunResult[0].endsWith(".gz")) {
- gzipFile(decRunResult[0] + ".ZMERT.it" + iteration);
- }
- } // if (!fake)
-
- }
-
- int[] candCount = new int[numSentences];
- int[] lastUsedIndex = new int[numSentences];
- ConcurrentHashMap<Integer, int[]>[] suffStats_array = new ConcurrentHashMap[numSentences];
- for (int i = 0; i < numSentences; ++i) {
- candCount[i] = 0;
- lastUsedIndex[i] = -1;
- // suffStats_array[i].clear();
- suffStats_array[i] = new ConcurrentHashMap<Integer, int[]>();
- }
-
- double[][] initialLambda = new double[1 + initsPerIt][1 + numParams];
- // the intermediate "initial" lambdas
- double[][] finalLambda = new double[1 + initsPerIt][1 + numParams];
- // the intermediate "final" lambdas
-
- // set initialLambda[][]
- System.arraycopy(lambda, 1, initialLambda[1], 1, numParams);
- for (int j = 2; j <= initsPerIt; ++j) {
- if (damianos_method == 0) {
- initialLambda[j] = randomLambda();
- } else {
- initialLambda[j] =
- randomPerturbation(initialLambda[1], iteration, damianos_method, damianos_param,
- damianos_mult);
- }
- }
-
-// double[] initialScore = new double[1 + initsPerIt];
- double[] finalScore = new double[1 + initsPerIt];
-
- int[][][] best1Cand_suffStats = new int[1 + initsPerIt][numSentences][suffStatsCount];
- double[][] best1Score = new double[1 + initsPerIt][numSentences];
- // Those two arrays are used to calculate initialScore[]
- // (the "score" in best1Score refers to that assigned by the
- // decoder; the "score" in initialScore refers to that
- // assigned by the evaluation metric)
-
- int firstIt = Math.max(1, iteration - prevIts);
- // i.e. only process candidates from the current iteration and candidates
- // from up to prevIts previous iterations.
- println("Reading candidate translations from iterations " + firstIt + "-" + iteration, 1);
- println("(and computing " + metricName
- + " sufficient statistics for previously unseen candidates)", 1);
- print(" Progress: ");
-
- int[] newCandidatesAdded = new int[1 + iteration];
- for (int it = 1; it <= iteration; ++it) {
- newCandidatesAdded[it] = 0;
- }
-
-
-
- try {
-
- // each inFile corresponds to the output of an iteration
- // (index 0 is not used; no corresponding index for the current iteration)
- BufferedReader[] inFile_sents = new BufferedReader[iteration];
- BufferedReader[] inFile_feats = new BufferedReader[iteration];
- BufferedReader[] inFile_stats = new BufferedReader[iteration];
-
- for (int it = firstIt; it < iteration; ++it) {
- InputStream inStream_sents, inStream_feats, inStream_stats;
- if (compressFiles == 0) {
- inStream_sents = new FileInputStream(tmpDirPrefix + "temp.sents.it" + it);
- inStream_feats = new FileInputStream(tmpDirPrefix + "temp.feats.it" + it);
- inStream_stats = new FileInputStream(tmpDirPrefix + "temp.stats.it" + it);
- } else {
- inStream_sents =
- new GZIPInputStream(
- new FileInputStream(tmpDirPrefix + "temp.sents.it" + it + ".gz"));
- inStream_feats =
- new GZIPInputStream(
- new FileInputStream(tmpDirPrefix + "temp.feats.it" + it + ".gz"));
- inStream_stats =
- new GZIPInputStream(
- new FileInputStream(tmpDirPrefix + "temp.stats.it" + it + ".gz"));
- }
-
- inFile_sents[it] = new BufferedReader(new InputStreamReader(inStream_sents, "utf8"));
- inFile_feats[it] = new BufferedReader(new InputStreamReader(inStream_feats, "utf8"));
- inFile_stats[it] = new BufferedReader(new InputStreamReader(inStream_stats, "utf8"));
- }
-
-
- InputStream inStream_sentsCurrIt, inStream_featsCurrIt, inStream_statsCurrIt;
- if (compressFiles == 0) {
- inStream_sentsCurrIt = new FileInputStream(tmpDirPrefix + "temp.sents.it" + iteration);
- inStream_featsCurrIt = new FileInputStream(tmpDirPrefix + "temp.feats.it" + iteration);
- } else {
- inStream_sentsCurrIt =
- new GZIPInputStream(new FileInputStream(tmpDirPrefix + "temp.sents.it" + iteration
- + ".gz"));
- inStream_featsCurrIt =
- new GZIPInputStream(new FileInputStream(tmpDirPrefix + "temp.feats.it" + iteration
- + ".gz"));
- }
-
- BufferedReader inFile_sentsCurrIt =
- new BufferedReader(new InputStreamReader(inStream_sentsCurrIt, "utf8"));
- BufferedReader inFile_featsCurrIt =
- new BufferedReader(new InputStreamReader(inStream_featsCurrIt, "utf8"));
-
- BufferedReader inFile_statsCurrIt = null; // will only be used if statsCurrIt_exists below
- // is set to true
- PrintWriter outFile_statsCurrIt = null; // will only be used if statsCurrIt_exists below is
- // set to false
- boolean statsCurrIt_exists = false;
- if (fileExists(tmpDirPrefix + "temp.stats.it" + iteration)) {
- inStream_statsCurrIt = new FileInputStream(tmpDirPrefix + "temp.stats.it" + iteration);
- inFile_statsCurrIt =
- new BufferedReader(new InputStreamReader(inStream_statsCurrIt, "utf8"));
- statsCurrIt_exists = true;
- copyFile(tmpDirPrefix + "temp.stats.it" + iteration, tmpDirPrefix + "temp.stats.it"
- + iteration + ".copy");
- } else if (fileExists(tmpDirPrefix + "temp.stats.it" + iteration + ".gz")) {
- inStream_statsCurrIt =
- new GZIPInputStream(new FileInputStream(tmpDirPrefix + "temp.stats.it" + iteration
- + ".gz"));
- inFile_statsCurrIt =
- new BufferedReader(new InputStreamReader(inStream_statsCurrIt, "utf8"));
- statsCurrIt_exists = true;
- copyFile(tmpDirPrefix + "temp.stats.it" + iteration + ".gz", tmpDirPrefix
- + "temp.stats.it" + iteration + ".copy.gz");
- } else {
- outFile_statsCurrIt = new PrintWriter(tmpDirPrefix + "temp.stats.it" + iteration);
- }
-
- PrintWriter outFile_statsMerged = new PrintWriter(tmpDirPrefix + "temp.stats.merged");
- // write sufficient statistics from all the sentences
- // from the output files into a single file
- PrintWriter outFile_statsMergedKnown =
- new PrintWriter(tmpDirPrefix + "temp.stats.mergedKnown");
- // write sufficient statistics from all the sentences
- // from the output files into a single file
-
- FileOutputStream outStream_unknownCands =
- new FileOutputStream(tmpDirPrefix + "temp.currIt.unknownCands", false);
- OutputStreamWriter outStreamWriter_unknownCands =
- new OutputStreamWriter(outStream_unknownCands, "utf8");
- BufferedWriter outFile_unknownCands = new BufferedWriter(outStreamWriter_unknownCands);
-
- PrintWriter outFile_unknownIndices =
- new PrintWriter(tmpDirPrefix + "temp.currIt.unknownIndices");
-
-
- String sents_str, feats_str, stats_str;
-
- // BUG: this assumes a candidate string cannot be produced for two
- // different source sentences, which is not necessarily true
- // (It's not actually a bug, but only because existingCandStats gets
- // cleared before moving to the next source sentence.)
- // FIX: should be made an array, indexed by i
- HashMap<String, String> existingCandStats = new HashMap<String, String>();
- // Stores precalculated sufficient statistics for candidates, in case
- // the same candidate is seen again. (SS stored as a String.)
- // Q: Why do we care? If we see the same candidate again, aren't we going
- // to ignore it? So, why do we care about the SS of this repeat candidate?
- // A: A "repeat" candidate may not be a repeat candidate in later
- // iterations if the user specifies a value for prevMERTIterations
- // that causes MERT to skip candidates from early iterations.
- double[] currFeatVal = new double[1 + numParams];
- String[] featVal_str;
-
- int totalCandidateCount = 0;
-
-
-
- int[] sizeUnknown_currIt = new int[numSentences];
-
-
-
- for (int i = 0; i < numSentences; ++i) {
-
- for (int j = 1; j <= initsPerIt; ++j) {
- best1Score[j][i] = NegInf;
- }
-
- for (int it = firstIt; it < iteration; ++it) {
- // Why up to but *excluding* iteration?
- // Because the last iteration is handled a little differently, since
- // the SS must be claculated (and the corresponding file created),
- // which is not true for previous iterations.
-
- for (int n = 0; n <= sizeOfNBest; ++n) {
- // Why up to and *including* sizeOfNBest?
- // So that it would read the "||||||" separator even if there is
- // a complete list of sizeOfNBest candidates.
-
- // for the nth candidate for the ith sentence, read the sentence, feature values,
- // and sufficient statistics from the various temp files
-
- sents_str = inFile_sents[it].readLine();
- feats_str = inFile_feats[it].readLine();
- stats_str = inFile_stats[it].readLine();
-
- if (sents_str.equals("||||||")) {
- n = sizeOfNBest + 1;
- } else if (!existingCandStats.containsKey(sents_str)) {
-
- outFile_statsMergedKnown.println(stats_str);
-
- featVal_str = feats_str.split("\\s+");
-
- /* Sparse (labeled) feature version */
- if (feats_str.indexOf('=') != -1) {
- for (String featurePair: featVal_str) {
- String[] pair = featurePair.split("=");
- String name = pair[0];
- Double value = Double.parseDouble(pair[1]);
- currFeatVal[c_fromParamName(name)] = value;
- }
- } else {
- for (int c = 1; c <= numParams; ++c) {
- try {
- currFeatVal[c] = Double.parseDouble(featVal_str[c - 1]);
- } catch (Exception e) {
- currFeatVal[c] = 0.0;
- }
- // print("fV[" + c + "]=" + currFeatVal[c] + " ",4);
- }
- // println("",4);
- }
-
-
- for (int j = 1; j <= initsPerIt; ++j) {
- double score = 0; // i.e. score assigned by decoder
- for (int c = 1; c <= numParams; ++c) {
- score += initialLambda[j][c] * currFeatVal[c];
- }
- if (score > best1Score[j][i]) {
- best1Score[j][i] = score;
- String[] tempStats = stats_str.split("\\s+");
- for (int s = 0; s < suffStatsCount; ++s)
- best1Cand_suffStats[j][i][s] = Integer.parseInt(tempStats[s]);
- }
- } // for (j)
-
- existingCandStats.put(sents_str, stats_str);
-
- setFeats(featVal_array, i, lastUsedIndex, maxIndex, currFeatVal);
- candCount[i] += 1;
-
- newCandidatesAdded[it] += 1;
-
- } // if unseen candidate
-
- } // for (n)
-
- } // for (it)
-
- outFile_statsMergedKnown.println("||||||");
-
-
- // now process the candidates of the current iteration
- // now determine the new candidates of the current iteration
-
- /*
- * remember: BufferedReader inFile_sentsCurrIt BufferedReader inFile_featsCurrIt
- * PrintWriter outFile_statsCurrIt
- */
-
- String[] sentsCurrIt_currSrcSent = new String[sizeOfNBest + 1];
-
- Vector<String> unknownCands_V = new Vector<String>();
- // which candidates (of the i'th source sentence) have not been seen before
- // this iteration?
-
- for (int n = 0; n <= sizeOfNBest; ++n) {
- // Why up to and *including* sizeOfNBest?
- // So that it would read the "||||||" separator even if there is
- // a complete list of sizeOfNBest candidates.
-
- // for the nth candidate for the ith sentence, read the sentence,
- // and store it in the sentsCurrIt_currSrcSent array
-
- sents_str = inFile_sentsCurrIt.readLine();
- sentsCurrIt_currSrcSent[n] = sents_str; // Note: possibly "||||||"
-
- if (sents_str.equals("||||||")) {
- n = sizeOfNBest + 1;
- } else if (!existingCandStats.containsKey(sents_str)) {
- unknownCands_V.add(sents_str);
- writeLine(sents_str, outFile_unknownCands);
- outFile_unknownIndices.println(i);
- newCandidatesAdded[iteration] += 1;
- existingCandStats.put(sents_str, "U"); // i.e. unknown
- // we add sents_str to avoid duplicate entries in unknownCands_V
- }
-
- } // for (n)
-
-
-
- // now unknownCands_V has the candidates for which we need to calculate
- // sufficient statistics (for the i'th source sentence)
- int sizeUnknown = unknownCands_V.size();
- sizeUnknown_currIt[i] = sizeUnknown;
-
- /*********************************************/
- /*
- * String[] unknownCands = new String[sizeUnknown]; unknownCands_V.toArray(unknownCands);
- * int[] indices = new int[sizeUnknown]; for (int d = 0; d < sizeUnknown; ++d) {
- * existingCandStats.remove(unknownCands[d]); // remove the (unknownCands[d],"U") entry
- * from existingCandStats // (we had added it while constructing unknownCands_V to avoid
- * duplicate entries) indices[d] = i; }
- */
- /*********************************************/
-
- existingCandStats.clear();
-
- } // for (i)
-
- /*
- * int[][] newSuffStats = null; if (!statsCurrIt_exists && sizeUnknown > 0) { newSuffStats =
- * evalMetric.suffStats(unknownCands, indices); }
- */
-
- outFile_statsMergedKnown.close();
- outFile_unknownCands.close();
- outFile_unknownIndices.close();
-
-
- for (int it = firstIt; it < iteration; ++it) {
- inFile_sents[it].close();
- inFile_stats[it].close();
-
- InputStream inStream_sents, inStream_stats;
- if (compressFiles == 0) {
- inStream_sents = new FileInputStream(tmpDirPrefix + "temp.sents.it" + it);
- inStream_stats = new FileInputStream(tmpDirPrefix + "temp.stats.it" + it);
- } else {
- inStream_sents =
- new GZIPInputStream(
- new FileInputStream(tmpDirPrefix + "temp.sents.it" + it + ".gz"));
- inStream_stats =
- new GZIPInputStream(
- new FileInputStream(tmpDirPrefix + "temp.stats.it" + it + ".gz"));
- }
-
- inFile_sents[it] = new BufferedReader(new InputStreamReader(inStream_sents, "utf8"));
- inFile_stats[it] = new BufferedReader(new InputStreamReader(inStream_stats, "utf8"));
- }
-
- inFile_sentsCurrIt.close();
- if (compressFiles == 0) {
- inStream_sentsCurrIt = new FileInputStream(tmpDirPrefix + "temp.sents.it" + iteration);
- } else {
- inStream_sentsCurrIt =
- new GZIPInputStream(new FileInputStream(tmpDirPrefix + "temp.sents.it" + iteration
- + ".gz"));
- }
- inFile_sentsCurrIt =
- new BufferedReader(new InputStreamReader(inStream_sentsCurrIt, "utf8"));
-
-
-
- // calculate SS for unseen candidates and write them to file
- FileInputStream inStream_statsCurrIt_unknown = null;
- BufferedReader inFile_statsCurrIt_unknown = null;
-
- if (!statsCurrIt_exists && newCandidatesAdded[iteration] > 0) {
- // create the file...
- evalMetric.createSuffStatsFile(tmpDirPrefix + "temp.currIt.unknownCands", tmpDirPrefix
- + "temp.currIt.unknownIndices", tmpDirPrefix + "temp.stats.unknown", sizeOfNBest);
-
- // ...and open it
- inStream_statsCurrIt_unknown = new FileInputStream(tmpDirPrefix + "temp.stats.unknown");
- inFile_statsCurrIt_unknown =
- new BufferedReader(new InputStreamReader(inStream_statsCurrIt_unknown, "utf8"));
- }
-
- // OPEN mergedKnown file
- FileInputStream instream_statsMergedKnown =
- new FileInputStream(tmpDirPrefix + "temp.stats.mergedKnown");
- BufferedReader inFile_statsMergedKnown =
- new BufferedReader(new InputStreamReader(instream_statsMergedKnown, "utf8"));
-
- for (int i = 0; i < numSentences; ++i) {
-
- // reprocess candidates from previous iterations
- for (int it = firstIt; it < iteration; ++it) {
- for (int n = 0; n <= sizeOfNBest; ++n) {
-
- sents_str = inFile_sents[it].readLine();
- stats_str = inFile_stats[it].readLine();
-
- if (sents_str.equals("||||||")) {
- n = sizeOfNBest + 1;
- } else if (!existingCandStats.containsKey(sents_str)) {
- existingCandStats.put(sents_str, stats_str);
- } // if unseen candidate
-
- } // for (n)
- } // for (it)
-
- // copy relevant portion from mergedKnown to the merged file
- String line_mergedKnown = inFile_statsMergedKnown.readLine();
- while (!line_mergedKnown.equals("||||||")) {
- outFile_statsMerged.println(line_mergedKnown);
- line_mergedKnown = inFile_statsMergedKnown.readLine();
- }
-
- int[] stats = new int[suffStatsCount];
-
- for (int n = 0; n <= sizeOfNBest; ++n) {
- // Why up to and *including* sizeOfNBest?
- // So that it would read the "||||||" separator even if there is
- // a complete list of sizeOfNBest candidates.
-
- // for the nth candidate for the ith sentence, read the sentence, feature values,
- // and sufficient statistics from the various temp files
-
- sents_str = inFile_sentsCurrIt.readLine();
- feats_str = inFile_featsCurrIt.readLine();
-
- if (sents_str.equals("||||||")) {
- n = sizeOfNBest + 1;
- } else if (!existingCandStats.containsKey(sents_str)) {
-
- if (!statsCurrIt_exists) {
- stats_str = inFile_statsCurrIt_unknown.readLine();
-
- String[] temp_stats = stats_str.split("\\s+");
- for (int s = 0; s < suffStatsCount; ++s) {
- stats[s] = Integer.parseInt(temp_stats[s]);
- }
-
- /*
- * stats_str = ""; for (int s = 0; s < suffStatsCount-1; ++s) { stats[s] =
- * newSuffStats[d][s]; stats_str += (stats[s] + " "); } stats[suffStatsCount-1] =
- * newSuffStats[d][suffStatsCount-1]; stats_str += stats[suffStatsCount-1];
- */
-
- outFile_statsCurrIt.println(stats_str);
- } else {
- stats_str = inFile_statsCurrIt.readLine();
- String[] temp_stats = stats_str.split("\\s+");
- for (int s = 0; s < suffStatsCount; ++s) {
- try {
- stats[s] = Integer.parseInt(temp_stats[s]);
- } catch (Exception e) {
- stats[s] = 0;
- }
- }
- }
-
- outFile_statsMerged.println(stats_str);
-
- featVal_str = feats_str.split("\\s+");
-
- if (feats_str.indexOf('=') != -1) {
- for (String featurePair: featVal_str) {
- String[] pair = featurePair.split("=");
- String name = pair[0];
- Double value = Double.parseDouble(pair[1]);
- currFeatVal[c_fromParamName(name)] = value;
- }
- } else {
- for (int c = 1; c <= numParams; ++c) {
- try {
- currFeatVal[c] = Double.parseDouble(featVal_str[c - 1]);
- } catch (Exception e) {
- // NumberFormatException, ArrayIndexOutOfBoundsException
- currFeatVal[c] = 0.0;
- }
-
- // print("fV[" + c + "]=" + currFeatVal[c] + " ",4);
- }
- }
- // println("",4);
-
-
- for (int j = 1; j <= initsPerIt; ++j) {
- double score = 0; // i.e. score assigned by decoder
- for (int c = 1; c <= numParams; ++c) {
- score += initialLambda[j][c] * currFeatVal[c];
- }
- if (score > best1Score[j][i]) {
- best1Score[j][i] = score;
- for (int s = 0; s < suffStatsCount; ++s)
- best1Cand_suffStats[j][i][s] = stats[s];
- }
- } // for (j)
-
- existingCandStats.put(sents_str, stats_str);
-
- setFeats(featVal_array, i, lastUsedIndex, maxIndex, currFeatVal);
- candCount[i] += 1;
-
- // newCandidatesAdded[iteration] += 1;
- // moved to code above detecting new candidates
-
- } else {
- if (statsCurrIt_exists)
- inFile_statsCurrIt.readLine();
- else {
- // write SS to outFile_statsCurrIt
- stats_str = existingCandStats.get(sents_str);
- outFile_statsCurrIt.println(stats_str);
- }
- }
-
- } // for (n)
-
- // now d = sizeUnknown_currIt[i] - 1
-
- if (statsCurrIt_exists)
- inFile_statsCurrIt.readLine();
- else
- outFile_statsCurrIt.println("||||||");
-
- existingCandStats.clear();
- totalCandidateCount += candCount[i];
-
- if ((i + 1) % 500 == 0) {
- print((i + 1) + "\n" + " ", 1);
- } else if ((i + 1) % 100 == 0) {
- print("+", 1);
- } else if ((i + 1) % 25 == 0) {
- print(".", 1);
- }
-
- } // for (i)
-
- inFile_statsMergedKnown.close();
- outFile_statsMerged.close();
-
- println("", 1); // finish progress line
-
- for (int it = firstIt; it < iteration; ++it) {
- inFile_sents[it].close();
- inFile_feats[it].close();
- inFile_stats[it].close();
- }
-
- inFile_sentsCurrIt.close();
- inFile_featsCurrIt.close();
- if (statsCurrIt_exists)
- inFile_statsCurrIt.close();
- else
- outFile_statsCurrIt.close();
-
- if (compressFiles == 1 && !statsCurrIt_exists) {
- gzipFile(tmpDirPrefix + "temp.stats.it" + iteration);
- }
-
- deleteFile(tmpDirPrefix + "temp.currIt.unknownCands");
- deleteFile(tmpDirPrefix + "temp.currIt.unknownIndices");
- deleteFile(tmpDirPrefix + "temp.stats.unknown");
- deleteFile(tmpDirPrefix + "temp.stats.mergedKnown");
-
- // cleanupMemory();
-
- println("Processed " + totalCandidateCount + " distinct candidates " + "(about "
- + totalCandidateCount / numSentences + " per sentence):", 1);
- for (int it = firstIt; it <= iteration; ++it) {
- println("newCandidatesAdded[it=" + it + "] = " + newCandidatesAdded[it] + " (about "
- + newCandidatesAdded[it] / numSentences + " per sentence)", 1);
- }
-
- println("", 1);
-
- } catch (FileNotFoundException e) {
- System.err.println("FileNotFoundException in MertCore.run_single_iteration(6): "
- + e.getMessage());
- System.exit(99901);
- } catch (IOException e) {
- System.err.println("IOException in MertCore.run_single_iteration(6): " + e.getMessage());
- System.exit(99902);
- }
-
-
- if (newCandidatesAdded[iteration] == 0) {
- if (!oneModificationPerIteration) {
- println("No new candidates added in this iteration; exiting Z-MERT.", 1);
- println("", 1);
- println("--- Z-MERT iteration #" + iteration + " ending @ " + (new Date()) + " ---", 1);
- println("", 1);
- return null; // THIS MEANS THAT THE OLD VALUES SHOULD BE KEPT BY THE CALLER
- } else {
- println("Note: No new candidates added in this iteration.", 1);
- }
- }
-
- // run the initsPerIt optimizations, in parallel, across numOptThreads threads
- ExecutorService pool = Executors.newFixedThreadPool(numOptThreads);
- Semaphore blocker = new Semaphore(0);
- Vector<String>[] threadOutput = new Vector[initsPerIt + 1];
-
- for (int j = 1; j <= initsPerIt; ++j) {
- threadOutput[j] = new Vector<String>();
- pool.execute(new IntermediateOptimizer(j, blocker, threadOutput[j], initialLambda[j],
- finalLambda[j], best1Cand_suffStats[j], finalScore, candCount, featVal_array,
- suffStats_array));
- }
-
- pool.shutdown();
-
- try {
- blocker.acquire(initsPerIt);
- } catch (java.lang.InterruptedException e) {
- System.err.println("InterruptedException in MertCore.run_single_iteration(): "
- + e.getMessage());
- System.exit(99906);
- }
-
- // extract output from threadOutput[]
- for (int j = 1; j <= initsPerIt; ++j) {
- for (String str : threadOutput[j]) {
- println(str); // no verbosity check needed; thread already checked
- }
- }
-
- int best_j = 1;
- double bestFinalScore = finalScore[1];
- for (int j = 2; j <= initsPerIt; ++j) {
- if (evalMetric.isBetter(finalScore[j], bestFinalScore)) {
- best_j = j;
- bestFinalScore = finalScore[j];
- }
- }
-
- if (initsPerIt > 1) {
- println("Best final lambda is lambda[j=" + best_j + "] " + "(" + metricName_display + ": "
- + f4.format(bestFinalScore) + ").", 1);
- println("", 1);
- }
-
- FINAL_score = bestFinalScore;
-
- boolean anyParamChanged = false;
- boolean anyParamChangedSignificantly = false;
-
- for (int c = 1; c <= numParams; ++c) {
- if (finalLambda[best_j][c] != lambda[c]) {
- anyParamChanged = true;
- }
- if (Math.abs(finalLambda[best_j][c] - lambda[c]) > stopSigValue) {
- anyParamChangedSignificantly = true;
- }
- }
-
- System.arraycopy(finalLambda[best_j], 1, lambda, 1, numParams);
- println("--- Z-MERT iteration #" + iteration + " ending @ " + (new Date()) + " ---", 1);
- println("", 1);
-
- if (!anyParamChanged) {
- println("No parameter value changed in this iteration; exiting Z-MERT.", 1);
- println("", 1);
- break; // exit for (iteration) loop preemptively
- }
-
- // check if a lambda is outside its threshold range
- for (int c = 1; c <= numParams; ++c) {
- if (lambda[c] < minThValue[c] || lambda[c] > maxThValue[c]) {
- println("Warning: after normalization, lambda[" + c + "]=" + f4.format(lambda[c])
- + " is outside its critical value range.", 1);
- }
- }
-
- // was an early stopping criterion satisfied?
- boolean critSatisfied = false;
- if (!anyParamChangedSignificantly && stopSigValue >= 0) {
- println("Note: No parameter value changed significantly " + "(i.e. by more than "
- + stopSigValue + ") in this iteration.", 1);
- critSatisfied = true;
- }
-
- if (critSatisfied) {
- ++earlyStop;
- println("", 1);
- } else {
- earlyStop = 0;
- }
-
- // if min number of iterations executed, investigate if early exit should happen
- if (iteration >= minIts && earlyStop >= stopMinIts) {
- println("Some early stopping criteria has been observed " + "in " + stopMinIts
- + " consecutive iterations; exiting Z-MERT.", 1);
- println("", 1);
- break; // exit for (iteration) loop preemptively
- }
-
- // if max number of iterations executed, exit
- if (iteration >= maxIts) {
- println("Maximum number of MERT iterations reached; exiting Z-MERT.", 1);
- println("", 1);
- break; // exit for (iteration) loop
- }
-
- println("Next iteration will decode with lambda: " + lambdaToString(lambda), 1);
- println("", 1);
-
- // printMemoryUsage();
- for (int i = 0; i < numSentences; ++i) {
- suffStats_array[i].clear();
- }
- // cleanupMemory();
- // println("",2);
-
-
- retA[2] = 0; // i.e. this should NOT be the last iteration
- done = true;
-
- } // while (!done) // NOTE: this "loop" will only be carried out once
-
-
- // delete .temp.stats.merged file, since it is not needed in the next
- // iteration (it will be recreated from scratch)
- deleteFile(tmpDirPrefix + "temp.stats.merged");
-
- retA[0] = FINAL_score;
- retA[1] = earlyStop;
- return retA;
-
- } // run_single_iteration
-
- private String lambdaToString(double[] lambdaA) {
- String retStr = "{";
- for (int c = 1; c <= numParams - 1; ++c) {
- retStr += "" + lambdaA[c] + ", ";
- }
- retStr += "" + lambdaA[numParams] + "}";
-
- return retStr;
- }
-
- private String[] run_decoder(int iteration) {
- String[] retSA = new String[2];
- // [0] name of file to be processed
- // [1] indicates how the output file was obtained:
- // 1: external decoder
- // 2: fake decoder
- // 3: internal decoder
-
- if (fakeFileNameTemplate != null
- && fileExists(fakeFileNamePrefix + iteration + fakeFileNameSuffix)) {
- String fakeFileName = fakeFileNamePrefix + iteration + fakeFileNameSuffix;
- println("Not running decoder; using " + fakeFileName + " instead.", 1);
- /*
- * if (fakeFileName.endsWith(".gz")) { copyFile(fakeFileName,decoderOutFileName+".gz");
- * gunzipFile(decoderOutFileName+".gz"); } else { copyFile(fakeFileName,decoderOutFileName); }
- */
- retSA[0] = fakeFileName;
- retSA[1] = "2";
-
- } else {
- println("Running external decoder...", 1);
-
- try {
- ArrayList<String> cmd = new ArrayList<String>();
- cmd.add(decoderCommandFileName);
-
- if (passIterationToDecoder)
- cmd.add(Integer.toString(iteration));
-
- ProcessBuilder pb = new ProcessBuilder(cmd);
- // this merges the error and output streams of the subprocess
- pb.redirectErrorStream(true);
- Process p = pb.start();
-
- // capture the sub-command's output
- StreamGobbler outputGobbler = new StreamGobbler(p.getInputStream(), decVerbosity);
- outputGobbler.start();
-
- int decStatus = p.waitFor();
- if (decStatus != validDecoderExitValue) {
- println("Call to decoder returned " + decStatus + "; was expecting "
- + validDecoderExitValue + ".");
- System.exit(30);
- }
- } catch (IOException e) {
- System.err.println("IOException in MertCore.run_decoder(int): " + e.getMessage());
- System.exit(99902);
- } catch (InterruptedException e) {
- System.err.println("InterruptedException in MertCore.run_decoder(int): " + e.getMessage());
- System.exit(99903);
- }
-
- retSA[0] = decoderOutFileName;
- retSA[1] = "1";
-
- }
-
- return retSA;
-
- }
-
- private void produceTempFiles(String nbestFileName, int iteration) {
- try {
- String sentsFileName = tmpDirPrefix + "temp.sents.it" + iteration;
- String featsFileName = tmpDirPrefix + "temp.feats.it" + iteration;
-
- FileOutputStream outStream_sents = new FileOutputStream(sentsFileName, false);
- OutputStreamWriter outStreamWriter_sents = new OutputStreamWriter(outStream_sents, "utf8");
- BufferedWriter outFile_sents = new BufferedWriter(outStreamWriter_sents);
-
- PrintWriter outFile_feats = new PrintWriter(featsFileName);
-
-
- InputStream inStream_nbest = null;
- if (nbestFileName.endsWith(".gz")) {
- inStream_nbest = new GZIPInputStream(new FileInputStream(nbestFileName));
- } else {
- inStream_nbest = new FileInputStream(nbestFileName);
- }
- BufferedReader inFile_nbest =
- new BufferedReader(new InputStreamReader(inStream_nbest, "utf8"));
-
- String line; // , prevLine;
- String candidate_str = "";
- String feats_str = "";
-
- int i = 0;
- int n = 0;
- line = inFile_nbest.readLine();
-
- while (line != null) {
-
- // skip blank lines
- if (line.equals("")) continue;
-
- // skip lines that aren't formatted correctly
- if (line.indexOf("|||") == -1)
- continue;
-
- /*
- * line format:
- *
- * i ||| words of candidate translation . ||| feat-1_val feat-2_val ... feat-numParams_val
- * .*
- *
- * Updated September 2012: features can now be named (for sparse feature compatibility).
- * You must name all features or none of them.
- */
-
- // in a well formed file, we'd find the nth candidate for the ith sentence
-
- int read_i = Integer.parseInt((line.substring(0, line.indexOf("|||"))).trim());
-
- if (read_i != i) {
- writeLine("||||||", outFile_sents);
- outFile_feats.println("||||||");
- n = 0;
- ++i;
- }
-
- line = (line.substring(line.indexOf("|||") + 3)).trim(); // get rid of initial text
-
- candidate_str = (line.substring(0, line.indexOf("|||"))).trim();
- feats_str = (line.substring(line.indexOf("|||") + 3)).trim();
- // get rid of candidate string
-
- int junk_i = feats_str.indexOf("|||");
- if (junk_i >= 0) {
- feats_str = (feats_str.substring(0, junk_i)).trim();
- }
-
- writeLine(normalize(candidate_str, textNormMethod), outFile_sents);
- outFile_feats.println(feats_str);
-
- ++n;
- if (n == sizeOfNBest) {
- writeLine("||||||", outFile_sents);
- outFile_feats.println("||||||");
- n = 0;
- ++i;
- }
-
- line = inFile_nbest.readLine();
- }
-
- if (i != numSentences) { // last sentence had too few candidates
- writeLine("||||||", outFile_sents);
- outFile_feats.println("||||||");
- }
-
- inFile_nbest.close();
- outFile_sents.close();
- outFile_feats.close();
-
- if (compressFiles == 1) {
- gzipFile(sentsFileName);
- gzipFile(featsFileName);
- }
-
- } catch (FileNotFoundException e) {
- System.err.println("FileNotFoundException in MertCore.produceTempFiles(int): "
- + e.getMessage());
- System.exit(99901);
- } catch (IOException e) {
- System.err.println("IOException in MertCore.produceTempFiles(int): " + e.getMessage());
- System.exit(99902);
- }
-
- }
-
- private void createConfigFile(double[] params, String cfgFileName, String templateFileName) {
- try {
- // i.e. create cfgFileName, which is similar to templateFileName, but with
- // params[] as parameter values
-
- BufferedReader inFile = new BufferedReader(new FileReader(templateFileName));
- PrintWriter outFile = new PrintWriter(cfgFileName);
-
- String line = inFile.readLine();
-
- while (line != null) {
- int c_match = -1;
- for (int c = 1; c <= numParams; ++c) {
- if (line.startsWith(paramNames[c] + " ")) {
- c_match = c;
- break;
- }
- }
-
- if (c_match == -1) {
- outFile.println(line);
- } else {
- outFile.println(paramNames[c_match] + " " + params[c_match]);
- }
-
- line = inFile.readLine();
- }
-
- inFile.close();
- outFile.close();
- } catch (IOException e) {
- System.err.println("IOException in MertCore.createConfigFile(double[],String,String): "
- + e.getMessage());
- System.exit(99902);
- }
- }
-
- private void processParamFile() {
- // process parameter file
- Scanner inFile_init = null;
- try {
- inFile_init = new Scanner(new FileReader(paramsFileName));
- } catch (FileNotFoundException e) {
- System.err.println("FileNotFoundException in MertCore.processParamFile(): " + e.getMessage());
- System.exit(99901);
- }
-
- String dummy = "";
-
- // initialize lambda[] and other related arrays
- for (int c = 1; c <= numParams; ++c) {
- // skip parameter name
- while (!dummy.equals("|||")) {
- dummy = inFile_init.next();
- }
-
- // read default value
- lambda[c] = inFile_init.nextDouble();
- defaultLambda[c] = lambda[c];
-
- // read isOptimizable
- dummy = inFile_init.next();
- if (dummy.equals("Opt")) {
- isOptimizable[c] = true;
- } else if (dummy.equals("Fix")) {
- isOptimizable[c] = false;
- } else {
- println("Unknown isOptimizable string " + dummy + " (must be either Opt or Fix)");
- System.exit(21);
- }
-
- if (!isOptimizable[c]) { // skip next four values
- dummy = inFile_init.next();
- dummy = inFile_init.next();
- dummy = inFile_init.next();
- dummy = inFile_init.next();
- } else {
- // set minThValue[c] and maxThValue[c] (range for thresholds to investigate)
- dummy = inFile_init.next();
- if (dummy.equals("-Inf")) {
- minThValue[c] = NegInf;
- } else if (dummy.equals("+Inf")) {
- println("minThValue[" + c + "] cannot be +Inf!");
- System.exit(21);
- } else {
- minThValue[c] = Double.parseDouble(dummy);
- }
-
- dummy = inFile_init.next();
- if (dummy.equals("-Inf")) {
- println("maxThValue[" + c + "] cannot be -Inf!");
- System.exit(21);
- } else if (dummy.equals("+Inf")) {
- maxThValue[c] = PosInf;
- } else {
- maxThValue[c] = Double.parseDouble(dummy);
- }
-
- // set minRandValue[c] and maxRandValue[c] (range for random values)
- dummy = inFile_init.next();
- if (dummy.equals("-Inf") || dummy.equals("+Inf")) {
- println("minRandValue[" + c + "] cannot be -Inf or +Inf!");
- System.exit(21);
- } else {
- minRandValue[c] = Double.parseDouble(dummy);
- }
-
- dummy = inFile_init.next();
- if (dummy.equals("-Inf") || dummy.equals("+Inf")) {
- println("maxRandValue[" + c + "] cannot be -Inf or +Inf!");
- System.exit(21);
- } else {
- maxRandValue[c] = Double.parseDouble(dummy);
- }
-
-
- // check for illogical values
- if (minThValue[c] > maxThValue[c]) {
- println("minThValue[" + c + "]=" + minThValue[c] + " > " + maxThValue[c] + "=maxThValue["
- + c + "]!");
- System.exit(21);
- }
- if (minRandValue[c] > maxRandValue[c]) {
- println("minRandValue[" + c + "]=" + minRandValue[c] + " > " + maxRandValue[c]
- + "=maxRandValue[" + c + "]!");
- System.exit(21);
- }
-
- // check for odd values
- if (!(minThValue[c] <= lambda[c] && lambda[c] <= maxThValue[c])) {
- println("Warning: lambda[" + c + "] has initial value (" + lambda[c] + ")", 1);
- println(" that is outside its critical value range " + "[" + minThValue[c] + ","
- + maxThValue[c] + "]", 1);
- }
-
- if (minThValue[c] == maxThValue[c]) {
- println("Warning: lambda[" + c + "] has " + "minThValue = maxThValue = " + minThValue[c]
- + ".", 1);
- }
-
- if (minRandValue[c] == maxRandValue[c]) {
- println("Warning: lambda[" + c + "] has " + "minRandValue = maxRandValue = "
- + minRandValue[c] + ".", 1);
- }
-
- if (minRandValue[c] < minThValue[c] || minRandValue[c] > maxThValue[c]
- || maxRandValue[c] < minThValue[c] || maxRandValue[c] > maxThValue[c]) {
- println("Warning: The random value range for lambda[" + c + "] is not contained", 1);
- println(" within its critical value range.", 1);
- }
-
- } // if (!isOptimizable[c])
-
- /*
- * precision[c] = inFile_init.nextDouble(); if (precision[c] < 0) { println("precision[" + c +
- * "]=" + precision[c] + " < 0! Must be non-negative."); System.exit(21); }
- */
-
- }
-
- // set normalizationOptions[]
- String origLine = "";
- while (origLine != null && origLine.length() == 0) {
- origLine = inFile_init.nextLine();
- }
-
-
- // How should a lambda[] vector be normalized (before decoding)?
- // nO[0] = 0: no normalization
- // nO[0] = 1: scale so that parameter nO[2] has absolute value nO[1]
- // nO[0] = 2: scale so that the maximum absolute value is nO[1]
- // nO[0] = 3: scale so that the minimum absolute value is nO[1]
- // nO[0] = 4: scale so that the L-nO[1] norm equals nO[2]
-
- // normalization = none
- // normalization = absval 1 lm
- // normalization = maxabsval 1
- // normalization = minabsval 1
- // normalization = LNorm 2 1
-
- dummy = (origLine.substring(origLine.indexOf("=") + 1)).trim();
- String[] dummyA = dummy.split("\\s+");
-
- if (dummyA[0].equals("none")) {
- normalizationOptions[0] = 0;
- } else if (dummyA[0].equals("absval")) {
- normalizationOptions[0] = 1;
- normalizationOptions[1] = Double.parseDouble(dummyA[1]);
- String pName = dummyA[2];
- for (int i = 3; i < dummyA.length; ++i) { // in case parameter name has multiple words
- pName = pName + " " + dummyA[i];
- }
- normalizationOptions[2] = c_fromParamName(pName);;
-
- if (normalizationOptions[1] <= 0) {
- println("Value for the absval normalization method must be positive.");
- System.exit(21);
- }
- if (normalizationOptions[2] == 0) {
- println("Unrecognized feature name " + normalizationOptions[2]
- + " for absval normalization method.", 1);
- System.exit(21);
- }
- } else if (dummyA[0].equals("maxabsval")) {
- normalizationOptions[0] = 2;
- normalizationOptions[1] = Double.parseDouble(dummyA[1]);
- if (normalizationOptions[1] <= 0) {
- println("Value for the maxabsval normalization method must be positive.");
- System.exit(21);
- }
- } else if (dummyA[0].equals("minabsval")) {
- normalizationOptions[0] = 3;
- normalizationOptions[1] = Double.parseDouble(dummyA[1]);
- if (normalizationOptions[1] <= 0) {
- println("Value for the minabsval normalization method must be positive.");
- System.exit(21);
- }
- } else if (dummyA[0].equals("LNorm")) {
- normalizationOptions[0] = 4;
- normalizationOptions[1] = Double.parseDouble(dummyA[1]);
- normalizationOptions[2] = Double.parseDouble(dummyA[2]);
- if (normalizationOptions[1] <= 0 || normalizationOptions[2] <= 0) {
- println("Both values for the LNorm normalization method must be positive.");
- System.exit(21);
- }
- } else {
- println("Unrecognized normalization method " + dummyA[0] + "; "
- + "must be one of none, absval, maxabsval, and LNorm.");
- System.exit(21);
- } // if (dummyA[0])
-
- inFile_init.close();
- }
-
- private void processDocInfo() {
- // sets numDocuments and docOfSentence[]
- docOfSentence = new int[numSentences];
-
- if (docInfoFileName == null) {
- for (int i = 0; i < numSentences; ++i)
- docOfSentence[i] = 0;
- numDocuments = 1;
- } else {
-
- try {
-
- // 4 possible formats:
- // 1) List of numbers, one per document, indicating # sentences in each document.
- // 2) List of "docName size" pairs, one per document, indicating name of document and #
- // sentences.
- // 3) List of docName's, one per sentence, indicating which doument each sentence belongs
- // to.
- // 4) List of docName_number's, one per sentence, indicating which doument each sentence
- // belongs to,
- // and its order in that document. (can also use '-' instead of '_')
-
- int docInfoSize = countNonEmptyLines(docInfoFileName);
-
- if (docInfoSize < numSentences) { // format #1 or #2
- numDocuments = docInfoSize;
- int i = 0;
-
- BufferedReader inFile = new BufferedReader(new FileReader(docInfoFileName));
- String line = inFile.readLine();
- boolean format1 = (!(line.contains(" ")));
-
- for (int doc = 0; doc < numDocuments; ++doc) {
-
- if (doc != 0) line = inFile.readLine();
-
- int docSize = 0;
- if (format1) {
- docSize = Integer.parseInt(line);
- } else {
- docSize = Integer.parseInt(line.split("\\s+")[1]);
- }
-
- for (int i2 = 1; i2 <= docSize; ++i2) {
- docOfSentence[i] = doc;
- ++i;
- }
-
- }
-
- // now i == numSentences
-
- inFile.close();
-
- } else if (docInfoSize == numSentences) { // format #3 or #4
-
- boolean format3 = false;
-
- HashSet<String> seenStrings = new HashSet<String>();
- BufferedReader inFile = new BufferedReader(new FileReader(docInfoFileName));
- for (int i = 0; i < numSentences; ++i) {
- // set format3 = true if a duplicate is found
- String line = inFile.readLine();
- if (seenStrings.contains(line)) format3 = true;
- seenStrings.add(line);
- }
-
- inFile.close();
-
- HashSet<String> seenDocNames = new HashSet<String>();
- HashMap<String, Integer> docOrder = new HashMap<String, Integer>();
- // maps a document name to the order (0-indexed) in which it was seen
-
- inFile = new BufferedReader(new FileReader(docInfoFileName));
- for (int i = 0; i < numSentences; ++i) {
- String line = inFile.readLine();
-
- String docName = "";
- if (format3) {
- docName = line;
- } else {
- int sep_i = Math.max(line.lastIndexOf('_'), line.lastIndexOf('-'));
- docName = line.substring(0, sep_i);
- }
-
- if (!seenDocNames.contains(docName)) {
- seenDocNames.add(docName);
- docOrder.put(docName, seenDocNames.size() - 1);
- }
-
- int docOrder_i = docOrder.get(docName);
-
- docOfSentence[i] = docOrder_i;
-
- }
-
- inFile.close();
-
- numDocuments = seenDocNames.size();
-
- } else { // badly formatted
-
- }
-
- } catch (FileNotFoundException e) {
- System.err.println("FileNotFoundException in MertCore.processDocInfo(): " + e.getMessage());
- System.exit(99901);
- } catch (IOException e) {
- System.err.println("IOException in MertCore.processDocInfo(): " + e.getMessage());
- System.exit(99902);
- }
- }
-
- }
-
- private boolean copyFile(String origFileName, String newFileName) {
- try {
- File inputFile = new File(origFileName);
- File outputFile = new File(newFileName);
-
- InputStream in = new FileInputStream(inputFile);
- OutputStream out = new FileOutputStream(outputFile);
-
- byte[] buffer = new byte[1024];
- int len;
- while ((len = in.read(buffer)) > 0) {
- out.write(buffer, 0, len);
- }
- in.close();
- out.close();
-
- /*
- * InputStream inStream = new FileInputStream(new File(origFileName)); BufferedReader inFile =
- * new BufferedReader(new InputStreamReader(inStream, "utf8"));
- *
- * FileOutputStream outStream = new FileOutputStream(newFileName, false); OutputStreamWriter
- * outStreamWriter = new OutputStreamWriter(outStream, "utf8"); BufferedWriter outFile = new
- * BufferedWriter(outStreamWriter);
- *
- * String line; while(inFile.ready()) { line = inFile.readLine(); writeLine(line, outFile); }
- *
- * inFile.close(); outFile.close();
- */
- return true;
- } catch (FileNotFoundException e) {
- System.err.println("FileNotFoundException in MertCore.copyFile(String,String): "
- + e.getMessage());
- return false;
- } catch (IOException e) {
- System.err.println("IOException in MertCore.copyFile(String,String): " + e.getMessage());
- return false;
- }
- }
-
- private void renameFile(String origFileName, String newFileName) {
- if (fileExists(origFileName)) {
- deleteFile(newFileName);
- File oldFile = new File(origFileName);
- File newFile = new File(newFileName);
- if (!oldFile.renameTo(newFile)) {
- println("Warning: attempt to rename " + origFileName + " to " + newFileName
- + " was unsuccessful!", 1);
- }
- } else {
- println("Warning: file " + origFileName + " does not exist! (in MertCore.renameFile)", 1);
- }
- }
-
- private void deleteFile(String fileName) {
- if (fileExists(fileName)) {
- File fd = new File(fileName);
- if (!fd.delete()) {
- println("Warning: attempt to delete " + fileName + " was unsuccessful!", 1);
- }
- }
- }
-
- private void writeLine(String line, BufferedWriter writer) throws IOException {
- writer.write(line, 0, line.length());
- writer.newLine();
- writer.flush();
- }
-
- public void finish() {
- if (myDecoder != null) {
- myDecoder.cleanUp();
- }
-
- // create config file with final values
- createConfigFile(lambda, decoderConfigFileName + ".ZMERT.final", decoderConfigFileName
- + ".ZMERT.orig");
-
- // delete current decoder config file and decoder output
- deleteFile(decoderConfigFileName);
- deleteFile(decoderOutFileName);
-
- // restore original name for config file (name was changed
- // in initialize() so it doesn't get overwritten)
- renameFile(decoderConfigFileName + ".ZMERT.orig", decoderConfigFileName);
-
- if (finalLambdaFileName != null) {
- try {
- PrintWriter outFile_lambdas = new PrintWriter(finalLambdaFileName);
- for (int c = 1; c <= numParams; ++c) {
- outFile_lambdas.println(paramNames[c] + " ||| " + lambda[c]);
- }
- outFile_lambdas.close();
-
- } catch (IOException e) {
- System.err.println("IOException in MertCore.finish(): " + e.getMessage());
- System.exit(99902);
- }
- }
-
- }
-
- private String[] cfgFileToArgsArray(String fileName) {
- checkFile(fileName);
-
- Vector<String> argsVector = new Vector<String>();
-
- BufferedReader inFile = null;
- try {
- inFile = new BufferedReader(new FileReader(fileName));
- String line, origLine;
- do {
- line = inFile.readLine();
- origLine = line; // for error reporting purposes
-
- if (line != null && line.length() > 0 && line.charAt(0) != '#') {
-
- if (line.indexOf("#") != -1) { // discard comment
- line = line.substring(0, line.indexOf("#"));
- }
-
- line = line.trim();
-
- // now line should look like "-xxx XXX"
-
- String[] paramA = line.split("\\s+");
-
- if (paramA.length == 2 && paramA[0].charAt(0) == '-') {
- argsVector.add(paramA[0]);
- argsVector.add(paramA[1]);
- } else if (paramA.length > 2
- && (paramA[0].equals("-m") || paramA[0].equals("-docSet") || paramA[0]
- .equals("-damianos"))) {
- // -m (metricName), -docSet, and -damianos are allowed to have extra optinos
- for (int opt = 0; opt < paramA.length; ++opt) {
- argsVector.add(paramA[opt]);
- }
- } else {
- println("Malformed line in config file:");
- println(origLine);
- System.exit(70);
- }
-
- }
- } while (line != null);
-
- inFile.close();
- } catch (FileNotFoundException e) {
- println("Z-MERT configuration file " + fileName + " was not found!");
- System.err.println("FileNotFoundException in MertCore.cfgFileToArgsArray(String): "
- + e.getMessage());
- System.exit(99901);
- } catch (IOException e) {
- System.err.println("IOException in MertCore.cfgFileToArgsArray(String): " + e.getMessage());
- System.exit(99902);
- }
-
- String[] argsArray = new String[argsVector.size()];
-
- for (int i = 0; i < argsVector.size(); ++i) {
- argsArray[i] = argsVector.elementAt(i);
- }
-
- return argsArray;
- }
-
- private void processArgsArray(String[] args) {
- processArgsArray(args, true);
- }
-
- private void processArgsArray(String[] args, boolean firstTime) {
- /* set default values */
- // Relevant files
- dirPrefix = null;
- sourceFileName = null;
- refFileName = "reference.txt";
- refsPerSen = 1;
- textNormMethod = 1;
- paramsFileName = "params.txt";
- docInfoFileName = null;
- finalLambdaFileName = null;
- // MERT specs
- metricName = "BLEU";
- metricName_display = metricName;
- metricOptions = new String[2];
- metricOptions[0] = "4";
- metricOptions[1] = "closest";
- docSubsetInfo = new int[7];
- docSubsetInfo[0] = 0;
- maxMERTIterations = 20;
- prevMERTIterations = 20;
- minMERTIterations = 5;
- stopMinIts = 3;
- stopSigValue = -1;
- //
- // /* possibly other early stopping criteria here */
- //
- numOptThreads = 1;
- saveInterFiles = 3;
- compressFiles = 0;
- initsPerIt = 20;
- oneModificationPerIteration = false;
- randInit = false;
- seed = System.currentTimeMillis();
- // useDisk = 2;
- // Decoder specs
- decoderCommandFileName = null;
- passIterationToDecoder = false;
- decoderOutFileName = "output.nbest";
- validDecoderExitValue = 0;
- decoderConfigFileName = "dec_cfg.txt";
- sizeOfNBest = 100;
- fakeFileNameTemplate = null;
- fakeFileNamePrefix = null;
- fakeFileNameSuffix = null;
- // Output specs
- verbosity = 1;
- decVerbosity = 1;
-
- damianos_method = 0;
- damianos_param = 0.0;
- damianos_mult = 0.0;
-
- int i = 0;
-
- while (i < args.length) {
- String option = args[i];
- // Relevant files
- if (option.equals("-dir")) {
- dirPrefix = args[i + 1];
- } else if (option.equals("-s")) {
- sourceFileName = args[i + 1];
- } else if (option.equals("-r")) {
- refFileName = args[i + 1];
- } else if (option.equals("-rps")) {
- refsPerSen = Integer.parseInt(args[i + 1]);
- if (refsPerSen < 1) {
- println("refsPerSen must be positive.");
- System.exit(10);
- }
- } else if (option.equals("-txtNrm")) {
- textNormMethod = Integer.parseInt(args[i + 1]);
- if (textNormMethod < 0 || textNormMethod > 4) {
- println("textNormMethod should be between 0 and 4");
- System.exit(10);
- }
- } else if (option.equals("-p")) {
- paramsFileName = args[i + 1];
- } else if (option.equals("-docInfo")) {
- docInfoFileName = args[i + 1];
- } else if (option.equals("-fin")) {
- finalLambdaFileName = args[i + 1];
- // MERT specs
- } else if (option.equals("-m")) {
- metricName = args[i + 1];
- metricName_display = metricName;
- if (EvaluationMetric.knownMetricName(metricName)) {
- int optionCount = EvaluationMetric.metricOptionCount(metricName);
- metricOptions = new String[optionCount];
- for (int opt = 0; opt < optionCount; ++opt) {
- metricOptions[opt] = args[i + opt + 2];
- }
- i += optionCount;
- } else {
- println("Unknown metric name " + metricName + ".");
- System.exit(10);
- }
- } else if (option.equals("-docSet")) {
- String method = args[i + 1];
-
- if (method.equals("all")) {
- docSubsetInfo[0] = 0;
- i += 0;
- } else if (method.equals("bottom")) {
- String a = args[i + 2];
- if (a.endsWith("d")) {
- docSubsetInfo[0] = 1;
- a = a.substring(0, a.indexOf("d"));
- } else {
- docSubsetInfo[0] = 2;
- a = a.substring(0, a.indexOf("%"));
- }
- docSubsetInfo[5] = Integer.parseInt(a);
- i += 1;
- } else if (method.equals("top")) {
- String a = args[i + 2];
- if (a.endsWith("d")) {
- docSubsetInfo[0] = 3;
- a = a.substring(0, a.indexOf("d"));
- } else {
- docSubsetInfo[0] = 4;
- a = a.substring(0, a.indexOf("%"));
- }
- docSubsetInfo[5] = Integer.parseInt(a);
- i += 1;
- } else if (method.equals("window")) {
- String a1 = args[i + 2];
- a1 = a1.substring(0, a1.indexOf("d")); // size of window
- String a2 = args[i + 4];
- if (a2.indexOf("p") > 0) {
- docSubsetInfo[0] = 5;
- a2 = a2.substring(0, a2.indexOf("p"));
- } else {
- docSubsetInfo[0] = 6;
- a2 = a2.substring(0, a2.indexOf("r"));
- }
- docSubsetInfo[5] = Integer.parseInt(a1);
- docSubsetInfo[6] = Integer.parseInt(a2);
- i += 3;
- } else {
- println("Unknown docSet method " + method + ".");
- System.exit(10);
- }
- } else if (option.equals("-maxIt")) {
- maxMERTIterations = Integer.parseInt(args[i + 1]);
- if (maxMERTIterations < 1) {
- println("maxMERTIts must be positive.");
- System.exit(10);
- }
- } else if (option.equals("-minIt")) {
- minMERTIterations = Integer.parseInt(args[i + 1]);
- if (minMERTIterations < 1) {
- println("minMERTIts must be positive.");
- System.exit(10);
- }
- } else if (option.equals("-prevIt")) {
- prevMERTIterations = Integer.parseInt(args[i + 1]);
- if (prevMERTIterations < 0) {
- println("prevMERTIts must be non-negative.");
- System.exit(10);
- }
- } else if (option.equals("-stopIt")) {
- stopMinIts = Integer.parseInt(args[i + 1]);
- if (stopMinIts < 1) {
- println("stopMinIts must be positive.");
- System.exit(10);
- }
- } else if (option.equals("-stopSig")) {
- stopSigValue = Double.parseDouble(args[i + 1]);
- }
- //
- // /* possibly other early stopping criteria here */
- //
- else if (option.equals("-thrCnt")) {
- numOptThreads = Integer.parseInt(args[i + 1]);
- if (numOptThreads < 1) {
- println("threadCount must be positive.");
- System.exit(10);
- }
- } else if (option.equals("-save")) {
- saveInterFiles = Integer.parseInt(args[i + 1]);
- if (saveInterFiles < 0 || saveInterFiles > 3) {
- println("save should be between 0 and 3");
- System.exit(10);
- }
- } else if (option.equals("-compress")) {
- compressFiles = Integer.parseInt(args[i + 1]);
- if (compressFiles < 0 || compressFiles > 1) {
- println("compressFiles should be either 0 or 1");
- System.exit(10);
- }
- } else if (option.equals("-ipi")) {
- initsPerIt = Integer.parseInt(args[i + 1]);
- if (initsPerIt < 1) {
- println("initsPerIt must be positive.");
- System.exit(10);
- }
- } else if (option.equals("-opi")) {
- int opi = Integer.parseInt(args[i + 1]);
- if (opi == 1) {
- oneModificationPerIteration = true;
- } else if (opi == 0) {
- oneModificationPerIteration = false;
- } else {
- println("oncePerIt must be either 0 or 1.");
- System.exit(10);
- }
- } else if (option.equals("-rand")) {
- int rand = Integer.parseInt(args[i + 1]);
- if (rand == 1) {
- randInit = true;
- } else if (rand == 0) {
- randInit = false;
- } else {
- println("randInit must be either 0 or 1.");
- System.exit(10);
- }
- } else if (option.equals("-seed")) {
- if (args[i + 1].equals("time")) {
- seed = System.currentTimeMillis();
- } else {
- seed = Long.parseLong(args[i + 1]);
- }
- }
- /*
- * else if (option.equals("-ud")) { useDisk = Integer.parseInt(args[i+1]); if (useDisk < 0 ||
- * useDisk > 2) { println("useDisk should be between 0 and 2"); System.exit(10); } }
- */
- // Decoder specs
- else if (option.equals("-cmd")) {
- decoderCommandFileName = args[i + 1];
-
<TRUNCATED>