You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/23 18:45:58 UTC

[47/60] [partial] incubator-joshua git commit: maven multi-module layout 1st commit: moving files into joshua-core

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/resources/phrase_decoder/config
----------------------------------------------------------------------
diff --git a/joshua-core/resources/phrase_decoder/config b/joshua-core/resources/phrase_decoder/config
new file mode 100644
index 0000000..de781e3
--- /dev/null
+++ b/joshua-core/resources/phrase_decoder/config
@@ -0,0 +1,29 @@
+tm = moses -owner pt -maxspan 0 -path resources/phrase_decoder/rules.1.gz -max-source-len 5
+feature-function = StateMinimizingLanguageModel -lm_order 5 -lm_file resources/phrase_decoder/lm.1.gz
+
+search = stack
+
+mark-oovs = false
+pop-limit = 10
+top-n = 1
+
+output-format = %i ||| %s ||| %f ||| %c
+
+include-align-index = true
+reordering-limit = 6
+
+# And these are the feature functions to activate.
+feature-function = OOVPenalty
+feature-function = WordPenalty
+feature-function = Distortion
+feature-function = PhrasePenalty -owner pt
+
+OOVPenalty 1.0
+Distortion 0.114849
+WordPenalty -0.201544
+PhrasePenalty -0.236965
+tm_pt_0 0.0370068
+tm_pt_1 0.0495759
+tm_pt_2 0.196742
+tm_pt_3 0.0745423
+lm_0 0.204412452147565

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/resources/phrase_decoder/constrained.config
----------------------------------------------------------------------
diff --git a/joshua-core/resources/phrase_decoder/constrained.config b/joshua-core/resources/phrase_decoder/constrained.config
new file mode 100644
index 0000000..4642650
--- /dev/null
+++ b/joshua-core/resources/phrase_decoder/constrained.config
@@ -0,0 +1,28 @@
+tm = moses pt 0 resources/phrase_decoder/rules.1.gz
+
+lm = kenlm 5 true false 100 resources/phrase_decoder/lm.1.gz
+
+mark-oovs = false
+pop-limit = 10
+top-n = 5
+
+output-format = %i ||| %s ||| %f ||| %c
+
+include-align-index = true
+reordering-limit = 10
+
+# And these are the feature functions to activate.
+feature-function = OOVPenalty
+feature-function = WordPenalty
+feature-function = Distortion
+feature-function = PhrasePenalty -owner pt
+
+OOVPenalty 1.0
+Distortion 0.114849
+WordPenalty -0.201544
+PhrasePenalty -0.236965
+tm_pt_0 0.0370068
+tm_pt_1 0.0495759
+tm_pt_2 0.196742
+tm_pt_3 0.0745423
+lm_0 0.204412452147565

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/resources/phrase_decoder/constrained.output.gold
----------------------------------------------------------------------
diff --git a/joshua-core/resources/phrase_decoder/constrained.output.gold b/joshua-core/resources/phrase_decoder/constrained.output.gold
new file mode 100644
index 0000000..238387c
--- /dev/null
+++ b/joshua-core/resources/phrase_decoder/constrained.output.gold
@@ -0,0 +1,5 @@
+0 ||| President Obama |8-8| to |7-7| hinder |4-4| a strategy |0-1| for |3-3| Republican |2-2| re @-@ election |5-6| ||| tm_pt_0=-15.792 tm_pt_1=-17.550 tm_pt_2=-14.599 tm_pt_3=-18.298 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-24.000 PhrasePenalty=7.000 ||| -15.163
+0 ||| President Obama |8-8| to |7-7| hinder |4-4| a |0-0| strategy |1-1| for |3-3| Republican |2-2| re @-@ election |5-6| ||| tm_pt_0=-16.919 tm_pt_1=-17.550 tm_pt_2=-14.917 tm_pt_3=-18.298 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-24.000 PhrasePenalty=8.000 ||| -15.505
+0 ||| President Obama |8-8| to hinder |3-4| a strategy |0-1| for |7-7| Republican |2-2| re @-@ election |5-6| ||| tm_pt_0=-14.986 tm_pt_1=-17.951 tm_pt_2=-14.075 tm_pt_3=-18.699 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-32.000 PhrasePenalty=6.000 ||| -15.762
+0 ||| President Obama |8-8| to hinder |3-4| a |0-0| strategy |1-1| for |7-7| Republican |2-2| re @-@ election |5-6| ||| tm_pt_0=-16.112 tm_pt_1=-17.951 tm_pt_2=-14.393 tm_pt_3=-18.699 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-32.000 PhrasePenalty=7.000 ||| -16.103
+0 ||| President Obama |8-8| to |3-3| hinder |4-4| a strategy |0-1| for |7-7| Republican |2-2| re @-@ election |5-6| ||| tm_pt_0=-16.329 tm_pt_1=-17.951 tm_pt_2=-15.136 tm_pt_3=-18.699 lm_0=-29.452 OOVPenalty=0.000 WordPenalty=-4.777 Distortion=-32.000 PhrasePenalty=7.000 ||| -16.257

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/resources/phrase_decoder/lm.1.gz
----------------------------------------------------------------------
diff --git a/joshua-core/resources/phrase_decoder/lm.1.gz b/joshua-core/resources/phrase_decoder/lm.1.gz
new file mode 100644
index 0000000..3f4c453
Binary files /dev/null and b/joshua-core/resources/phrase_decoder/lm.1.gz differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/resources/phrase_decoder/output.gold
----------------------------------------------------------------------
diff --git a/joshua-core/resources/phrase_decoder/output.gold b/joshua-core/resources/phrase_decoder/output.gold
new file mode 100644
index 0000000..509a3de
--- /dev/null
+++ b/joshua-core/resources/phrase_decoder/output.gold
@@ -0,0 +1 @@
+0 ||| a strategy |0-1| republican |2-2| to hinder |3-4| reelection |5-6| Obama |7-8| ||| tm_pt_0=-9.702 tm_pt_1=-10.800 tm_pt_2=-7.543 tm_pt_3=-8.555 lm_0=-19.117 OOVPenalty=0.000 WordPenalty=-3.040 Distortion=0.000 PhrasePenalty=5.000 ||| -7.496

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/resources/phrase_decoder/rules.1.gz
----------------------------------------------------------------------
diff --git a/joshua-core/resources/phrase_decoder/rules.1.gz b/joshua-core/resources/phrase_decoder/rules.1.gz
new file mode 100644
index 0000000..14466e9
Binary files /dev/null and b/joshua-core/resources/phrase_decoder/rules.1.gz differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/resources/wa_grammar
----------------------------------------------------------------------
diff --git a/joshua-core/resources/wa_grammar b/joshua-core/resources/wa_grammar
new file mode 100644
index 0000000..82d0052
--- /dev/null
+++ b/joshua-core/resources/wa_grammar
@@ -0,0 +1,3 @@
+[X] ||| A [X,1] B1 [X,2] B2 C ||| a b [X,2] c1 [X,1] c2 ||| 1 1 1 1 1 1 OOV=1 ||| 0-0 2-1 4-1 5-3 5-5
+[X] ||| U Z1 Z2 ||| n1 u z ||| 1 1 1 1 1 1 OOV=2 ||| 0-1 1-2 2-2
+[X] ||| K ||| k1 k2 k3 n1 n2 n3 ||| 1 1 1 1 1 1 OOV=4 ||| 0-0 0-1 0-2
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/resources/wa_grammar.packed/config
----------------------------------------------------------------------
diff --git a/joshua-core/resources/wa_grammar.packed/config b/joshua-core/resources/wa_grammar.packed/config
new file mode 100644
index 0000000..ebd1bf1
--- /dev/null
+++ b/joshua-core/resources/wa_grammar.packed/config
@@ -0,0 +1 @@
+max-source-len = 6

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/resources/wa_grammar.packed/encoding
----------------------------------------------------------------------
diff --git a/joshua-core/resources/wa_grammar.packed/encoding b/joshua-core/resources/wa_grammar.packed/encoding
new file mode 100644
index 0000000..630f69f
Binary files /dev/null and b/joshua-core/resources/wa_grammar.packed/encoding differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/resources/wa_grammar.packed/slice_00000.alignments
----------------------------------------------------------------------
diff --git a/joshua-core/resources/wa_grammar.packed/slice_00000.alignments b/joshua-core/resources/wa_grammar.packed/slice_00000.alignments
new file mode 100644
index 0000000..f1425eb
Binary files /dev/null and b/joshua-core/resources/wa_grammar.packed/slice_00000.alignments differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/resources/wa_grammar.packed/slice_00000.features
----------------------------------------------------------------------
diff --git a/joshua-core/resources/wa_grammar.packed/slice_00000.features b/joshua-core/resources/wa_grammar.packed/slice_00000.features
new file mode 100644
index 0000000..5a4c774
Binary files /dev/null and b/joshua-core/resources/wa_grammar.packed/slice_00000.features differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/resources/wa_grammar.packed/slice_00000.source
----------------------------------------------------------------------
diff --git a/joshua-core/resources/wa_grammar.packed/slice_00000.source b/joshua-core/resources/wa_grammar.packed/slice_00000.source
new file mode 100644
index 0000000..4607b89
Binary files /dev/null and b/joshua-core/resources/wa_grammar.packed/slice_00000.source differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/resources/wa_grammar.packed/slice_00000.target
----------------------------------------------------------------------
diff --git a/joshua-core/resources/wa_grammar.packed/slice_00000.target b/joshua-core/resources/wa_grammar.packed/slice_00000.target
new file mode 100644
index 0000000..fe11a38
Binary files /dev/null and b/joshua-core/resources/wa_grammar.packed/slice_00000.target differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/resources/wa_grammar.packed/slice_00000.target.lookup
----------------------------------------------------------------------
diff --git a/joshua-core/resources/wa_grammar.packed/slice_00000.target.lookup b/joshua-core/resources/wa_grammar.packed/slice_00000.target.lookup
new file mode 100644
index 0000000..7d82179
Binary files /dev/null and b/joshua-core/resources/wa_grammar.packed/slice_00000.target.lookup differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/resources/wa_grammar.packed/vocabulary
----------------------------------------------------------------------
diff --git a/joshua-core/resources/wa_grammar.packed/vocabulary b/joshua-core/resources/wa_grammar.packed/vocabulary
new file mode 100644
index 0000000..637651e
Binary files /dev/null and b/joshua-core/resources/wa_grammar.packed/vocabulary differ

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e2734396/joshua-core/src/main/java/org/apache/joshua/adagrad/AdaGrad.java
----------------------------------------------------------------------
diff --git a/joshua-core/src/main/java/org/apache/joshua/adagrad/AdaGrad.java b/joshua-core/src/main/java/org/apache/joshua/adagrad/AdaGrad.java
new file mode 100755
index 0000000..0784318
--- /dev/null
+++ b/joshua-core/src/main/java/org/apache/joshua/adagrad/AdaGrad.java
@@ -0,0 +1,160 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *  http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.joshua.adagrad;
+
+import org.apache.joshua.decoder.JoshuaConfiguration;
+import org.apache.joshua.util.FileUtility;
+import org.apache.joshua.util.StreamGobbler;
+
+public class AdaGrad {
+  public static void main(String[] args) throws Exception {
+    JoshuaConfiguration joshuaConfiguration = new JoshuaConfiguration();
+    boolean external = false; // should each AdaGrad iteration be launched externally?
+
+    if (args.length == 1) {
+      if (args[0].equals("-h")) {
+        printAdaGradUsage(args.length, true);
+        System.exit(2);
+      } else {
+        external = false;
+      }
+    } else if (args.length == 3) {
+      external = true;
+    } else {
+      printAdaGradUsage(args.length, false);
+      System.exit(1);
+    }
+
+    if (!external) {
+      AdaGradCore myAdaGrad = new AdaGradCore(args[0], joshuaConfiguration);
+      myAdaGrad.run_AdaGrad(); // optimize lambda[]
+      myAdaGrad.finish();
+    } else {
+
+      int maxMem = Integer.parseInt(args[1]);
+      String configFileName = args[2];
+      String stateFileName = FileUtility.dirname(configFileName) + "/AdaGrad.temp.state";
+      String cp = System.getProperty("java.class.path");
+      boolean done = false;
+      int iteration = 0;
+
+      while (!done) {
+        ++iteration;
+        Runtime rt = Runtime.getRuntime();
+        Process p =
+            rt.exec("java -Xmx" + maxMem + "m -cp " + cp + " org.apache.joshua.adagrad.AdaGradCore " + configFileName
+                + " " + stateFileName + " " + iteration);
+        /*
+         * BufferedReader br_i = new BufferedReader(new InputStreamReader(p.getInputStream()));
+         * BufferedReader br_e = new BufferedReader(new InputStreamReader(p.getErrorStream()));
+         * String dummy_line = null; while ((dummy_line = br_i.readLine()) != null) {
+         * System.out.println(dummy_line); } while ((dummy_line = br_e.readLine()) != null) {
+         * System.out.println(dummy_line); }
+         */
+        StreamGobbler errorGobbler = new StreamGobbler(p.getErrorStream(), 1);
+        StreamGobbler outputGobbler = new StreamGobbler(p.getInputStream(), 1);
+
+        errorGobbler.start();
+        outputGobbler.start();
+
+        int status = p.waitFor();
+
+        if (status == 90) {
+          done = true;
+        } else if (status == 91) {
+          done = false;
+        } else {
+          System.out.println("AdaGrad exiting prematurely (AdaGradCore returned " + status + ")...");
+          break;
+        }
+      }
+    }
+
+    System.exit(0);
+
+  } // main(String[] args)
+
+  public static void printAdaGradUsage(int argsLen, boolean detailed) {
+    if (!detailed) {
+      println("Oops, you provided " + argsLen + " args!");
+      println("");
+      println("Usage:");
+      println("           AdaGrad -maxMem maxMemoryInMB AdaGrad_configFile");
+      println("");
+      println("Where -maxMem specifies the maximum amount of memory (in MB) AdaGrad is");
+      println("allowed to use when performing its calculations (no memroy is needed while");
+      println("the decoder is running),");
+      println("and the config file contains any subset of AdaGrad's 20-some parameters,");
+      println("one per line.  Run   AdaGrad -h   for more details on those parameters.");
+    } else {
+      println("Usage:");
+      println("           AdaGrad -maxMem maxMemoryInMB AdaGrad_configFile");
+      println("");
+      println("Where -maxMem specifies the maximum amount of memory (in MB) AdaGrad is");
+      println("allowed to use when performing its calculations (no memroy is needed while");
+      println("the decoder is running),");
+      println("and the config file contains any subset of AdaGrad's 20-some parameters,");
+      println("one per line.  Those parameters, and their default values, are:");
+      println("");
+      println("Relevant files:");
+      println("  -dir dirPrefix: working directory\n    [[default: null string (i.e. they are in the current directory)]]");
+      println("  -s sourceFile: source sentences (foreign sentences) of the AdaGrad dataset\n    [[default: null string (i.e. file name is not needed by AdaGrad)]]");
+      println("  -r refFile: target sentences (reference translations) of the AdaGrad dataset\n    [[default: reference.txt]]");
+      println("  -rps refsPerSen: number of reference translations per sentence\n    [[default: 1]]");
+      //println("  -txtNrm textNormMethod: how should text be normalized?\n       (0) don't normalize text,\n    or (1) \"NIST-style\", and also rejoin 're, *'s, n't, etc,\n    or (2) apply 1 and also rejoin dashes between letters,\n    or (3) apply 1 and also drop non-ASCII characters,\n    or (4) apply 1+2+3\n    [[default: 1]]");
+      println("  -p paramsFile: file containing parameter names, initial values, and ranges\n    [[default: params.txt]]");
+      //println("  -docInfo documentInfoFile: file informing AdaGrad which document each\n    sentence belongs to\n    [[default: null string (i.e. all sentences are in one 'document')]]");
+      println("  -fin finalLambda: file name for final lambda[] values\n    [[default: null string (i.e. no such file will be created)]]");
+      println("");
+      println("AdaGrad specs:");
+      println("  -m metricName metric options: name of evaluation metric and its options\n    [[default: BLEU 4 closest]]");
+      println("  -maxIt maxAdaGradIts: maximum number of AdaGrad iterations\n    [[default: 20]]");
+      println("  -prevIt prevAdaGradIts: maximum number of previous AdaGrad iterations to\n    construct candidate sets from\n    [[default: 20]]");
+      println("  -minIt minAdaGradIts: number of iterations before considering an early exit\n    [[default: 5]]");
+      println("  -stopIt stopMinIts: some early stopping criterion must be satisfied in\n    stopMinIts *consecutive* iterations before an early exit\n    [[default: 3]]");
+      println("  -stopSig sigValue: early AdaGrad exit if no weight changes by more than sigValue\n    [[default: -1 (i.e. this criterion is never investigated)]]");
+      //println("  -thrCnt threadCount: number of threads to run in parallel when optimizing\n    [[default: 1]]");
+      println("  -save saveInter: save intermediate cfg files (1) or decoder outputs (2)\n    or both (3) or neither (0)\n    [[default: 3]]");
+      println("  -compress compressFiles: should AdaGrad compress the files it produces (1)\n    or not (0)\n    [[default: 0]]");
+      //println("  -ipi initsPerIt: number of intermediate initial points per iteration\n    [[default: 20]]");
+      //println("  -opi oncePerIt: modify a parameter only once per iteration (1) or not (0)\n    [[default: 0]]");
+      //println("  -rand randInit: choose initial point randomly (1) or from paramsFile (0)\n    [[default: 0]]");
+      //println("  -seed seed: seed used to initialize random number generator\n    [[default: time (i.e. value returned by System.currentTimeMillis()]]");
+      // println("  -ud useDisk: reliance on disk (0-2; higher value => more reliance)\n    [[default: 2]]");
+      println("");
+      println("Decoder specs:");
+      println("  -cmd commandFile: name of file containing commands to run the decoder\n    [[default: null string (i.e. decoder is a JoshuaDecoder object)]]");
+      println("  -passIt passIterationToDecoder: should iteration number be passed\n    to command file (1) or not (0)\n    [[default: 0]]");
+      println("  -decOut decoderOutFile: name of the output file produced by the decoder\n    [[default: output.nbest]]");
+      println("  -decExit validExit: value returned by decoder to indicate success\n    [[default: 0]]");
+      println("  -dcfg decConfigFile: name of decoder config file\n    [[default: dec_cfg.txt]]");
+      println("  -N N: size of N-best list (per sentence) generated in each AdaGrad iteration\n    [[default: 100]]");
+      println("");
+      println("Output specs:");
+      println("  -v verbosity: AdaGrad verbosity level (0-2; higher value => more verbose)\n    [[default: 1]]");
+      println("  -decV decVerbosity: should decoder output be printed (1) or ignored (0)\n    [[default: 0]]");
+      println("");
+    }
+  }
+
+  private static void println(Object obj) {
+    System.out.println(obj);
+  }
+
+}