You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/10/19 12:46:10 UTC

incubator-joshua git commit: added template file

Repository: incubator-joshua
Updated Branches:
  refs/heads/master 301f301cd -> 0819b9bb9


added template file


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/0819b9bb
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/0819b9bb
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/0819b9bb

Branch: refs/heads/master
Commit: 0819b9bb9039374f74edf8ed4c2ff63fd8183d17
Parents: 301f301
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Oct 19 08:45:58 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed Oct 19 08:45:58 2016 -0400

----------------------------------------------------------------------
 scripts/training/templates/thrax-phrase-gt.conf | 76 ++++++++++++++++++++
 1 file changed, 76 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/0819b9bb/scripts/training/templates/thrax-phrase-gt.conf
----------------------------------------------------------------------
diff --git a/scripts/training/templates/thrax-phrase-gt.conf b/scripts/training/templates/thrax-phrase-gt.conf
new file mode 100644
index 0000000..5d2efe4
--- /dev/null
+++ b/scripts/training/templates/thrax-phrase-gt.conf
@@ -0,0 +1,76 @@
+# this is an example Thrax configuration file
+# <- this symbol indicates a comment
+# each line should be a key-value pair separated by whitespace
+
+###
+### GRAMMAR OPTIONS
+###
+
+grammar     hiero   # or samt
+reverse     false
+source-is-parsed    false
+target-is-parsed    false
+# default-nt    X   # X is the default anyway
+
+min-rule-count 1
+
+# the number of reducers
+reducers 16
+
+# Maximum length of initial phrase pairs. These are set to be shorter than
+# used by Hiero.
+initial-phrase-length   <MAXPHRLEN>
+lex-source-words        5
+lex-target-words        5
+
+# maximum number of NTs in a rule
+arity                   0
+
+# minimum number of aligned terminals in a rule
+lexicality              1   
+
+# allow adjacent nonterminals on source side
+adjacent-nts    false   
+
+# allow unaligned words at boundaries of phrases
+loose           true
+
+allow-abstract-rules    false
+allow-nonlexical-x      false
+allow-full-sentence-rules   false
+
+nonlex-source-length    5
+nonlex-target-length    5
+nonlex-source-words     5
+nonlex-target-words     5
+
+allow-double-plus    false
+
+rule-span-limit         12
+
+phrase-penalty  2.718
+
+# a whitespace seperated list of features
+# in this example, the features are phrase translation probability,
+# lexical probability, and phrase penalty
+# features        phrase-penalty e2fphrase f2ephrase lexprob lexical abstract adjacent x-rule source-terminals-without-target target-terminals-without-source monotonic glue-rule rarity target-word-count unaligned-count
+features        e_given_f_phrase_gt_smoothed f_given_e_phrase_gt_smoothed e_given_f_lex f_given_e_lex rarity phrase-penalty alignment
+
+# the only option and default later we will want to add formats for other decoders such as moses and
+# cdec, if they use other formats
+output-format   joshua  
+
+# label feature scores? each score will be output as name=score
+label-feature-scores false
+
+amazon-work s3://edu.jhu.cs.jonny/wmt11/fr-en/hiero
+amazon-jar  s3://edu.jhu.cs.jonny/thrax.jar
+amazon-num-instances    15
+
+max-split-size  8388608
+
+# the format should be:
+# foreign sentence ||| english sentence ||| alignment
+# where the english is either parsed or not depending on whether you want
+# SAMT or you want Hiero.
+#input-file  s3://edu.jhu.cs.jonny/wmt11/corpus.fr-en