You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/10/19 12:46:10 UTC
incubator-joshua git commit: added template file
Repository: incubator-joshua
Updated Branches:
refs/heads/master 301f301cd -> 0819b9bb9
added template file
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/0819b9bb
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/0819b9bb
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/0819b9bb
Branch: refs/heads/master
Commit: 0819b9bb9039374f74edf8ed4c2ff63fd8183d17
Parents: 301f301
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Oct 19 08:45:58 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed Oct 19 08:45:58 2016 -0400
----------------------------------------------------------------------
scripts/training/templates/thrax-phrase-gt.conf | 76 ++++++++++++++++++++
1 file changed, 76 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/0819b9bb/scripts/training/templates/thrax-phrase-gt.conf
----------------------------------------------------------------------
diff --git a/scripts/training/templates/thrax-phrase-gt.conf b/scripts/training/templates/thrax-phrase-gt.conf
new file mode 100644
index 0000000..5d2efe4
--- /dev/null
+++ b/scripts/training/templates/thrax-phrase-gt.conf
@@ -0,0 +1,76 @@
+# this is an example Thrax configuration file
+# <- this symbol indicates a comment
+# each line should be a key-value pair separated by whitespace
+
+###
+### GRAMMAR OPTIONS
+###
+
+grammar hiero # or samt
+reverse false
+source-is-parsed false
+target-is-parsed false
+# default-nt X # X is the default anyway
+
+min-rule-count 1
+
+# the number of reducers
+reducers 16
+
+# Maximum length of initial phrase pairs. These are set to be shorter than
+# used by Hiero.
+initial-phrase-length <MAXPHRLEN>
+lex-source-words 5
+lex-target-words 5
+
+# maximum number of NTs in a rule
+arity 0
+
+# minimum number of aligned terminals in a rule
+lexicality 1
+
+# allow adjacent nonterminals on source side
+adjacent-nts false
+
+# allow unaligned words at boundaries of phrases
+loose true
+
+allow-abstract-rules false
+allow-nonlexical-x false
+allow-full-sentence-rules false
+
+nonlex-source-length 5
+nonlex-target-length 5
+nonlex-source-words 5
+nonlex-target-words 5
+
+allow-double-plus false
+
+rule-span-limit 12
+
+phrase-penalty 2.718
+
+# a whitespace seperated list of features
+# in this example, the features are phrase translation probability,
+# lexical probability, and phrase penalty
+# features phrase-penalty e2fphrase f2ephrase lexprob lexical abstract adjacent x-rule source-terminals-without-target target-terminals-without-source monotonic glue-rule rarity target-word-count unaligned-count
+features e_given_f_phrase_gt_smoothed f_given_e_phrase_gt_smoothed e_given_f_lex f_given_e_lex rarity phrase-penalty alignment
+
+# the only option and default later we will want to add formats for other decoders such as moses and
+# cdec, if they use other formats
+output-format joshua
+
+# label feature scores? each score will be output as name=score
+label-feature-scores false
+
+amazon-work s3://edu.jhu.cs.jonny/wmt11/fr-en/hiero
+amazon-jar s3://edu.jhu.cs.jonny/thrax.jar
+amazon-num-instances 15
+
+max-split-size 8388608
+
+# the format should be:
+# foreign sentence ||| english sentence ||| alignment
+# where the english is either parsed or not depending on whether you want
+# SAMT or you want Hiero.
+#input-file s3://edu.jhu.cs.jonny/wmt11/corpus.fr-en