You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/08/20 02:43:26 UTC
[02/15] incubator-joshua git commit: updated scripts to work with the
new format
updated scripts to work with the new format
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/32504c47
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/32504c47
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/32504c47
Branch: refs/heads/JOSHUA-284
Commit: 32504c47bbc90b3fd4a8d02298b9758fa8126a44
Parents: dcc7e7e
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue Aug 16 18:13:50 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue Aug 16 18:13:50 2016 -0400
----------------------------------------------------------------------
scripts/support/phrase2hiero.py | 22 ++++------------------
scripts/training/pipeline.pl | 8 ++------
2 files changed, 6 insertions(+), 24 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/32504c47/scripts/support/phrase2hiero.py
----------------------------------------------------------------------
diff --git a/scripts/support/phrase2hiero.py b/scripts/support/phrase2hiero.py
index e741564..581a823 100755
--- a/scripts/support/phrase2hiero.py
+++ b/scripts/support/phrase2hiero.py
@@ -1,13 +1,10 @@
#!/usr/bin/python
"""
-Prepends nonterminals to source and target side of phrase rules, and also
-increments the alignment points (if present) to match.
-This allows them to be used in the phrase-based decoder.
+Converts a Moses phrase table to a Joshua phrase table. The differences are
+(a) adding an LHS and (b) applying -log() to all the model weights.
-Usage: gzip -cd grammar.gz | phrase2hiero.py [-moses] | gzip -9n > grammar.new.gz
-
-If you specify "-moses", it will also apply -log() to each of the model weights.
+Usage: gzip -cd grammar.gz | phrase2hiero.py | gzip -9n > grammar.new.gz
Author: Matt Post <po...@cs.jhu.edu>
Date: June 2016
@@ -16,7 +13,6 @@ Date: June 2016
import sys
import math
import codecs
-import argparse
reload(sys)
sys.setdefaultencoding('utf-8')
@@ -24,11 +20,6 @@ sys.stdin = codecs.getreader('utf-8')(sys.stdin)
sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
sys.stdout.encoding = 'utf-8'
-def incr(alignment):
- """Takes an alignment point (0-1) and increments both sides"""
- points = alignment.split('-')
- return '%d-%d' % (int(points[0]) + 1, int(points[1]) + 1)
-
def maybelog(value):
"""Takes a feature value and returns -log(x) if it is a scalar"""
try:
@@ -46,14 +37,9 @@ for line in sys.stdin:
# Get all the fields
tokens = line.split(r' ||| ')
- tokens[1] = '[X,1] ' + tokens[1]
- tokens[2] = '[X,1] ' + tokens[2]
# take the -log() of each input token
if moses and len(tokens) >= 4:
tokens[3] = ' '.join(map(maybelog, tokens[3].split(' ')))
- if len(tokens) >= 5:
- tokens[4] = ' '.join(map(incr, tokens[4].split(' ')))
-
- print ' ||| '.join(tokens)
+ print ' ||| '.join(tokens),
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/32504c47/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index ea617bc..08933ec 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1123,7 +1123,7 @@ if (! defined $GRAMMAR_FILE) {
# Convert the model to Joshua format
$cachepipe->cmd("convert-moses-to-joshua",
- "$CAT model/phrase-table.gz | $SCRIPTDIR/support/phrase2hiero.py | gzip -9n > grammar.gz",
+ "$CAT model/phrase-table.gz | $SCRIPTDIR/support/phrase2hiero.py -moses | gzip -9n > grammar.gz",
"model/phrase-table.gz",
"grammar.gz",
);
@@ -1165,10 +1165,6 @@ if (! defined $GRAMMAR_FILE) {
$GRAMMAR_FILE = "grammar.gz";
- # Convert phrase model to hiero format (Thrax should do this!)
- if ($GRAMMAR_TYPE eq "phrase") {
- system("mv grammar.gz grammar.tmp.gz; gzip -cd grammar.tmp.gz | $SCRIPTDIR/support/phrase2hiero.py | gzip -9n > grammar.gz; rm -rf grammar.tmp.gz");
- }
} else {
print STDERR "* FATAL: There was no way to build a grammar, and none was passed in\n";
@@ -1181,7 +1177,7 @@ if (! defined $GRAMMAR_FILE) {
}
# Pack the entire model! Saves filtering and repacking of tuning and test sets
-if ($DO_PACK_GRAMMARS and ! $DO_FILTER_TM) {
+if ($DO_PACK_GRAMMARS and ! $DO_FILTER_TM and ! -e "grammar.packed") {
$cachepipe->cmd("pack-grammar",
"$SCRIPTDIR/support/grammar-packer.pl -a -T $TMPDIR -m $PACKER_MEM -g $GRAMMAR_FILE -o $RUNDIR/grammar.packed",
"$RUNDIR/grammar.packed/vocabulary",