You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/08/20 02:43:26 UTC

[02/15] incubator-joshua git commit: updated scripts to work with the new format

updated scripts to work with the new format


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/32504c47
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/32504c47
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/32504c47

Branch: refs/heads/JOSHUA-284
Commit: 32504c47bbc90b3fd4a8d02298b9758fa8126a44
Parents: dcc7e7e
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue Aug 16 18:13:50 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue Aug 16 18:13:50 2016 -0400

----------------------------------------------------------------------
 scripts/support/phrase2hiero.py | 22 ++++------------------
 scripts/training/pipeline.pl    |  8 ++------
 2 files changed, 6 insertions(+), 24 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/32504c47/scripts/support/phrase2hiero.py
----------------------------------------------------------------------
diff --git a/scripts/support/phrase2hiero.py b/scripts/support/phrase2hiero.py
index e741564..581a823 100755
--- a/scripts/support/phrase2hiero.py
+++ b/scripts/support/phrase2hiero.py
@@ -1,13 +1,10 @@
 #!/usr/bin/python
 
 """
-Prepends nonterminals to source and target side of phrase rules, and also
-increments the alignment points (if present) to match.
-This allows them to be used in the phrase-based decoder.
+Converts a Moses phrase table to a Joshua phrase table. The differences are
+(a) adding an LHS and (b) applying -log() to all the model weights.
 
-Usage: gzip -cd grammar.gz | phrase2hiero.py [-moses] | gzip -9n > grammar.new.gz
-
-If you specify "-moses", it will also apply -log() to each of the model weights.
+Usage: gzip -cd grammar.gz | phrase2hiero.py | gzip -9n > grammar.new.gz
 
 Author: Matt Post <po...@cs.jhu.edu>
 Date:   June 2016
@@ -16,7 +13,6 @@ Date:   June 2016
 import sys
 import math
 import codecs
-import argparse
 
 reload(sys)
 sys.setdefaultencoding('utf-8')
@@ -24,11 +20,6 @@ sys.stdin = codecs.getreader('utf-8')(sys.stdin)
 sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
 sys.stdout.encoding = 'utf-8'
 
-def incr(alignment):
-    """Takes an alignment point (0-1) and increments both sides"""
-    points = alignment.split('-')
-    return '%d-%d' % (int(points[0]) + 1, int(points[1]) + 1)
-
 def maybelog(value):
     """Takes a feature value and returns -log(x) if it is a scalar"""
     try:
@@ -46,14 +37,9 @@ for line in sys.stdin:
 
     # Get all the fields
     tokens = line.split(r' ||| ')
-    tokens[1] = '[X,1] ' + tokens[1]
-    tokens[2] = '[X,1] ' + tokens[2]
 
     # take the -log() of each input token
     if moses and len(tokens) >= 4:
         tokens[3] = ' '.join(map(maybelog, tokens[3].split(' ')))
 
-    if len(tokens) >= 5:
-        tokens[4] = ' '.join(map(incr, tokens[4].split(' ')))
-
-    print ' ||| '.join(tokens)
+    print ' ||| '.join(tokens),

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/32504c47/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index ea617bc..08933ec 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1123,7 +1123,7 @@ if (! defined $GRAMMAR_FILE) {
 
     # Convert the model to Joshua format
     $cachepipe->cmd("convert-moses-to-joshua",
-                    "$CAT model/phrase-table.gz | $SCRIPTDIR/support/phrase2hiero.py | gzip -9n > grammar.gz",
+                    "$CAT model/phrase-table.gz | $SCRIPTDIR/support/phrase2hiero.py -moses | gzip -9n > grammar.gz",
                     "model/phrase-table.gz",
                     "grammar.gz",
         );
@@ -1165,10 +1165,6 @@ if (! defined $GRAMMAR_FILE) {
 
     $GRAMMAR_FILE = "grammar.gz";
 
-    # Convert phrase model to hiero format (Thrax should do this!)
-    if ($GRAMMAR_TYPE eq "phrase") {
-        system("mv grammar.gz grammar.tmp.gz; gzip -cd grammar.tmp.gz | $SCRIPTDIR/support/phrase2hiero.py | gzip -9n > grammar.gz; rm -rf grammar.tmp.gz");
-     }
   } else {
 
     print STDERR "* FATAL: There was no way to build a grammar, and none was passed in\n";
@@ -1181,7 +1177,7 @@ if (! defined $GRAMMAR_FILE) {
 }
 
 # Pack the entire model! Saves filtering and repacking of tuning and test sets
-if ($DO_PACK_GRAMMARS and ! $DO_FILTER_TM) {
+if ($DO_PACK_GRAMMARS and ! $DO_FILTER_TM and ! -e "grammar.packed") {
   $cachepipe->cmd("pack-grammar",
                   "$SCRIPTDIR/support/grammar-packer.pl -a -T $TMPDIR -m $PACKER_MEM -g $GRAMMAR_FILE -o $RUNDIR/grammar.packed",
                   "$RUNDIR/grammar.packed/vocabulary",