You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/07/12 19:35:40 UTC

incubator-joshua git commit: bugfix: alignment points were not being adjusted when converting phrase-based grammars

Repository: incubator-joshua
Updated Branches:
  refs/heads/master 5d1fc27dd -> 8ebea3881


bugfix: alignment points were not being adjusted when converting phrase-based grammars

Also simplified the pipeline for converting Moses and Thrax phrase tables into Joshua's Hiero format


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/8ebea388
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/8ebea388
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/8ebea388

Branch: refs/heads/master
Commit: 8ebea3881c740dccec1ef02a4da227ca441e60e9
Parents: 5d1fc27
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue Jul 12 15:35:37 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue Jul 12 15:35:37 2016 -0400

----------------------------------------------------------------------
 scripts/support/grammar-packer.pl | 25 ++++++--------
 scripts/support/phrase2hiero.pl   | 23 -------------
 scripts/support/phrase2hiero.py   | 59 ++++++++++++++++++++++++++++++++++
 scripts/training/pipeline.pl      | 11 +++++--
 4 files changed, 77 insertions(+), 41 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8ebea388/scripts/support/grammar-packer.pl
----------------------------------------------------------------------
diff --git a/scripts/support/grammar-packer.pl b/scripts/support/grammar-packer.pl
index a0caf3c..47ce66f 100755
--- a/scripts/support/grammar-packer.pl
+++ b/scripts/support/grammar-packer.pl
@@ -12,6 +12,10 @@
 #    input-grammar is the input grammar to be packed
 #    output-dir is the packed grammar directory to write to (default: grammar.packed)
 #    packer-config is the packer config file (default: all floats)
+#
+# This script *requires* the grammar to be in Hiero format. If you are working with a
+# phrase-based model (either extracted from Thrax or from Moses) you must first convert it
+# to Hiero format using the script $JOSHUA/scripts/support/phrase2hiero.py.
 
 use strict;
 use warnings;
@@ -65,22 +69,11 @@ foreach my $grammar (@grammars) {
   my (undef,$sorted_grammar) = tempfile("${name}XXXX", DIR => $opts{T}, UNLINK => 1);
   print STDERR "Sorting grammar to $sorted_grammar...\n" if $opts{v};
 
-  # We need to sort by source side, which is field 1 (for phrase tables not listing the LHS)
-  # or field 2 (convention, Thrax format)
-  chomp(my $first_line = `$CAT $grammar | head -n1`);
-  if ($first_line =~ /^\[/) {
-    # regular grammar
-    if (system("$CAT $grammar | sed 's/ ||| /\t/g' | LC_ALL=C sort -t'\t' -k2,2 -k3,3 --buffer-size=$opts{m} -T $opts{T} | sed 's/\t/ ||| /g' | gzip -9n > $sorted_grammar")) {
-      print STDERR "* FATAL: Couldn't sort the grammar (not enough memory? short on tmp space?)\n";
-      exit 2;
-    }
-  } else {
-    # Moses phrase-based grammar -- prepend nonterminal symbol and -log() the weights
-    if (system("$CAT $grammar | sed 's/ ||| /\t/g' | LC_ALL=C sort -t'\t' -k1,1 -k2,2 --buffer-size=$opts{m} -T $opts{T} | sed 's/\t/ ||| /g' | gzip -9n > $sorted_grammar")) {
-      print STDERR "* FATAL: Couldn't sort the grammar (not enough memory? short on tmp space?)\n";
-      exit 2;
-    }
-  }  
+  # regular grammar
+  if (system("$CAT $grammar | sed 's/ ||| /\t/g' | LC_ALL=C sort -t'\t' -k2,2 -k3,3 --buffer-size=$opts{m} -T $opts{T} | sed 's/\t/ ||| /g' | gzip -9n > $sorted_grammar")) {
+    print STDERR "* FATAL: Couldn't sort the grammar (not enough memory? short on tmp space?)\n";
+    exit 2;
+  }
 
   push(@sorted_grammars, $sorted_grammar);
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8ebea388/scripts/support/phrase2hiero.pl
----------------------------------------------------------------------
diff --git a/scripts/support/phrase2hiero.pl b/scripts/support/phrase2hiero.pl
deleted file mode 100755
index 980d221..0000000
--- a/scripts/support/phrase2hiero.pl
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/perl -C31
-# Matt Post <po...@cs.jhu.edu>
-# June 2016
-
-# Prepends nonterminals to source and target side of phrase rules.
-# This allows them to be used in the phrase-based decoder.
-#
-# Usage: gzip -cd grammar.gz | phrase2hiero.pl | gzip -9n > grammar.new.gz
-
-use strict;
-use warnings;
-use File::Basename;
-use Getopt::Std;
-
-binmode STDOUT, ':utf8';
-binmode STDIN, ':utf8';
-
-while (my $line = <>) {
-  my @tokens = split(/ \|\|\| /, $line);
-  $tokens[1] = "[X,1] $tokens[1]";
-  $tokens[2] = "[X,1] $tokens[2]";
-  print join(" ||| ", @tokens);
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8ebea388/scripts/support/phrase2hiero.py
----------------------------------------------------------------------
diff --git a/scripts/support/phrase2hiero.py b/scripts/support/phrase2hiero.py
new file mode 100755
index 0000000..e741564
--- /dev/null
+++ b/scripts/support/phrase2hiero.py
@@ -0,0 +1,59 @@
+#!/usr/bin/python
+
+"""
+Prepends nonterminals to source and target side of phrase rules, and also
+increments the alignment points (if present) to match.
+This allows them to be used in the phrase-based decoder.
+
+Usage: gzip -cd grammar.gz | phrase2hiero.py [-moses] | gzip -9n > grammar.new.gz
+
+If you specify "-moses", it will also apply -log() to each of the model weights.
+
+Author: Matt Post <po...@cs.jhu.edu>
+Date:   June 2016
+"""
+
+import sys
+import math
+import codecs
+import argparse
+
+reload(sys)
+sys.setdefaultencoding('utf-8')
+sys.stdin = codecs.getreader('utf-8')(sys.stdin)
+sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
+sys.stdout.encoding = 'utf-8'
+
+def incr(alignment):
+    """Takes an alignment point (0-1) and increments both sides"""
+    points = alignment.split('-')
+    return '%d-%d' % (int(points[0]) + 1, int(points[1]) + 1)
+
+def maybelog(value):
+    """Takes a feature value and returns -log(x) if it is a scalar"""
+    try:
+        return str(-1.0 * math.log(float(value)))
+    except ValueError:
+        return value
+
+for line in sys.stdin:
+    moses = False
+
+    # Moses phrase tables do not have a left-hand side symbol, add that
+    if not line.startswith('['):
+        line = '[X] ||| ' + line
+        moses = True
+
+    # Get all the fields
+    tokens = line.split(r' ||| ')
+    tokens[1] = '[X,1] ' + tokens[1]
+    tokens[2] = '[X,1] ' + tokens[2]
+
+    # take the -log() of each input token
+    if moses and len(tokens) >= 4:
+        tokens[3] = ' '.join(map(maybelog, tokens[3].split(' ')))
+
+    if len(tokens) >= 5:
+        tokens[4] = ' '.join(map(incr, tokens[4].split(' ')))
+
+    print ' ||| '.join(tokens)

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8ebea388/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 286370f..3384221 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1120,7 +1120,14 @@ if (! defined $GRAMMAR_FILE) {
                     "model/phrase-table.gz",
         );
 
-    $GRAMMAR_FILE = "model/phrase-table.gz";
+    # Convert the model to Joshua format
+    $cachepipe->cmd("convert-moses-to-joshua",
+                    "$CAT model/phrase-table.gz | $SCRIPTDIR/support/phrase2hiero.py | gzip -9n > grammar.gz",
+                    "model/phrase-table.gz",
+                    "grammar.gz",
+        );
+
+    $GRAMMAR_FILE = "grammar.gz";
 
   } elsif ($GRAMMAR_TYPE eq "samt" or $GRAMMAR_TYPE eq "hiero" or $GRAMMAR_TYPE eq "phrase") {
 
@@ -1159,7 +1166,7 @@ if (! defined $GRAMMAR_FILE) {
 
     # Convert phrase model to hiero format (Thrax should do this!)
     if ($GRAMMAR_TYPE eq "phrase") {
-        system("mv grammar.gz grammar.tmp.gz; gzip -cd grammar.tmp.gz | $SCRIPTDIR/support/phrase2hiero.pl | gzip -9n > grammar.gz; rm -rf grammar.tmp.gz");
+        system("mv grammar.gz grammar.tmp.gz; gzip -cd grammar.tmp.gz | $SCRIPTDIR/support/phrase2hiero.py | gzip -9n > grammar.gz; rm -rf grammar.tmp.gz");
      }
   } else {