You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/07/12 19:35:40 UTC
incubator-joshua git commit: bugfix: alignment points were not being
adjusted when converting phrase-based grammars
Repository: incubator-joshua
Updated Branches:
refs/heads/master 5d1fc27dd -> 8ebea3881
bugfix: alignment points were not being adjusted when converting phrase-based grammars
Also simplified the pipeline for converting Moses and Thrax phrase tables into Joshua's Hiero format
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/8ebea388
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/8ebea388
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/8ebea388
Branch: refs/heads/master
Commit: 8ebea3881c740dccec1ef02a4da227ca441e60e9
Parents: 5d1fc27
Author: Matt Post <po...@cs.jhu.edu>
Authored: Tue Jul 12 15:35:37 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Tue Jul 12 15:35:37 2016 -0400
----------------------------------------------------------------------
scripts/support/grammar-packer.pl | 25 ++++++--------
scripts/support/phrase2hiero.pl | 23 -------------
scripts/support/phrase2hiero.py | 59 ++++++++++++++++++++++++++++++++++
scripts/training/pipeline.pl | 11 +++++--
4 files changed, 77 insertions(+), 41 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8ebea388/scripts/support/grammar-packer.pl
----------------------------------------------------------------------
diff --git a/scripts/support/grammar-packer.pl b/scripts/support/grammar-packer.pl
index a0caf3c..47ce66f 100755
--- a/scripts/support/grammar-packer.pl
+++ b/scripts/support/grammar-packer.pl
@@ -12,6 +12,10 @@
# input-grammar is the input grammar to be packed
# output-dir is the packed grammar directory to write to (default: grammar.packed)
# packer-config is the packer config file (default: all floats)
+#
+# This script *requires* the grammar to be in Hiero format. If you are working with a
+# phrase-based model (either extracted from Thrax or from Moses) you must first convert it
+# to Hiero format using the script $JOSHUA/scripts/support/phrase2hiero.py.
use strict;
use warnings;
@@ -65,22 +69,11 @@ foreach my $grammar (@grammars) {
my (undef,$sorted_grammar) = tempfile("${name}XXXX", DIR => $opts{T}, UNLINK => 1);
print STDERR "Sorting grammar to $sorted_grammar...\n" if $opts{v};
- # We need to sort by source side, which is field 1 (for phrase tables not listing the LHS)
- # or field 2 (convention, Thrax format)
- chomp(my $first_line = `$CAT $grammar | head -n1`);
- if ($first_line =~ /^\[/) {
- # regular grammar
- if (system("$CAT $grammar | sed 's/ ||| /\t/g' | LC_ALL=C sort -t'\t' -k2,2 -k3,3 --buffer-size=$opts{m} -T $opts{T} | sed 's/\t/ ||| /g' | gzip -9n > $sorted_grammar")) {
- print STDERR "* FATAL: Couldn't sort the grammar (not enough memory? short on tmp space?)\n";
- exit 2;
- }
- } else {
- # Moses phrase-based grammar -- prepend nonterminal symbol and -log() the weights
- if (system("$CAT $grammar | sed 's/ ||| /\t/g' | LC_ALL=C sort -t'\t' -k1,1 -k2,2 --buffer-size=$opts{m} -T $opts{T} | sed 's/\t/ ||| /g' | gzip -9n > $sorted_grammar")) {
- print STDERR "* FATAL: Couldn't sort the grammar (not enough memory? short on tmp space?)\n";
- exit 2;
- }
- }
+ # regular grammar
+ if (system("$CAT $grammar | sed 's/ ||| /\t/g' | LC_ALL=C sort -t'\t' -k2,2 -k3,3 --buffer-size=$opts{m} -T $opts{T} | sed 's/\t/ ||| /g' | gzip -9n > $sorted_grammar")) {
+ print STDERR "* FATAL: Couldn't sort the grammar (not enough memory? short on tmp space?)\n";
+ exit 2;
+ }
push(@sorted_grammars, $sorted_grammar);
}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8ebea388/scripts/support/phrase2hiero.pl
----------------------------------------------------------------------
diff --git a/scripts/support/phrase2hiero.pl b/scripts/support/phrase2hiero.pl
deleted file mode 100755
index 980d221..0000000
--- a/scripts/support/phrase2hiero.pl
+++ /dev/null
@@ -1,23 +0,0 @@
-#!/usr/bin/perl -C31
-# Matt Post <po...@cs.jhu.edu>
-# June 2016
-
-# Prepends nonterminals to source and target side of phrase rules.
-# This allows them to be used in the phrase-based decoder.
-#
-# Usage: gzip -cd grammar.gz | phrase2hiero.pl | gzip -9n > grammar.new.gz
-
-use strict;
-use warnings;
-use File::Basename;
-use Getopt::Std;
-
-binmode STDOUT, ':utf8';
-binmode STDIN, ':utf8';
-
-while (my $line = <>) {
- my @tokens = split(/ \|\|\| /, $line);
- $tokens[1] = "[X,1] $tokens[1]";
- $tokens[2] = "[X,1] $tokens[2]";
- print join(" ||| ", @tokens);
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8ebea388/scripts/support/phrase2hiero.py
----------------------------------------------------------------------
diff --git a/scripts/support/phrase2hiero.py b/scripts/support/phrase2hiero.py
new file mode 100755
index 0000000..e741564
--- /dev/null
+++ b/scripts/support/phrase2hiero.py
@@ -0,0 +1,59 @@
+#!/usr/bin/python
+
+"""
+Prepends nonterminals to source and target side of phrase rules, and also
+increments the alignment points (if present) to match.
+This allows them to be used in the phrase-based decoder.
+
+Usage: gzip -cd grammar.gz | phrase2hiero.py [-moses] | gzip -9n > grammar.new.gz
+
+If you specify "-moses", it will also apply -log() to each of the model weights.
+
+Author: Matt Post <po...@cs.jhu.edu>
+Date: June 2016
+"""
+
+import sys
+import math
+import codecs
+import argparse
+
+reload(sys)
+sys.setdefaultencoding('utf-8')
+sys.stdin = codecs.getreader('utf-8')(sys.stdin)
+sys.stdout = codecs.getwriter('utf-8')(sys.stdout)
+sys.stdout.encoding = 'utf-8'
+
+def incr(alignment):
+ """Takes an alignment point (0-1) and increments both sides"""
+ points = alignment.split('-')
+ return '%d-%d' % (int(points[0]) + 1, int(points[1]) + 1)
+
+def maybelog(value):
+ """Takes a feature value and returns -log(x) if it is a scalar"""
+ try:
+ return str(-1.0 * math.log(float(value)))
+ except ValueError:
+ return value
+
+for line in sys.stdin:
+ moses = False
+
+ # Moses phrase tables do not have a left-hand side symbol, add that
+ if not line.startswith('['):
+ line = '[X] ||| ' + line
+ moses = True
+
+ # Get all the fields
+ tokens = line.split(r' ||| ')
+ tokens[1] = '[X,1] ' + tokens[1]
+ tokens[2] = '[X,1] ' + tokens[2]
+
+ # take the -log() of each input token
+ if moses and len(tokens) >= 4:
+ tokens[3] = ' '.join(map(maybelog, tokens[3].split(' ')))
+
+ if len(tokens) >= 5:
+ tokens[4] = ' '.join(map(incr, tokens[4].split(' ')))
+
+ print ' ||| '.join(tokens)
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/8ebea388/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 286370f..3384221 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1120,7 +1120,14 @@ if (! defined $GRAMMAR_FILE) {
"model/phrase-table.gz",
);
- $GRAMMAR_FILE = "model/phrase-table.gz";
+ # Convert the model to Joshua format
+ $cachepipe->cmd("convert-moses-to-joshua",
+ "$CAT model/phrase-table.gz | $SCRIPTDIR/support/phrase2hiero.py | gzip -9n > grammar.gz",
+ "model/phrase-table.gz",
+ "grammar.gz",
+ );
+
+ $GRAMMAR_FILE = "grammar.gz";
} elsif ($GRAMMAR_TYPE eq "samt" or $GRAMMAR_TYPE eq "hiero" or $GRAMMAR_TYPE eq "phrase") {
@@ -1159,7 +1166,7 @@ if (! defined $GRAMMAR_FILE) {
# Convert phrase model to hiero format (Thrax should do this!)
if ($GRAMMAR_TYPE eq "phrase") {
- system("mv grammar.gz grammar.tmp.gz; gzip -cd grammar.tmp.gz | $SCRIPTDIR/support/phrase2hiero.pl | gzip -9n > grammar.gz; rm -rf grammar.tmp.gz");
+ system("mv grammar.gz grammar.tmp.gz; gzip -cd grammar.tmp.gz | $SCRIPTDIR/support/phrase2hiero.py | gzip -9n > grammar.gz; rm -rf grammar.tmp.gz");
}
} else {