You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/03 20:53:00 UTC

incubator-joshua git commit: Added script to convert Thrax-extracted phrase tables to Hiero format

Repository: incubator-joshua
Updated Branches:
  refs/heads/master 8ce27c973 -> d49477e9d


Added script to convert Thrax-extracted phrase tables to Hiero format


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/d49477e9
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/d49477e9
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/d49477e9

Branch: refs/heads/master
Commit: d49477e9d59cb832a24ace28e3c26e04c368599b
Parents: 8ce27c9
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Jun 3 16:52:56 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Jun 3 16:52:56 2016 -0400

----------------------------------------------------------------------
 scripts/support/phrase2hiero.pl | 23 +++++++++++++++++++++++
 scripts/training/pipeline.pl    |  5 +++++
 2 files changed, 28 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/d49477e9/scripts/support/phrase2hiero.pl
----------------------------------------------------------------------
diff --git a/scripts/support/phrase2hiero.pl b/scripts/support/phrase2hiero.pl
new file mode 100755
index 0000000..fb2ddb9
--- /dev/null
+++ b/scripts/support/phrase2hiero.pl
@@ -0,0 +1,23 @@
+#!/usr/bin/env perl -C31
+# Matt Post <po...@cs.jhu.edu>
+# June 2016
+
+# Prepends nonterminals to source and target side of phrase rules.
+# This allows them to be used in the phrase-based decoder.
+#
+# Usage: gzip -cd grammar.gz | phrase2hiero.pl | gzip -9n > grammar.new.gz
+
+use strict;
+use warnings;
+use File::Basename;
+use Getopt::Std;
+
+binmode STDOUT, ':utf8';
+binmode STDIN, ':utf8';
+
+while (my $line = <>) {
+  my @tokens = split(/ \|\|\| /, $line);
+  $tokens[1] = "[X,1] $tokens[1]";
+  $tokens[2] = "[X,1] $tokens[2]";
+  print join(" ||| ", @tokens);
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/d49477e9/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index b0ec4ae..6e4f68c 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1154,6 +1154,11 @@ if (! defined $GRAMMAR_FILE) {
 #perl -pi -e 's/\.?0+\b//g' grammar; 
 
     $GRAMMAR_FILE = "grammar.gz";
+
+    # Convert phrase model to hiero format (Thrax should do this!)
+    if ($GRAMMAR_TYPE eq "phrase") {
+        system("mv grammar.gz grammar.tmp.gz; gzip -cd grammar.tmp.gz | $SCRIPTDIR/support/phrase2hiero.pl | gzip -9n > grammar.gz; rm -rf grammar.tmp.gz");
+     }
   } else {
 
     print STDERR "* FATAL: There was no way to build a grammar, and none was passed in\n";