You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/10/15 12:31:27 UTC

[1/3] incubator-joshua git commit: filtering now only happens if a 6th field is present

Repository: incubator-joshua
Updated Branches:
  refs/heads/master 990711b1d -> 3b6f7e811


filtering now only happens if a 6th field is present


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/3a6e9169
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/3a6e9169
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/3a6e9169

Branch: refs/heads/master
Commit: 3a6e91698ff94406afc1b638a853986d380a1c5e
Parents: 990711b
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Oct 14 16:02:47 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Oct 14 16:02:47 2016 -0400

----------------------------------------------------------------------
 scripts/training/filter-rules.pl | 48 ++++++++++++++++++++---------------
 scripts/training/pipeline.pl     |  2 +-
 2 files changed, 29 insertions(+), 21 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3a6e9169/scripts/training/filter-rules.pl
----------------------------------------------------------------------
diff --git a/scripts/training/filter-rules.pl b/scripts/training/filter-rules.pl
index 68b88ee..bebf02e 100755
--- a/scripts/training/filter-rules.pl
+++ b/scripts/training/filter-rules.pl
@@ -7,25 +7,24 @@
 # of thousands of translation options, due to garbage collection (Moore, 2004), all of
 # which are then retained. These can be filtered out by this script, which will reduce
 # the grammar to contain only the top 100 translation options (by count) for each source
-# side. You just need to provide the field that contains the "Rarity Penalty" computed
-# by thrax. This is field 3 (0-indexed) by default. To filter in this way:
+# side. This only works for grammars that have a 6th |||-delimited field containing
+# rule counts:
 #
-# gzip -cd grammar.gz | filter-rules.pl -t 100 -f 3 | gzip -9n > grammar-filtered.gz
+#   gzip -cd grammar.gz | filter-rules.pl -t 100 | gzip -9n > grammar-filtered.gz
 #
-# You can also filter by using the model weights, say after tuning:
+# If you don't have that field, but do have a tuned model, you can also filter by using that,
+# say after tuning:
 #
-# gzip -cd grammar.gz | filter-rules.pl -t 100 -c /path/to/joshua.config -o pt ...
+#   gzip -cd grammar.gz | filter-rules.pl -t 100 -c /path/to/joshua.config -o pt ...
 #
-# Really this should just be built into Thrax, which could use the rarity penalty there.
+# Really this should all just be built into thrax.
 
 use strict;
 use warnings;
 use List::Util qw/max sum/;
 use Getopt::Std;
 
-my %opts = ( 
-  f => 3, # default field for rarity penalty is 3 (0-indexed)
-);
+my %opts = ();
 my $ret = getopts("bps:uvc:t:o:f:", \%opts);
 
 if (!$ret) {
@@ -35,8 +34,7 @@ if (!$ret) {
   print "   -s SCOPE: remove rules with scope > SCOPE (Hopkins & Langmead, 2010)\n";
   print "   -u: remove abstract unary rules\n";
   print "   -v: be verbose\n";
-  print "   -t: only include top N candidates (requires either -f or (-c and -o)\n";
-  print "   -f: rarity penalty field to use when filtering (index or name) to -t without -c (default:3)\n";
+  print "   -t: only include top N candidates\n";
   print "   -c: path to joshua config file\n";
   print "   -o: grammar owner (required for -t)\n";
   exit;
@@ -151,19 +149,29 @@ sub filter_and_print_rules {
     @filtered_rules = splice(@sorted_rules, 0, $opts{t});
     $SKIPPED{redundant} += scalar(@sorted_rules) - scalar(@filtered_rules);
 
-  } elsif ($opts{t} and $opts{f}) {
+  } elsif ($opts{t}) {
     # Filter using field f (0-indexed), which is assumed to be the rarity penalty field
-    my %rarities;
+    my %counts;
     foreach my $rule (@rules) {
       my @tokens = split(/ \|\|\| /, $rule);
-      my $features = $tokens[3];
-      my @features = split(" ", $features);
-      my $rarity = $features[$opts{f}] || 1.0;
-      $rarities{$rule} = 1-log($rarity); # Thrax sets rarity = exp(1-count(e,f)), sigh
+      if (@tokens < 6) {
+        print STDERR "* WARNING, no counts present in field, not filtering\n";
+        delete $opts{t};
+        last;
+      }
+      my $countstr = $tokens[5];
+      my @counts = split(" ", $countstr);
+      my $count = $counts[0];
+      $counts{$rule} = $count;
+    }
+
+    if (len(keys(%counts)) > 0) {
+      my @sorted_rules = sort { $counts{$b} <=> $counts{$a} } keys(%counts);
+      @filtered_rules = splice(@sorted_rules, 0, $opts{t});
+      $SKIPPED{redundant} += scalar(@sorted_rules) - scalar(@filtered_rules);
+    } else {
+      @filtered_rules = @rules;
     }
-    my @sorted_rules = sort { $rarities{$b} <=> $rarities{$a} } keys(%rarities);
-    @filtered_rules = splice(@sorted_rules, 0, $opts{t});
-    $SKIPPED{redundant} += scalar(@sorted_rules) - scalar(@filtered_rules);
 
   } else {
     @filtered_rules = @rules;

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3a6e9169/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 7b55b34..6c4d154 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1154,7 +1154,7 @@ if (! defined $GRAMMAR_FILE) {
     system("mv $thrax_file.tmp $thrax_file");
 
     $cachepipe->cmd("thrax-run",
-                    "hadoop jar $THRAX/bin/thrax.jar -D mapreduce.task.timeout=0 -D mapreduce.map.java.opts='-Xmx$HADOOP_MEM' -D mapreduce.reduce.java.opts='-Xmx$HADOOP_MEM' -D hadoop.tmp.dir=$TMPDIR $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; hadoop fs -cat $THRAXDIR/final/* | tee grammar-unfiltered.gz | gzip -cd | $JOSHUA/scripts/training/filter-rules.pl -t 100 -f 3 | gzip -9n > grammar.gz",
+                    "hadoop jar $THRAX/bin/thrax.jar -D mapreduce.task.timeout=0 -D mapreduce.map.java.opts='-Xmx$HADOOP_MEM' -D mapreduce.reduce.java.opts='-Xmx$HADOOP_MEM' -D hadoop.tmp.dir=$TMPDIR $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; hadoop fs -cat $THRAXDIR/final/* | gzip -cd | $JOSHUA/scripts/training/filter-rules.pl -t 100 | gzip -9n > grammar.gz",
                     "$DATA_DIRS{train}/thrax-input-file",
                     $thrax_file,
                     "grammar.gz");


[3/3] incubator-joshua git commit: bugfix: len -> scalar

Posted by mj...@apache.org.
bugfix: len -> scalar


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/3b6f7e81
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/3b6f7e81
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/3b6f7e81

Branch: refs/heads/master
Commit: 3b6f7e811184cdfa1b6d8fe7552126217668abe1
Parents: 6256361
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sat Oct 15 08:31:17 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sat Oct 15 08:31:17 2016 -0400

----------------------------------------------------------------------
 scripts/training/filter-rules.pl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3b6f7e81/scripts/training/filter-rules.pl
----------------------------------------------------------------------
diff --git a/scripts/training/filter-rules.pl b/scripts/training/filter-rules.pl
index bebf02e..1240f64 100755
--- a/scripts/training/filter-rules.pl
+++ b/scripts/training/filter-rules.pl
@@ -165,7 +165,7 @@ sub filter_and_print_rules {
       $counts{$rule} = $count;
     }
 
-    if (len(keys(%counts)) > 0) {
+    if (scalar(keys(%counts)) > 0) {
       my @sorted_rules = sort { $counts{$b} <=> $counts{$a} } keys(%counts);
       @filtered_rules = splice(@sorted_rules, 0, $opts{t});
       $SKIPPED{redundant} += scalar(@sorted_rules) - scalar(@filtered_rules);


[2/3] incubator-joshua git commit: bugfix: don't delete temp file before using

Posted by mj...@apache.org.
bugfix: don't delete temp file before using


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/62563615
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/62563615
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/62563615

Branch: refs/heads/master
Commit: 625636152416e1eca9e21a84859ebed237b9af29
Parents: 3a6e916
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sat Oct 15 08:30:51 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sat Oct 15 08:30:51 2016 -0400

----------------------------------------------------------------------
 scripts/training/run_thrax.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/62563615/scripts/training/run_thrax.py
----------------------------------------------------------------------
diff --git a/scripts/training/run_thrax.py b/scripts/training/run_thrax.py
index 4457245..1356b05 100755
--- a/scripts/training/run_thrax.py
+++ b/scripts/training/run_thrax.py
@@ -17,7 +17,7 @@
 # limitations under the License.
 #
 """
-Runs the Z-MERT and PRO tuners.
+Runs Thrax.
 """
 from __future__ import print_function
 from itertools import izip
@@ -91,7 +91,7 @@ paste(args.source_corpus, args.target_corpus, args.alignment_file, thrax_file)
 run('%s/bin/hadoop fs -put %s %s/input-file' % (HADOOP, thrax_file, THRAXDIR))
 
 # Copy the template
-conf_file = tempfile.NamedTemporaryFile(prefix='thrax.conf')
+conf_file = tempfile.NamedTemporaryFile(prefix='thrax.conf', delete=False)
 for line in open(args.thrax_config):
     if not line.startswith('input-file'):
         conf_file.write(line)
@@ -106,5 +106,6 @@ run('%s/bin/hadoop fs -getmerge %s/final/ %s' % (HADOOP, THRAXDIR, args.output_f
 
 # Cleanup
 if not args.debug:
+    os.remove(conf_file)
     os.remove(thrax_file)
     run('%s/bin/hadoop fs -rm -r %s' % (HADOOP, THRAXDIR))