You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/10/15 12:31:27 UTC
[1/3] incubator-joshua git commit: filtering now only happens if a
6th field is present
Repository: incubator-joshua
Updated Branches:
refs/heads/master 990711b1d -> 3b6f7e811
filtering now only happens if a 6th field is present
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/3a6e9169
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/3a6e9169
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/3a6e9169
Branch: refs/heads/master
Commit: 3a6e91698ff94406afc1b638a853986d380a1c5e
Parents: 990711b
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Oct 14 16:02:47 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Oct 14 16:02:47 2016 -0400
----------------------------------------------------------------------
scripts/training/filter-rules.pl | 48 ++++++++++++++++++++---------------
scripts/training/pipeline.pl | 2 +-
2 files changed, 29 insertions(+), 21 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3a6e9169/scripts/training/filter-rules.pl
----------------------------------------------------------------------
diff --git a/scripts/training/filter-rules.pl b/scripts/training/filter-rules.pl
index 68b88ee..bebf02e 100755
--- a/scripts/training/filter-rules.pl
+++ b/scripts/training/filter-rules.pl
@@ -7,25 +7,24 @@
# of thousands of translation options, due to garbage collection (Moore, 2004), all of
# which are then retained. These can be filtered out by this script, which will reduce
# the grammar to contain only the top 100 translation options (by count) for each source
-# side. You just need to provide the field that contains the "Rarity Penalty" computed
-# by thrax. This is field 3 (0-indexed) by default. To filter in this way:
+# side. This only works for grammars that have a 6th |||-delimited field containing
+# rule counts:
#
-# gzip -cd grammar.gz | filter-rules.pl -t 100 -f 3 | gzip -9n > grammar-filtered.gz
+# gzip -cd grammar.gz | filter-rules.pl -t 100 | gzip -9n > grammar-filtered.gz
#
-# You can also filter by using the model weights, say after tuning:
+# If you don't have that field, but do have a tuned model, you can also filter by using that,
+# say after tuning:
#
-# gzip -cd grammar.gz | filter-rules.pl -t 100 -c /path/to/joshua.config -o pt ...
+# gzip -cd grammar.gz | filter-rules.pl -t 100 -c /path/to/joshua.config -o pt ...
#
-# Really this should just be built into Thrax, which could use the rarity penalty there.
+# Really this should all just be built into thrax.
use strict;
use warnings;
use List::Util qw/max sum/;
use Getopt::Std;
-my %opts = (
- f => 3, # default field for rarity penalty is 3 (0-indexed)
-);
+my %opts = ();
my $ret = getopts("bps:uvc:t:o:f:", \%opts);
if (!$ret) {
@@ -35,8 +34,7 @@ if (!$ret) {
print " -s SCOPE: remove rules with scope > SCOPE (Hopkins & Langmead, 2010)\n";
print " -u: remove abstract unary rules\n";
print " -v: be verbose\n";
- print " -t: only include top N candidates (requires either -f or (-c and -o)\n";
- print " -f: rarity penalty field to use when filtering (index or name) to -t without -c (default:3)\n";
+ print " -t: only include top N candidates\n";
print " -c: path to joshua config file\n";
print " -o: grammar owner (required for -t)\n";
exit;
@@ -151,19 +149,29 @@ sub filter_and_print_rules {
@filtered_rules = splice(@sorted_rules, 0, $opts{t});
$SKIPPED{redundant} += scalar(@sorted_rules) - scalar(@filtered_rules);
- } elsif ($opts{t} and $opts{f}) {
+ } elsif ($opts{t}) {
# Filter using field f (0-indexed), which is assumed to be the rarity penalty field
- my %rarities;
+ my %counts;
foreach my $rule (@rules) {
my @tokens = split(/ \|\|\| /, $rule);
- my $features = $tokens[3];
- my @features = split(" ", $features);
- my $rarity = $features[$opts{f}] || 1.0;
- $rarities{$rule} = 1-log($rarity); # Thrax sets rarity = exp(1-count(e,f)), sigh
+ if (@tokens < 6) {
+ print STDERR "* WARNING, no counts present in field, not filtering\n";
+ delete $opts{t};
+ last;
+ }
+ my $countstr = $tokens[5];
+ my @counts = split(" ", $countstr);
+ my $count = $counts[0];
+ $counts{$rule} = $count;
+ }
+
+ if (len(keys(%counts)) > 0) {
+ my @sorted_rules = sort { $counts{$b} <=> $counts{$a} } keys(%counts);
+ @filtered_rules = splice(@sorted_rules, 0, $opts{t});
+ $SKIPPED{redundant} += scalar(@sorted_rules) - scalar(@filtered_rules);
+ } else {
+ @filtered_rules = @rules;
}
- my @sorted_rules = sort { $rarities{$b} <=> $rarities{$a} } keys(%rarities);
- @filtered_rules = splice(@sorted_rules, 0, $opts{t});
- $SKIPPED{redundant} += scalar(@sorted_rules) - scalar(@filtered_rules);
} else {
@filtered_rules = @rules;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3a6e9169/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 7b55b34..6c4d154 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1154,7 +1154,7 @@ if (! defined $GRAMMAR_FILE) {
system("mv $thrax_file.tmp $thrax_file");
$cachepipe->cmd("thrax-run",
- "hadoop jar $THRAX/bin/thrax.jar -D mapreduce.task.timeout=0 -D mapreduce.map.java.opts='-Xmx$HADOOP_MEM' -D mapreduce.reduce.java.opts='-Xmx$HADOOP_MEM' -D hadoop.tmp.dir=$TMPDIR $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; hadoop fs -cat $THRAXDIR/final/* | tee grammar-unfiltered.gz | gzip -cd | $JOSHUA/scripts/training/filter-rules.pl -t 100 -f 3 | gzip -9n > grammar.gz",
+ "hadoop jar $THRAX/bin/thrax.jar -D mapreduce.task.timeout=0 -D mapreduce.map.java.opts='-Xmx$HADOOP_MEM' -D mapreduce.reduce.java.opts='-Xmx$HADOOP_MEM' -D hadoop.tmp.dir=$TMPDIR $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; hadoop fs -cat $THRAXDIR/final/* | gzip -cd | $JOSHUA/scripts/training/filter-rules.pl -t 100 | gzip -9n > grammar.gz",
"$DATA_DIRS{train}/thrax-input-file",
$thrax_file,
"grammar.gz");
[3/3] incubator-joshua git commit: bugfix: len -> scalar
Posted by mj...@apache.org.
bugfix: len -> scalar
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/3b6f7e81
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/3b6f7e81
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/3b6f7e81
Branch: refs/heads/master
Commit: 3b6f7e811184cdfa1b6d8fe7552126217668abe1
Parents: 6256361
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sat Oct 15 08:31:17 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sat Oct 15 08:31:17 2016 -0400
----------------------------------------------------------------------
scripts/training/filter-rules.pl | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3b6f7e81/scripts/training/filter-rules.pl
----------------------------------------------------------------------
diff --git a/scripts/training/filter-rules.pl b/scripts/training/filter-rules.pl
index bebf02e..1240f64 100755
--- a/scripts/training/filter-rules.pl
+++ b/scripts/training/filter-rules.pl
@@ -165,7 +165,7 @@ sub filter_and_print_rules {
$counts{$rule} = $count;
}
- if (len(keys(%counts)) > 0) {
+ if (scalar(keys(%counts)) > 0) {
my @sorted_rules = sort { $counts{$b} <=> $counts{$a} } keys(%counts);
@filtered_rules = splice(@sorted_rules, 0, $opts{t});
$SKIPPED{redundant} += scalar(@sorted_rules) - scalar(@filtered_rules);
[2/3] incubator-joshua git commit: bugfix: don't delete temp file
before using
Posted by mj...@apache.org.
bugfix: don't delete temp file before using
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/62563615
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/62563615
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/62563615
Branch: refs/heads/master
Commit: 625636152416e1eca9e21a84859ebed237b9af29
Parents: 3a6e916
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sat Oct 15 08:30:51 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sat Oct 15 08:30:51 2016 -0400
----------------------------------------------------------------------
scripts/training/run_thrax.py | 5 +++--
1 file changed, 3 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/62563615/scripts/training/run_thrax.py
----------------------------------------------------------------------
diff --git a/scripts/training/run_thrax.py b/scripts/training/run_thrax.py
index 4457245..1356b05 100755
--- a/scripts/training/run_thrax.py
+++ b/scripts/training/run_thrax.py
@@ -17,7 +17,7 @@
# limitations under the License.
#
"""
-Runs the Z-MERT and PRO tuners.
+Runs Thrax.
"""
from __future__ import print_function
from itertools import izip
@@ -91,7 +91,7 @@ paste(args.source_corpus, args.target_corpus, args.alignment_file, thrax_file)
run('%s/bin/hadoop fs -put %s %s/input-file' % (HADOOP, thrax_file, THRAXDIR))
# Copy the template
-conf_file = tempfile.NamedTemporaryFile(prefix='thrax.conf')
+conf_file = tempfile.NamedTemporaryFile(prefix='thrax.conf', delete=False)
for line in open(args.thrax_config):
if not line.startswith('input-file'):
conf_file.write(line)
@@ -106,5 +106,6 @@ run('%s/bin/hadoop fs -getmerge %s/final/ %s' % (HADOOP, THRAXDIR, args.output_f
# Cleanup
if not args.debug:
+ os.remove(conf_file)
os.remove(thrax_file)
run('%s/bin/hadoop fs -rm -r %s' % (HADOOP, THRAXDIR))