You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/08 15:30:41 UTC
[1/2] incubator-joshua git commit: pipeline now allows multiple
--tune args
Repository: incubator-joshua
Updated Branches:
refs/heads/master 6fc831ea8 -> 3171b6a8c
pipeline now allows multiple --tune args
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/cd68a9e4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/cd68a9e4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/cd68a9e4
Branch: refs/heads/master
Commit: cd68a9e4300aeecd357cf554b81af5a248f9eb1f
Parents: 6fc831e
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Jun 8 09:55:38 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed Jun 8 09:55:38 2016 -0400
----------------------------------------------------------------------
scripts/training/pipeline.pl | 26 ++++++++++++++------------
1 file changed, 14 insertions(+), 12 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cd68a9e4/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 6e4f68c..38cb379 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -62,7 +62,7 @@ delete $ENV{GREP_OPTIONS};
die not_defined("JAVA_HOME") unless exists $ENV{JAVA_HOME};
-my (@CORPORA,$TUNE,$TEST,$ALIGNMENT,$SOURCE,$TARGET,@LMFILES,$GRAMMAR_FILE,$GLUE_GRAMMAR_FILE,$_TUNE_GRAMMAR_FILE,$_TEST_GRAMMAR_FILE,$THRAX_CONF_FILE, $_JOSHUA_CONFIG, $_JOSHUA_ARGS);
+my (@CORPORA,@TUNE,$TEST,$ALIGNMENT,$SOURCE,$TARGET,@LMFILES,$GRAMMAR_FILE,$GLUE_GRAMMAR_FILE,$_TUNE_GRAMMAR_FILE,$_TEST_GRAMMAR_FILE,$THRAX_CONF_FILE, $_JOSHUA_CONFIG, $_JOSHUA_ARGS);
my $FIRST_STEP = "SUBSAMPLE";
my $LAST_STEP = "LAST";
my $LMFILTER = "$ENV{HOME}/code/filter/filter";
@@ -241,7 +241,7 @@ my $retval = GetOptions(
"readme=s" => \$README,
"corpus=s" => \@CORPORA,
"parsed-corpus=s" => \$PARSED_CORPUS,
- "tune=s" => \$TUNE,
+ "tune=s" => \@TUNE,
"test=s" => \$TEST,
"prepare!" => \$DO_PREPARE_CORPORA,
"aligner=s" => \$ALIGNER,
@@ -434,9 +434,9 @@ if (@CORPORA == 0 and $STEPS{$FIRST_STEP} < $STEPS{TUNE}) {
}
# make sure a tuning corpus was provided if we're doing tuning
-if (! defined $TUNE and ($STEPS{$FIRST_STEP} <= $STEPS{TUNE}
+if (scalar(@TUNE) == 0 and ($STEPS{$FIRST_STEP} <= $STEPS{TUNE}
and $STEPS{$LAST_STEP} >= $STEPS{TUNE})) {
- print "* FATAL: need a tuning set (--tune)\n";
+ print "* FATAL: need at least one tuning set (--tune)\n";
exit 1;
}
@@ -504,7 +504,9 @@ map {
} (0..$#CORPORA);
# Do the same for tuning and test data, and other files
-$TUNE = get_absolute_path($TUNE);
+map {
+ $TUNE[$_] = get_absolute_path($TUNE[$_]);
+} (0..$#TUNE);
$TEST = get_absolute_path($TEST);
$GRAMMAR_FILE = get_absolute_path($GRAMMAR_FILE);
@@ -609,9 +611,9 @@ if (defined $PARSED_CORPUS) {
$TRAIN{parsed} = get_absolute_path($PARSED_CORPUS);
}
-if ($TUNE) {
- $TUNE{source} = "$TUNE.$SOURCE";
- $TUNE{target} = "$TUNE.$TARGET";
+if (scalar(@TUNE) > 0) {
+ $TUNE{source} = "$TUNE[0].$SOURCE";
+ $TUNE{target} = "$TUNE[0].$TARGET";
if (! -e "$TUNE{source}") {
print "* FATAL: couldn't find tune source file at '$TUNE{source}'\n";
@@ -675,8 +677,8 @@ if (@CORPORA > 0) {
}
# prepare the tuning and development data
-if (defined $TUNE) {
- my $prefixes = prepare_data("tune",[$TUNE],$MAXLEN_TUNE);
+if (@TUNE > 0) {
+ my $prefixes = prepare_data("tune",\@TUNE,$MAXLEN_TUNE);
$TUNE{source} = "$DATA_DIRS{tune}/corpus.$SOURCE";
$TUNE{target} = "$DATA_DIRS{tune}/corpus.$TARGET";
my $ner_return = ner_annotate("$TUNE{source}", "$TUNE{source}.ner", $SOURCE);
@@ -1180,7 +1182,7 @@ TUNE:
# prep the tuning data, unless already prepped
if (! $PREPPED{TUNE}) {
- my $prefixes = prepare_data("tune",[$TUNE],$MAXLEN_TUNE);
+ my $prefixes = prepare_data("tune",\@TUNE,$MAXLEN_TUNE);
$TUNE{source} = "$DATA_DIRS{tune}/$prefixes->{lowercased}.$SOURCE";
$TUNE{target} = "$DATA_DIRS{tune}/$prefixes->{lowercased}.$TARGET";
$PREPPED{TUNE} = 1;
@@ -1509,7 +1511,7 @@ if (defined $TUNE_GRAMMAR) {
} elsif (-e "tune/model/$basename.packed") {
$TUNE_GRAMMAR = "tune/model/$basename.packed";
} else {
- print STDERR "* FATAL: tune model bundling didn't produce a grammar?";
+ print STDERR "* FATAL: tune model bundling didn't produce a grammar?\n";
exit 1;
}
}
[2/2] incubator-joshua git commit: added whole-grammar packing
Posted by mj...@apache.org.
added whole-grammar packing
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/3171b6a8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/3171b6a8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/3171b6a8
Branch: refs/heads/master
Commit: 3171b6a8ca1d68c90f22b4c7d89d0e8d74fdba1b
Parents: cd68a9e
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Jun 8 11:30:25 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed Jun 8 11:30:25 2016 -0400
----------------------------------------------------------------------
scripts/training/pipeline.pl | 11 +++++++++++
1 file changed, 11 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3171b6a8/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 38cb379..7ecfed9 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1172,6 +1172,17 @@ if (! defined $GRAMMAR_FILE) {
}
}
+# Pack the entire model! Saves filtering and repacking of tuning and test sets
+if ($DO_PACK_GRAMMARS and ! $DO_FILTER_TM) {
+ $cachepipe->cmd("pack-grammar",
+ "$SCRIPTDIR/support/grammar-packer.pl -a -T $TMPDIR -m $PACKER_MEM -g $GRAMMAR_FILE -o $RUNDIR/grammar.packed",
+ "$RUNDIR/grammar.packed/vocabulary",
+ "$RUNDIR/grammar.packed/encoding",
+ "$RUNDIR/grammar.packed/slice_00000.source");
+ $GRAMMAR_FILE = "$RUNDIR/grammar.packed";
+}
+
+
maybe_quit("THRAX");
maybe_quit("GRAMMAR");
maybe_quit("MODEL");