You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/06/08 15:30:41 UTC

[1/2] incubator-joshua git commit: pipeline now allows multiple --tune args

Repository: incubator-joshua
Updated Branches:
  refs/heads/master 6fc831ea8 -> 3171b6a8c


pipeline now allows multiple --tune args


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/cd68a9e4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/cd68a9e4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/cd68a9e4

Branch: refs/heads/master
Commit: cd68a9e4300aeecd357cf554b81af5a248f9eb1f
Parents: 6fc831e
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Jun 8 09:55:38 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed Jun 8 09:55:38 2016 -0400

----------------------------------------------------------------------
 scripts/training/pipeline.pl | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/cd68a9e4/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 6e4f68c..38cb379 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -62,7 +62,7 @@ delete $ENV{GREP_OPTIONS};
 
 die not_defined("JAVA_HOME") unless exists $ENV{JAVA_HOME};
 
-my (@CORPORA,$TUNE,$TEST,$ALIGNMENT,$SOURCE,$TARGET,@LMFILES,$GRAMMAR_FILE,$GLUE_GRAMMAR_FILE,$_TUNE_GRAMMAR_FILE,$_TEST_GRAMMAR_FILE,$THRAX_CONF_FILE, $_JOSHUA_CONFIG, $_JOSHUA_ARGS);
+my (@CORPORA,@TUNE,$TEST,$ALIGNMENT,$SOURCE,$TARGET,@LMFILES,$GRAMMAR_FILE,$GLUE_GRAMMAR_FILE,$_TUNE_GRAMMAR_FILE,$_TEST_GRAMMAR_FILE,$THRAX_CONF_FILE, $_JOSHUA_CONFIG, $_JOSHUA_ARGS);
 my $FIRST_STEP = "SUBSAMPLE";
 my $LAST_STEP  = "LAST";
 my $LMFILTER = "$ENV{HOME}/code/filter/filter";
@@ -241,7 +241,7 @@ my $retval = GetOptions(
   "readme=s"    => \$README,
   "corpus=s"        => \@CORPORA,
   "parsed-corpus=s"   => \$PARSED_CORPUS,
-  "tune=s"          => \$TUNE,
+  "tune=s"          => \@TUNE,
   "test=s"            => \$TEST,
   "prepare!"          => \$DO_PREPARE_CORPORA,
   "aligner=s"         => \$ALIGNER,
@@ -434,9 +434,9 @@ if (@CORPORA == 0 and $STEPS{$FIRST_STEP} < $STEPS{TUNE}) {
 }
 
 # make sure a tuning corpus was provided if we're doing tuning
-if (! defined $TUNE and ($STEPS{$FIRST_STEP} <= $STEPS{TUNE}
+if (scalar(@TUNE) == 0 and ($STEPS{$FIRST_STEP} <= $STEPS{TUNE}
                          and $STEPS{$LAST_STEP} >= $STEPS{TUNE})) { 
-  print "* FATAL: need a tuning set (--tune)\n";
+  print "* FATAL: need at least one tuning set (--tune)\n";
   exit 1;
 }
 
@@ -504,7 +504,9 @@ map {
 } (0..$#CORPORA);
 
 # Do the same for tuning and test data, and other files
-$TUNE = get_absolute_path($TUNE);
+map {
+  $TUNE[$_] = get_absolute_path($TUNE[$_]);
+} (0..$#TUNE);
 $TEST = get_absolute_path($TEST);
 
 $GRAMMAR_FILE = get_absolute_path($GRAMMAR_FILE);
@@ -609,9 +611,9 @@ if (defined $PARSED_CORPUS) {
   $TRAIN{parsed} = get_absolute_path($PARSED_CORPUS);
 }
 
-if ($TUNE) {
-  $TUNE{source} = "$TUNE.$SOURCE";
-  $TUNE{target} = "$TUNE.$TARGET";
+if (scalar(@TUNE) > 0) {
+  $TUNE{source} = "$TUNE[0].$SOURCE";
+  $TUNE{target} = "$TUNE[0].$TARGET";
 
   if (! -e "$TUNE{source}") {
     print "* FATAL: couldn't find tune source file at '$TUNE{source}'\n";
@@ -675,8 +677,8 @@ if (@CORPORA > 0) {
 }
 
 # prepare the tuning and development data
-if (defined $TUNE) {
-  my $prefixes = prepare_data("tune",[$TUNE],$MAXLEN_TUNE);
+if (@TUNE > 0) {
+  my $prefixes = prepare_data("tune",\@TUNE,$MAXLEN_TUNE);
   $TUNE{source} = "$DATA_DIRS{tune}/corpus.$SOURCE";
   $TUNE{target} = "$DATA_DIRS{tune}/corpus.$TARGET";
   my $ner_return = ner_annotate("$TUNE{source}", "$TUNE{source}.ner", $SOURCE);
@@ -1180,7 +1182,7 @@ TUNE:
 
 # prep the tuning data, unless already prepped
 if (! $PREPPED{TUNE}) {
-  my $prefixes = prepare_data("tune",[$TUNE],$MAXLEN_TUNE);
+  my $prefixes = prepare_data("tune",\@TUNE,$MAXLEN_TUNE);
   $TUNE{source} = "$DATA_DIRS{tune}/$prefixes->{lowercased}.$SOURCE";
   $TUNE{target} = "$DATA_DIRS{tune}/$prefixes->{lowercased}.$TARGET";
   $PREPPED{TUNE} = 1;
@@ -1509,7 +1511,7 @@ if (defined $TUNE_GRAMMAR) {
   } elsif (-e "tune/model/$basename.packed") {
     $TUNE_GRAMMAR = "tune/model/$basename.packed";
   } else {
-    print STDERR "* FATAL: tune model bundling didn't produce a grammar?";
+    print STDERR "* FATAL: tune model bundling didn't produce a grammar?\n";
     exit 1;
   }
 }


[2/2] incubator-joshua git commit: added whole-grammar packing

Posted by mj...@apache.org.
added whole-grammar packing


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/3171b6a8
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/3171b6a8
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/3171b6a8

Branch: refs/heads/master
Commit: 3171b6a8ca1d68c90f22b4c7d89d0e8d74fdba1b
Parents: cd68a9e
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Jun 8 11:30:25 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed Jun 8 11:30:25 2016 -0400

----------------------------------------------------------------------
 scripts/training/pipeline.pl | 11 +++++++++++
 1 file changed, 11 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/3171b6a8/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 38cb379..7ecfed9 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1172,6 +1172,17 @@ if (! defined $GRAMMAR_FILE) {
   }
 }
 
+# Pack the entire model! Saves filtering and repacking of tuning and test sets
+if ($DO_PACK_GRAMMARS and ! $DO_FILTER_TM) {
+  $cachepipe->cmd("pack-grammar",
+                  "$SCRIPTDIR/support/grammar-packer.pl -a -T $TMPDIR -m $PACKER_MEM -g $GRAMMAR_FILE -o $RUNDIR/grammar.packed",
+                  "$RUNDIR/grammar.packed/vocabulary",
+                  "$RUNDIR/grammar.packed/encoding",
+                  "$RUNDIR/grammar.packed/slice_00000.source");
+  $GRAMMAR_FILE = "$RUNDIR/grammar.packed";
+}
+
+
 maybe_quit("THRAX");
 maybe_quit("GRAMMAR");
 maybe_quit("MODEL");