You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/05/09 04:05:49 UTC
incubator-joshua git commit: added --maxlines to pipeline script

Repository: incubator-joshua
Updated Branches:
  refs/heads/master aa25ab975 -> 16b1c8e6a


added --maxlines to pipeline script


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/16b1c8e6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/16b1c8e6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/16b1c8e6

Branch: refs/heads/master
Commit: 16b1c8e6af09cd4f142ac12473430e5bbd6e1b8b
Parents: aa25ab9
Author: Matt Post <po...@cs.jhu.edu>
Authored: Mon May 9 00:05:41 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Mon May 9 00:05:41 2016 -0400

----------------------------------------------------------------------
 scripts/training/pipeline.pl | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/16b1c8e6/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 373145f..c33d54b 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -63,6 +63,9 @@ my $MAXSPAN = 20;
 my $MAXLEN_TUNE = 0;
 my $MAXLEN_TEST = 0;
 
+# Maximum number of lines from any single corpus
+my $MAXLINES = 0;
+
 # when doing phrase-based decoding, the maximum length of a phrase (source side)
 my $MAX_PHRASE_LEN = 5;
 
@@ -261,6 +264,7 @@ my $retval = GetOptions(
   "maxlen=i"        => \$MAXLEN,
   "maxlen-tune=i"        => \$MAXLEN_TUNE,
   "maxlen-test=i"        => \$MAXLEN_TEST,
+  "maxlines=i"        => \$MAXLINES,
   "tokenizer-source=s"      => \$TOKENIZER_SOURCE,
   "tokenizer-target=s"      => \$TOKENIZER_TARGET,
   "normalizer=s"      => \$NORMALIZER,
@@ -1831,10 +1835,10 @@ sub prepare_data {
   foreach my $ext (@exts) {
     my @files =  map { "$_.$ext" } @$corpora;
     push(@indeps, @files);
-    if (@files > 1) {
-      push(@infiles, "<(cat " . join(" ", @files) . ")");
+    if ($MAXLINES != 0) {
+      push(@infiles, "<(head -n $MAXLINES " . join(" ", @files) . ")");
     } else {
-      push(@infiles, $files[0]);
+      push(@infiles, "<(cat " . join(" ", @files) . ")");
     }
     push (@outfiles, "$DATA_DIRS{$label}/$label.$ext");
   }