You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/05/09 04:05:49 UTC
incubator-joshua git commit: added --maxlines to pipeline script
Repository: incubator-joshua
Updated Branches:
refs/heads/master aa25ab975 -> 16b1c8e6a
added --maxlines to pipeline script
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/16b1c8e6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/16b1c8e6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/16b1c8e6
Branch: refs/heads/master
Commit: 16b1c8e6af09cd4f142ac12473430e5bbd6e1b8b
Parents: aa25ab9
Author: Matt Post <po...@cs.jhu.edu>
Authored: Mon May 9 00:05:41 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Mon May 9 00:05:41 2016 -0400
----------------------------------------------------------------------
scripts/training/pipeline.pl | 10 +++++++---
1 file changed, 7 insertions(+), 3 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/16b1c8e6/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 373145f..c33d54b 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -63,6 +63,9 @@ my $MAXSPAN = 20;
my $MAXLEN_TUNE = 0;
my $MAXLEN_TEST = 0;
+# Maximum number of lines from any single corpus
+my $MAXLINES = 0;
+
# when doing phrase-based decoding, the maximum length of a phrase (source side)
my $MAX_PHRASE_LEN = 5;
@@ -261,6 +264,7 @@ my $retval = GetOptions(
"maxlen=i" => \$MAXLEN,
"maxlen-tune=i" => \$MAXLEN_TUNE,
"maxlen-test=i" => \$MAXLEN_TEST,
+ "maxlines=i" => \$MAXLINES,
"tokenizer-source=s" => \$TOKENIZER_SOURCE,
"tokenizer-target=s" => \$TOKENIZER_TARGET,
"normalizer=s" => \$NORMALIZER,
@@ -1831,10 +1835,10 @@ sub prepare_data {
foreach my $ext (@exts) {
my @files = map { "$_.$ext" } @$corpora;
push(@indeps, @files);
- if (@files > 1) {
- push(@infiles, "<(cat " . join(" ", @files) . ")");
+ if ($MAXLINES != 0) {
+ push(@infiles, "<(head -n $MAXLINES " . join(" ", @files) . ")");
} else {
- push(@infiles, $files[0]);
+ push(@infiles, "<(cat " . join(" ", @files) . ")");
}
push (@outfiles, "$DATA_DIRS{$label}/$label.$ext");
}