You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/07/25 14:25:35 UTC
[6/6] incubator-joshua git commit: Reverting commit
e957a126f0bacd6c5aa1f8aa8513fc0f252403a8
Reverting commit e957a126f0bacd6c5aa1f8aa8513fc0f252403a8
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/80c17ef6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/80c17ef6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/80c17ef6
Branch: refs/heads/master
Commit: 80c17ef6a35f1e707cc224051389e680e129f859
Parents: b2b6480
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Jul 22 19:36:19 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Jul 22 19:36:19 2016 -0400
----------------------------------------------------------------------
scripts/training/pipeline.pl | 17 +++++++++++------
1 file changed, 11 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/80c17ef6/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 68b18f6..ea617bc 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1837,9 +1837,16 @@ sub prepare_data {
my $infiles = join(" ", @infiles);
my $outfiles = join(" ", @outfiles);
- $cachepipe->cmd("$label-copy-and-filter",
- "$PASTE $infiles | $SCRIPTDIR/support/split2files $outfiles",
- @indeps, @outfiles);
+ # only skip blank lines for training data
+ if ($label ne "test") {
+ $cachepipe->cmd("$label-copy-and-filter",
+ "$PASTE $infiles | $SCRIPTDIR/training/filter-empty-lines.pl | $SCRIPTDIR/support/split2files $outfiles",
+ @indeps, @outfiles);
+ } else {
+ $cachepipe->cmd("$label-copy-and-filter",
+ "$PASTE $infiles | $SCRIPTDIR/support/split2files $outfiles",
+ @indeps, @outfiles);
+ }
# Done concatenating and filtering files
@@ -1857,11 +1864,9 @@ sub prepare_data {
} else {
my $TOKENIZER = ($lang eq $SOURCE) ? $TOKENIZER_SOURCE : $TOKENIZER_TARGET;
- # Normalization can delete lines, so they might need to be filtered out
- my $maybe_filter = ($label eq "test") ? "" : "| grep -v ^\$";
my $ext = $lang; $ext =~ s/\.\d//;
$cachepipe->cmd("$label-tokenize-$lang",
- "$CAT $DATA_DIRS{$label}/$prefix.$lang | $NORMALIZER $ext | $TOKENIZER -l $ext 2> /dev/null $maybe_filter > $DATA_DIRS{$label}/$prefix.tok.$lang",
+ "$CAT $DATA_DIRS{$label}/$prefix.$lang | $NORMALIZER $ext | $TOKENIZER -l $ext 2> /dev/null > $DATA_DIRS{$label}/$prefix.tok.$lang",
"$DATA_DIRS{$label}/$prefix.$lang", "$DATA_DIRS{$label}/$prefix.tok.$lang");
}