You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/07/25 14:25:35 UTC
[6/6] incubator-joshua git commit: Reverting commit e957a126f0bacd6c5aa1f8aa8513fc0f252403a8

Reverting commit e957a126f0bacd6c5aa1f8aa8513fc0f252403a8


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/80c17ef6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/80c17ef6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/80c17ef6

Branch: refs/heads/master
Commit: 80c17ef6a35f1e707cc224051389e680e129f859
Parents: b2b6480
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Jul 22 19:36:19 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Jul 22 19:36:19 2016 -0400

----------------------------------------------------------------------
 scripts/training/pipeline.pl | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/80c17ef6/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 68b18f6..ea617bc 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1837,9 +1837,16 @@ sub prepare_data {
 
   my $infiles =  join(" ", @infiles);
   my $outfiles = join(" ", @outfiles);
-  $cachepipe->cmd("$label-copy-and-filter",
-                  "$PASTE $infiles | $SCRIPTDIR/support/split2files $outfiles",
-                  @indeps, @outfiles);
+  # only skip blank lines for training data
+  if ($label ne "test") {
+    $cachepipe->cmd("$label-copy-and-filter",
+                    "$PASTE $infiles | $SCRIPTDIR/training/filter-empty-lines.pl | $SCRIPTDIR/support/split2files $outfiles",
+                    @indeps, @outfiles);
+  } else {
+    $cachepipe->cmd("$label-copy-and-filter",
+                    "$PASTE $infiles | $SCRIPTDIR/support/split2files $outfiles",
+                    @indeps, @outfiles);
+  }
 
   # Done concatenating and filtering files
 
@@ -1857,11 +1864,9 @@ sub prepare_data {
         } else {
           my $TOKENIZER = ($lang eq $SOURCE) ? $TOKENIZER_SOURCE : $TOKENIZER_TARGET;
 
-          # Normalization can delete lines, so they might need to be filtered out
-          my $maybe_filter = ($label eq "test") ? "" : "| grep -v ^\$";
           my $ext = $lang; $ext =~ s/\.\d//;
           $cachepipe->cmd("$label-tokenize-$lang",
-                          "$CAT $DATA_DIRS{$label}/$prefix.$lang | $NORMALIZER $ext | $TOKENIZER -l $ext 2> /dev/null $maybe_filter > $DATA_DIRS{$label}/$prefix.tok.$lang",
+                          "$CAT $DATA_DIRS{$label}/$prefix.$lang | $NORMALIZER $ext | $TOKENIZER -l $ext 2> /dev/null > $DATA_DIRS{$label}/$prefix.tok.$lang",
                           "$DATA_DIRS{$label}/$prefix.$lang", "$DATA_DIRS{$label}/$prefix.tok.$lang");
         }