You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/07/25 14:25:33 UTC
[4/6] incubator-joshua git commit: bugfix: move empty line filtering to after normalization

bugfix: move empty line filtering to after normalization

normalization recently added deleting of certain non-printing UTF8
characters, which means blank lines can be created by normalization,
so we have to remove them afterward instead of before (for both
train and dev, since Moses kbmira croaks on blank outputs)


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/e957a126
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/e957a126
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/e957a126

Branch: refs/heads/master
Commit: e957a126f0bacd6c5aa1f8aa8513fc0f252403a8
Parents: ce27c2d
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Jul 15 11:03:11 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Jul 15 11:03:11 2016 -0400

----------------------------------------------------------------------
 scripts/training/pipeline.pl | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e957a126/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index bd90707..68b18f6 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1837,16 +1837,10 @@ sub prepare_data {
 
   my $infiles =  join(" ", @infiles);
   my $outfiles = join(" ", @outfiles);
-  # only skip blank lines for training data
-  if ($label ne "test") {
-    $cachepipe->cmd("$label-copy-and-filter",
-                    "$PASTE $infiles | $SCRIPTDIR/training/filter-empty-lines.pl | $SCRIPTDIR/support/split2files $outfiles",
-                    @indeps, @outfiles);
-  } else {
-    $cachepipe->cmd("$label-copy-and-filter",
-                    "$PASTE $infiles | $SCRIPTDIR/support/split2files $outfiles",
-                    @indeps, @outfiles);
-  }
+  $cachepipe->cmd("$label-copy-and-filter",
+                  "$PASTE $infiles | $SCRIPTDIR/support/split2files $outfiles",
+                  @indeps, @outfiles);
+
   # Done concatenating and filtering files
 
   # record where the concatenated input files were
@@ -1862,9 +1856,12 @@ sub prepare_data {
           system("cp $DATA_DIRS{$label}/$prefix.$lang $DATA_DIRS{$label}/$prefix.tok.$lang");
         } else {
           my $TOKENIZER = ($lang eq $SOURCE) ? $TOKENIZER_SOURCE : $TOKENIZER_TARGET;
+
+          # Normalization can delete lines, so they might need to be filtered out
+          my $maybe_filter = ($label eq "test") ? "" : "| grep -v ^\$";
           my $ext = $lang; $ext =~ s/\.\d//;
           $cachepipe->cmd("$label-tokenize-$lang",
-                          "$CAT $DATA_DIRS{$label}/$prefix.$lang | $NORMALIZER $ext | $TOKENIZER -l $ext 2> /dev/null > $DATA_DIRS{$label}/$prefix.tok.$lang",
+                          "$CAT $DATA_DIRS{$label}/$prefix.$lang | $NORMALIZER $ext | $TOKENIZER -l $ext 2> /dev/null $maybe_filter > $DATA_DIRS{$label}/$prefix.tok.$lang",
                           "$DATA_DIRS{$label}/$prefix.$lang", "$DATA_DIRS{$label}/$prefix.tok.$lang");
         }