You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/05/18 13:13:38 UTC
incubator-joshua git commit: combined split2files implementations

Repository: incubator-joshua
Updated Branches:
  refs/heads/master 09fb6a2d3 -> f02bd279e


combined split2files implementations


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/f02bd279
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/f02bd279
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/f02bd279

Branch: refs/heads/master
Commit: f02bd279e892408c9eca2a2a241f21f59cb105e9
Parents: 09fb6a2
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed May 18 09:12:07 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed May 18 09:12:07 2016 -0400

----------------------------------------------------------------------
 scripts/support/split2files              | 44 +++++++++++++++++++++++++++
 scripts/support/splittabs.pl             | 42 -------------------------
 scripts/training/pipeline.pl             |  8 ++---
 scripts/training/split2files.pl          | 38 -----------------------
 scripts/training/trim_parallel_corpus.pl |  2 +-
 5 files changed, 49 insertions(+), 85 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f02bd279/scripts/support/split2files
----------------------------------------------------------------------
diff --git a/scripts/support/split2files b/scripts/support/split2files
new file mode 100755
index 0000000..866ab0e
--- /dev/null
+++ b/scripts/support/split2files
@@ -0,0 +1,44 @@
+#!/usr/bin/perl
+
+# Reads any number of file names from the command line, then split()s
+# STDIN on tabs and writes them to those files. Example usage:
+#
+# paste file1 file2 file3 ... | splittabs file1.new file2.new file3.new.gz ...
+#
+# If there are more fields on STDIN that files on the command-line, the extra
+# fields are silently discarded.
+#
+# A common usage scenario is to paste together parallel lines and do some filtering,
+# then write out to a new set of files (thus retaining parallelization).
+
+use FileHandle;
+
+my @fh;
+$| = 1;   # don't buffer output
+
+if (@ARGV < 0) {
+  print "Usage: cat tabbed-file | splittabs file1 [file2 [file3 ...]]\n";
+  exit;
+}
+
+my @fh = map { get_filehandle($_) } @ARGV;
+@ARGV = ();
+
+while (my $line = <>) {
+  chomp($line);
+  my (@fields) = split(/\t/, $line);
+  
+  map { print {$fh[$_]} "$fields[$_]\n" } (0..$#fh);
+}
+
+sub get_filehandle {
+    my $file = shift;
+
+    if ($file eq "-") {
+	return *STDOUT;
+    } else {
+	local *FH;
+	open FH, ">$file" or die "can't open '$file' for writing";
+	return *FH;
+    }
+}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f02bd279/scripts/support/splittabs.pl
----------------------------------------------------------------------
diff --git a/scripts/support/splittabs.pl b/scripts/support/splittabs.pl
deleted file mode 100755
index db5d5d1..0000000
--- a/scripts/support/splittabs.pl
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/perl
-
-# Reads a stream from STDIN, splits them on tabs, and writes the fields to each of a series of
-# filenames, respectively, passed as arguments to the script.  If a filename ends in .gz, output
-# will be compressed using gzip.
-
-use strict;
-use warnings;
-use FileHandle;
-
-$| = 1;   # don't buffer output
-
-if (@ARGV <= 0) {
-  print "Usage: cat tabbed-file | splittabs.pl file1 [file2 [file3 ...]]\n";
-  exit;
-}
-
-my @fh = map { get_filehandle($_) } @ARGV;
-@ARGV = ();
-
-while (my $line = <>) {
-  chomp($line);
-  my (@fields) = split(/\t/,$line,scalar @fh);
-  
-  map { print {$fh[$_]} "$fields[$_]\n" } (0..$#fields);
-}
-
-sub get_filehandle {
-  my $file = shift;
-
-  if ($file eq "-") {
-    return *STDOUT;
-  } elsif ($file =~ /.gz$/) {
-    local *FH;
-    open FH, "| gzip -9n > $file" or die "can't open compressed file '$file' for writing";
-    return *FH;
-  } else {
-    local *FH;
-    open FH, ">$file" or die "can't open file '$file' for writing";
-    return *FH;
-  }
-}

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f02bd279/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index deb6ebc..ff9d762 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1031,7 +1031,7 @@ if (! defined $GRAMMAR_FILE) {
   if ($GRAMMAR_TYPE eq "ghkm") {
     if ($GHKM_EXTRACTOR eq "galley") {
       $cachepipe->cmd("ghkm-extract",
-                      "java -Xmx4g -Xms4g -cp $JOSHUA/lib/ghkm-modified.jar:$JOSHUA/lib/fastutil.jar -XX:+UseCompressedOops edu.stanford.nlp.mt.syntax.ghkm.RuleExtractor -fCorpus $TRAIN{source} -eParsedCorpus $target_file -align $ALIGNMENT -threads $NUM_THREADS -joshuaFormat true -maxCompositions 1 -reversedAlignment false | $SCRIPTDIR/support/splittabs.pl ghkm-mapping.gz grammar.gz",
+                      "java -Xmx4g -Xms4g -cp $JOSHUA/lib/ghkm-modified.jar:$JOSHUA/lib/fastutil.jar -XX:+UseCompressedOops edu.stanford.nlp.mt.syntax.ghkm.RuleExtractor -fCorpus $TRAIN{source} -eParsedCorpus $target_file -align $ALIGNMENT -threads $NUM_THREADS -joshuaFormat true -maxCompositions 1 -reversedAlignment false | $SCRIPTDIR/support/split2files ghkm-mapping.gz grammar.gz",
                       $ALIGNMENT,
                       "grammar.gz");
     } elsif ($GHKM_EXTRACTOR eq "moses") {
@@ -1834,11 +1834,11 @@ sub prepare_data {
   # only skip blank lines for training data
   if ($label eq "train") {
     $cachepipe->cmd("$label-copy-and-filter",
-                    "$PASTE $infiles | $SCRIPTDIR/training/filter-empty-lines.pl | $SCRIPTDIR/training/split2files.pl $outfiles",
+                    "$PASTE $infiles | $SCRIPTDIR/training/filter-empty-lines.pl | $SCRIPTDIR/support/split2files $outfiles",
                     @indeps, @outfiles);
   } else {
     $cachepipe->cmd("$label-copy-and-filter",
-                    "$PASTE $infiles | $SCRIPTDIR/training/split2files.pl $outfiles",
+                    "$PASTE $infiles | $SCRIPTDIR/support/split2files $outfiles",
                     @indeps, @outfiles);
   }
   # Done concatenating and filtering files
@@ -1884,7 +1884,7 @@ sub prepare_data {
 
       # trim training data
       $cachepipe->cmd("$label-trim",
-                      "$PASTE $infilelist | $SCRIPTDIR/training/trim_parallel_corpus.pl $maxlen | $SCRIPTDIR/training/split2files.pl $outfilelist",
+                      "$PASTE $infilelist | $SCRIPTDIR/training/trim_parallel_corpus.pl $maxlen | $SCRIPTDIR/support/split2files $outfilelist",
                       @infiles,
                       @outfiles);
       $prefix .= ".$maxlen";

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f02bd279/scripts/training/split2files.pl
----------------------------------------------------------------------
diff --git a/scripts/training/split2files.pl b/scripts/training/split2files.pl
deleted file mode 100755
index e44c913..0000000
--- a/scripts/training/split2files.pl
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/perl
-
-# Reads any number of file names from the command line, then split()s
-# STDIN on tabs and writes them to those files. Example usage:
-#
-# paste file1 file2 file3 ... | split2files.pl file1.new file2.new file3.new.gz ...
-#
-# A common usage scenario is to paste together parallel lines and do some filtering,
-# then write out to a new set of files (thus retaining parallelization).
-
-use strict;
-use warnings;
-
-my @files = @ARGV;
-@ARGV = ();
-
-my @fh;
-foreach my $file (@files) {
-  my $fh;
-  if ($file =~ /gz$/) {
-	open $fh, "|gzip -9 > $file" or die "can't pipe through gzip";
-  } else {
-	open $fh, ">", $file or die "can't write to file '$file'";
-  }
-  push(@fh, $fh);
-}
-
-while (my $line = <>) {
-  chomp($line);
-
-  my @lines = split(/\t/, $line, scalar @files);
-
-  map {
-  	print { $fh[$_] } "$lines[$_]\n";
-  } (0..$#fh);
-}
-
-map { close($_) } @fh;

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f02bd279/scripts/training/trim_parallel_corpus.pl
----------------------------------------------------------------------
diff --git a/scripts/training/trim_parallel_corpus.pl b/scripts/training/trim_parallel_corpus.pl
index 2248c0a..39b635c 100755
--- a/scripts/training/trim_parallel_corpus.pl
+++ b/scripts/training/trim_parallel_corpus.pl
@@ -5,7 +5,7 @@
 # the first two fields has mroe than N tokens, the line is skipped.
 
 # e.g.,
-# paste corpus.en corpus.fr | trim_parallel_corpus.pl 40 | split2files.pl en.trimmed.40 fr.trimmed.40
+# paste corpus.en corpus.fr | trim_parallel_corpus.pl 40 | splittabs en.trimmed.40 fr.trimmed.40
 
 my $thresh = shift || 100;