You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/05/18 13:13:38 UTC
incubator-joshua git commit: combined split2files implementations
Repository: incubator-joshua
Updated Branches:
refs/heads/master 09fb6a2d3 -> f02bd279e
combined split2files implementations
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/f02bd279
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/f02bd279
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/f02bd279
Branch: refs/heads/master
Commit: f02bd279e892408c9eca2a2a241f21f59cb105e9
Parents: 09fb6a2
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed May 18 09:12:07 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed May 18 09:12:07 2016 -0400
----------------------------------------------------------------------
scripts/support/split2files | 44 +++++++++++++++++++++++++++
scripts/support/splittabs.pl | 42 -------------------------
scripts/training/pipeline.pl | 8 ++---
scripts/training/split2files.pl | 38 -----------------------
scripts/training/trim_parallel_corpus.pl | 2 +-
5 files changed, 49 insertions(+), 85 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f02bd279/scripts/support/split2files
----------------------------------------------------------------------
diff --git a/scripts/support/split2files b/scripts/support/split2files
new file mode 100755
index 0000000..866ab0e
--- /dev/null
+++ b/scripts/support/split2files
@@ -0,0 +1,44 @@
+#!/usr/bin/perl
+
+# Reads any number of file names from the command line, then split()s
+# STDIN on tabs and writes them to those files. Example usage:
+#
+# paste file1 file2 file3 ... | splittabs file1.new file2.new file3.new.gz ...
+#
+# If there are more fields on STDIN that files on the command-line, the extra
+# fields are silently discarded.
+#
+# A common usage scenario is to paste together parallel lines and do some filtering,
+# then write out to a new set of files (thus retaining parallelization).
+
+use FileHandle;
+
+my @fh;
+$| = 1; # don't buffer output
+
+if (@ARGV < 0) {
+ print "Usage: cat tabbed-file | splittabs file1 [file2 [file3 ...]]\n";
+ exit;
+}
+
+my @fh = map { get_filehandle($_) } @ARGV;
+@ARGV = ();
+
+while (my $line = <>) {
+ chomp($line);
+ my (@fields) = split(/\t/, $line);
+
+ map { print {$fh[$_]} "$fields[$_]\n" } (0..$#fh);
+}
+
+sub get_filehandle {
+ my $file = shift;
+
+ if ($file eq "-") {
+ return *STDOUT;
+ } else {
+ local *FH;
+ open FH, ">$file" or die "can't open '$file' for writing";
+ return *FH;
+ }
+}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f02bd279/scripts/support/splittabs.pl
----------------------------------------------------------------------
diff --git a/scripts/support/splittabs.pl b/scripts/support/splittabs.pl
deleted file mode 100755
index db5d5d1..0000000
--- a/scripts/support/splittabs.pl
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/usr/bin/perl
-
-# Reads a stream from STDIN, splits them on tabs, and writes the fields to each of a series of
-# filenames, respectively, passed as arguments to the script. If a filename ends in .gz, output
-# will be compressed using gzip.
-
-use strict;
-use warnings;
-use FileHandle;
-
-$| = 1; # don't buffer output
-
-if (@ARGV <= 0) {
- print "Usage: cat tabbed-file | splittabs.pl file1 [file2 [file3 ...]]\n";
- exit;
-}
-
-my @fh = map { get_filehandle($_) } @ARGV;
-@ARGV = ();
-
-while (my $line = <>) {
- chomp($line);
- my (@fields) = split(/\t/,$line,scalar @fh);
-
- map { print {$fh[$_]} "$fields[$_]\n" } (0..$#fields);
-}
-
-sub get_filehandle {
- my $file = shift;
-
- if ($file eq "-") {
- return *STDOUT;
- } elsif ($file =~ /.gz$/) {
- local *FH;
- open FH, "| gzip -9n > $file" or die "can't open compressed file '$file' for writing";
- return *FH;
- } else {
- local *FH;
- open FH, ">$file" or die "can't open file '$file' for writing";
- return *FH;
- }
-}
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f02bd279/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index deb6ebc..ff9d762 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1031,7 +1031,7 @@ if (! defined $GRAMMAR_FILE) {
if ($GRAMMAR_TYPE eq "ghkm") {
if ($GHKM_EXTRACTOR eq "galley") {
$cachepipe->cmd("ghkm-extract",
- "java -Xmx4g -Xms4g -cp $JOSHUA/lib/ghkm-modified.jar:$JOSHUA/lib/fastutil.jar -XX:+UseCompressedOops edu.stanford.nlp.mt.syntax.ghkm.RuleExtractor -fCorpus $TRAIN{source} -eParsedCorpus $target_file -align $ALIGNMENT -threads $NUM_THREADS -joshuaFormat true -maxCompositions 1 -reversedAlignment false | $SCRIPTDIR/support/splittabs.pl ghkm-mapping.gz grammar.gz",
+ "java -Xmx4g -Xms4g -cp $JOSHUA/lib/ghkm-modified.jar:$JOSHUA/lib/fastutil.jar -XX:+UseCompressedOops edu.stanford.nlp.mt.syntax.ghkm.RuleExtractor -fCorpus $TRAIN{source} -eParsedCorpus $target_file -align $ALIGNMENT -threads $NUM_THREADS -joshuaFormat true -maxCompositions 1 -reversedAlignment false | $SCRIPTDIR/support/split2files ghkm-mapping.gz grammar.gz",
$ALIGNMENT,
"grammar.gz");
} elsif ($GHKM_EXTRACTOR eq "moses") {
@@ -1834,11 +1834,11 @@ sub prepare_data {
# only skip blank lines for training data
if ($label eq "train") {
$cachepipe->cmd("$label-copy-and-filter",
- "$PASTE $infiles | $SCRIPTDIR/training/filter-empty-lines.pl | $SCRIPTDIR/training/split2files.pl $outfiles",
+ "$PASTE $infiles | $SCRIPTDIR/training/filter-empty-lines.pl | $SCRIPTDIR/support/split2files $outfiles",
@indeps, @outfiles);
} else {
$cachepipe->cmd("$label-copy-and-filter",
- "$PASTE $infiles | $SCRIPTDIR/training/split2files.pl $outfiles",
+ "$PASTE $infiles | $SCRIPTDIR/support/split2files $outfiles",
@indeps, @outfiles);
}
# Done concatenating and filtering files
@@ -1884,7 +1884,7 @@ sub prepare_data {
# trim training data
$cachepipe->cmd("$label-trim",
- "$PASTE $infilelist | $SCRIPTDIR/training/trim_parallel_corpus.pl $maxlen | $SCRIPTDIR/training/split2files.pl $outfilelist",
+ "$PASTE $infilelist | $SCRIPTDIR/training/trim_parallel_corpus.pl $maxlen | $SCRIPTDIR/support/split2files $outfilelist",
@infiles,
@outfiles);
$prefix .= ".$maxlen";
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f02bd279/scripts/training/split2files.pl
----------------------------------------------------------------------
diff --git a/scripts/training/split2files.pl b/scripts/training/split2files.pl
deleted file mode 100755
index e44c913..0000000
--- a/scripts/training/split2files.pl
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/perl
-
-# Reads any number of file names from the command line, then split()s
-# STDIN on tabs and writes them to those files. Example usage:
-#
-# paste file1 file2 file3 ... | split2files.pl file1.new file2.new file3.new.gz ...
-#
-# A common usage scenario is to paste together parallel lines and do some filtering,
-# then write out to a new set of files (thus retaining parallelization).
-
-use strict;
-use warnings;
-
-my @files = @ARGV;
-@ARGV = ();
-
-my @fh;
-foreach my $file (@files) {
- my $fh;
- if ($file =~ /gz$/) {
- open $fh, "|gzip -9 > $file" or die "can't pipe through gzip";
- } else {
- open $fh, ">", $file or die "can't write to file '$file'";
- }
- push(@fh, $fh);
-}
-
-while (my $line = <>) {
- chomp($line);
-
- my @lines = split(/\t/, $line, scalar @files);
-
- map {
- print { $fh[$_] } "$lines[$_]\n";
- } (0..$#fh);
-}
-
-map { close($_) } @fh;
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/f02bd279/scripts/training/trim_parallel_corpus.pl
----------------------------------------------------------------------
diff --git a/scripts/training/trim_parallel_corpus.pl b/scripts/training/trim_parallel_corpus.pl
index 2248c0a..39b635c 100755
--- a/scripts/training/trim_parallel_corpus.pl
+++ b/scripts/training/trim_parallel_corpus.pl
@@ -5,7 +5,7 @@
# the first two fields has mroe than N tokens, the line is skipped.
# e.g.,
-# paste corpus.en corpus.fr | trim_parallel_corpus.pl 40 | split2files.pl en.trimmed.40 fr.trimmed.40
+# paste corpus.en corpus.fr | trim_parallel_corpus.pl 40 | splittabs en.trimmed.40 fr.trimmed.40
my $thresh = shift || 100;