You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/08/29 17:42:40 UTC

[04/10] incubator-joshua git commit: updated format of corpus splits to get berkeley aligner working again

updated format of corpus splits to get berkeley aligner working again

I'm not sure why, but the Berkeley Aligner broke. It seems that the jar
file that used to be included with Joshua was an old version, despite the
fact that the Berkeley Aligner itself hasn't been updated for almost
a decade. This change introduces some minor differences that get it working
again.


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/38eebb3b
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/38eebb3b
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/38eebb3b

Branch: refs/heads/master
Commit: 38eebb3b58375d0da584f470de66df68476ab938
Parents: fb5d35d
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Aug 24 16:16:29 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed Aug 24 16:16:29 2016 -0400

----------------------------------------------------------------------
 scripts/training/paralign.pl                       | 12 ++++++------
 scripts/training/pipeline.pl                       | 17 +++++++----------
 .../training/templates/alignment/word-align.conf   |  2 +-
 3 files changed, 14 insertions(+), 17 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/38eebb3b/scripts/training/paralign.pl
----------------------------------------------------------------------
diff --git a/scripts/training/paralign.pl b/scripts/training/paralign.pl
index 2f04fc1..8b0fd28 100755
--- a/scripts/training/paralign.pl
+++ b/scripts/training/paralign.pl
@@ -54,9 +54,9 @@ sub run_giza {
   my ($chunkdir,$chunkno,$do_parallel) = @_;
   my $parallel = ($do_parallel == 1) ? "-parallel" : "";
   $cachepipe->cmd("giza-$chunkno",
-                  "rm -f $chunkdir/corpus.0-0.*; $args{giza_trainer} --root-dir $chunkdir -e $args{target}.$chunkno -f $args{source}.$chunkno -corpus $args{train_dir}/splits/corpus -merge $args{giza_merge} $parallel > $chunkdir/giza.log 2>&1",
-                  "$args{train_dir}/splits/corpus.$args{source}.$chunkno",
-                  "$args{train_dir}/splits/corpus.$args{target}.$chunkno",
+                  "rm -f $chunkdir/corpus.0-0.*; $args{giza_trainer} --root-dir $chunkdir -e $args{target} -f $args{source} -corpus $args{train_dir}/splits/$chunkno/corpus -merge $args{giza_merge} $parallel > $chunkdir/giza.log 2>&1",
+                  "$args{train_dir}/splits/$chunkno/corpus.$args{source}",
+                  "$args{train_dir}/splits/$chunkno/corpus.$args{target}",
                   "$chunkdir/model/aligned.$args{giza_merge}");
 }
 
@@ -67,8 +67,8 @@ sub run_berkeley_aligner {
   open FROM, $aligner_conf or die "can't read berkeley alignment template";
   open TO, ">", "alignments/$chunkno/word-align.conf" or die "can't write to 'alignments/$chunkno/word-align.conf'";
   while (<FROM>) {
-    s/<SOURCE>/$args{source}.$chunkno/g;
-    s/<TARGET>/$args{target}.$chunkno/g;
+    s/<SOURCE>/$args{source}/g;
+    s/<TARGET>/$args{target}/g;
     s/<CHUNK>/$chunkno/g;
     s/<TRAIN_DIR>/$args{train_dir}/g;
     print TO;
@@ -91,5 +91,5 @@ sub run_jacana_aligner {
 
   # run the job
   $cachepipe->cmd("jacana-aligner-chunk-$chunkno",
-                  "java -d64 -Xmx$args{aligner_mem} -DJACANA_HOME=$jacana_home -jar $JOSHUA/lib/jacana-xy.jar -m $jacana_home/resources/model/fr-en.model -src fr -tgt en -a $args{train_dir}/splits/corpus.$args{source}.$chunkno -b $args{train_dir}/splits/corpus.$args{target}.$chunkno -o $chunkdir/training.align");
+                  "java -d64 -Xmx$args{aligner_mem} -DJACANA_HOME=$jacana_home -jar $JOSHUA/lib/jacana-xy.jar -m $jacana_home/resources/model/fr-en.model -src fr -tgt en -a $args{train_dir}/splits/$chunkno/corpus.$args{source} -b $args{train_dir}/splits/$chunkno/corpus.$args{target} -o $chunkdir/training.align");
 }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/38eebb3b/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 08933ec..c0e33d3 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -797,8 +797,11 @@ if (! defined $ALIGNMENT) {
 		if ($chunk != $lastchunk) {
 			close CHUNK_SOURCE;
 			close CHUNK_TARGET;
-			open CHUNK_SOURCE, ">", "$DATA_DIRS{train}/splits/corpus.$SOURCE.$chunk" or die;
-			open CHUNK_TARGET, ">", "$DATA_DIRS{train}/splits/corpus.$TARGET.$chunk" or die;
+
+      mkdir("$DATA_DIRS{train}/splits/$chunk");
+
+			open CHUNK_SOURCE, ">", "$DATA_DIRS{train}/splits/$chunk/corpus.$SOURCE" or die;
+			open CHUNK_TARGET, ">", "$DATA_DIRS{train}/splits/$chunk/corpus.$TARGET" or die;
 
 			$lastchunk = $chunk;
 		}
@@ -817,13 +820,7 @@ if (! defined $ALIGNMENT) {
   #   $max_aligner_threads /= 2;
   # }
 
-  # # With multi-threading, we can use a pool to set up concurrent GIZA jobs on the chunks.
-  #
-  # TODO: implement this.  There appears to be a problem with calling system() in threads.
-  #
-  # my $pool = new Thread::Pool(Min => 1, Max => $max_aligner_threads);
-
-  system("mkdir alignments") unless -d "alignments";
+  mkdir("alignments") unless -d "alignments";
 
   my $aligner_cmd = (
     "$SCRIPTDIR/training/paralign.pl "
@@ -875,7 +872,7 @@ if (! defined $ALIGNMENT) {
   if ($ALIGNER eq "giza") {
     @aligned_files = map { "alignments/$_/model/aligned.$GIZA_MERGE" } (0..$lastchunk);
   } elsif ($ALIGNER eq "berkeley") {
-    @aligned_files = map { "alignments/$_/training.align" } (0..$lastchunk);
+    @aligned_files = map { "alignments/$_/training.$TARGET-$SOURCE.align" } (0..$lastchunk);
   } elsif ($ALIGNER eq "jacana") {
     @aligned_files = map { "alignments/$_/training.align" } (0..$lastchunk);
   }

http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/38eebb3b/scripts/training/templates/alignment/word-align.conf
----------------------------------------------------------------------
diff --git a/scripts/training/templates/alignment/word-align.conf b/scripts/training/templates/alignment/word-align.conf
index 5fe3e0c..1622fb9 100644
--- a/scripts/training/templates/alignment/word-align.conf
+++ b/scripts/training/templates/alignment/word-align.conf
@@ -33,7 +33,7 @@ foreignSuffix	<SOURCE>
 englishSuffix	<TARGET>
 
 # Choose the training sources, which can either be directories or files that list files/directories
-trainSources	<TRAIN_DIR>/splits/corpus
+trainSources	<TRAIN_DIR>/splits/<CHUNK>
 sentences	MAX
 testSources	/dev/null
 overwriteExecDir	true