You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/07/25 14:25:30 UTC

[1/6] incubator-joshua git commit: updated release version

Repository: incubator-joshua
Updated Branches:
  refs/heads/master 5f13ce921 -> 80c17ef6a


updated release version


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/b259ddf4
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/b259ddf4
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/b259ddf4

Branch: refs/heads/master
Commit: b259ddf49a034c4d82ad1f0dc6ef9058de49d287
Parents: 5f13ce9
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Jul 15 11:01:39 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Jul 15 11:01:39 2016 -0400

----------------------------------------------------------------------
 pom.xml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b259ddf4/pom.xml
----------------------------------------------------------------------
diff --git a/pom.xml b/pom.xml
index bc13483..da67edb 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
   <groupId>org.apache.joshua</groupId>
   <artifactId>joshua</artifactId>
   <packaging>jar</packaging>
-  <version>6.0.6-SNAPSHOT</version>
+  <version>6.1-SNAPSHOT</version>
   <name>Apache Joshua Machine Translation Toolkit</name>
   <description>Joshua is an open-source statistical machine
   translation decoder for phrase-based, hierarchical,


[6/6] incubator-joshua git commit: Reverting commit e957a126f0bacd6c5aa1f8aa8513fc0f252403a8

Posted by mj...@apache.org.
Reverting commit e957a126f0bacd6c5aa1f8aa8513fc0f252403a8


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/80c17ef6
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/80c17ef6
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/80c17ef6

Branch: refs/heads/master
Commit: 80c17ef6a35f1e707cc224051389e680e129f859
Parents: b2b6480
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Jul 22 19:36:19 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Jul 22 19:36:19 2016 -0400

----------------------------------------------------------------------
 scripts/training/pipeline.pl | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/80c17ef6/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 68b18f6..ea617bc 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1837,9 +1837,16 @@ sub prepare_data {
 
   my $infiles =  join(" ", @infiles);
   my $outfiles = join(" ", @outfiles);
-  $cachepipe->cmd("$label-copy-and-filter",
-                  "$PASTE $infiles | $SCRIPTDIR/support/split2files $outfiles",
-                  @indeps, @outfiles);
+  # only skip blank lines for training data
+  if ($label ne "test") {
+    $cachepipe->cmd("$label-copy-and-filter",
+                    "$PASTE $infiles | $SCRIPTDIR/training/filter-empty-lines.pl | $SCRIPTDIR/support/split2files $outfiles",
+                    @indeps, @outfiles);
+  } else {
+    $cachepipe->cmd("$label-copy-and-filter",
+                    "$PASTE $infiles | $SCRIPTDIR/support/split2files $outfiles",
+                    @indeps, @outfiles);
+  }
 
   # Done concatenating and filtering files
 
@@ -1857,11 +1864,9 @@ sub prepare_data {
         } else {
           my $TOKENIZER = ($lang eq $SOURCE) ? $TOKENIZER_SOURCE : $TOKENIZER_TARGET;
 
-          # Normalization can delete lines, so they might need to be filtered out
-          my $maybe_filter = ($label eq "test") ? "" : "| grep -v ^\$";
           my $ext = $lang; $ext =~ s/\.\d//;
           $cachepipe->cmd("$label-tokenize-$lang",
-                          "$CAT $DATA_DIRS{$label}/$prefix.$lang | $NORMALIZER $ext | $TOKENIZER -l $ext 2> /dev/null $maybe_filter > $DATA_DIRS{$label}/$prefix.tok.$lang",
+                          "$CAT $DATA_DIRS{$label}/$prefix.$lang | $NORMALIZER $ext | $TOKENIZER -l $ext 2> /dev/null > $DATA_DIRS{$label}/$prefix.tok.$lang",
                           "$DATA_DIRS{$label}/$prefix.$lang", "$DATA_DIRS{$label}/$prefix.tok.$lang");
         }
 


[5/6] incubator-joshua git commit: fixed java path for apache

Posted by mj...@apache.org.
fixed java path for apache


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/b2b64806
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/b2b64806
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/b2b64806

Branch: refs/heads/master
Commit: b2b648066d271ff21e177f61c92987cde811e89f
Parents: e957a12
Author: Matt Post <po...@cs.jhu.edu>
Authored: Sun Jul 17 13:45:36 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Sun Jul 17 13:45:36 2016 -0400

----------------------------------------------------------------------
 scripts/support/filter_grammar.sh | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/b2b64806/scripts/support/filter_grammar.sh
----------------------------------------------------------------------
diff --git a/scripts/support/filter_grammar.sh b/scripts/support/filter_grammar.sh
index 8b037cf..6b05e32 100755
--- a/scripts/support/filter_grammar.sh
+++ b/scripts/support/filter_grammar.sh
@@ -17,4 +17,6 @@
 #
 # Wrapper around the grammar filter
 
-java -Xmx4g -Dfile.encoding=utf8 -cp $JOSHUA/class joshua.tools.TestSetFilter "$@"
+JOSHUA=$(readlink -f $(dirname $0)/../..)
+JAR_PATH=$JOSHUA/target/joshua-*-jar-with-dependencies.jar
+java -Xmx4g -Dfile.encoding=utf8 -cp $JAR_PATH org.apache.joshua.tools.TestSetFilter "$@"


[3/6] incubator-joshua git commit: bugfix: don't pack a nonexistent TM

Posted by mj...@apache.org.
bugfix: don't pack a nonexistent TM


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/ce27c2de
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/ce27c2de
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/ce27c2de

Branch: refs/heads/master
Commit: ce27c2de672a8d48fb876f38181dbc860b9dc909
Parents: fcdd7cd
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Jul 15 11:02:35 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Jul 15 11:02:35 2016 -0400

----------------------------------------------------------------------
 scripts/training/pipeline.pl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/ce27c2de/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index b8512fc..bd90707 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1648,9 +1648,11 @@ if ($OPTIMIZER_RUN == 1) {
 }
 
 $tm_switch = "";
-$tm_copy_config_args = "";
-$tm_switch .= ($DO_PACK_GRAMMARS) ? "--pack-tm" : "--tm";
-$tm_switch .= " $TEST_GRAMMAR";
+if (defined $TEST_GRAMMAR) {
+  $tm_copy_config_args = "";
+  $tm_switch .= ($DO_PACK_GRAMMARS) ? "--pack-tm" : "--tm";
+  $tm_switch .= " $TEST_GRAMMAR";
+}
 
 # Add in the glue grammar
 if (defined $GLUE_GRAMMAR_FILE) {


[4/6] incubator-joshua git commit: bugfix: move empty line filtering to after normalization

Posted by mj...@apache.org.
bugfix: move empty line filtering to after normalization

normalization recently added deleting of certain non-printing UTF8
characters, which means blank lines can be created by normalization,
so we have to remove them afterward instead of before (for both
train and dev, since Moses kbmira croaks on blank outputs)


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/e957a126
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/e957a126
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/e957a126

Branch: refs/heads/master
Commit: e957a126f0bacd6c5aa1f8aa8513fc0f252403a8
Parents: ce27c2d
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Jul 15 11:03:11 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Jul 15 11:03:11 2016 -0400

----------------------------------------------------------------------
 scripts/training/pipeline.pl | 19 ++++++++-----------
 1 file changed, 8 insertions(+), 11 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/e957a126/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index bd90707..68b18f6 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1837,16 +1837,10 @@ sub prepare_data {
 
   my $infiles =  join(" ", @infiles);
   my $outfiles = join(" ", @outfiles);
-  # only skip blank lines for training data
-  if ($label ne "test") {
-    $cachepipe->cmd("$label-copy-and-filter",
-                    "$PASTE $infiles | $SCRIPTDIR/training/filter-empty-lines.pl | $SCRIPTDIR/support/split2files $outfiles",
-                    @indeps, @outfiles);
-  } else {
-    $cachepipe->cmd("$label-copy-and-filter",
-                    "$PASTE $infiles | $SCRIPTDIR/support/split2files $outfiles",
-                    @indeps, @outfiles);
-  }
+  $cachepipe->cmd("$label-copy-and-filter",
+                  "$PASTE $infiles | $SCRIPTDIR/support/split2files $outfiles",
+                  @indeps, @outfiles);
+
   # Done concatenating and filtering files
 
   # record where the concatenated input files were
@@ -1862,9 +1856,12 @@ sub prepare_data {
           system("cp $DATA_DIRS{$label}/$prefix.$lang $DATA_DIRS{$label}/$prefix.tok.$lang");
         } else {
           my $TOKENIZER = ($lang eq $SOURCE) ? $TOKENIZER_SOURCE : $TOKENIZER_TARGET;
+
+          # Normalization can delete lines, so they might need to be filtered out
+          my $maybe_filter = ($label eq "test") ? "" : "| grep -v ^\$";
           my $ext = $lang; $ext =~ s/\.\d//;
           $cachepipe->cmd("$label-tokenize-$lang",
-                          "$CAT $DATA_DIRS{$label}/$prefix.$lang | $NORMALIZER $ext | $TOKENIZER -l $ext 2> /dev/null > $DATA_DIRS{$label}/$prefix.tok.$lang",
+                          "$CAT $DATA_DIRS{$label}/$prefix.$lang | $NORMALIZER $ext | $TOKENIZER -l $ext 2> /dev/null $maybe_filter > $DATA_DIRS{$label}/$prefix.tok.$lang",
                           "$DATA_DIRS{$label}/$prefix.$lang", "$DATA_DIRS{$label}/$prefix.tok.$lang");
         }
 


[2/6] incubator-joshua git commit: added pipeline parameter

Posted by mj...@apache.org.
added pipeline parameter


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/fcdd7cd0
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/fcdd7cd0
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/fcdd7cd0

Branch: refs/heads/master
Commit: fcdd7cd05c69e718fed3a476f1112244ddb29f8b
Parents: b259ddf
Author: Matt Post <po...@cs.jhu.edu>
Authored: Fri Jul 15 11:02:13 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Fri Jul 15 11:02:13 2016 -0400

----------------------------------------------------------------------
 scripts/training/pipeline.pl | 1 +
 1 file changed, 1 insertion(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/fcdd7cd0/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index 2a3cc92..b8512fc 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -276,6 +276,7 @@ my $retval = GetOptions(
   "maxlen-tune=i"        => \$MAXLEN_TUNE,
   "maxlen-test=i"        => \$MAXLEN_TEST,
   "maxlines=i"        => \$MAXLINES,
+  "maxlen-phrase=i"   => \$MAX_PHRASE_LEN,
   "tokenizer-source=s"      => \$TOKENIZER_SOURCE,
   "tokenizer-target=s"      => \$TOKENIZER_TARGET,
   "normalizer=s"      => \$NORMALIZER,