You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/09/07 20:42:36 UTC

[1/4] incubator-joshua git commit: only cleanup Hadoop if grammar was created

Repository: incubator-joshua
Updated Branches:
  refs/heads/master b4e7c0095 -> 56be05ec6


only cleanup Hadoop if grammar was created


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/d61eb538
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/d61eb538
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/d61eb538

Branch: refs/heads/master
Commit: d61eb538a23e6b1c75fb5eabdba9562c0a1d06e6
Parents: b4e7c00
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Sep 7 16:39:41 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed Sep 7 16:39:41 2016 -0400

----------------------------------------------------------------------
 scripts/training/pipeline.pl | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/d61eb538/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index c0e33d3..41a0cbb 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1154,7 +1154,7 @@ if (! defined $GRAMMAR_FILE) {
     system("mv $thrax_file.tmp $thrax_file");
 
     $cachepipe->cmd("thrax-run",
-                    "hadoop jar $THRAX/bin/thrax.jar -D mapreduce.task.timeout=0 -D mapreduce.map.java.opts='-Xmx$HADOOP_MEM' -D mapreduce.reduce.java.opts='-Xmx$HADOOP_MEM' -D hadoop.tmp.dir=$TMPDIR $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; hadoop fs -getmerge $THRAXDIR/final/ grammar.gz; hadoop fs -rm -r $THRAXDIR",
+                    "hadoop jar $THRAX/bin/thrax.jar -D mapreduce.task.timeout=0 -D mapreduce.map.java.opts='-Xmx$HADOOP_MEM' -D mapreduce.reduce.java.opts='-Xmx$HADOOP_MEM' -D hadoop.tmp.dir=$TMPDIR $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; hadoop fs -getmerge $THRAXDIR/final/ grammar.gz",
                     "$DATA_DIRS{train}/thrax-input-file",
                     $thrax_file,
                     "grammar.gz");
@@ -1162,6 +1162,11 @@ if (! defined $GRAMMAR_FILE) {
 
     $GRAMMAR_FILE = "grammar.gz";
 
+    # cleanup if successful
+    if (-s $GRAMMAR_FILE) {
+      system("hadoop fs -rm -r $THRAXDIR");
+    }
+
   } else {
 
     print STDERR "* FATAL: There was no way to build a grammar, and none was passed in\n";


[4/4] incubator-joshua git commit: use temporary file name to avoid potential clash

Posted by mj...@apache.org.
use temporary file name to avoid potential clash


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/56be05ec
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/56be05ec
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/56be05ec

Branch: refs/heads/master
Commit: 56be05ec63b964fa06f1a9317c048182e623dddb
Parents: 209f222
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Sep 7 16:41:05 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed Sep 7 16:41:05 2016 -0400

----------------------------------------------------------------------
 scripts/training/run_thrax.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/56be05ec/scripts/training/run_thrax.py
----------------------------------------------------------------------
diff --git a/scripts/training/run_thrax.py b/scripts/training/run_thrax.py
index eef2c19..4457245 100755
--- a/scripts/training/run_thrax.py
+++ b/scripts/training/run_thrax.py
@@ -91,13 +91,13 @@ paste(args.source_corpus, args.target_corpus, args.alignment_file, thrax_file)
 run('%s/bin/hadoop fs -put %s %s/input-file' % (HADOOP, thrax_file, THRAXDIR))
 
 # Copy the template
-conf_file_name = 'thrax.conf'
-conf_file = open(conf_file_name, 'w')
+conf_file = tempfile.NamedTemporaryFile(prefix='thrax.conf')
 for line in open(args.thrax_config):
     if not line.startswith('input-file'):
         conf_file.write(line)
 conf_file.write('input-file %s/input-file\n' % (THRAXDIR))
 conf_file.close()
+conf_file_name = conf_file.name
 
 # Run Hadoop
 run('%s/bin/hadoop jar %s -D mapred.child.java.opts="-Xmx%s" -D hadoop.tmp.dir=%s %s %s > thrax.log 2>&1' % (HADOOP, THRAX_JAR, '4g', args.tmp_dir, conf_file_name, THRAXDIR))


[3/4] incubator-joshua git commit: fixed documentation

Posted by mj...@apache.org.
fixed documentation


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/209f2223
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/209f2223
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/209f2223

Branch: refs/heads/master
Commit: 209f22238b2c3d691935a91fbd0cfe76356464d6
Parents: 60bf717
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Sep 7 16:40:41 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed Sep 7 16:40:41 2016 -0400

----------------------------------------------------------------------
 scripts/training/run_thrax.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/209f2223/scripts/training/run_thrax.py
----------------------------------------------------------------------
diff --git a/scripts/training/run_thrax.py b/scripts/training/run_thrax.py
index e38f6b2..eef2c19 100755
--- a/scripts/training/run_thrax.py
+++ b/scripts/training/run_thrax.py
@@ -35,10 +35,11 @@ EXAMPLE = r"""
 Example invocation:
 
 $JOSHUA/scripts/support/run_thrax.py \
+  /path/to/thrax.config \
   /path/to/corpus.SOURCE \
   /path/to/corpus.TARGET \
   /path/to/alignment \
-  -c /path/to/thrax.config \
+  /path/to/thrax.config \
   -o grammar.gz
 """
 parser = argparse.ArgumentParser(description='Run thrax')


[2/4] incubator-joshua git commit: set default phrase length to 5 instead of 10

Posted by mj...@apache.org.
set default phrase length to 5 instead of 10


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/60bf7170
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/60bf7170
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/60bf7170

Branch: refs/heads/master
Commit: 60bf717055b4a52f07690b865186f222a040a753
Parents: d61eb53
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Sep 7 16:40:15 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed Sep 7 16:40:15 2016 -0400

----------------------------------------------------------------------
 scripts/training/templates/thrax-phrase.conf | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/60bf7170/scripts/training/templates/thrax-phrase.conf
----------------------------------------------------------------------
diff --git a/scripts/training/templates/thrax-phrase.conf b/scripts/training/templates/thrax-phrase.conf
index 630c76f..1585383 100644
--- a/scripts/training/templates/thrax-phrase.conf
+++ b/scripts/training/templates/thrax-phrase.conf
@@ -17,15 +17,11 @@ min-rule-count 1
 # the number of reducers
 reducers 16
 
-# not only do these next six options have the suggested values as given
-# in Chiang's "Hierarchical Phrase-based Translation" (CL), they are also
-# Thrax's default values! You could comment them out and the resulting grammar
-# would be identical.
-
-# maximum length of initial phrase pairs
-initial-phrase-length   10
-lex-source-words        10
-lex-target-words        10
+# Maximum length of initial phrase pairs. These are set to be shorter than
+# used by Hiero.
+initial-phrase-length   5
+lex-source-words        5
+lex-target-words        5
 
 # maximum number of NTs in a rule
 arity                   0