You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/09/07 20:42:36 UTC
[1/4] incubator-joshua git commit: only cleanup Hadoop if grammar was
created
Repository: incubator-joshua
Updated Branches:
refs/heads/master b4e7c0095 -> 56be05ec6
only cleanup Hadoop if grammar was created
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/d61eb538
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/d61eb538
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/d61eb538
Branch: refs/heads/master
Commit: d61eb538a23e6b1c75fb5eabdba9562c0a1d06e6
Parents: b4e7c00
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Sep 7 16:39:41 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed Sep 7 16:39:41 2016 -0400
----------------------------------------------------------------------
scripts/training/pipeline.pl | 7 ++++++-
1 file changed, 6 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/d61eb538/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index c0e33d3..41a0cbb 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -1154,7 +1154,7 @@ if (! defined $GRAMMAR_FILE) {
system("mv $thrax_file.tmp $thrax_file");
$cachepipe->cmd("thrax-run",
- "hadoop jar $THRAX/bin/thrax.jar -D mapreduce.task.timeout=0 -D mapreduce.map.java.opts='-Xmx$HADOOP_MEM' -D mapreduce.reduce.java.opts='-Xmx$HADOOP_MEM' -D hadoop.tmp.dir=$TMPDIR $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; hadoop fs -getmerge $THRAXDIR/final/ grammar.gz; hadoop fs -rm -r $THRAXDIR",
+ "hadoop jar $THRAX/bin/thrax.jar -D mapreduce.task.timeout=0 -D mapreduce.map.java.opts='-Xmx$HADOOP_MEM' -D mapreduce.reduce.java.opts='-Xmx$HADOOP_MEM' -D hadoop.tmp.dir=$TMPDIR $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; hadoop fs -getmerge $THRAXDIR/final/ grammar.gz",
"$DATA_DIRS{train}/thrax-input-file",
$thrax_file,
"grammar.gz");
@@ -1162,6 +1162,11 @@ if (! defined $GRAMMAR_FILE) {
$GRAMMAR_FILE = "grammar.gz";
+ # cleanup if successful
+ if (-s $GRAMMAR_FILE) {
+ system("hadoop fs -rm -r $THRAXDIR");
+ }
+
} else {
print STDERR "* FATAL: There was no way to build a grammar, and none was passed in\n";
[4/4] incubator-joshua git commit: use temporary file name to avoid
potential clash
Posted by mj...@apache.org.
use temporary file name to avoid potential clash
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/56be05ec
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/56be05ec
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/56be05ec
Branch: refs/heads/master
Commit: 56be05ec63b964fa06f1a9317c048182e623dddb
Parents: 209f222
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Sep 7 16:41:05 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed Sep 7 16:41:05 2016 -0400
----------------------------------------------------------------------
scripts/training/run_thrax.py | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/56be05ec/scripts/training/run_thrax.py
----------------------------------------------------------------------
diff --git a/scripts/training/run_thrax.py b/scripts/training/run_thrax.py
index eef2c19..4457245 100755
--- a/scripts/training/run_thrax.py
+++ b/scripts/training/run_thrax.py
@@ -91,13 +91,13 @@ paste(args.source_corpus, args.target_corpus, args.alignment_file, thrax_file)
run('%s/bin/hadoop fs -put %s %s/input-file' % (HADOOP, thrax_file, THRAXDIR))
# Copy the template
-conf_file_name = 'thrax.conf'
-conf_file = open(conf_file_name, 'w')
+conf_file = tempfile.NamedTemporaryFile(prefix='thrax.conf')
for line in open(args.thrax_config):
if not line.startswith('input-file'):
conf_file.write(line)
conf_file.write('input-file %s/input-file\n' % (THRAXDIR))
conf_file.close()
+conf_file_name = conf_file.name
# Run Hadoop
run('%s/bin/hadoop jar %s -D mapred.child.java.opts="-Xmx%s" -D hadoop.tmp.dir=%s %s %s > thrax.log 2>&1' % (HADOOP, THRAX_JAR, '4g', args.tmp_dir, conf_file_name, THRAXDIR))
[3/4] incubator-joshua git commit: fixed documentation
Posted by mj...@apache.org.
fixed documentation
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/209f2223
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/209f2223
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/209f2223
Branch: refs/heads/master
Commit: 209f22238b2c3d691935a91fbd0cfe76356464d6
Parents: 60bf717
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Sep 7 16:40:41 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed Sep 7 16:40:41 2016 -0400
----------------------------------------------------------------------
scripts/training/run_thrax.py | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/209f2223/scripts/training/run_thrax.py
----------------------------------------------------------------------
diff --git a/scripts/training/run_thrax.py b/scripts/training/run_thrax.py
index e38f6b2..eef2c19 100755
--- a/scripts/training/run_thrax.py
+++ b/scripts/training/run_thrax.py
@@ -35,10 +35,11 @@ EXAMPLE = r"""
Example invocation:
$JOSHUA/scripts/support/run_thrax.py \
+ /path/to/thrax.config \
/path/to/corpus.SOURCE \
/path/to/corpus.TARGET \
/path/to/alignment \
- -c /path/to/thrax.config \
+ /path/to/thrax.config \
-o grammar.gz
"""
parser = argparse.ArgumentParser(description='Run thrax')
[2/4] incubator-joshua git commit: set default phrase length to 5
instead of 10
Posted by mj...@apache.org.
set default phrase length to 5 instead of 10
Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/60bf7170
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/60bf7170
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/60bf7170
Branch: refs/heads/master
Commit: 60bf717055b4a52f07690b865186f222a040a753
Parents: d61eb53
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed Sep 7 16:40:15 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed Sep 7 16:40:15 2016 -0400
----------------------------------------------------------------------
scripts/training/templates/thrax-phrase.conf | 14 +++++---------
1 file changed, 5 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/60bf7170/scripts/training/templates/thrax-phrase.conf
----------------------------------------------------------------------
diff --git a/scripts/training/templates/thrax-phrase.conf b/scripts/training/templates/thrax-phrase.conf
index 630c76f..1585383 100644
--- a/scripts/training/templates/thrax-phrase.conf
+++ b/scripts/training/templates/thrax-phrase.conf
@@ -17,15 +17,11 @@ min-rule-count 1
# the number of reducers
reducers 16
-# not only do these next six options have the suggested values as given
-# in Chiang's "Hierarchical Phrase-based Translation" (CL), they are also
-# Thrax's default values! You could comment them out and the resulting grammar
-# would be identical.
-
-# maximum length of initial phrase pairs
-initial-phrase-length 10
-lex-source-words 10
-lex-target-words 10
+# Maximum length of initial phrase pairs. These are set to be shorter than
+# used by Hiero.
+initial-phrase-length 5
+lex-source-words 5
+lex-target-words 5
# maximum number of NTs in a rule
arity 0