You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@joshua.apache.org by mj...@apache.org on 2016/05/25 18:08:18 UTC
[14/15] incubator-joshua git commit: pipeline no longer depends on $HADOOP, also will not setup a cluster for you

pipeline no longer depends on $HADOOP, also will not setup a cluster for you

The pipeline used to have the option of setting up a Hadoop cluster for you, rolling it out in standalone mode. This is not a good solution to do here; it's much better to have the user do it, wrapping Joshua within the many tools available to do so. Therefore, this functionality is removed. The developer is now responsible for setting up her own Hadoop cluster and ensuring that the "hadoop" command is in her path.


Project: http://git-wip-us.apache.org/repos/asf/incubator-joshua/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-joshua/commit/78f727f3
Tree: http://git-wip-us.apache.org/repos/asf/incubator-joshua/tree/78f727f3
Diff: http://git-wip-us.apache.org/repos/asf/incubator-joshua/diff/78f727f3

Branch: refs/heads/master
Commit: 78f727f3a2fd6616b3f77ca50655c464a91671b4
Parents: aef0b2d
Author: Matt Post <po...@cs.jhu.edu>
Authored: Wed May 25 06:01:43 2016 -0400
Committer: Matt Post <po...@cs.jhu.edu>
Committed: Wed May 25 06:01:43 2016 -0400

----------------------------------------------------------------------
 scripts/training/pipeline.pl | 89 ++++-----------------------------------
 1 file changed, 8 insertions(+), 81 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-joshua/blob/78f727f3/scripts/training/pipeline.pl
----------------------------------------------------------------------
diff --git a/scripts/training/pipeline.pl b/scripts/training/pipeline.pl
index ff9d762..88d641c 100755
--- a/scripts/training/pipeline.pl
+++ b/scripts/training/pipeline.pl
@@ -40,7 +40,6 @@ use v5.12;
 # Thus we undefine CDPATH to ensure this doesn't happen.
 delete $ENV{CDPATH};
 
-my $HADOOP = $ENV{HADOOP};
 my $MOSES = $ENV{MOSES};
 my $METEOR = $ENV{METEOR};
 my $THRAX = "$JOSHUA/thrax";
@@ -127,9 +126,6 @@ my $JOSHUA_MEM = "4g";
 # Hadoop via -Dmapred.child.java.opts
 my $HADOOP_MEM = "4g";
 
-# The location of a custom core-site.xml file, if desired (optional).
-my $HADOOP_CONF = undef;
-
 # memory available to the parser
 my $PARSER_MEM = "2g";
 
@@ -291,8 +287,6 @@ my $retval = GetOptions(
   "first-step=s"     => \$FIRST_STEP,
   "last-step=s"      => \$LAST_STEP,
   "aligner-chunk-size=s" => \$ALIGNER_BLOCKSIZE,
-  "hadoop=s"          => \$HADOOP,
-  "hadoop-conf=s"          => \$HADOOP_CONF,
   "tmp=s"             => \$TMPDIR,
   "nbest=i"           => \$NBEST,
   "reordering-limit=i" => \$REORDERING_LIMIT,
@@ -504,7 +498,6 @@ $_TUNE_GRAMMAR_FILE = get_absolute_path($_TUNE_GRAMMAR_FILE);
 $_TEST_GRAMMAR_FILE = get_absolute_path($_TEST_GRAMMAR_FILE);
 $THRAX_CONF_FILE = get_absolute_path($THRAX_CONF_FILE);
 $ALIGNMENT = get_absolute_path($ALIGNMENT);
-$HADOOP_CONF = get_absolute_path($HADOOP_CONF);
 
 foreach my $corpus (@CORPORA) {
   foreach my $ext ($TARGET,$SOURCE) {
@@ -561,11 +554,6 @@ if ($FILTERING eq "fast") {
   exit 1;
 }
 
-if (defined $HADOOP_CONF && ! -e $HADOOP_CONF) {
-  print STDERR "* FATAL: Couldn't find \$HADOOP_CONF file '$HADOOP_CONF'\n";
-  exit 1;
-}
-
 ## END SANITY CHECKS
 
 ####################################################################################################
@@ -1125,29 +1113,17 @@ if (! defined $GRAMMAR_FILE) {
                     $TRAIN{source}, $target_file, $ALIGNMENT,
                     "$DATA_DIRS{train}/thrax-input-file");
 
-    # Rollout the hadoop cluster if needed.  This causes $HADOOP to be defined (pointing to the
-    # unrolled directory).
-    start_hadoop_cluster() unless defined $HADOOP;
-
     # put the hadoop files in place
-    my $THRAXDIR;
     my $thrax_input;
-    if (! defined $HADOOP or $HADOOP eq "") {
-      $THRAXDIR = "thrax";
-
-      $thrax_input = "$DATA_DIRS{train}/thrax-input-file"
-
-    } else {
-      $THRAXDIR = "pipeline-$SOURCE-$TARGET-$GRAMMAR_TYPE-$RUNDIR";
-      $THRAXDIR =~ s#/#_#g;
+    my $THRAXDIR = "pipeline-$SOURCE-$TARGET-$GRAMMAR_TYPE-$RUNDIR";
+    $THRAXDIR =~ s#/#_#g;
 
-      $cachepipe->cmd("thrax-prep",
-                      "$HADOOP/bin/hadoop fs -rm -r $THRAXDIR; $HADOOP/bin/hadoop fs -mkdir $THRAXDIR; $HADOOP/bin/hadoop fs -put $DATA_DIRS{train}/thrax-input-file $THRAXDIR/input-file",
-                      "$DATA_DIRS{train}/thrax-input-file", 
-                      "grammar.gz");
+    $cachepipe->cmd("thrax-prep",
+                    "hadoop fs -rm -r $THRAXDIR; hadoop fs -mkdir $THRAXDIR; hadoop fs -put $DATA_DIRS{train}/thrax-input-file $THRAXDIR/input-file",
+                    "$DATA_DIRS{train}/thrax-input-file", 
+                    "grammar.gz");
 
-      $thrax_input = "$THRAXDIR/input-file";
-    }
+    $thrax_input = "$THRAXDIR/input-file";
 
     # copy the thrax config file
     my $thrax_file = "thrax-$GRAMMAR_TYPE.conf";
@@ -1156,25 +1132,12 @@ if (! defined $GRAMMAR_FILE) {
     system("mv $thrax_file.tmp $thrax_file");
 
     $cachepipe->cmd("thrax-run",
-                    "$HADOOP/bin/hadoop jar $THRAX/bin/thrax.jar -D mapreduce.task.timeout=0 -D mapreduce.map.java.opts='-Xmx$HADOOP_MEM' -D mapreduce.reduce.java.opts='-Xmx$HADOOP_MEM' -D hadoop.tmp.dir=$TMPDIR $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; $HADOOP/bin/hadoop fs -getmerge $THRAXDIR/final/ grammar.gz", #; $HADOOP/bin/hadoop fs -rm -r $THRAXDIR",
+                    "hadoop jar $THRAX/bin/thrax.jar -D mapreduce.task.timeout=0 -D mapreduce.map.java.opts='-Xmx$HADOOP_MEM' -D mapreduce.reduce.java.opts='-Xmx$HADOOP_MEM' -D hadoop.tmp.dir=$TMPDIR $thrax_file $THRAXDIR > thrax.log 2>&1; rm -f grammar grammar.gz; hadoop fs -getmerge $THRAXDIR/final/ grammar.gz", #; hadoop fs -rm -r $THRAXDIR",
                     "$DATA_DIRS{train}/thrax-input-file",
                     $thrax_file,
                     "grammar.gz");
 #perl -pi -e 's/\.?0+\b//g' grammar; 
 
-    stop_hadoop_cluster() if $HADOOP eq "hadoop";
-
-    # cache the thrax-prep step, which depends on grammar.gz
-#    if ($HADOOP ne "hadoop") {
-#      $cachepipe->cmd("thrax-prep", "--cache-only");
-#    }
-
-    # clean up
-    # TODO: clean up real hadoop clusters too
-    # if ($HADOOP eq "hadoop") {
-    #   system("rm -rf $THRAXDIR hadoop hadoop-2.5.2");
-    # }
-
     $GRAMMAR_FILE = "grammar.gz";
   } else {
 
@@ -1973,42 +1936,6 @@ sub get_numrefs {
   }
 }
 
-sub start_hadoop_cluster {
-  rollout_hadoop_cluster();
-
-  # start the cluster
-  # system("./hadoop/bin/start-all.sh");
-  # sleep(120);
-}
-
-sub rollout_hadoop_cluster {
-  # if it's not already unpacked, unpack it
-  if (! -d "hadoop") {
-
-    my $hadoop_tmp_dir = tempdir("hadoop-XXXX", DIR => $TMPDIR, CLEANUP => 0);
-		system("tar xzf $JOSHUA/lib/hadoop-2.5.2.tar.gz -C $hadoop_tmp_dir");
-		system("ln -sf $hadoop_tmp_dir/hadoop-2.5.2 hadoop");
-    if (defined $HADOOP_CONF) {
-      print STDERR "Copying HADOOP_CONF($HADOOP_CONF) to hadoop/conf/core-site.xml\n";
-      system("cp $HADOOP_CONF hadoop/conf/core-site.xml");
-    }
-  }
-  
-  $ENV{HADOOP} = $HADOOP = "hadoop";
-  $ENV{HADOOP_CONF_DIR} = "";
-}
-
-sub stop_hadoop_cluster {
-  if ($HADOOP ne "hadoop") {
-		system("hadoop/bin/stop-all.sh");
-  }
-}
-
-#sub teardown_hadoop_cluster {
-#  stop_hadoop_cluster();
-#  system("rm -f hadoop");
-#}
-
 sub is_lattice {
   my $file = shift;
   open READ, "$CAT $file|" or die "can't read from potential lattice '$file'";