You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by be...@apache.org on 2011/06/05 10:38:16 UTC

svn commit: r1132076 - in /incubator/mesos/trunk/ec2: ./ deploy.lucid64/root/hadoop-0.20.2/conf/ deploy.lucid64/root/mesos-ec2/ deploy.lucid64/root/mesos-ec2/hadoop-framework-conf/

Author: benh
Date: Sun Jun  5 08:38:16 2011
New Revision: 1132076

URL: http://svn.apache.org/viewvc?rev=1132076&view=rev
Log:
More work on EC2 scripts:
- Files are now deployed to the master using rsync instead of scp,
  making cluster startup faster
- A cluster-url file is created in /root/mesos-ec2 to let users know the
  URL without having to list ZooKeeper nodes, etc themselves
- This cluster URL is also used by the Hadoop framework
- HDFS no longer attempts to use all the masters as secondary name
  nodes, only the first one (this was likely a bug before)

Added:
    incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/cluster-url
Modified:
    incubator/mesos/trunk/ec2/deploy.lucid64/root/hadoop-0.20.2/conf/masters
    incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/hadoop-framework-conf/mapred-site.xml
    incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/setup
    incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/start-mesos
    incubator/mesos/trunk/ec2/mesos_ec2.py

Modified: incubator/mesos/trunk/ec2/deploy.lucid64/root/hadoop-0.20.2/conf/masters
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/ec2/deploy.lucid64/root/hadoop-0.20.2/conf/masters?rev=1132076&r1=1132075&r2=1132076&view=diff
==============================================================================
--- incubator/mesos/trunk/ec2/deploy.lucid64/root/hadoop-0.20.2/conf/masters (original)
+++ incubator/mesos/trunk/ec2/deploy.lucid64/root/hadoop-0.20.2/conf/masters Sun Jun  5 08:38:16 2011
@@ -1 +1 @@
-{{master_list}}
+{{active_master}}

Added: incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/cluster-url
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/cluster-url?rev=1132076&view=auto
==============================================================================
--- incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/cluster-url (added)
+++ incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/cluster-url Sun Jun  5 08:38:16 2011
@@ -0,0 +1 @@
+{{cluster_url}}

Modified: incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/hadoop-framework-conf/mapred-site.xml
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/hadoop-framework-conf/mapred-site.xml?rev=1132076&r1=1132075&r2=1132076&view=diff
==============================================================================
--- incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/hadoop-framework-conf/mapred-site.xml (original)
+++ incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/hadoop-framework-conf/mapred-site.xml Sun Jun  5 08:38:16 2011
@@ -22,7 +22,7 @@
 
   <property>
     <name>mapred.mesos.master</name>
-    <value>1@{{active_master}}:5050</value>
+    <value>{{cluster_url}}</value>
   </property>
 
   <property>

Modified: incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/setup
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/setup?rev=1132076&r1=1132075&r2=1132076&view=diff
==============================================================================
--- incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/setup (original)
+++ incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/setup Sun Jun  5 08:38:16 2011
@@ -25,6 +25,7 @@ ZOOS=`cat zoo`
 
 if [[ $ZOOS = *NONE* ]]; then
   NUM_ZOOS=0
+  ZOOS=""
 else
   NUM_ZOOS=`cat zoo | wc -l`
 fi
@@ -95,7 +96,7 @@ if [[ $NUM_MASTERS -gt 1 ]] ; then
 fi
 
 if [[ $NUM_ZOOS != 0 ]] ; then
-  echo "RSYNC'ing /root/mesos-ec2 to other ZooKeeper servers..."
+  echo "RSYNC'ing /root/mesos-ec2 to ZooKeeper servers..."
   for zoo in $ZOOS; do
       echo $zoo
       rsync -e "ssh $SSH_OPTS" -az /root/mesos-ec2 $zoo:/root & sleep 0.3
@@ -111,27 +112,15 @@ for slave in $SLAVES; do
 done
 wait
 
-echo "Setting up slaves..."
-for slave in $SLAVES; do
-  echo $slave
-  ssh -t $SSH_OPTS root@$slave "mesos-ec2/setup-slave" &
+echo "Running slave setup script on all nodes..."
+for node in $SLAVES $MASTERS $ZOO; do
+  echo $node
+  ssh -t $SSH_OPTS root@$node "mesos-ec2/setup-slave" & wait 0.3
 done
 wait
 
-if [[ $NUM_ZOOS != 0 ]] ; then
-  echo "Running slave setup on ZooKeeper nodes..."
-  for slave in $SLAVES; do
-    echo $slave
-    ssh -t $SSH_OPTS root@$slave "mesos-ec2/setup-slave" &
-  done
-  wait
-fi
-
-echo "Running slave setup on master (i.e. for local)..."
-./setup-slave
-
 if [[ $NUM_MASTERS -gt 1 ]] ; then
-  echo "RSYNC'ing Hadoop config files for HDFS to other masters..."
+  echo "RSYNC'ing HDFS config files to other masters..."
   for master in `cat $MASTERS_FILE | sed '1d'`; do
     echo $master
     rsync -e "ssh $SSH_OPTS" -az $HADOOP_HOME/conf $master:$HADOOP_HOME & sleep 0.3
@@ -139,7 +128,7 @@ if [[ $NUM_MASTERS -gt 1 ]] ; then
   wait
 fi
 
-echo "RSYNC'ing Hadoop config files for HDFS to slaves..."
+echo "RSYNC'ing HDFS config files to slaves..."
 for slave in $SLAVES; do
   echo $slave
   rsync -e "ssh $SSH_OPTS" -az $HADOOP_HOME/conf $slave:$HADOOP_HOME & sleep 0.3
@@ -192,6 +181,9 @@ echo "Setting up haproxy+apache framewor
 cp haproxy+apache/* /root/mesos/frameworks/haproxy+apache
 
 echo "Setting up Spark config files..."
+# TODO: This currently overwrites whatever the user wrote there; on
+# the other hand, we also don't want to leave an old file created by
+# us because it would have the wrong hostname for HDFS etc
 mkdir -p /root/spark/conf
 echo "-Dspark.dfs=hdfs://$HOSTNAME:9000 -Dspark.repl.classdir=/nfs" \
      > /root/spark/conf/java-opts

Modified: incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/start-mesos
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/start-mesos?rev=1132076&r1=1132075&r2=1132076&view=diff
==============================================================================
--- incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/start-mesos (original)
+++ incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/start-mesos Sun Jun  5 08:38:16 2011
@@ -15,25 +15,15 @@ fi
 
 SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=2"
 
-if [[ $NUM_ZOOS == 0 ]]; then
-  master_arg="1@${ACTIVE_MASTER}:5050"
-else
-  master_arg="zoo://"
-  add=""
-  for zoo in $ZOOS; do
-    master_arg+=$add
-    master_arg+=$zoo":2181/mesos"
-    add=","
-  done
-fi
+cluster_url=`cat cluster-url`
 
-echo "Running with master parameter: "$master_arg
+echo "Running with cluster URL: "$cluster_url
 
 if [[ $NUM_ZOOS != 0 ]]; then
   masterid=1
   for master in $MASTERS; do
     echo "Starting master $masterid on $master"
-    ssh $SSH_OPTS $master "/root/mesos-ec2/mesos-daemon mesos-master -p 5050 -u $master_arg $@ </dev/null >/dev/null" & sleep 0.1
+    ssh $SSH_OPTS $master "/root/mesos-ec2/mesos-daemon mesos-master -p 5050 -u $cluster_url $@ </dev/null >/dev/null" & sleep 0.1
     masterid=$(($masterid+1))
   done
   wait
@@ -73,7 +63,7 @@ MEM=''
 
 for slave in $SLAVES; do
   echo "Starting $COUNT slave(s) on $slave"
-  ssh $SSH_OPTS $slave "for ((i = 0; i < $COUNT; i++)); do /root/mesos-ec2/mesos-daemon mesos-slave -u ${master_arg} $CPUS $MEM; done </dev/null >/dev/null" &
+  ssh $SSH_OPTS $slave "for ((i = 0; i < $COUNT; i++)); do /root/mesos-ec2/mesos-daemon mesos-slave -u ${cluster_url} $CPUS $MEM; done </dev/null >/dev/null" &
   sleep 0.1
 done
 wait

Modified: incubator/mesos/trunk/ec2/mesos_ec2.py
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/ec2/mesos_ec2.py?rev=1132076&r1=1132075&r2=1132076&view=diff
==============================================================================
--- incubator/mesos/trunk/ec2/mesos_ec2.py (original)
+++ incubator/mesos/trunk/ec2/mesos_ec2.py Sun Jun  5 08:38:16 2011
@@ -5,13 +5,13 @@ from __future__ import with_statement
 import boto
 import logging
 import os
+import shutil
 import subprocess
 import sys
 import tempfile
 import time
 from optparse import OptionParser
 from sys import stderr
-from tempfile import NamedTemporaryFile
 from boto.ec2.blockdevicemapping import BlockDeviceMapping, EBSBlockDeviceType
 
 
@@ -295,9 +295,6 @@ def get_num_disks(instance_type):
 # the first master instance in the cluster, and we expect the setup
 # script to be run on that instance to copy them to other nodes.
 def deploy_files(conn, root_dir, opts, master_res, slave_res, zoo_res):
-  # TODO: Speed up deployment by creating a temp directory with the
-  # template-transformed files and then rsyncing it
-
   active_master = master_res.instances[0].public_dns_name
 
   num_disks = get_num_disks(opts.instance_type)
@@ -308,38 +305,50 @@ def deploy_files(conn, root_dir, opts, m
       hdfs_data_dirs += ",/mnt%d/hdfs/dfs/data" % i
       mapred_local_dirs += ",/mnt%d/hadoop/mrlocal" % i
 
+  if zoo_res != None:
+    zoo_list = '\n'.join([i.public_dns_name for i in zoo_res.instances])
+    cluster_url = "zoo://" + ",".join(
+        ["%s:2181/mesos" % i.public_dns_name for i in zoo_res.instances])
+  else:
+    zoo_list = "NONE"
+    cluster_url = "1@%s:5050" % active_master
+
   template_vars = {
     "master_list": '\n'.join([i.public_dns_name for i in master_res.instances]),
     "active_master": active_master,
     "slave_list": '\n'.join([i.public_dns_name for i in slave_res.instances]),
+    "zoo_list": zoo_list,
+    "cluster_url": cluster_url,
     "hdfs_data_dirs": hdfs_data_dirs,
     "mapred_local_dirs": mapred_local_dirs
   }
 
-  if opts.ft > 1:
-    zoo = zoo_res.instances[0].public_dns_name
-    template_vars["zoo_list"] = '\n'.join(
-        [i.public_dns_name for i in zoo_res.instances])
-  else:
-    template_vars["zoo_list"] = "NONE";
-
+  # Create a temp directory in which we will place all the files to be
+  # deployed after we substitue template parameters in them
+  tmp_dir = tempfile.mkdtemp()
   for path, dirs, files in os.walk(root_dir):
     dest_dir = os.path.join('/', path[len(root_dir):])
-    if len(files) > 0: # Only mkdir for low-level directories since we use -p
-      ssh(active_master, opts, 'mkdir -p "%s"' % dest_dir)
+    local_dir = tmp_dir + dest_dir
+    if not os.path.exists(local_dir):
+      os.makedirs(local_dir)
     for filename in files:
       if filename[0] not in '#.~' and filename[-1] != '~':
         dest_file = os.path.join(dest_dir, filename)
-        print "Setting up %s" % dest_file
-        with open(os.path.join(path, filename)) as file:
-          text = file.read()
-          for key in template_vars:
-            text = text.replace("{{" + key + "}}", template_vars[key])
-          temp_file = NamedTemporaryFile()
-          temp_file.write(text)
-          temp_file.flush()
-          scp(active_master, opts, temp_file.name, dest_file)
-          temp_file.close()
+        local_file = tmp_dir + dest_file
+        with open(os.path.join(path, filename)) as src:
+          with open(local_file, "w") as dest:
+            text = src.read()
+            for key in template_vars:
+              text = text.replace("{{" + key + "}}", template_vars[key])
+            dest.write(text)
+            dest.close()
+  # rsync the whole directory over to the master machine
+  command = (("rsync -rv -e 'ssh -o StrictHostKeyChecking=no -i %s' " + 
+      "'%s/' 'root@%s:/'") % (opts.identity_file, tmp_dir, active_master))
+  print command
+  subprocess.check_call(command, shell=True)
+  # Remove the temp directory
+  shutil.rmtree(tmp_dir)
 
 
 # Copy a file to a given host through scp, throwing an exception if scp fails
@@ -367,7 +376,7 @@ def main():
     else:
       (master_res, slave_res, zoo_res) = launch_cluster(
           conn, opts, cluster_name)
-    wait_for_cluster(conn, master_res, slave_res, zoo_res)
+      wait_for_cluster(conn, master_res, slave_res, zoo_res)
     setup_cluster(conn, master_res, slave_res, zoo_res, opts, True)
   elif action == "destroy":
     response = raw_input("Are you sure you want to destroy the cluster " +
@@ -382,7 +391,7 @@ def main():
       print "Terminating slaves..."
       for inst in slave_res.instances:
         inst.terminate()
-      if opts.ft > 1:
+      if zoo_res != None:
         print "Terminating zoo..."
         for inst in zoo_res.instances:
           inst.terminate()
@@ -406,7 +415,7 @@ def main():
     response = raw_input("Are you sure you want to stop the cluster " +
         cluster_name + "?\nDATA ON EPHEMERAL DISKS WILL BE LOST, " +
         "BUT THE CLUSTER WILL KEEP USING SPACE ON\n" + 
-        "AMAZON EBS IF IT IS EBS-BACKED!\n" +
+        "AMAZON EBS IF IT IS EBS-BACKED!!\n" +
         "Stop cluster " + cluster_name + " (y/N): ")
     if response == "y":
       (master_res, slave_res, zoo_res) = get_existing_cluster(
@@ -417,7 +426,7 @@ def main():
       print "Stopping slaves..."
       for inst in slave_res.instances:
         inst.stop()
-      if opts.ft > 1:
+      if zoo_res != None:
         print "Stopping zoo..."
         for inst in zoo_res.instances:
           inst.stop()
@@ -425,13 +434,13 @@ def main():
   elif action == "start":
     (master_res, slave_res, zoo_res) = get_existing_cluster(
         conn, opts, cluster_name)
-    print "Starting master..."
-    for inst in master_res.instances:
-      inst.start()
     print "Starting slaves..."
     for inst in slave_res.instances:
       inst.start()
-    if opts.ft > 1:
+    print "Starting master..."
+    for inst in master_res.instances:
+      inst.start()
+    if zoo_res != None:
       print "Starting zoo..."
       for inst in zoo_res.instances:
         inst.start()