You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by be...@apache.org on 2011/06/05 05:32:25 UTC

svn commit: r1131632 - in /incubator/mesos/trunk/src/ec2: ./ deploy.karmic64/root/nexus-ec2/ deploy.solaris/root/nexus-ec2/

Author: benh
Date: Sun Jun  5 03:32:24 2011
New Revision: 1131632

URL: http://svn.apache.org/viewvc?rev=1131632&view=rev
Log:
EC2 scripts modified with a --ft parameter that will a zookeeper group and instance, it will also launch multiple master instances and start them. Slaves and masters are connected to ZooKeeper. The nexus-ec2 directory now contains a zoo file with the id of the ZK instance and the master file can now contain multiple master addresses.

Added:
    incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/zoo
    incubator/mesos/trunk/src/ec2/deploy.solaris/root/nexus-ec2/zoo
Modified:
    incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/redeploy-nexus
    incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/setup
    incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/start-nexus
    incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/stop-nexus
    incubator/mesos/trunk/src/ec2/nexus_ec2.py

Modified: incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/redeploy-nexus
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/redeploy-nexus?rev=1131632&r1=1131631&r2=1131632&view=diff
==============================================================================
--- incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/redeploy-nexus (original)
+++ incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/redeploy-nexus Sun Jun  5 03:32:24 2011
@@ -2,11 +2,19 @@
 
 SLAVES=/root/nexus-ec2/slaves
 MASTER=/root/nexus-ec2/master
+ZOO=/root/nexus-ec2/zoo
 
 SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=2"
 
-echo "RSYNC'ing /root/nexus to slaves..."
-for slave in `cat $SLAVES`; do
-    echo $slave
-    rsync -e "ssh $SSH_OPTS" -az --exclude '*.d' --exclude '*.o' --exclude '*.cpp' --exclude '*.hpp' --exclude '*.pyc' --exclude 'nexus/frameworks/hadoop-0.20.0/logs/*' /root/nexus $slave:/root
+SERVERS=`cat $SLAVES`
+SERVERS+=" "
+SERVERS+=`cat $MASTER | sed '1d'`
+SERVERS+=" "
+SERVERS+=`cat $ZOO`
+
+
+echo "RSYNC'ing /root/nexus to other servers..."
+for server in $SERVERS; do
+    echo $server
+    rsync -e "ssh $SSH_OPTS" -az --exclude '*.d' --exclude '*.o' --exclude '*.cpp' --exclude '*.hpp' --exclude '*.pyc' --exclude 'nexus/frameworks/hadoop-0.20.0/logs/*' /root/nexus $server:/root
 done

Modified: incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/setup
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/setup?rev=1131632&r1=1131631&r2=1131632&view=diff
==============================================================================
--- incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/setup (original)
+++ incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/setup Sun Jun  5 03:32:24 2011
@@ -11,6 +11,9 @@ export PATH=$PATH:/root/scala-2.7.7.fina
 
 MASTER=/root/nexus-ec2/master
 SLAVES=/root/nexus-ec2/slaves
+ZOO=/root/nexus-ec2/zoo
+
+ISFT=`cat $ZOO | wc -l`
 
 # Scripts that get used for/while running Nexus.
 SCRIPTS="copy-dir
@@ -45,9 +48,24 @@ echo "Setting executable permissions on 
 for s in $SCRIPTS; do chmod u+x $s; done
 
 echo "SSH'ing to local machine to approve key..."
-ssh -q $SSH_OPTS `cat $MASTER` echo -n
+for master in `cat $MASTER`; do
+  echo $master
+  ssh $SSH_OPTS $master echo -n &
+  sleep 0.3
+done
+
 ssh -q $SSH_OPTS localhost echo -n
 
+if [[ $ISFT != 0 ]] ; then
+  echo "SSH'ing to Zoo server(s) to approve keys..."
+  for zoo in `cat $ZOO`; do
+    echo $zoo
+    ssh $SSH_OPTS $zoo echo -n &
+    ssh $SSH_OPTS $zoo mkdir -p /tmp/zookeeper &
+    sleep 0.3
+  done
+fi
+
 echo "SSH'ing to slaves to approve keys..."
 for slave in `cat $SLAVES`; do
   echo $slave
@@ -57,6 +75,22 @@ done
 echo "Waiting for commands to finish..."
 wait
 
+if [[ `cat $MASTER | wc -l` > 1 ]] ; then
+  echo "RSYNC'ing /root/nexus-ec2 to other master servers..."
+  for master in `cat $MASTER | sed '1d'`; do
+      echo $master
+      rsync -e "ssh $SSH_OPTS" -az /root/nexus-ec2 $master:/root
+  done
+fi
+
+if [[ $ISFT != 0 ]] ; then
+  echo "RSYNC'ing /root/nexus-ec2 to other Zoo servers..."
+  for zoo in `cat $ZOO`; do
+      echo $zoo
+      rsync -e "ssh $SSH_OPTS" -az /root/nexus-ec2 $zoo:/root
+  done
+fi
+
 echo "RSYNC'ing /root/nexus-ec2 to slaves..."
 for slave in `cat $SLAVES`; do
   echo $slave

Modified: incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/start-nexus
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/start-nexus?rev=1131632&r1=1131631&r2=1131632&view=diff
==============================================================================
--- incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/start-nexus (original)
+++ incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/start-nexus Sun Jun  5 03:32:24 2011
@@ -2,12 +2,38 @@
 cd /root/nexus-ec2
 
 MASTER="`cat master`"
+MASTER1="`cat master | head -1`"
 SLAVES="`cat slaves`"
+ZOO1="`cat zoo | head -1`"
+ZOO="`cat zoo`"
+
+ISFT="`cat zoo | wc -l`"
 
 SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=2"
 
-echo "Starting master on $MASTER"
-ssh $SSH_OPTS $MASTER "/root/nexus-ec2/nexus-daemon nexus-master -p 1111 </dev/null >/dev/null"
+if [[ $ISFT == 0 ]]; then
+  master_arg="1@${MASTER}:1111"
+else
+  master_arg="zoo://${ZOO1}:2181"
+fi
+
+
+if [[ $ISFT != 0 ]]; then
+  echo "Starting ZooKeeper on $ZOO1"
+  ssh $SSH_OPTS $ZOO1 "/root/nexus/src/third_party/zookeeper-*/bin/zkServer.sh start </dev/null >/dev/null"
+
+  sleep 2
+
+  masterid=1
+  for master in $MASTER; do
+    echo "Starting master $masterid on $MASTER"
+    ssh $SSH_OPTS $master "/root/nexus-ec2/nexus-daemon nexus-master -p 1111 -f $master_arg </dev/null >/dev/null"
+    masterid=$(($masterid+1))
+  done
+else
+  echo "Starting master on $MASTER1"
+  ssh $SSH_OPTS $MASTER1 "/root/nexus-ec2/nexus-daemon nexus-master -p 1111 </dev/null >/dev/null"
+fi
 
 COUNT=''
 CPUS=''
@@ -38,10 +64,19 @@ fi
 
 for slave in $SLAVES; do
   echo "Starting $COUNT slave(s) on $slave"
-  ssh $SSH_OPTS $slave "for ((i = 0; i < $COUNT; i++)); do /root/nexus-ec2/nexus-daemon nexus-slave 1@$MASTER:1111 $CPUS $MEM; done </dev/null >/dev/null" &
+  ssh $SSH_OPTS $slave "for ((i = 0; i < $COUNT; i++)); do /root/nexus-ec2/nexus-daemon nexus-slave ${master_arg} $CPUS $MEM; done </dev/null >/dev/null" &
   sleep 0.1
 done
 wait
 
+if [[ $ISFT != 0 ]]; then
+  echo "ZooKeeper is running at"
+  for zoo in $ZOO; do
+    echo "      $zoo:2181"
+  done
+fi
+
 echo "Everything's started! You can view the master Web UI at"
-echo "      http://$MASTER:8080"
+for master in $MASTER; do
+  echo "      http://$master:8080"
+done

Modified: incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/stop-nexus
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/stop-nexus?rev=1131632&r1=1131631&r2=1131632&view=diff
==============================================================================
--- incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/stop-nexus (original)
+++ incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/stop-nexus Sun Jun  5 03:32:24 2011
@@ -3,6 +3,9 @@ cd /root/nexus-ec2
 
 MASTER="`cat master`"
 SLAVES="`cat slaves`"
+ZOO="`cat zoo`"
+
+ISFT="`cat zoo | wc -l`"
 
 SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=2"
 
@@ -14,4 +17,18 @@ done
 wait
 
 echo "Stopping master on $MASTER"
-ssh $SSH_OPTS $MASTER pkill nexus-master
+for master in $MASTER; do
+  echo "Stopping slave(s) on $master"
+  ssh $SSH_OPTS $master pkill nexus-master &
+  sleep 0.1
+done
+wait
+
+if [[ $ISFT != 0 ]] ; then
+  for zoo in $ZOO; do
+    echo "Stopping ZK on $zoo"
+    ssh $SSH_OPTS $zoo "/root/nexus/src/third_party/zookeeper-*/bin/zkServer.sh stop </dev/null >/dev/null"
+    sleep 0.1
+  done
+  wait
+fi

Added: incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/zoo
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/zoo?rev=1131632&view=auto
==============================================================================
--- incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/zoo (added)
+++ incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/zoo Sun Jun  5 03:32:24 2011
@@ -0,0 +1 @@
+{{zoo}}

Added: incubator/mesos/trunk/src/ec2/deploy.solaris/root/nexus-ec2/zoo
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/ec2/deploy.solaris/root/nexus-ec2/zoo?rev=1131632&view=auto
==============================================================================
--- incubator/mesos/trunk/src/ec2/deploy.solaris/root/nexus-ec2/zoo (added)
+++ incubator/mesos/trunk/src/ec2/deploy.solaris/root/nexus-ec2/zoo Sun Jun  5 03:32:24 2011
@@ -0,0 +1 @@
+{{zoo}}

Modified: incubator/mesos/trunk/src/ec2/nexus_ec2.py
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/ec2/nexus_ec2.py?rev=1131632&r1=1131631&r2=1131632&view=diff
==============================================================================
--- incubator/mesos/trunk/src/ec2/nexus_ec2.py (original)
+++ incubator/mesos/trunk/src/ec2/nexus_ec2.py Sun Jun  5 03:32:24 2011
@@ -48,7 +48,11 @@ def parse_args():
   parser.add_option("--resume", action="store_true", default=False,
       help="Resume installation on a previously launched cluster " +
            "(for debugging)")
+  parser.add_option("-f", "--ft", default="1", 
+      help="Number of masters to run. Default is 1. " + 
+           "Greater values cause Nexus to run in FT mode with ZooKeeper")
   (opts, args) = parser.parse_args()
+  opts.ft = int(opts.ft)
   if len(args) != 2:
     parser.print_help()
     sys.exit(1)
@@ -96,12 +100,15 @@ def wait_for_instances(conn, reservation
 
 
 def launch_cluster(conn, opts, cluster_name):
+  zoo_res = None
   print "Setting up security groups..."
   master_group = get_or_make_group(conn, cluster_name + "-master")
   slave_group = get_or_make_group(conn, cluster_name + "-slaves")
+  zoo_group = get_or_make_group(conn, cluster_name + "-zoo")
   if master_group.rules == []: # Group was just now created
     master_group.authorize(src_group=master_group)
     master_group.authorize(src_group=slave_group)
+    master_group.authorize(src_group=zoo_group)
     master_group.authorize('tcp', 22, 22, '0.0.0.0/0')
     master_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
     master_group.authorize('tcp', 50030, 50030, '0.0.0.0/0')
@@ -109,15 +116,22 @@ def launch_cluster(conn, opts, cluster_n
   if slave_group.rules == []: # Group was just now created
     slave_group.authorize(src_group=master_group)
     slave_group.authorize(src_group=slave_group)
+    slave_group.authorize(src_group=zoo_group)
     slave_group.authorize('tcp', 22, 22, '0.0.0.0/0')
     slave_group.authorize('tcp', 8080, 8081, '0.0.0.0/0')
     slave_group.authorize('tcp', 50060, 50060, '0.0.0.0/0')
     slave_group.authorize('tcp', 50075, 50075, '0.0.0.0/0')
+  if zoo_group.rules == []: # Group was just now created
+    zoo_group.authorize(src_group=master_group)
+    zoo_group.authorize(src_group=slave_group)
+    zoo_group.authorize(src_group=zoo_group)
+    zoo_group.authorize('tcp', 22, 22, '0.0.0.0/0')
+    zoo_group.authorize('tcp', 2181, 2181, '0.0.0.0/0')
   print "Checking for running cluster..."
   reservations = conn.get_all_instances()
   for res in reservations:
     group_names = [g.id for g in res.groups]
-    if master_group.name in group_names or slave_group.name in group_names:
+    if master_group.name in group_names or slave_group.name in group_names or zoo_group.name in group_names:
       active = [i for i in res.instances if i.state in ['pending', 'running']]
       if len(active) > 0:
         print >> stderr, ("ERROR: There are already instances running in " +
@@ -142,9 +156,17 @@ def launch_cluster(conn, opts, cluster_n
   master_res = image.run(key_name = opts.key_pair,
                          security_groups = [master_group],
                          instance_type = master_type,
-                         placement = opts.zone)
+                         placement = opts.zone,
+                         min_count = opts.ft,
+                         max_count = opts.ft)
   print "Launched master, regid = " + master_res.id
-  return (master_res, slave_res)
+  if opts.ft > 1:
+    zoo_res = image.run(key_name = opts.key_pair,
+                        security_groups = [zoo_group],
+                        instance_type = opts.instance_type,
+                        placement = opts.zone)
+    print "Launched zoo, regid = " + zoo_res.id
+  return (master_res, slave_res, zoo_res)
 
 
 def get_existing_cluster(conn, opts, cluster_name):
@@ -152,6 +174,7 @@ def get_existing_cluster(conn, opts, clu
   reservations = conn.get_all_instances()
   master_res = None
   slave_res = None
+  zoo_res = None
   for res in reservations:
     active = [i for i in res.instances if i.state in ['pending', 'running']]
     if len(active) > 0:
@@ -160,10 +183,14 @@ def get_existing_cluster(conn, opts, clu
         master_res = res
       elif group_names == [cluster_name + "-slaves"]:
         slave_res = res
+      elif group_names == [cluster_name + "-zoo"]:
+        zoo_res = res
   if master_res != None and slave_res != None:
     print "Found master regid: " + master_res.id
     print "Found slave regid: " + slave_res.id
-    return (master_res, slave_res)
+    if zoo_res != None:
+      print "Found slave regid: " + zoo_res.id
+    return (master_res, slave_res, zoo_res)
   else:
     if master_res == None and slave_res != None:
       print "ERROR: Could not find master in group " + cluster_name + "-master"
@@ -174,14 +201,21 @@ def get_existing_cluster(conn, opts, clu
     sys.exit(1)
 
 
-def deploy_files(conn, root_dir, instance, opts, master_res, slave_res):
+def deploy_files(conn, root_dir, instance, opts, master_res, slave_res, zoo_res):
   # TODO: Speed up deployment by creating a temp directory with the
   # template-transformed files and then rsyncing it
+
   master = master_res.instances[0].public_dns_name
+
   template_vars = {
-    "master" : master,
+    "master" : '\n'.join([i.public_dns_name for i in master_res.instances]),
     "slave_list" : '\n'.join([i.public_dns_name for i in slave_res.instances])
   }
+
+  if opts.ft > 1:
+    zoo = zoo_res.instances[0].public_dns_name
+    template_vars[ "zoo" ] = '\n'.join([i.public_dns_name for i in zoo_res.instances])
+
   for path, dirs, files in os.walk(root_dir):
     dest_dir = os.path.join('/', path[len(root_dir):])
     if len(files) > 0: # Only mkdir for low-level directories since we use -p
@@ -219,18 +253,20 @@ def main():
   conn = boto.connect_ec2()
   if action == "launch":
     if opts.resume:
-      (master_res, slave_res) = get_existing_cluster(conn, opts, cluster_name)
+      (master_res, slave_res, zoo_res) = get_existing_cluster(conn, opts, cluster_name)
     else:
-      (master_res, slave_res) = launch_cluster(conn, opts, cluster_name)
+      (master_res, slave_res, zoo_res) = launch_cluster(conn, opts, cluster_name)
       print "Waiting for instances to start up..."
       time.sleep(5)
       wait_for_instances(conn, master_res)
       wait_for_instances(conn, slave_res)
+      if opts.ft > 1:
+        wait_for_instances(conn, zoo_res)
       print "Waiting 20 more seconds..."
       time.sleep(20)
     print "Deploying files to master..."
     deploy_files(conn, "deploy." + opts.os, master_res.instances[0],
-        opts, master_res, slave_res)
+        opts, master_res, slave_res, zoo_res)
     print "Copying SSH key %s to master..." % opts.identity_file
     master = master_res.instances[0].public_dns_name
     ssh(master, opts, 'mkdir -p /root/.ssh')
@@ -243,13 +279,16 @@ def main():
     response = raw_input("Are you sure you want to shut down the cluster " +
         cluster_name + "? (y/N) ")
     if response == "y":
-      (master_res, slave_res) = get_existing_cluster(conn, opts, cluster_name)
+      (master_res, slave_res, zoo_res) = get_existing_cluster(conn, opts, cluster_name)
       print "Shutting down master..."
       master_res.stop_all()
       print "Shutting down slaves..."
       slave_res.stop_all()
+      if opts.ft > 1:
+        print "Shutting down zoo..."
+        zoo_res.stop_all()
   elif action == "login":
-    (master_res, slave_res) = get_existing_cluster(conn, opts, cluster_name)
+    (master_res, slave_res, zoo_res) = get_existing_cluster(conn, opts, cluster_name)
     master = master_res.instances[0].public_dns_name
     print "Logging into master " + master + "..."
     proxy_opt = ""