You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by be...@apache.org on 2011/06/05 10:38:16 UTC
svn commit: r1132076 - in /incubator/mesos/trunk/ec2: ./
deploy.lucid64/root/hadoop-0.20.2/conf/ deploy.lucid64/root/mesos-ec2/
deploy.lucid64/root/mesos-ec2/hadoop-framework-conf/
Author: benh
Date: Sun Jun 5 08:38:16 2011
New Revision: 1132076
URL: http://svn.apache.org/viewvc?rev=1132076&view=rev
Log:
More work on EC2 scripts:
- Files are now deployed to the master using rsync instead of scp,
making cluster startup faster
- A cluster-url file is created in /root/mesos-ec2 to let users know the
URL without having to list ZooKeeper nodes, etc themselves
- This cluster URL is also used by the Hadoop framework
- HDFS no longer attempts to use all the masters as secondary name
nodes, only the first one (this was likely a bug before)
Added:
incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/cluster-url
Modified:
incubator/mesos/trunk/ec2/deploy.lucid64/root/hadoop-0.20.2/conf/masters
incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/hadoop-framework-conf/mapred-site.xml
incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/setup
incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/start-mesos
incubator/mesos/trunk/ec2/mesos_ec2.py
Modified: incubator/mesos/trunk/ec2/deploy.lucid64/root/hadoop-0.20.2/conf/masters
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/ec2/deploy.lucid64/root/hadoop-0.20.2/conf/masters?rev=1132076&r1=1132075&r2=1132076&view=diff
==============================================================================
--- incubator/mesos/trunk/ec2/deploy.lucid64/root/hadoop-0.20.2/conf/masters (original)
+++ incubator/mesos/trunk/ec2/deploy.lucid64/root/hadoop-0.20.2/conf/masters Sun Jun 5 08:38:16 2011
@@ -1 +1 @@
-{{master_list}}
+{{active_master}}
Added: incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/cluster-url
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/cluster-url?rev=1132076&view=auto
==============================================================================
--- incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/cluster-url (added)
+++ incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/cluster-url Sun Jun 5 08:38:16 2011
@@ -0,0 +1 @@
+{{cluster_url}}
Modified: incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/hadoop-framework-conf/mapred-site.xml
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/hadoop-framework-conf/mapred-site.xml?rev=1132076&r1=1132075&r2=1132076&view=diff
==============================================================================
--- incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/hadoop-framework-conf/mapred-site.xml (original)
+++ incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/hadoop-framework-conf/mapred-site.xml Sun Jun 5 08:38:16 2011
@@ -22,7 +22,7 @@
<property>
<name>mapred.mesos.master</name>
- <value>1@{{active_master}}:5050</value>
+ <value>{{cluster_url}}</value>
</property>
<property>
Modified: incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/setup
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/setup?rev=1132076&r1=1132075&r2=1132076&view=diff
==============================================================================
--- incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/setup (original)
+++ incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/setup Sun Jun 5 08:38:16 2011
@@ -25,6 +25,7 @@ ZOOS=`cat zoo`
if [[ $ZOOS = *NONE* ]]; then
NUM_ZOOS=0
+ ZOOS=""
else
NUM_ZOOS=`cat zoo | wc -l`
fi
@@ -95,7 +96,7 @@ if [[ $NUM_MASTERS -gt 1 ]] ; then
fi
if [[ $NUM_ZOOS != 0 ]] ; then
- echo "RSYNC'ing /root/mesos-ec2 to other ZooKeeper servers..."
+ echo "RSYNC'ing /root/mesos-ec2 to ZooKeeper servers..."
for zoo in $ZOOS; do
echo $zoo
rsync -e "ssh $SSH_OPTS" -az /root/mesos-ec2 $zoo:/root & sleep 0.3
@@ -111,27 +112,15 @@ for slave in $SLAVES; do
done
wait
-echo "Setting up slaves..."
-for slave in $SLAVES; do
- echo $slave
- ssh -t $SSH_OPTS root@$slave "mesos-ec2/setup-slave" &
+echo "Running slave setup script on all nodes..."
+for node in $SLAVES $MASTERS $ZOO; do
+ echo $node
+ ssh -t $SSH_OPTS root@$node "mesos-ec2/setup-slave" & wait 0.3
done
wait
-if [[ $NUM_ZOOS != 0 ]] ; then
- echo "Running slave setup on ZooKeeper nodes..."
- for slave in $SLAVES; do
- echo $slave
- ssh -t $SSH_OPTS root@$slave "mesos-ec2/setup-slave" &
- done
- wait
-fi
-
-echo "Running slave setup on master (i.e. for local)..."
-./setup-slave
-
if [[ $NUM_MASTERS -gt 1 ]] ; then
- echo "RSYNC'ing Hadoop config files for HDFS to other masters..."
+ echo "RSYNC'ing HDFS config files to other masters..."
for master in `cat $MASTERS_FILE | sed '1d'`; do
echo $master
rsync -e "ssh $SSH_OPTS" -az $HADOOP_HOME/conf $master:$HADOOP_HOME & sleep 0.3
@@ -139,7 +128,7 @@ if [[ $NUM_MASTERS -gt 1 ]] ; then
wait
fi
-echo "RSYNC'ing Hadoop config files for HDFS to slaves..."
+echo "RSYNC'ing HDFS config files to slaves..."
for slave in $SLAVES; do
echo $slave
rsync -e "ssh $SSH_OPTS" -az $HADOOP_HOME/conf $slave:$HADOOP_HOME & sleep 0.3
@@ -192,6 +181,9 @@ echo "Setting up haproxy+apache framewor
cp haproxy+apache/* /root/mesos/frameworks/haproxy+apache
echo "Setting up Spark config files..."
+# TODO: This currently overwrites whatever the user wrote there; on
+# the other hand, we also don't want to leave an old file created by
+# us because it would have the wrong hostname for HDFS etc
mkdir -p /root/spark/conf
echo "-Dspark.dfs=hdfs://$HOSTNAME:9000 -Dspark.repl.classdir=/nfs" \
> /root/spark/conf/java-opts
Modified: incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/start-mesos
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/start-mesos?rev=1132076&r1=1132075&r2=1132076&view=diff
==============================================================================
--- incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/start-mesos (original)
+++ incubator/mesos/trunk/ec2/deploy.lucid64/root/mesos-ec2/start-mesos Sun Jun 5 08:38:16 2011
@@ -15,25 +15,15 @@ fi
SSH_OPTS="-o StrictHostKeyChecking=no -o ConnectTimeout=2"
-if [[ $NUM_ZOOS == 0 ]]; then
- master_arg="1@${ACTIVE_MASTER}:5050"
-else
- master_arg="zoo://"
- add=""
- for zoo in $ZOOS; do
- master_arg+=$add
- master_arg+=$zoo":2181/mesos"
- add=","
- done
-fi
+cluster_url=`cat cluster-url`
-echo "Running with master parameter: "$master_arg
+echo "Running with cluster URL: "$cluster_url
if [[ $NUM_ZOOS != 0 ]]; then
masterid=1
for master in $MASTERS; do
echo "Starting master $masterid on $master"
- ssh $SSH_OPTS $master "/root/mesos-ec2/mesos-daemon mesos-master -p 5050 -u $master_arg $@ </dev/null >/dev/null" & sleep 0.1
+ ssh $SSH_OPTS $master "/root/mesos-ec2/mesos-daemon mesos-master -p 5050 -u $cluster_url $@ </dev/null >/dev/null" & sleep 0.1
masterid=$(($masterid+1))
done
wait
@@ -73,7 +63,7 @@ MEM=''
for slave in $SLAVES; do
echo "Starting $COUNT slave(s) on $slave"
- ssh $SSH_OPTS $slave "for ((i = 0; i < $COUNT; i++)); do /root/mesos-ec2/mesos-daemon mesos-slave -u ${master_arg} $CPUS $MEM; done </dev/null >/dev/null" &
+ ssh $SSH_OPTS $slave "for ((i = 0; i < $COUNT; i++)); do /root/mesos-ec2/mesos-daemon mesos-slave -u ${cluster_url} $CPUS $MEM; done </dev/null >/dev/null" &
sleep 0.1
done
wait
Modified: incubator/mesos/trunk/ec2/mesos_ec2.py
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/ec2/mesos_ec2.py?rev=1132076&r1=1132075&r2=1132076&view=diff
==============================================================================
--- incubator/mesos/trunk/ec2/mesos_ec2.py (original)
+++ incubator/mesos/trunk/ec2/mesos_ec2.py Sun Jun 5 08:38:16 2011
@@ -5,13 +5,13 @@ from __future__ import with_statement
import boto
import logging
import os
+import shutil
import subprocess
import sys
import tempfile
import time
from optparse import OptionParser
from sys import stderr
-from tempfile import NamedTemporaryFile
from boto.ec2.blockdevicemapping import BlockDeviceMapping, EBSBlockDeviceType
@@ -295,9 +295,6 @@ def get_num_disks(instance_type):
# the first master instance in the cluster, and we expect the setup
# script to be run on that instance to copy them to other nodes.
def deploy_files(conn, root_dir, opts, master_res, slave_res, zoo_res):
- # TODO: Speed up deployment by creating a temp directory with the
- # template-transformed files and then rsyncing it
-
active_master = master_res.instances[0].public_dns_name
num_disks = get_num_disks(opts.instance_type)
@@ -308,38 +305,50 @@ def deploy_files(conn, root_dir, opts, m
hdfs_data_dirs += ",/mnt%d/hdfs/dfs/data" % i
mapred_local_dirs += ",/mnt%d/hadoop/mrlocal" % i
+ if zoo_res != None:
+ zoo_list = '\n'.join([i.public_dns_name for i in zoo_res.instances])
+ cluster_url = "zoo://" + ",".join(
+ ["%s:2181/mesos" % i.public_dns_name for i in zoo_res.instances])
+ else:
+ zoo_list = "NONE"
+ cluster_url = "1@%s:5050" % active_master
+
template_vars = {
"master_list": '\n'.join([i.public_dns_name for i in master_res.instances]),
"active_master": active_master,
"slave_list": '\n'.join([i.public_dns_name for i in slave_res.instances]),
+ "zoo_list": zoo_list,
+ "cluster_url": cluster_url,
"hdfs_data_dirs": hdfs_data_dirs,
"mapred_local_dirs": mapred_local_dirs
}
- if opts.ft > 1:
- zoo = zoo_res.instances[0].public_dns_name
- template_vars["zoo_list"] = '\n'.join(
- [i.public_dns_name for i in zoo_res.instances])
- else:
- template_vars["zoo_list"] = "NONE";
-
+ # Create a temp directory in which we will place all the files to be
+ # deployed after we substitue template parameters in them
+ tmp_dir = tempfile.mkdtemp()
for path, dirs, files in os.walk(root_dir):
dest_dir = os.path.join('/', path[len(root_dir):])
- if len(files) > 0: # Only mkdir for low-level directories since we use -p
- ssh(active_master, opts, 'mkdir -p "%s"' % dest_dir)
+ local_dir = tmp_dir + dest_dir
+ if not os.path.exists(local_dir):
+ os.makedirs(local_dir)
for filename in files:
if filename[0] not in '#.~' and filename[-1] != '~':
dest_file = os.path.join(dest_dir, filename)
- print "Setting up %s" % dest_file
- with open(os.path.join(path, filename)) as file:
- text = file.read()
- for key in template_vars:
- text = text.replace("{{" + key + "}}", template_vars[key])
- temp_file = NamedTemporaryFile()
- temp_file.write(text)
- temp_file.flush()
- scp(active_master, opts, temp_file.name, dest_file)
- temp_file.close()
+ local_file = tmp_dir + dest_file
+ with open(os.path.join(path, filename)) as src:
+ with open(local_file, "w") as dest:
+ text = src.read()
+ for key in template_vars:
+ text = text.replace("{{" + key + "}}", template_vars[key])
+ dest.write(text)
+ dest.close()
+ # rsync the whole directory over to the master machine
+ command = (("rsync -rv -e 'ssh -o StrictHostKeyChecking=no -i %s' " +
+ "'%s/' 'root@%s:/'") % (opts.identity_file, tmp_dir, active_master))
+ print command
+ subprocess.check_call(command, shell=True)
+ # Remove the temp directory
+ shutil.rmtree(tmp_dir)
# Copy a file to a given host through scp, throwing an exception if scp fails
@@ -367,7 +376,7 @@ def main():
else:
(master_res, slave_res, zoo_res) = launch_cluster(
conn, opts, cluster_name)
- wait_for_cluster(conn, master_res, slave_res, zoo_res)
+ wait_for_cluster(conn, master_res, slave_res, zoo_res)
setup_cluster(conn, master_res, slave_res, zoo_res, opts, True)
elif action == "destroy":
response = raw_input("Are you sure you want to destroy the cluster " +
@@ -382,7 +391,7 @@ def main():
print "Terminating slaves..."
for inst in slave_res.instances:
inst.terminate()
- if opts.ft > 1:
+ if zoo_res != None:
print "Terminating zoo..."
for inst in zoo_res.instances:
inst.terminate()
@@ -406,7 +415,7 @@ def main():
response = raw_input("Are you sure you want to stop the cluster " +
cluster_name + "?\nDATA ON EPHEMERAL DISKS WILL BE LOST, " +
"BUT THE CLUSTER WILL KEEP USING SPACE ON\n" +
- "AMAZON EBS IF IT IS EBS-BACKED!\n" +
+ "AMAZON EBS IF IT IS EBS-BACKED!!\n" +
"Stop cluster " + cluster_name + " (y/N): ")
if response == "y":
(master_res, slave_res, zoo_res) = get_existing_cluster(
@@ -417,7 +426,7 @@ def main():
print "Stopping slaves..."
for inst in slave_res.instances:
inst.stop()
- if opts.ft > 1:
+ if zoo_res != None:
print "Stopping zoo..."
for inst in zoo_res.instances:
inst.stop()
@@ -425,13 +434,13 @@ def main():
elif action == "start":
(master_res, slave_res, zoo_res) = get_existing_cluster(
conn, opts, cluster_name)
- print "Starting master..."
- for inst in master_res.instances:
- inst.start()
print "Starting slaves..."
for inst in slave_res.instances:
inst.start()
- if opts.ft > 1:
+ print "Starting master..."
+ for inst in master_res.instances:
+ inst.start()
+ if zoo_res != None:
print "Starting zoo..."
for inst in zoo_res.instances:
inst.start()