You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@spark.apache.org by jo...@apache.org on 2014/12/20 02:02:44 UTC

spark git commit: [SPARK-4890] Upgrade Boto to 2.34.0; automatically download Boto from PyPi instead of packaging it

Repository: spark
Updated Branches:
  refs/heads/master 7981f9697 -> c28083f46


[SPARK-4890] Upgrade Boto to 2.34.0; automatically download Boto from PyPi instead of packaging it

This patch upgrades `spark-ec2`'s Boto version to 2.34.0, since this is blocking several features.  Newer versions of Boto don't work properly when they're loaded from a zipfile since they try to read a JSON file from a path relative to the Boto library sources.

Therefore, this patch also changes spark-ec2 to automatically download Boto from PyPi if it's not present in `SPARK_EC2_DIR/lib`, similar to what we do in the `sbt/sbt` script. This shouldn't ben an issue for users since they already need to have an internet connection to launch an EC2 cluster.  By performing the downloading in spark_ec2.py instead of the Bash script, this should also work for Windows users.

I've tested this with Python 2.6, too.

Author: Josh Rosen <jo...@databricks.com>

Closes #3737 from JoshRosen/update-boto and squashes the following commits:

0aa43cc [Josh Rosen] Remove unused setup_standalone_cluster() method.
f02935d [Josh Rosen] Enable Python deprecation warnings and fix one Boto warning:
587ae89 [Josh Rosen] [SPARK-4890] Upgrade Boto to 2.34.0; automatically download Boto from PyPi instead of packaging it


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/c28083f4
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/c28083f4
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/c28083f4

Branch: refs/heads/master
Commit: c28083f468685d468c4daa5ce336ef04fbec3144
Parents: 7981f96
Author: Josh Rosen <jo...@databricks.com>
Authored: Fri Dec 19 17:02:37 2014 -0800
Committer: Josh Rosen <jo...@databricks.com>
Committed: Fri Dec 19 17:02:37 2014 -0800

----------------------------------------------------------------------
 ec2/spark-ec2                  |   3 +--
 ec2/spark_ec2.py               |  48 +++++++++++++++++++++++++++---------
 ec2/third_party/boto-2.4.1.zip | Bin 860195 -> 0 bytes
 3 files changed, 38 insertions(+), 13 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/c28083f4/ec2/spark-ec2
----------------------------------------------------------------------
diff --git a/ec2/spark-ec2 b/ec2/spark-ec2
index 4aa9082..3abd3f3 100755
--- a/ec2/spark-ec2
+++ b/ec2/spark-ec2
@@ -22,5 +22,4 @@
 #+ the underlying Python script.
 SPARK_EC2_DIR="$(dirname $0)"
 
-PYTHONPATH="${SPARK_EC2_DIR}/third_party/boto-2.4.1.zip/boto-2.4.1:$PYTHONPATH" \
-    python "${SPARK_EC2_DIR}/spark_ec2.py" "$@"
+python -Wdefault "${SPARK_EC2_DIR}/spark_ec2.py" "$@"

http://git-wip-us.apache.org/repos/asf/spark/blob/c28083f4/ec2/spark_ec2.py
----------------------------------------------------------------------
diff --git a/ec2/spark_ec2.py b/ec2/spark_ec2.py
index 4e8f5c1..556d99d 100755
--- a/ec2/spark_ec2.py
+++ b/ec2/spark_ec2.py
@@ -21,6 +21,7 @@
 
 from __future__ import with_statement
 
+import hashlib
 import logging
 import os
 import pipes
@@ -29,6 +30,7 @@ import shutil
 import string
 import subprocess
 import sys
+import tarfile
 import tempfile
 import time
 import urllib2
@@ -36,9 +38,6 @@ import warnings
 from datetime import datetime
 from optparse import OptionParser
 from sys import stderr
-import boto
-from boto.ec2.blockdevicemapping import BlockDeviceMapping, BlockDeviceType, EBSBlockDeviceType
-from boto import ec2
 
 DEFAULT_SPARK_VERSION = "1.1.0"
 SPARK_EC2_DIR = os.path.dirname(os.path.realpath(__file__))
@@ -48,6 +47,39 @@ MESOS_SPARK_EC2_BRANCH = "v4"
 AMI_PREFIX = "https://raw.github.com/mesos/spark-ec2/{b}/ami-list".format(b=MESOS_SPARK_EC2_BRANCH)
 
 
+def setup_boto():
+    # Download Boto if it's not already present in the SPARK_EC2_DIR/lib folder:
+    version = "boto-2.34.0"
+    md5 = "5556223d2d0cc4d06dd4829e671dcecd"
+    url = "https://pypi.python.org/packages/source/b/boto/%s.tar.gz" % version
+    lib_dir = os.path.join(SPARK_EC2_DIR, "lib")
+    if not os.path.exists(lib_dir):
+        os.mkdir(lib_dir)
+    boto_lib_dir = os.path.join(lib_dir, version)
+    if not os.path.isdir(boto_lib_dir):
+        tgz_file_path = os.path.join(lib_dir, "%s.tar.gz" % version)
+        print "Downloading Boto from PyPi"
+        download_stream = urllib2.urlopen(url)
+        with open(tgz_file_path, "wb") as tgz_file:
+            tgz_file.write(download_stream.read())
+        with open(tgz_file_path) as tar:
+            if hashlib.md5(tar.read()).hexdigest() != md5:
+                print >> stderr, "ERROR: Got wrong md5sum for Boto"
+                sys.exit(1)
+        tar = tarfile.open(tgz_file_path)
+        tar.extractall(path=lib_dir)
+        tar.close()
+        os.remove(tgz_file_path)
+        print "Finished downloading Boto"
+    sys.path.insert(0, boto_lib_dir)
+
+
+setup_boto()
+import boto
+from boto.ec2.blockdevicemapping import BlockDeviceMapping, BlockDeviceType, EBSBlockDeviceType
+from boto import ec2
+
+
 class UsageError(Exception):
     pass
 
@@ -452,7 +484,7 @@ def launch_cluster(conn, opts, cluster_name):
                         active_instance_ids.append(id_to_req[i].instance_id)
                 if len(active_instance_ids) == opts.slaves:
                     print "All %d slaves granted" % opts.slaves
-                    reservations = conn.get_all_instances(active_instance_ids)
+                    reservations = conn.get_all_reservations(active_instance_ids)
                     slave_nodes = []
                     for r in reservations:
                         slave_nodes += r.instances
@@ -541,7 +573,7 @@ def launch_cluster(conn, opts, cluster_name):
 
 def get_existing_cluster(conn, opts, cluster_name, die_on_error=True):
     print "Searching for existing cluster " + cluster_name + "..."
-    reservations = conn.get_all_instances()
+    reservations = conn.get_all_reservations()
     master_nodes = []
     slave_nodes = []
     for res in reservations:
@@ -618,12 +650,6 @@ def setup_cluster(conn, master_nodes, slave_nodes, opts, deploy_ssh_key):
     print "Done!"
 
 
-def setup_standalone_cluster(master, slave_nodes, opts):
-    slave_ips = '\n'.join([i.public_dns_name for i in slave_nodes])
-    ssh(master, opts, "echo \"%s\" > spark/conf/slaves" % (slave_ips))
-    ssh(master, opts, "/root/spark/sbin/start-all.sh")
-
-
 def setup_spark_cluster(master, opts):
     ssh(master, opts, "chmod u+x spark-ec2/setup.sh")
     ssh(master, opts, "spark-ec2/setup.sh")

http://git-wip-us.apache.org/repos/asf/spark/blob/c28083f4/ec2/third_party/boto-2.4.1.zip
----------------------------------------------------------------------
diff --git a/ec2/third_party/boto-2.4.1.zip b/ec2/third_party/boto-2.4.1.zip
deleted file mode 100644
index 49886b8..0000000
Binary files a/ec2/third_party/boto-2.4.1.zip and /dev/null differ


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@spark.apache.org
For additional commands, e-mail: commits-help@spark.apache.org