You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datalab.apache.org by my...@apache.org on 2021/07/01 11:59:16 UTC
[incubator-datalab] branch DATALAB-2449 created (now 45599a2)
This is an automated email from the ASF dual-hosted git repository.
mykolabodnar pushed a change to branch DATALAB-2449
in repository https://gitbox.apache.org/repos/asf/incubator-datalab.git.
at 45599a2 [DATALAB-2449] - [GCP] GPU type and count fixed for Jupyter and Spark cluster
This branch includes the following new commits:
new 45599a2 [DATALAB-2449] - [GCP] GPU type and count fixed for Jupyter and Spark cluster
The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datalab.apache.org
For additional commands, e-mail: commits-help@datalab.apache.org
[incubator-datalab] 01/01: [DATALAB-2449] - [GCP] GPU type and
count fixed for Jupyter and Spark cluster
Posted by my...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
mykolabodnar pushed a commit to branch DATALAB-2449
in repository https://gitbox.apache.org/repos/asf/incubator-datalab.git
commit 45599a2448d91b27cd8cd206e1a896447324e236
Author: bodnarmykola <bo...@gmail.com>
AuthorDate: Thu Jul 1 14:58:56 2021 +0300
[DATALAB-2449] - [GCP] GPU type and count fixed for Jupyter and Spark cluster
---
.../src/general/lib/gcp/actions_lib.py | 10 +++----
.../general/scripts/gcp/common_create_instance.py | 4 ++-
.../general/scripts/gcp/common_prepare_notebook.py | 17 ++++++++----
.../general/scripts/gcp/dataengine_configure.py | 32 ++++++++++++++++++++++
.../src/general/scripts/gcp/dataengine_prepare.py | 31 ++++++++++++++-------
.../src/general/scripts/gcp/jupyter_configure.py | 2 +-
6 files changed, 73 insertions(+), 23 deletions(-)
diff --git a/infrastructure-provisioning/src/general/lib/gcp/actions_lib.py b/infrastructure-provisioning/src/general/lib/gcp/actions_lib.py
index a2429bc..fa1d891 100644
--- a/infrastructure-provisioning/src/general/lib/gcp/actions_lib.py
+++ b/infrastructure-provisioning/src/general/lib/gcp/actions_lib.py
@@ -320,7 +320,7 @@ class GCPActions:
initial_user, image_name, secondary_image_name, service_account_name, instance_class,
network_tag, labels, static_ip='',
primary_disk_size='12', secondary_disk_size='30',
- gpu_accelerator_type='None'):
+ gpu_accelerator_type='None', gpu_accelerator_count='1'):
key = RSA.importKey(open(ssh_key_path, 'rb').read())
ssh_key = key.publickey().exportKey("OpenSSH").decode('UTF-8')
unique_index = datalab.meta_lib.GCPMeta().get_index_by_service_account_name(service_account_name)
@@ -439,12 +439,12 @@ class GCPActions:
if instance_class == 'notebook' or instance_class == 'dataengine':
del instance_params['networkInterfaces'][0]['accessConfigs']
if gpu_accelerator_type != 'None':
- request = self.service.acceleratorTypes().list(project=self.project, zone = zone)
- result = request.execute().get('items')
- gpu_accelerator_type = result[0].get('name')
+ #request = self.service.acceleratorTypes().list(project=self.project, zone = zone)
+ #result = request.execute().get('items')
+ #gpu_accelerator_type = result[0].get('name')
instance_params['guestAccelerators'] = [
{
- "acceleratorCount": 1,
+ "acceleratorCount": gpu_accelerator_count,
"acceleratorType": "projects/{0}/zones/{1}/acceleratorTypes/{2}".format(
self.project, zone, gpu_accelerator_type)
}
diff --git a/infrastructure-provisioning/src/general/scripts/gcp/common_create_instance.py b/infrastructure-provisioning/src/general/scripts/gcp/common_create_instance.py
index 3ab863f..b62f882 100644
--- a/infrastructure-provisioning/src/general/scripts/gcp/common_create_instance.py
+++ b/infrastructure-provisioning/src/general/scripts/gcp/common_create_instance.py
@@ -45,6 +45,7 @@ parser.add_argument('--instance_class', type=str, default='')
parser.add_argument('--static_ip', type=str, default='')
parser.add_argument('--labels', type=str, default='{"empty":"string"}')
parser.add_argument('--gpu_accelerator_type', type=str, default='None')
+parser.add_argument('--gpu_accelerator_count', type=str, default='None')
parser.add_argument('--network_tag', type=str, default='')
parser.add_argument('--cluster_name', type=str, default='')
parser.add_argument('--service_base_name', type=str, default='')
@@ -62,7 +63,8 @@ if __name__ == "__main__":
args.instance_size, args.ssh_key_path, args.initial_user, args.image_name,
args.secondary_image_name, args.service_account_name, args.instance_class,
args.network_tag, json.loads(args.labels), args.static_ip,
- args.primary_disk_size, args.secondary_disk_size, args.gpu_accelerator_type)
+ args.primary_disk_size, args.secondary_disk_size, args.gpu_accelerator_type,
+ args.gpu_accelerator_count)
else:
parser.print_help()
sys.exit(2)
diff --git a/infrastructure-provisioning/src/general/scripts/gcp/common_prepare_notebook.py b/infrastructure-provisioning/src/general/scripts/gcp/common_prepare_notebook.py
index 3ba882d..17ac8e0 100644
--- a/infrastructure-provisioning/src/general/scripts/gcp/common_prepare_notebook.py
+++ b/infrastructure-provisioning/src/general/scripts/gcp/common_prepare_notebook.py
@@ -97,7 +97,7 @@ if __name__ == "__main__":
notebook_config['project_name'],
notebook_config['endpoint_name'],
notebook_config['exploratory_name'])
- notebook_config['primary_disk_size'] = (lambda x: '50' if x == 'deeplearning' else '16')(
+ notebook_config['primary_disk_size'] = (lambda x: '60' if x == 'deeplearning' else '20')(
os.environ['application'])
notebook_config['secondary_disk_size'] = os.environ['notebook_disk_size']
@@ -155,9 +155,14 @@ if __name__ == "__main__":
notebook_config['secondary_image_name'].get('name'))
notebook_config['gpu_accelerator_type'] = 'None'
+ notebook_config['gpu_accelerator_count'] = 'None'
if os.environ['application'] in ('tensor', 'tensor-rstudio', 'deeplearning') or os.environ['gpu_enabled'] == 'True':
- notebook_config['gpu_accelerator_type'] = os.environ['gcp_gpu_accelerator_type']
+ if os.environ['gpuType'] != '':
+ notebook_config['gpu_accelerator_type'] = os.environ['gpuType']
+ notebook_config['gpu_accelerator_count'] = os.environ['gpuCount']
+ else:
+ notebook_config['gpu_accelerator_type'] = os.environ['gcp_gpu_accelerator_type']
notebook_config['network_tag'] = '{0}-{1}-{2}-ps'.format(notebook_config['service_base_name'],
notebook_config['project_name'],
@@ -194,16 +199,16 @@ if __name__ == "__main__":
params = "--instance_name {0} --region {1} --zone {2} --vpc_name {3} --subnet_name {4} --instance_size {5} " \
"--ssh_key_path {6} --initial_user {7} --service_account_name {8} --image_name {9} " \
"--secondary_image_name {10} --instance_class {11} --primary_disk_size {12} " \
- "--secondary_disk_size {13} --gpu_accelerator_type {14} --network_tag {15} --labels '{16}' " \
- "--service_base_name {17}".\
+ "--secondary_disk_size {13} --gpu_accelerator_type {14} --gpu_accelerator_count {15} --network_tag {16} --labels '{17}' " \
+ "--service_base_name {18}".\
format(notebook_config['instance_name'], notebook_config['region'], notebook_config['zone'],
notebook_config['vpc_name'], notebook_config['subnet_name'], notebook_config['instance_size'],
notebook_config['ssh_key_path'], notebook_config['initial_user'],
notebook_config['notebook_service_account_name'], notebook_config['primary_image_name'],
notebook_config['secondary_image_name'], 'notebook', notebook_config['primary_disk_size'],
notebook_config['secondary_disk_size'], notebook_config['gpu_accelerator_type'],
- notebook_config['network_tag'], json.dumps(notebook_config['labels']),
- notebook_config['service_base_name'])
+ notebook_config['gpu_accelerator_count'], notebook_config['network_tag'],
+ json.dumps(notebook_config['labels']), notebook_config['service_base_name'])
try:
subprocess.run("~/scripts/{}.py {}".format('common_create_instance', params), shell=True, check=True)
except:
diff --git a/infrastructure-provisioning/src/general/scripts/gcp/dataengine_configure.py b/infrastructure-provisioning/src/general/scripts/gcp/dataengine_configure.py
index 8507703..87e6bb2 100644
--- a/infrastructure-provisioning/src/general/scripts/gcp/dataengine_configure.py
+++ b/infrastructure-provisioning/src/general/scripts/gcp/dataengine_configure.py
@@ -124,6 +124,22 @@ def configure_slave(slave_number, data_engine):
datalab.fab.append_result("Failed to configure slave node.", str(err))
sys.exit(1)
+ if 'slave_gpu_type' in os.environ:
+ try:
+ print('[INSTALLING GPU DRIVERS ON MASTER NODE]')
+ params = "--hostname {} --keyfile {} --os_user {}".format(
+ slave_hostname, keyfile_name, data_engine['datalab_ssh_user'])
+ try:
+ subprocess.run("~/scripts/{}.py {}".format('common_install_gpu', params), shell=True, check=True)
+ except:
+ datalab.fab.append_result("Failed installing gpu drivers")
+ raise Exception
+
+ except Exception as err:
+ datalab.fab.append_result("Failed to install GPU drivers.", str(err))
+ GCPActions.remove_instance(notebook_config['instance_name'], notebook_config['zone'])
+ sys.exit(1)
+
def clear_resources():
for i in range(data_engine['instance_count'] - 1):
@@ -298,6 +314,22 @@ if __name__ == "__main__":
clear_resources()
sys.exit(1)
+ if 'master_gpu_type' in os.environ:
+ try:
+ print('[INSTALLING GPU DRIVERS ON MASTER NODE]')
+ params = "--hostname {} --keyfile {} --os_user {}".format(
+ master_node_hostname, keyfile_name, data_engine['datalab_ssh_user'])
+ try:
+ subprocess.run("~/scripts/{}.py {}".format('common_install_gpu', params), shell=True, check=True)
+ except:
+ datalab.fab.append_result("Failed installing gpu drivers")
+ raise Exception
+
+ except Exception as err:
+ datalab.fab.append_result("Failed to install GPU drivers.", str(err))
+ GCPActions.remove_instance(notebook_config['instance_name'], notebook_config['zone'])
+ sys.exit(1)
+
try:
jobs = []
for slave in range(data_engine['instance_count'] - 1):
diff --git a/infrastructure-provisioning/src/general/scripts/gcp/dataengine_prepare.py b/infrastructure-provisioning/src/general/scripts/gcp/dataengine_prepare.py
index 64a27c0..2051b7c 100644
--- a/infrastructure-provisioning/src/general/scripts/gcp/dataengine_prepare.py
+++ b/infrastructure-provisioning/src/general/scripts/gcp/dataengine_prepare.py
@@ -150,7 +150,17 @@ if __name__ == "__main__":
data_engine['gpu_accelerator_type'] = 'None'
if os.environ['application'] in ('tensor', 'tensor-rstudio', 'deeplearning'):
- data_engine['gpu_accelerator_type'] = os.environ['gcp_gpu_accelerator_type']
+ if os.environ['gpu_type'] != '':
+ notebook_config['gpu_accelerator_type'] = os.environ['gpu_type']
+ else:
+ notebook_config['gpu_accelerator_type'] = os.environ['gcp_gpu_accelerator_type']
+
+ if 'master_gpu_type' in os.environ:
+ data_engine['gpu_master_accelerator_type'] = os.environ['master_gpu_type']
+ data_engine['gpu_master_accelerator_count'] = os.environ['master_gpu_count']
+ data_engine['gpu_slave_accelerator_type'] = os.environ['slave_gpu_type']
+ data_engine['gpu_slave_accelerator_count'] = os.environ['slave_gpu_count']
+
data_engine['network_tag'] = '{0}-{1}-{2}-ps'.format(data_engine['service_base_name'],
data_engine['project_name'], data_engine['endpoint_name'])
additional_tags = os.environ['tags'].replace("': '", ":").replace("', '", ",").replace("{'", "").replace(
@@ -185,14 +195,14 @@ if __name__ == "__main__":
params = "--instance_name {0} --region {1} --zone {2} --vpc_name {3} --subnet_name {4} --instance_size {5} " \
"--ssh_key_path {6} --initial_user {7} --service_account_name {8} --image_name {9} " \
"--secondary_image_name {10} --instance_class {11} --primary_disk_size {12} " \
- "--secondary_disk_size {13} --gpu_accelerator_type {14} --network_tag {15} --cluster_name {16} " \
- "--labels '{17}' --service_base_name {18}". \
+ "--secondary_disk_size {13} --gpu_accelerator_type {14} --gpu_accelerator_count {15} --network_tag {16} --cluster_name {17} " \
+ "--labels '{18}' --service_base_name {19}". \
format(data_engine['master_node_name'], data_engine['region'], data_engine['zone'], data_engine['vpc_name'],
data_engine['subnet_name'], data_engine['master_size'], data_engine['ssh_key_path'], initial_user,
data_engine['dataengine_service_account_name'], data_engine['primary_image_name'],
data_engine['secondary_image_name'], 'dataengine', data_engine['primary_disk_size'],
- data_engine['secondary_disk_size'], data_engine['gpu_accelerator_type'],
- data_engine['network_tag'], data_engine['cluster_name'],
+ data_engine['secondary_disk_size'], data_engine['gpu_master_accelerator_type'],
+ data_engine['gpu_master_accelerator_count'], data_engine['network_tag'], data_engine['cluster_name'],
json.dumps(data_engine['master_labels']), data_engine['service_base_name'])
try:
subprocess.run("~/scripts/{}.py {}".format('common_create_instance', params), shell=True, check=True)
@@ -212,16 +222,17 @@ if __name__ == "__main__":
params = "--instance_name {0} --region {1} --zone {2} --vpc_name {3} --subnet_name {4} " \
"--instance_size {5} --ssh_key_path {6} --initial_user {7} --service_account_name {8} " \
"--image_name {9} --secondary_image_name {10} --instance_class {11} --primary_disk_size {12} " \
- "--secondary_disk_size {13} --gpu_accelerator_type {14} --network_tag {15} --cluster_name {16} " \
- "--labels '{17}' --service_base_name {18}". \
+ "--secondary_disk_size {13} --gpu_accelerator_type {14} --gpu_accelerator_count {15} --network_tag {16} --cluster_name {17} " \
+ "--labels '{18}' --service_base_name {19}". \
format(slave_name, data_engine['region'], data_engine['zone'],
data_engine['vpc_name'], data_engine['subnet_name'], data_engine['slave_size'],
data_engine['ssh_key_path'], initial_user, data_engine['dataengine_service_account_name'],
data_engine['primary_image_name'], data_engine['secondary_image_name'], 'dataengine',
data_engine['primary_disk_size'],
- data_engine['secondary_disk_size'], data_engine['gpu_accelerator_type'],
- data_engine['network_tag'], data_engine['cluster_name'],
- json.dumps(data_engine['slave_labels']), data_engine['service_base_name'])
+ data_engine['secondary_disk_size'], data_engine['gpu_slave_accelerator_type'],
+ data_engine['gpu_slave_accelerator_count'], data_engine['network_tag'],
+ data_engine['cluster_name'], json.dumps(data_engine['slave_labels']),
+ data_engine['service_base_name'])
try:
subprocess.run("~/scripts/{}.py {}".format('common_create_instance', params), shell=True, check=True)
except:
diff --git a/infrastructure-provisioning/src/general/scripts/gcp/jupyter_configure.py b/infrastructure-provisioning/src/general/scripts/gcp/jupyter_configure.py
index 14a48f6..0ede3eb 100644
--- a/infrastructure-provisioning/src/general/scripts/gcp/jupyter_configure.py
+++ b/infrastructure-provisioning/src/general/scripts/gcp/jupyter_configure.py
@@ -245,7 +245,7 @@ if __name__ == "__main__":
try:
subprocess.run("~/scripts/{}.py {}".format('common_install_gpu', params), shell=True, check=True)
except:
- datalab.fab.append_result("Failed installing users key")
+ datalab.fab.append_result("Failed installing gpu drivers")
raise Exception
except Exception as err:
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datalab.apache.org
For additional commands, e-mail: commits-help@datalab.apache.org