You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datalab.apache.org by my...@apache.org on 2021/06/22 10:32:12 UTC

[incubator-datalab] 01/01: [DATALAB-2372] - [GCP] Deeplearning deploy from cloud image implemented

This is an automated email from the ASF dual-hosted git repository.

mykolabodnar pushed a commit to branch DATALAB-2372
in repository https://gitbox.apache.org/repos/asf/incubator-datalab.git

commit e6d921eb3b81df1b56394cb043841617af75cd20
Author: bodnarmykola <bo...@gmail.com>
AuthorDate: Tue Jun 22 13:31:23 2021 +0300

    [DATALAB-2372] - [GCP] Deeplearning deploy from cloud image implemented
---
 .../src/base/scripts/install_user_key.py           |  8 ++++--
 .../scripts/configure_deep_learning_node.py        | 32 ++++++++++++++++++++--
 .../files/gcp/deeplearning_description.json        |  2 +-
 .../src/general/lib/gcp/meta_lib.py                | 17 ++++++++++++
 .../general/scripts/gcp/common_prepare_notebook.py | 20 +++++++++++---
 5 files changed, 69 insertions(+), 10 deletions(-)

diff --git a/infrastructure-provisioning/src/base/scripts/install_user_key.py b/infrastructure-provisioning/src/base/scripts/install_user_key.py
index 3d417ab..d7a5faf 100644
--- a/infrastructure-provisioning/src/base/scripts/install_user_key.py
+++ b/infrastructure-provisioning/src/base/scripts/install_user_key.py
@@ -66,9 +66,11 @@ if __name__ == "__main__":
     except:
         print('Fail connection')
         sys.exit(2)
-
-    print("Ensuring safest ssh ciphers")
-    ensure_ciphers()
+    try:
+        print("Ensuring safest ssh ciphers")
+        ensure_ciphers()
+    except:
+        print('Faild to install safest ssh ciphers')
 
     print("Installing users key...")
     try:
diff --git a/infrastructure-provisioning/src/deeplearning/scripts/configure_deep_learning_node.py b/infrastructure-provisioning/src/deeplearning/scripts/configure_deep_learning_node.py
index 00600e5..54f8601 100644
--- a/infrastructure-provisioning/src/deeplearning/scripts/configure_deep_learning_node.py
+++ b/infrastructure-provisioning/src/deeplearning/scripts/configure_deep_learning_node.py
@@ -90,6 +90,22 @@ def install_itorch(os_user):
         conn.sudo('chown -R {0}:{0} /home/{0}/.local/share/jupyter/'.format(os_user))
         conn.sudo('touch /home/{}/.ensure_dir/itorch_ensured'.format(os_user))
 
+def configure_jupyterlab_at_gcp_image(os_user, exploratory_name):
+    if not exists(conn, '/home/{}/.ensure_dir/jupyterlab_ensured'.format(os_user)):
+        jupyter_conf_file = '/home/jupyter/.jupyter/jupyter_notebook_config.py'
+        conn.sudo('''bash -l -c 'sed -i "s|c.NotebookApp|#c.NotebookApp|g" {}' '''.format(jupyter_conf_file))
+        conn.sudo('''bash -l -c "echo 'c.NotebookApp.ip = \\"0.0.0.0\\" ' >> {}" '''.format(jupyter_conf_file))
+        conn.sudo('''bash -l -c "echo 'c.NotebookApp.port = 8888' >> {}" '''.format(jupyter_conf_file))
+        conn.sudo('''bash -l -c "echo 'c.NotebookApp.base_url = \\"/{0}/\\"' >> {1}" '''.format(exploratory_name,
+                                                                                                jupyter_conf_file))
+        conn.sudo('''bash -l -c "echo 'c.NotebookApp.open_browser = False' >> {}" '''.format(jupyter_conf_file))
+        conn.sudo('''bash -l -c "echo 'c.NotebookApp.allow_remote_access = True' >> {}" '''.format(jupyter_conf_file))
+        conn.sudo('''bash -l -c "echo 'c.NotebookApp.cookie_secret = b\\"{0}\\"' >> {1}" '''.format(id_generator(),
+                                                                                                    jupyter_conf_file))
+        conn.sudo('''bash -l -c "echo \\"c.NotebookApp.token = u''\\" >> {}" '''.format(jupyter_conf_file))
+        conn.sudo('systemctl restart jupyter')
+        conn.sudo('touch /home/{}/.ensure_dir/jupyterlab_ensured'.format(os_user))
+
 
 if __name__ == "__main__":
     print("Configure connections")
@@ -105,7 +121,16 @@ if __name__ == "__main__":
     except:
         sys.exit(1)
     print("Mount additional volume")
-    prepare_disk(args.os_user)
+    if os.environ['conf_cloud_provider'] == 'gcp' and os.environ['conf_deeplearning_cloud_ami'] == 'true':
+        print('Additional disk premounted by google image')
+        print('Installing nvidia drivers')
+        try:
+            conn.sudo('/opt/deeplearning/install-driver.sh')
+        except:
+            traceback.print_exc()
+            sys.exit(1)
+    else:
+        prepare_disk(args.os_user)
 
     if os.environ['conf_deeplearning_cloud_ami'] == 'false':
         # INSTALL LANGUAGES
@@ -157,10 +182,13 @@ if __name__ == "__main__":
         ensure_additional_python_libs(args.os_user)
         print("Install Matplotlib")
         ensure_matplot(args.os_user)
-    elif os.environ['conf_deeplearning_cloud_ami'] == 'true':
+    elif os.environ['conf_deeplearning_cloud_ami'] == 'true' and os.environ['conf_cloud_provider'] != 'gcp':
         # CONFIGURE JUPYTER NOTEBOOK
         print("Configure Jupyter")
         configure_jupyter(args.os_user, jupyter_conf_file, templates_dir, args.jupyter_version, args.exploratory_name)
+    else:
+        configure_jupyterlab_at_gcp_image(args.os_user, args.exploratory_name)
+
 
     # INSTALL UNGIT
     print("Install nodejs")
diff --git a/infrastructure-provisioning/src/general/files/gcp/deeplearning_description.json b/infrastructure-provisioning/src/general/files/gcp/deeplearning_description.json
index 79ed687..f3dd752 100644
--- a/infrastructure-provisioning/src/general/files/gcp/deeplearning_description.json
+++ b/infrastructure-provisioning/src/general/files/gcp/deeplearning_description.json
@@ -20,7 +20,7 @@
   "exploratory_environment_images" :
   [
     {"Image family": "common-cu110", "Description": "Google Deep Learning Image: Base, m67 CUDA11.0, A debian-10 Linux based image with CUDA 11.0 preinstalled."},
-    {"Image family": "ccommon-cu100", "Description": "Google Deep Learning Image: Base, m67 CUDA10.0. A debian-10 Linux based image with CUDA 10.0 preinstalled."},
+    {"Image family": "common-cu100", "Description": "Google Deep Learning Image: Base, m67 CUDA10.0. A debian-10 Linux based image with CUDA 10.0 preinstalled."},
     {"Image family": "common-cu92", "Description": "Google Deep Learning Image: Base, m67 CUDA 9.2, A Debian based image with CUDA 9.2 pre-installed."},
     {"Image family": "pytorch-latest-gpu", "Description": "Google Deep Learning Image: PyTorch 1.8, m67 CUDA 110, A debian-10 Linux based image with PyTorch 1.8 pre-installed."},
     {"Image family": "rapids-latest-gpu-experimental", "Description": "Google RAPIDS 0.5.1 with XGBoost, m64, RAPIDS 0.5.1 with XGBoost with CUDA 10.0."},
diff --git a/infrastructure-provisioning/src/general/lib/gcp/meta_lib.py b/infrastructure-provisioning/src/general/lib/gcp/meta_lib.py
index 583ec5f..be5d17b 100644
--- a/infrastructure-provisioning/src/general/lib/gcp/meta_lib.py
+++ b/infrastructure-provisioning/src/general/lib/gcp/meta_lib.py
@@ -394,6 +394,23 @@ class GCPMeta:
                                "error_message": str(err) + "\n Traceback: " + traceback.print_exc(file=sys.stdout)}))
             traceback.print_exc(file=sys.stdout)
 
+    def get_deeplearning_image_by_family(self, family_name):
+        try:
+            request = self.service.images().getFromFamily(project='deeplearning-platform-release', family=family_name)
+            try:
+                return request.execute()
+            except errors.HttpError as err:
+                if err.resp.status == 404:
+                    return ''
+                else:
+                    raise err
+        except Exception as err:
+            logging.info("Error with getting image by family: " + str(err) + "\n Traceback: " + traceback.print_exc(
+                file=sys.stdout))
+            append_result(str({"error": "Error with getting image by family",
+                               "error_message": str(err) + "\n Traceback: " + traceback.print_exc(file=sys.stdout)}))
+            traceback.print_exc(file=sys.stdout)
+
     def get_disk(self, disk_name):
         try:
             request = self.service.disks().get(project=self.project, zone=os.environ['gcp_zone'], disk=disk_name)
diff --git a/infrastructure-provisioning/src/general/scripts/gcp/common_prepare_notebook.py b/infrastructure-provisioning/src/general/scripts/gcp/common_prepare_notebook.py
index 2cb3f64..1c3c038 100644
--- a/infrastructure-provisioning/src/general/scripts/gcp/common_prepare_notebook.py
+++ b/infrastructure-provisioning/src/general/scripts/gcp/common_prepare_notebook.py
@@ -119,19 +119,31 @@ if __name__ == "__main__":
             os.environ['application'], os.environ['notebook_image_name'].replace('_', '-').lower()) if (x != 'None' and x != '')
             else notebook_config['expected_primary_image_name'])(str(os.environ.get('notebook_image_name')))
         print('Searching pre-configured images')
-        notebook_config['primary_image_name'] = GCPMeta.get_image_by_name(
-            notebook_config['notebook_primary_image_name'])
+
+        if os.environ['conf_deeplearning_cloud_ami'] == 'true' and os.environ['application'] == 'deeplearning':
+            notebook_config['primary_image_name'] = GCPMeta.get_deeplearning_image_by_family(os.environ['notebook_image_name'])
+            if notebook_config['primary_image_name']:
+                deeplearning_ami = 'true'
+        else:
+            notebook_config['primary_image_name'] = GCPMeta.get_image_by_name(notebook_config['notebook_primary_image_name'])
+            deeplearning_ami = 'false'
         if notebook_config['primary_image_name'] == '':
             notebook_config['primary_image_name'] = os.environ['gcp_{}_image_name'.format(os.environ['conf_os_family'])]
         else:
             print('Pre-configured primary image found. Using: {}'.format(
                 notebook_config['primary_image_name'].get('name')))
-            notebook_config['primary_image_name'] = 'global/images/{}'.format(
+            if deeplearning_ami == 'true':
+                notebook_config['primary_image_name'] = 'projects/deeplearning-platform-release/global/images/{}'.format(
+                    notebook_config['primary_image_name'].get('name'))
+            else:
+                notebook_config['primary_image_name'] = 'global/images/{}'.format(
                 notebook_config['primary_image_name'].get('name'))
         notebook_config['notebook_secondary_image_name'] = (lambda x: '{0}-{1}-{2}-{3}-secondary-image-{4}'.format(
             notebook_config['service_base_name'], notebook_config['project_name'], notebook_config['endpoint_name'],
-            os.environ['application'], os.environ['notebook_image_name'].replace('_', '-').lower()) if (x != 'None' and x != '')
+            os.environ['application'], os.environ['notebook_image_name'].replace('_', '-').lower()[:63]) if (x != 'None' and x != '')
             else notebook_config['expected_secondary_image_name'])(str(os.environ.get('notebook_image_name')))
+        if notebook_config['notebook_secondary_image_name'][:63].endswith('-'):
+            notebook_config['notebook_secondary_image_name'] = notebook_config['notebook_secondary_image_name'][:63][:-1]
         notebook_config['secondary_image_name'] = GCPMeta.get_image_by_name(
             notebook_config['notebook_secondary_image_name'])
         if notebook_config['secondary_image_name'] == '':

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datalab.apache.org
For additional commands, e-mail: commits-help@datalab.apache.org