You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datalab.apache.org by lf...@apache.org on 2022/02/14 16:16:52 UTC

[incubator-datalab] branch DATALAB-2698 created (now 42e7adc)

This is an automated email from the ASF dual-hosted git repository.

lfrolov pushed a change to branch DATALAB-2698
in repository https://gitbox.apache.org/repos/asf/incubator-datalab.git.


      at 42e7adc  [DATALAB-2698]: fixed nvidia drivers for aws tensor

This branch includes the following new commits:

     new 42e7adc  [DATALAB-2698]: fixed nvidia drivers for aws tensor

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datalab.apache.org
For additional commands, e-mail: commits-help@datalab.apache.org


[incubator-datalab] 01/01: [DATALAB-2698]: fixed nvidia drivers for aws tensor

Posted by lf...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

lfrolov pushed a commit to branch DATALAB-2698
in repository https://gitbox.apache.org/repos/asf/incubator-datalab.git

commit 42e7adc25f7c82b552a3da466c60f0c596f6ca26
Author: leonidfrolov <fr...@gmail.com>
AuthorDate: Mon Feb 14 18:11:19 2022 +0200

    [DATALAB-2698]: fixed nvidia drivers for aws tensor
---
 .../src/general/lib/os/debian/notebook_lib.py      | 26 +++++++++++++++++++---
 .../general/scripts/aws/common_prepare_notebook.py | 10 +++++++--
 .../src/general/scripts/aws/tensor_configure.py    | 16 +++++++++++++
 .../scripts/{gcp => os}/common_install_gpu.py      |  0
 .../src/tensor/scripts/configure_tensor_node.py    |  3 +++
 5 files changed, 50 insertions(+), 5 deletions(-)

diff --git a/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py b/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py
index 0fcea27..768e6aa 100644
--- a/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py
+++ b/infrastructure-provisioning/src/general/lib/os/debian/notebook_lib.py
@@ -248,6 +248,7 @@ def ensure_additional_python_libs(os_user):
                 datalab.fab.conn.sudo('pip3 install NumPy=={} SciPy pandas Sympy Pillow sklearn --no-cache-dir'.format(os.environ['notebook_numpy_version']))
             if os.environ['application'] in ('tensor', 'deeplearning'):
                 datalab.fab.conn.sudo('pip3 install opencv-python h5py --no-cache-dir')
+                #datalab.fab.conn.sudo('pip3 install python3-opencv scikit-learn --no-cache-dir')
             datalab.fab.conn.sudo('touch /home/' + os_user + '/.ensure_dir/additional_python_libs_ensured')
         except:
             sys.exit(1)
@@ -293,14 +294,22 @@ def ensure_python3_libraries(os_user):
 def install_nvidia_drivers(os_user):
     if not exists(datalab.fab.conn,'/home/{}/.ensure_dir/nvidia_ensured'.format(os_user)):
         try:
+            if os.environ['conf_cloud_provider'] == 'aws':
+                cuda_version = '11.3.0'
+                cuda_file_name = "cuda-repo-ubuntu2004-11-3-local_11.3.0-465.19.01-1_amd64.deb"
+                cuda_key = '/var/cuda-repo-ubuntu2004-11-3-local/7fa2af80.pub'
+            else:
+                cuda_version = '11.4.0'
+                cuda_file_name = 'cuda-repo-ubuntu2004-11-4-local_11.4.0-470.42.01-1_amd64.deb'
+                cuda_key = '/var/cuda-repo-ubuntu2004-11-4-local/7fa2af80.pub'
             # install nvidia drivers
             datalab.fab.conn.sudo(
                 'wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-ubuntu2004.pin')
             datalab.fab.conn.sudo('mv cuda-ubuntu2004.pin /etc/apt/preferences.d/cuda-repository-pin-600')
             datalab.fab.conn.sudo(
-                'wget https://developer.download.nvidia.com/compute/cuda/11.4.0/local_installers/cuda-repo-ubuntu2004-11-4-local_11.4.0-470.42.01-1_amd64.deb')
-            datalab.fab.conn.sudo('dpkg -i cuda-repo-ubuntu2004-11-4-local_11.4.0-470.42.01-1_amd64.deb')
-            datalab.fab.conn.sudo('apt-key add /var/cuda-repo-ubuntu2004-11-4-local/7fa2af80.pub')
+                'wget https://developer.download.nvidia.com/compute/cuda/{}/local_installers/{}'.format(cuda_version, cuda_file_name))
+            datalab.fab.conn.sudo('dpkg -i {}'.format(cuda_file_name))
+            datalab.fab.conn.sudo('apt-key add {}'.format(cuda_key))
             manage_pkg('update', 'remote', '')
             manage_pkg('-y install', 'remote', 'cuda')
             #clean space on disk
@@ -386,6 +395,17 @@ def install_tensor(os_user, cuda_version, cuda_file_name,
             sys.exit(1)
 
 
+def ensure_pytorch(os_user, gpu=True):
+    if not exists(datalab.fab.conn, '/home/' + os_user + '/.ensure_dir/pytorch_ensured'):
+        if gpu:
+            install_venv_pip_pkg('torch==1.10.2+cu113 torchvision==0.11.3+cu113 torchaudio==0.10.2+cu113'
+                                 ' -f https://download.pytorch.org/whl/cu113/torch_stable.html')
+        else:
+            datalab.fab.conn.sudo('pip3 install torch==1.10.2+cpu torchvision==0.11.3+cpu torchaudio==0.10.2+cpu -f '
+                                  'https://download.pytorch.org/whl/cpu/torch_stable.html --no-cache-dir')
+        datalab.fab.conn.sudo('touch /home/' + os_user + '/.ensure_dir/pytorch_ensured')
+
+
 def install_maven(os_user):
     if not exists(datalab.fab.conn,'/home/' + os_user + '/.ensure_dir/maven_ensured'):
         manage_pkg('-y install', 'remote', 'maven')
diff --git a/infrastructure-provisioning/src/general/scripts/aws/common_prepare_notebook.py b/infrastructure-provisioning/src/general/scripts/aws/common_prepare_notebook.py
index 6d40ffe..a11cc32 100644
--- a/infrastructure-provisioning/src/general/scripts/aws/common_prepare_notebook.py
+++ b/infrastructure-provisioning/src/general/scripts/aws/common_prepare_notebook.py
@@ -79,8 +79,14 @@ if __name__ == "__main__":
                                                                       notebook_config['project_name'],
                                                                       notebook_config['endpoint_name'],
                                                                       notebook_config['exploratory_name'], args.uuid)
-        notebook_config['primary_disk_size'] = (lambda x: '100' if x == 'deeplearning' else '16')(
-            os.environ['application'])
+        #notebook_config['primary_disk_size'] = (lambda x: '100' if x == 'deeplearning' else '16')(
+        #    os.environ['application'])
+        if os.environ['application'] == 'deeplearning':
+            notebook_config['primary_disk_size'] = '100'
+        elif os.environ['application'] == 'tensor':
+            notebook_config['primary_disk_size'] = '32'
+        else:
+            notebook_config['primary_disk_size'] = '16'
         notebook_config['role_profile_name'] = '{}-{}-{}-nb-de-profile'.format(
             notebook_config['service_base_name'], notebook_config['project_name'], notebook_config['endpoint_name'])
         notebook_config['security_group_name'] = '{}-{}-{}-nb-sg'.format(notebook_config['service_base_name'],
diff --git a/infrastructure-provisioning/src/general/scripts/aws/tensor_configure.py b/infrastructure-provisioning/src/general/scripts/aws/tensor_configure.py
index 2a0d115..d3b7d8e 100644
--- a/infrastructure-provisioning/src/general/scripts/aws/tensor_configure.py
+++ b/infrastructure-provisioning/src/general/scripts/aws/tensor_configure.py
@@ -154,6 +154,22 @@ if __name__ == "__main__":
         datalab.actions_lib.remove_ec2(notebook_config['tag_name'], notebook_config['instance_name'])
         sys.exit(1)
 
+    #Installing GPU drivers
+    try:
+        logging.info('[INSTALLING GPU DRIVERS]')
+        params = "--hostname {} --keyfile {} --os_user {}".format(
+            instance_hostname, keyfile_name, notebook_config['datalab_ssh_user'])
+        try:
+            subprocess.run("~/scripts/{}.py {}".format('common_install_gpu', params), shell=True, check=True)
+        except:
+            datalab.fab.append_result("Failed installing users key")
+            raise Exception
+
+    except Exception as err:
+        datalab.fab.append_result("Failed to install GPU drivers.", str(err))
+        GCPActions.remove_instance(notebook_config['instance_name'], notebook_config['zone'])
+        sys.exit(1)
+
     # installing and configuring TensorFlow and all dependencies
     try:
         logging.info('[CONFIGURE TENSORFLOW NOTEBOOK INSTANCE]')
diff --git a/infrastructure-provisioning/src/general/scripts/gcp/common_install_gpu.py b/infrastructure-provisioning/src/general/scripts/os/common_install_gpu.py
similarity index 100%
rename from infrastructure-provisioning/src/general/scripts/gcp/common_install_gpu.py
rename to infrastructure-provisioning/src/general/scripts/os/common_install_gpu.py
diff --git a/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py b/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py
index c9b5e3f..3566518 100644
--- a/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py
+++ b/infrastructure-provisioning/src/tensor/scripts/configure_tensor_node.py
@@ -142,6 +142,9 @@ if __name__ == "__main__":
     # INSTALL OPTIONAL PACKAGES
     print("Installing additional Python packages")
     ensure_additional_python_libs(args.os_user)
+    if os.environ['conf_cloud_provider'] == 'aws':
+        print('Installing Pytorch')
+        ensure_pytorch(args.os_user)
     print("Install Matplotlib")
     ensure_matplot(args.os_user)
     

---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datalab.apache.org
For additional commands, e-mail: commits-help@datalab.apache.org