You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datalab.apache.org by op...@apache.org on 2022/09/27 17:41:17 UTC

[incubator-datalab] 01/01: [DATALAB-2914]: added scripts to integrate jupyter with hdinsight

This is an automated email from the ASF dual-hosted git repository.

opolishchuk pushed a commit to branch DATALAB-2914
in repository https://gitbox.apache.org/repos/asf/incubator-datalab.git

commit db6f2f7462612618629ecd2350aae686758f4b73
Author: Oleksandr Polishchuk <po...@gmail.com>
AuthorDate: Tue Sep 27 20:40:14 2022 +0300

    [DATALAB-2914]: added scripts to integrate jupyter with hdinsight
---
 .../files/azure/dataengine-service_Dockerfile      |   2 +
 .../src/general/files/azure/jupyter_Dockerfile     |   2 +
 .../jupyter_dataengine-service_create_configs.py   |  88 +++++++++++++++++
 .../jupyter_install_dataengine-service_kernels.py  | 104 +++++++++++++++++++++
 .../dataengine-service_sparkmagic_config.json      |  17 ++++
 5 files changed, 213 insertions(+)

diff --git a/infrastructure-provisioning/src/general/files/azure/dataengine-service_Dockerfile b/infrastructure-provisioning/src/general/files/azure/dataengine-service_Dockerfile
index 966aaa5c2..cd9ea3ba8 100644
--- a/infrastructure-provisioning/src/general/files/azure/dataengine-service_Dockerfile
+++ b/infrastructure-provisioning/src/general/files/azure/dataengine-service_Dockerfile
@@ -34,6 +34,8 @@ COPY general/templates/os/inactive.sh /root/templates/
 COPY general/templates/os/inactive.service /root/templates/
 COPY general/templates/os/inactive.timer /root/templates/
 COPY general/templates/azure/dataengine-service_interpreter_livy.json /root/templates/dataengine-service_interpreter_livy.json
+COPY general/templates/azure/dataengine-service_sparkmagic_config.json /root/templates/dataengine-service_sparkmagic_config.json
+
 
 RUN chmod a+x /root/fabfile.py; \
     chmod a+x /root/scripts/*
diff --git a/infrastructure-provisioning/src/general/files/azure/jupyter_Dockerfile b/infrastructure-provisioning/src/general/files/azure/jupyter_Dockerfile
index 45736846c..18da125e8 100644
--- a/infrastructure-provisioning/src/general/files/azure/jupyter_Dockerfile
+++ b/infrastructure-provisioning/src/general/files/azure/jupyter_Dockerfile
@@ -45,6 +45,8 @@ COPY general/templates/os/inactive.sh /root/templates/
 COPY general/templates/os/inactive.service /root/templates/
 COPY general/templates/os/inactive.timer /root/templates/
 COPY general/templates/azure/core-site* /root/templates/
+COPY general/templates/azure/dataengine-service_sparkmagic_config.json /root/templates/
+
 
 
 RUN chmod a+x /root/fabfile.py; \
diff --git a/infrastructure-provisioning/src/general/scripts/azure/jupyter_dataengine-service_create_configs.py b/infrastructure-provisioning/src/general/scripts/azure/jupyter_dataengine-service_create_configs.py
new file mode 100644
index 000000000..488b782b6
--- /dev/null
+++ b/infrastructure-provisioning/src/general/scripts/azure/jupyter_dataengine-service_create_configs.py
@@ -0,0 +1,88 @@
+#!/usr/bin/python3
+
+# *****************************************************************************
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# ******************************************************************************
+
+import argparse
+import sys
+import subprocess
+from datalab.actions_lib import *
+from datalab.common_lib import *
+from datalab.fab import *
+from datalab.notebook_lib import *
+from fabric import *
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--bucket', type=str, default='')
+parser.add_argument('--cluster_name', type=str, default='')
+parser.add_argument('--dry_run', type=str, default='false')
+parser.add_argument('--hdinsight_version', type=str, default='')
+parser.add_argument('--dataproc_version', type=str, default='')
+parser.add_argument('--spark_version', type=str, default='')
+parser.add_argument('--hadoop_version', type=str, default='')
+parser.add_argument('--region', type=str, default='')
+parser.add_argument('--user_name', type=str, default='')
+parser.add_argument('--os_user', type=str, default='')
+parser.add_argument('--pip_mirror', type=str, default='')
+parser.add_argument('--application', type=str, default='')
+parser.add_argument('--scala_version', type=str, default='')
+parser.add_argument('--python_version', type=str, default='')
+parser.add_argument('--headnode_ip', type=str, default='')
+args = parser.parse_args()
+
+hdinsight_dir = '/opt/{}/jars/'.format(args.hdinsight_version)
+kernels_dir = '/home/{}/.local/share/jupyter/kernels/'.format(args.os_user)
+spark_dir = '/opt/{}/{}/spark/'.format(args.hdinsight_version, args.cluster_name)
+yarn_dir = '/opt/{}/{}/conf/'.format(args.hdinsight_version, args.cluster_name)
+
+
+def install_sparkamagic_kernels(args):
+    try:
+        subprocess.run('sudo jupyter nbextension enable --py --sys-prefix widgetsnbextension')
+        sparkmagic_dir = subprocess.run("sudo pip3 show sparkmagic | grep 'Location: ' | awk '{print $2}'", capture_output=True, shell=True, check=True).stdout.decode('UTF-8').rstrip("\n\r")
+        subprocess.run('sudo jupyter-kernelspec install {}/sparkmagic/kernels/sparkkernel --prefix=/home/{}/.local/'.format(sparkmagic_dir, args.os_user), shell=True, check=True)
+        subprocess.run('sudo jupyter-kernelspec install {}/sparkmagic/kernels/pysparkkernel --prefix=/home/{}/.local/'.format(sparkmagic_dir, args.os_user), shell=True, check=True)
+        #subprocess.run('sudo jupyter-kernelspec install {}/sparkmagic/kernels/sparkrkernel --prefix=/home/{}/.local/'.format(sparkmagic_dir, args.os_user), shell=True, check=True)
+        pyspark_kernel_name = 'PySpark (Python-{0} / Spark-{1} ) [{2}]'.format(args.python_version, args.spark_version,
+                                                                         args.cluster_name)
+        subprocess.run('sed -i \'s|PySpark|{0}|g\' /home/{1}/.local/share/jupyter/kernels/pysparkkernel/kernel.json'.format(
+            pyspark_kernel_name, args.os_user), shell=True, check=True)
+        spark_kernel_name = 'Spark (Scala-{0} / Spark-{1} ) [{2}]'.format(args.scala_version, args.spark_version,
+                                                                         args.cluster_name)
+        subprocess.run('sed -i \'s|Spark|{0}|g\' /home/{1}/.local/share/jupyter/kernels/sparkkernel/kernel.json'.format(
+            spark_kernel_name, args.os_user), shell=True, check=True)
+        #sparkr_kernel_name = 'SparkR (R-{0} / Spark-{1} ) [{2}]'.format(args.r_version, args.spark_version,
+        #                                                                    args.cluster_name)
+        #subprocess.run('sed -i \'s|SparkR|{0}|g\' /home/{1}/.local/share/jupyter/kernels/sparkrkernel/kernel.json'.format(
+        #    sparkr_kernel_name, args.os_user), shell=True, check=True)
+        subprocess.run('mkdir -p /home/' + args.os_user + '/.sparkmagic', shell=True, check=True)
+        subprocess.run('cp -f /tmp/dataengine-service_sparkmagic_config.json /home/' + args.os_user + '/.sparkmagic/config.json', shell=True, check=True)
+        subprocess.run('sed -i \'s|HEADNODEIP:PORT|{0}:{2}|g\' /home/{1}/.sparkmagic/config.json'.format(
+                args.master_ip, args.os_user, args.livy_port, shell=True, check=True)
+        subprocess.run('sudo chown -R {0}:{0} /home/{0}/.sparkmagic/'.format(args.os_user), shell=True, check=True)
+    except:
+        sys.exit(1)
+
+if __name__ == "__main__":
+    if args.dry_run == 'true':
+        parser.print_help()
+    else:
+        install_sparkamagic_kernels(args)
diff --git a/infrastructure-provisioning/src/general/scripts/azure/jupyter_install_dataengine-service_kernels.py b/infrastructure-provisioning/src/general/scripts/azure/jupyter_install_dataengine-service_kernels.py
new file mode 100644
index 000000000..5136a4873
--- /dev/null
+++ b/infrastructure-provisioning/src/general/scripts/azure/jupyter_install_dataengine-service_kernels.py
@@ -0,0 +1,104 @@
+#!/usr/bin/python3
+
+# *****************************************************************************
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# ******************************************************************************
+
+import argparse
+import os
+from datalab.actions_lib import *
+from datalab.meta_lib import *
+from datalab.fab import *
+from fabric import *
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--bucket', type=str, default='')
+parser.add_argument('--cluster_name', type=str, default='')
+parser.add_argument('--dry_run', type=str, default='false')
+parser.add_argument('--hdinsight_version', type=str, default='')
+parser.add_argument('--keyfile', type=str, default='')
+parser.add_argument('--region', type=str, default='')
+parser.add_argument('--notebook_ip', type=str, default='')
+parser.add_argument('--scala_version', type=str, default='')
+parser.add_argument('--edge_user_name', type=str, default='')
+parser.add_argument('--project_name', type=str, default='')
+parser.add_argument('--os_user', type=str, default='')
+parser.add_argument('--edge_hostname', type=str, default='')
+parser.add_argument('--proxy_port', type=str, default='')
+parser.add_argument('--pip_mirror', type=str, default='')
+parser.add_argument('--application', type=str, default='')
+parser.add_argument('--headnode_ip', type=str, default='')
+args = parser.parse_args()
+
+def configure_notebook(args):
+    templates_dir = '/root/templates/'
+    files_dir = '/root/files/'
+    scripts_dir = '/root/scripts/'
+    datalab.fab.conn.put(templates_dir + 'dataengine-service_sparkmagic_config.json', '/tmp/dataengine-service_sparkmagic_config.json')
+    datalab.fab.conn.put(scripts_dir + '{}_dataengine-service_create_configs.py'.format(args.application), '/tmp/create_configs.py')
+    datalab.fab.conn.sudo('\cp /tmp/create_configs.py /usr/local/bin/create_configs.py')
+    datalab.fab.conn.sudo('chmod 755 /usr/local/bin/create_configs.py')
+    datalab.fab.conn.sudo('mkdir -p /usr/lib/python3.8/datalab/')
+    datalab.fab.conn.run('mkdir -p /tmp/datalab_libs/')
+    host_string = args.os_user + "@" + args.notebook_ip
+    datalab.fab.conn.local('rsync -e "ssh -i {}" /usr/lib/python3.8/datalab/*.py {}:/tmp/datalab_libs/'.format(args.keyfile, host_string))
+    datalab.fab.conn.run('chmod a+x /tmp/datalab_libs/*')
+    datalab.fab.conn.sudo('mv /tmp/datalab_libs/* /usr/lib/python3.8/datalab/')
+    if exists(datalab.fab.conn, '/usr/lib64'):
+        datalab.fab.conn.sudo('mkdir -p /usr/lib64/python3.8')
+        datalab.fab.conn.sudo('ln -fs /usr/lib/python3.8/datalab /usr/lib64/python3.8/datalab')
+
+def install_sparkamagic_kernels(args):
+    try:
+        datalab.fab.conn.sudo('jupyter nbextension enable --py --sys-prefix widgetsnbextension')
+        sparkmagic_dir = datalab.fab.conn.sudo(''' bash -l -c 'pip3 show sparkmagic | grep "Location: "' ''').stdout.rstrip("\n\r").split(' ')[1]
+        datalab.fab.conn.sudo('jupyter-kernelspec install {}/sparkmagic/kernels/sparkkernel --prefix=/home/{}/.local/'.format(sparkmagic_dir, args.os_user))
+        datalab.fab.conn.sudo('jupyter-kernelspec install {}/sparkmagic/kernels/pysparkkernel --prefix=/home/{}/.local/'.format(sparkmagic_dir, args.os_user))
+        #datalab.fab.conn.sudo('jupyter-kernelspec install {}/sparkmagic/kernels/sparkrkernel --prefix=/home/{}/.local/'.format(sparkmagic_dir, args.os_user))
+        pyspark_kernel_name = 'PySpark (Python-{0} / Spark-{1} ) [{2}]'.format(args.python_version, args.spark_version,
+                                                                         args.cluster_name)
+        datalab.fab.conn.sudo('sed -i \'s|PySpark|{0}|g\' /home/{1}/.local/share/jupyter/kernels/pysparkkernel/kernel.json'.format(
+            pyspark_kernel_name, args.os_user))
+        spark_kernel_name = 'Spark (Scala-{0} / Spark-{1} ) [{2}]'.format(args.scala_version, args.spark_version,
+                                                                         args.cluster_name)
+        datalab.fab.conn.sudo('sed -i \'s|Spark|{0}|g\' /home/{1}/.local/share/jupyter/kernels/sparkkernel/kernel.json'.format(
+            spark_kernel_name, args.os_user))
+        #sparkr_kernel_name = 'SparkR (R-{0} / Spark-{1} ) [{2}]'.format(args.r_version, args.spark_version,
+        #                                                                   args.cluster_name)
+        #datalab.fab.conn.sudo('sed -i \'s|SparkR|{0}|g\' /home/{1}/.local/share/jupyter/kernels/sparkrkernel/kernel.json'.format(
+        #    sparkr_kernel_name, args.os_user))
+        datalab.fab.conn.sudo('mkdir -p /home/' + args.os_user + '/.sparkmagic')
+        datalab.fab.conn.sudo('cp -f /tmp/dataengine-service_sparkmagic_config.json /home/' + args.os_user + '/.sparkmagic/config.json')
+        datalab.fab.conn.sudo('sed -i \'s|HEADNODEIP:PORT|{0}:{2}|g\' /home/{1}/.sparkmagic/config.json'.format(
+                args.master_ip, args.os_user, args.livy_port))
+        datalab.fab.conn.sudo('chown -R {0}:{0} /home/{0}/.sparkmagic/'.format(args.os_user))
+    except:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    global conn
+    conn = init_datalab_connection(args.notebook_ip, args.os_user, args.keyfile)
+    configure_notebook(args)
+    args.spark_version = '3.1.2'
+    args.python_version = '3.8.10'
+    args.livy_port = '8998'
+    args.master_ip = args.headnode_ip
+    install_sparkamagic_kernels(args)
diff --git a/infrastructure-provisioning/src/general/templates/azure/dataengine-service_sparkmagic_config.json b/infrastructure-provisioning/src/general/templates/azure/dataengine-service_sparkmagic_config.json
new file mode 100644
index 000000000..d9907b454
--- /dev/null
+++ b/infrastructure-provisioning/src/general/templates/azure/dataengine-service_sparkmagic_config.json
@@ -0,0 +1,17 @@
+{
+  "kernel_python_credentials" : {
+    "username": "",
+    "password": "",
+    "url": "http://HEADNODEIP:PORT/",
+    "auth": "None"
+  },
+  "kernel_scala_credentials" : {
+    "username": "",
+    "password": "",
+    "url": "http://HEADNODEIP:PORT/",
+    "auth": "None"
+  },
+  "custom_headers" : {
+    "X-Requested-By": "livy"
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datalab.apache.org
For additional commands, e-mail: commits-help@datalab.apache.org