You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datalab.apache.org by op...@apache.org on 2022/09/27 17:41:17 UTC
[incubator-datalab] 01/01: [DATALAB-2914]: added scripts to integrate jupyter with hdinsight
This is an automated email from the ASF dual-hosted git repository.
opolishchuk pushed a commit to branch DATALAB-2914
in repository https://gitbox.apache.org/repos/asf/incubator-datalab.git
commit db6f2f7462612618629ecd2350aae686758f4b73
Author: Oleksandr Polishchuk <po...@gmail.com>
AuthorDate: Tue Sep 27 20:40:14 2022 +0300
[DATALAB-2914]: added scripts to integrate jupyter with hdinsight
---
.../files/azure/dataengine-service_Dockerfile | 2 +
.../src/general/files/azure/jupyter_Dockerfile | 2 +
.../jupyter_dataengine-service_create_configs.py | 88 +++++++++++++++++
.../jupyter_install_dataengine-service_kernels.py | 104 +++++++++++++++++++++
.../dataengine-service_sparkmagic_config.json | 17 ++++
5 files changed, 213 insertions(+)
diff --git a/infrastructure-provisioning/src/general/files/azure/dataengine-service_Dockerfile b/infrastructure-provisioning/src/general/files/azure/dataengine-service_Dockerfile
index 966aaa5c2..cd9ea3ba8 100644
--- a/infrastructure-provisioning/src/general/files/azure/dataengine-service_Dockerfile
+++ b/infrastructure-provisioning/src/general/files/azure/dataengine-service_Dockerfile
@@ -34,6 +34,8 @@ COPY general/templates/os/inactive.sh /root/templates/
COPY general/templates/os/inactive.service /root/templates/
COPY general/templates/os/inactive.timer /root/templates/
COPY general/templates/azure/dataengine-service_interpreter_livy.json /root/templates/dataengine-service_interpreter_livy.json
+COPY general/templates/azure/dataengine-service_sparkmagic_config.json /root/templates/dataengine-service_sparkmagic_config.json
+
RUN chmod a+x /root/fabfile.py; \
chmod a+x /root/scripts/*
diff --git a/infrastructure-provisioning/src/general/files/azure/jupyter_Dockerfile b/infrastructure-provisioning/src/general/files/azure/jupyter_Dockerfile
index 45736846c..18da125e8 100644
--- a/infrastructure-provisioning/src/general/files/azure/jupyter_Dockerfile
+++ b/infrastructure-provisioning/src/general/files/azure/jupyter_Dockerfile
@@ -45,6 +45,8 @@ COPY general/templates/os/inactive.sh /root/templates/
COPY general/templates/os/inactive.service /root/templates/
COPY general/templates/os/inactive.timer /root/templates/
COPY general/templates/azure/core-site* /root/templates/
+COPY general/templates/azure/dataengine-service_sparkmagic_config.json /root/templates/
+
RUN chmod a+x /root/fabfile.py; \
diff --git a/infrastructure-provisioning/src/general/scripts/azure/jupyter_dataengine-service_create_configs.py b/infrastructure-provisioning/src/general/scripts/azure/jupyter_dataengine-service_create_configs.py
new file mode 100644
index 000000000..488b782b6
--- /dev/null
+++ b/infrastructure-provisioning/src/general/scripts/azure/jupyter_dataengine-service_create_configs.py
@@ -0,0 +1,88 @@
+#!/usr/bin/python3
+
+# *****************************************************************************
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# ******************************************************************************
+
+import argparse
+import sys
+import subprocess
+from datalab.actions_lib import *
+from datalab.common_lib import *
+from datalab.fab import *
+from datalab.notebook_lib import *
+from fabric import *
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--bucket', type=str, default='')
+parser.add_argument('--cluster_name', type=str, default='')
+parser.add_argument('--dry_run', type=str, default='false')
+parser.add_argument('--hdinsight_version', type=str, default='')
+parser.add_argument('--dataproc_version', type=str, default='')
+parser.add_argument('--spark_version', type=str, default='')
+parser.add_argument('--hadoop_version', type=str, default='')
+parser.add_argument('--region', type=str, default='')
+parser.add_argument('--user_name', type=str, default='')
+parser.add_argument('--os_user', type=str, default='')
+parser.add_argument('--pip_mirror', type=str, default='')
+parser.add_argument('--application', type=str, default='')
+parser.add_argument('--scala_version', type=str, default='')
+parser.add_argument('--python_version', type=str, default='')
+parser.add_argument('--headnode_ip', type=str, default='')
+args = parser.parse_args()
+
+hdinsight_dir = '/opt/{}/jars/'.format(args.hdinsight_version)
+kernels_dir = '/home/{}/.local/share/jupyter/kernels/'.format(args.os_user)
+spark_dir = '/opt/{}/{}/spark/'.format(args.hdinsight_version, args.cluster_name)
+yarn_dir = '/opt/{}/{}/conf/'.format(args.hdinsight_version, args.cluster_name)
+
+
+def install_sparkamagic_kernels(args):
+ try:
+ subprocess.run('sudo jupyter nbextension enable --py --sys-prefix widgetsnbextension')
+ sparkmagic_dir = subprocess.run("sudo pip3 show sparkmagic | grep 'Location: ' | awk '{print $2}'", capture_output=True, shell=True, check=True).stdout.decode('UTF-8').rstrip("\n\r")
+ subprocess.run('sudo jupyter-kernelspec install {}/sparkmagic/kernels/sparkkernel --prefix=/home/{}/.local/'.format(sparkmagic_dir, args.os_user), shell=True, check=True)
+ subprocess.run('sudo jupyter-kernelspec install {}/sparkmagic/kernels/pysparkkernel --prefix=/home/{}/.local/'.format(sparkmagic_dir, args.os_user), shell=True, check=True)
+ #subprocess.run('sudo jupyter-kernelspec install {}/sparkmagic/kernels/sparkrkernel --prefix=/home/{}/.local/'.format(sparkmagic_dir, args.os_user), shell=True, check=True)
+ pyspark_kernel_name = 'PySpark (Python-{0} / Spark-{1} ) [{2}]'.format(args.python_version, args.spark_version,
+ args.cluster_name)
+ subprocess.run('sed -i \'s|PySpark|{0}|g\' /home/{1}/.local/share/jupyter/kernels/pysparkkernel/kernel.json'.format(
+ pyspark_kernel_name, args.os_user), shell=True, check=True)
+ spark_kernel_name = 'Spark (Scala-{0} / Spark-{1} ) [{2}]'.format(args.scala_version, args.spark_version,
+ args.cluster_name)
+ subprocess.run('sed -i \'s|Spark|{0}|g\' /home/{1}/.local/share/jupyter/kernels/sparkkernel/kernel.json'.format(
+ spark_kernel_name, args.os_user), shell=True, check=True)
+ #sparkr_kernel_name = 'SparkR (R-{0} / Spark-{1} ) [{2}]'.format(args.r_version, args.spark_version,
+ # args.cluster_name)
+ #subprocess.run('sed -i \'s|SparkR|{0}|g\' /home/{1}/.local/share/jupyter/kernels/sparkrkernel/kernel.json'.format(
+ # sparkr_kernel_name, args.os_user), shell=True, check=True)
+ subprocess.run('mkdir -p /home/' + args.os_user + '/.sparkmagic', shell=True, check=True)
+ subprocess.run('cp -f /tmp/dataengine-service_sparkmagic_config.json /home/' + args.os_user + '/.sparkmagic/config.json', shell=True, check=True)
+ subprocess.run('sed -i \'s|HEADNODEIP:PORT|{0}:{2}|g\' /home/{1}/.sparkmagic/config.json'.format(
+ args.master_ip, args.os_user, args.livy_port, shell=True, check=True)
+ subprocess.run('sudo chown -R {0}:{0} /home/{0}/.sparkmagic/'.format(args.os_user), shell=True, check=True)
+ except:
+ sys.exit(1)
+
+if __name__ == "__main__":
+ if args.dry_run == 'true':
+ parser.print_help()
+ else:
+ install_sparkamagic_kernels(args)
diff --git a/infrastructure-provisioning/src/general/scripts/azure/jupyter_install_dataengine-service_kernels.py b/infrastructure-provisioning/src/general/scripts/azure/jupyter_install_dataengine-service_kernels.py
new file mode 100644
index 000000000..5136a4873
--- /dev/null
+++ b/infrastructure-provisioning/src/general/scripts/azure/jupyter_install_dataengine-service_kernels.py
@@ -0,0 +1,104 @@
+#!/usr/bin/python3
+
+# *****************************************************************************
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# ******************************************************************************
+
+import argparse
+import os
+from datalab.actions_lib import *
+from datalab.meta_lib import *
+from datalab.fab import *
+from fabric import *
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--bucket', type=str, default='')
+parser.add_argument('--cluster_name', type=str, default='')
+parser.add_argument('--dry_run', type=str, default='false')
+parser.add_argument('--hdinsight_version', type=str, default='')
+parser.add_argument('--keyfile', type=str, default='')
+parser.add_argument('--region', type=str, default='')
+parser.add_argument('--notebook_ip', type=str, default='')
+parser.add_argument('--scala_version', type=str, default='')
+parser.add_argument('--edge_user_name', type=str, default='')
+parser.add_argument('--project_name', type=str, default='')
+parser.add_argument('--os_user', type=str, default='')
+parser.add_argument('--edge_hostname', type=str, default='')
+parser.add_argument('--proxy_port', type=str, default='')
+parser.add_argument('--pip_mirror', type=str, default='')
+parser.add_argument('--application', type=str, default='')
+parser.add_argument('--headnode_ip', type=str, default='')
+args = parser.parse_args()
+
+def configure_notebook(args):
+ templates_dir = '/root/templates/'
+ files_dir = '/root/files/'
+ scripts_dir = '/root/scripts/'
+ datalab.fab.conn.put(templates_dir + 'dataengine-service_sparkmagic_config.json', '/tmp/dataengine-service_sparkmagic_config.json')
+ datalab.fab.conn.put(scripts_dir + '{}_dataengine-service_create_configs.py'.format(args.application), '/tmp/create_configs.py')
+ datalab.fab.conn.sudo('\cp /tmp/create_configs.py /usr/local/bin/create_configs.py')
+ datalab.fab.conn.sudo('chmod 755 /usr/local/bin/create_configs.py')
+ datalab.fab.conn.sudo('mkdir -p /usr/lib/python3.8/datalab/')
+ datalab.fab.conn.run('mkdir -p /tmp/datalab_libs/')
+ host_string = args.os_user + "@" + args.notebook_ip
+ datalab.fab.conn.local('rsync -e "ssh -i {}" /usr/lib/python3.8/datalab/*.py {}:/tmp/datalab_libs/'.format(args.keyfile, host_string))
+ datalab.fab.conn.run('chmod a+x /tmp/datalab_libs/*')
+ datalab.fab.conn.sudo('mv /tmp/datalab_libs/* /usr/lib/python3.8/datalab/')
+ if exists(datalab.fab.conn, '/usr/lib64'):
+ datalab.fab.conn.sudo('mkdir -p /usr/lib64/python3.8')
+ datalab.fab.conn.sudo('ln -fs /usr/lib/python3.8/datalab /usr/lib64/python3.8/datalab')
+
+def install_sparkamagic_kernels(args):
+ try:
+ datalab.fab.conn.sudo('jupyter nbextension enable --py --sys-prefix widgetsnbextension')
+ sparkmagic_dir = datalab.fab.conn.sudo(''' bash -l -c 'pip3 show sparkmagic | grep "Location: "' ''').stdout.rstrip("\n\r").split(' ')[1]
+ datalab.fab.conn.sudo('jupyter-kernelspec install {}/sparkmagic/kernels/sparkkernel --prefix=/home/{}/.local/'.format(sparkmagic_dir, args.os_user))
+ datalab.fab.conn.sudo('jupyter-kernelspec install {}/sparkmagic/kernels/pysparkkernel --prefix=/home/{}/.local/'.format(sparkmagic_dir, args.os_user))
+ #datalab.fab.conn.sudo('jupyter-kernelspec install {}/sparkmagic/kernels/sparkrkernel --prefix=/home/{}/.local/'.format(sparkmagic_dir, args.os_user))
+ pyspark_kernel_name = 'PySpark (Python-{0} / Spark-{1} ) [{2}]'.format(args.python_version, args.spark_version,
+ args.cluster_name)
+ datalab.fab.conn.sudo('sed -i \'s|PySpark|{0}|g\' /home/{1}/.local/share/jupyter/kernels/pysparkkernel/kernel.json'.format(
+ pyspark_kernel_name, args.os_user))
+ spark_kernel_name = 'Spark (Scala-{0} / Spark-{1} ) [{2}]'.format(args.scala_version, args.spark_version,
+ args.cluster_name)
+ datalab.fab.conn.sudo('sed -i \'s|Spark|{0}|g\' /home/{1}/.local/share/jupyter/kernels/sparkkernel/kernel.json'.format(
+ spark_kernel_name, args.os_user))
+ #sparkr_kernel_name = 'SparkR (R-{0} / Spark-{1} ) [{2}]'.format(args.r_version, args.spark_version,
+ # args.cluster_name)
+ #datalab.fab.conn.sudo('sed -i \'s|SparkR|{0}|g\' /home/{1}/.local/share/jupyter/kernels/sparkrkernel/kernel.json'.format(
+ # sparkr_kernel_name, args.os_user))
+ datalab.fab.conn.sudo('mkdir -p /home/' + args.os_user + '/.sparkmagic')
+ datalab.fab.conn.sudo('cp -f /tmp/dataengine-service_sparkmagic_config.json /home/' + args.os_user + '/.sparkmagic/config.json')
+ datalab.fab.conn.sudo('sed -i \'s|HEADNODEIP:PORT|{0}:{2}|g\' /home/{1}/.sparkmagic/config.json'.format(
+ args.master_ip, args.os_user, args.livy_port))
+ datalab.fab.conn.sudo('chown -R {0}:{0} /home/{0}/.sparkmagic/'.format(args.os_user))
+ except:
+ sys.exit(1)
+
+
+if __name__ == "__main__":
+ global conn
+ conn = init_datalab_connection(args.notebook_ip, args.os_user, args.keyfile)
+ configure_notebook(args)
+ args.spark_version = '3.1.2'
+ args.python_version = '3.8.10'
+ args.livy_port = '8998'
+ args.master_ip = args.headnode_ip
+ install_sparkamagic_kernels(args)
diff --git a/infrastructure-provisioning/src/general/templates/azure/dataengine-service_sparkmagic_config.json b/infrastructure-provisioning/src/general/templates/azure/dataengine-service_sparkmagic_config.json
new file mode 100644
index 000000000..d9907b454
--- /dev/null
+++ b/infrastructure-provisioning/src/general/templates/azure/dataengine-service_sparkmagic_config.json
@@ -0,0 +1,17 @@
+{
+ "kernel_python_credentials" : {
+ "username": "",
+ "password": "",
+ "url": "http://HEADNODEIP:PORT/",
+ "auth": "None"
+ },
+ "kernel_scala_credentials" : {
+ "username": "",
+ "password": "",
+ "url": "http://HEADNODEIP:PORT/",
+ "auth": "None"
+ },
+ "custom_headers" : {
+ "X-Requested-By": "livy"
+ }
+}
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datalab.apache.org
For additional commands, e-mail: commits-help@datalab.apache.org