You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datalab.apache.org by op...@apache.org on 2022/09/27 17:41:16 UTC

[incubator-datalab] branch DATALAB-2914 created (now db6f2f746)

This is an automated email from the ASF dual-hosted git repository.

opolishchuk pushed a change to branch DATALAB-2914
in repository https://gitbox.apache.org/repos/asf/incubator-datalab.git


      at db6f2f746 [DATALAB-2914]: added scripts to integrate jupyter with hdinsight

This branch includes the following new commits:

     new db6f2f746 [DATALAB-2914]: added scripts to integrate jupyter with hdinsight

The 1 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.



---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datalab.apache.org
For additional commands, e-mail: commits-help@datalab.apache.org


[incubator-datalab] 01/01: [DATALAB-2914]: added scripts to integrate jupyter with hdinsight

Posted by op...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

opolishchuk pushed a commit to branch DATALAB-2914
in repository https://gitbox.apache.org/repos/asf/incubator-datalab.git

commit db6f2f7462612618629ecd2350aae686758f4b73
Author: Oleksandr Polishchuk <po...@gmail.com>
AuthorDate: Tue Sep 27 20:40:14 2022 +0300

    [DATALAB-2914]: added scripts to integrate jupyter with hdinsight
---
 .../files/azure/dataengine-service_Dockerfile      |   2 +
 .../src/general/files/azure/jupyter_Dockerfile     |   2 +
 .../jupyter_dataengine-service_create_configs.py   |  88 +++++++++++++++++
 .../jupyter_install_dataengine-service_kernels.py  | 104 +++++++++++++++++++++
 .../dataengine-service_sparkmagic_config.json      |  17 ++++
 5 files changed, 213 insertions(+)

diff --git a/infrastructure-provisioning/src/general/files/azure/dataengine-service_Dockerfile b/infrastructure-provisioning/src/general/files/azure/dataengine-service_Dockerfile
index 966aaa5c2..cd9ea3ba8 100644
--- a/infrastructure-provisioning/src/general/files/azure/dataengine-service_Dockerfile
+++ b/infrastructure-provisioning/src/general/files/azure/dataengine-service_Dockerfile
@@ -34,6 +34,8 @@ COPY general/templates/os/inactive.sh /root/templates/
 COPY general/templates/os/inactive.service /root/templates/
 COPY general/templates/os/inactive.timer /root/templates/
 COPY general/templates/azure/dataengine-service_interpreter_livy.json /root/templates/dataengine-service_interpreter_livy.json
+COPY general/templates/azure/dataengine-service_sparkmagic_config.json /root/templates/dataengine-service_sparkmagic_config.json
+
 
 RUN chmod a+x /root/fabfile.py; \
     chmod a+x /root/scripts/*
diff --git a/infrastructure-provisioning/src/general/files/azure/jupyter_Dockerfile b/infrastructure-provisioning/src/general/files/azure/jupyter_Dockerfile
index 45736846c..18da125e8 100644
--- a/infrastructure-provisioning/src/general/files/azure/jupyter_Dockerfile
+++ b/infrastructure-provisioning/src/general/files/azure/jupyter_Dockerfile
@@ -45,6 +45,8 @@ COPY general/templates/os/inactive.sh /root/templates/
 COPY general/templates/os/inactive.service /root/templates/
 COPY general/templates/os/inactive.timer /root/templates/
 COPY general/templates/azure/core-site* /root/templates/
+COPY general/templates/azure/dataengine-service_sparkmagic_config.json /root/templates/
+
 
 
 RUN chmod a+x /root/fabfile.py; \
diff --git a/infrastructure-provisioning/src/general/scripts/azure/jupyter_dataengine-service_create_configs.py b/infrastructure-provisioning/src/general/scripts/azure/jupyter_dataengine-service_create_configs.py
new file mode 100644
index 000000000..488b782b6
--- /dev/null
+++ b/infrastructure-provisioning/src/general/scripts/azure/jupyter_dataengine-service_create_configs.py
@@ -0,0 +1,88 @@
+#!/usr/bin/python3
+
+# *****************************************************************************
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# ******************************************************************************
+
+import argparse
+import sys
+import subprocess
+from datalab.actions_lib import *
+from datalab.common_lib import *
+from datalab.fab import *
+from datalab.notebook_lib import *
+from fabric import *
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--bucket', type=str, default='')
+parser.add_argument('--cluster_name', type=str, default='')
+parser.add_argument('--dry_run', type=str, default='false')
+parser.add_argument('--hdinsight_version', type=str, default='')
+parser.add_argument('--dataproc_version', type=str, default='')
+parser.add_argument('--spark_version', type=str, default='')
+parser.add_argument('--hadoop_version', type=str, default='')
+parser.add_argument('--region', type=str, default='')
+parser.add_argument('--user_name', type=str, default='')
+parser.add_argument('--os_user', type=str, default='')
+parser.add_argument('--pip_mirror', type=str, default='')
+parser.add_argument('--application', type=str, default='')
+parser.add_argument('--scala_version', type=str, default='')
+parser.add_argument('--python_version', type=str, default='')
+parser.add_argument('--headnode_ip', type=str, default='')
+args = parser.parse_args()
+
+hdinsight_dir = '/opt/{}/jars/'.format(args.hdinsight_version)
+kernels_dir = '/home/{}/.local/share/jupyter/kernels/'.format(args.os_user)
+spark_dir = '/opt/{}/{}/spark/'.format(args.hdinsight_version, args.cluster_name)
+yarn_dir = '/opt/{}/{}/conf/'.format(args.hdinsight_version, args.cluster_name)
+
+
+def install_sparkamagic_kernels(args):
+    try:
+        subprocess.run('sudo jupyter nbextension enable --py --sys-prefix widgetsnbextension')
+        sparkmagic_dir = subprocess.run("sudo pip3 show sparkmagic | grep 'Location: ' | awk '{print $2}'", capture_output=True, shell=True, check=True).stdout.decode('UTF-8').rstrip("\n\r")
+        subprocess.run('sudo jupyter-kernelspec install {}/sparkmagic/kernels/sparkkernel --prefix=/home/{}/.local/'.format(sparkmagic_dir, args.os_user), shell=True, check=True)
+        subprocess.run('sudo jupyter-kernelspec install {}/sparkmagic/kernels/pysparkkernel --prefix=/home/{}/.local/'.format(sparkmagic_dir, args.os_user), shell=True, check=True)
+        #subprocess.run('sudo jupyter-kernelspec install {}/sparkmagic/kernels/sparkrkernel --prefix=/home/{}/.local/'.format(sparkmagic_dir, args.os_user), shell=True, check=True)
+        pyspark_kernel_name = 'PySpark (Python-{0} / Spark-{1} ) [{2}]'.format(args.python_version, args.spark_version,
+                                                                         args.cluster_name)
+        subprocess.run('sed -i \'s|PySpark|{0}|g\' /home/{1}/.local/share/jupyter/kernels/pysparkkernel/kernel.json'.format(
+            pyspark_kernel_name, args.os_user), shell=True, check=True)
+        spark_kernel_name = 'Spark (Scala-{0} / Spark-{1} ) [{2}]'.format(args.scala_version, args.spark_version,
+                                                                         args.cluster_name)
+        subprocess.run('sed -i \'s|Spark|{0}|g\' /home/{1}/.local/share/jupyter/kernels/sparkkernel/kernel.json'.format(
+            spark_kernel_name, args.os_user), shell=True, check=True)
+        #sparkr_kernel_name = 'SparkR (R-{0} / Spark-{1} ) [{2}]'.format(args.r_version, args.spark_version,
+        #                                                                    args.cluster_name)
+        #subprocess.run('sed -i \'s|SparkR|{0}|g\' /home/{1}/.local/share/jupyter/kernels/sparkrkernel/kernel.json'.format(
+        #    sparkr_kernel_name, args.os_user), shell=True, check=True)
+        subprocess.run('mkdir -p /home/' + args.os_user + '/.sparkmagic', shell=True, check=True)
+        subprocess.run('cp -f /tmp/dataengine-service_sparkmagic_config.json /home/' + args.os_user + '/.sparkmagic/config.json', shell=True, check=True)
+        subprocess.run('sed -i \'s|HEADNODEIP:PORT|{0}:{2}|g\' /home/{1}/.sparkmagic/config.json'.format(
+                args.master_ip, args.os_user, args.livy_port, shell=True, check=True)
+        subprocess.run('sudo chown -R {0}:{0} /home/{0}/.sparkmagic/'.format(args.os_user), shell=True, check=True)
+    except:
+        sys.exit(1)
+
+if __name__ == "__main__":
+    if args.dry_run == 'true':
+        parser.print_help()
+    else:
+        install_sparkamagic_kernels(args)
diff --git a/infrastructure-provisioning/src/general/scripts/azure/jupyter_install_dataengine-service_kernels.py b/infrastructure-provisioning/src/general/scripts/azure/jupyter_install_dataengine-service_kernels.py
new file mode 100644
index 000000000..5136a4873
--- /dev/null
+++ b/infrastructure-provisioning/src/general/scripts/azure/jupyter_install_dataengine-service_kernels.py
@@ -0,0 +1,104 @@
+#!/usr/bin/python3
+
+# *****************************************************************************
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# ******************************************************************************
+
+import argparse
+import os
+from datalab.actions_lib import *
+from datalab.meta_lib import *
+from datalab.fab import *
+from fabric import *
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--bucket', type=str, default='')
+parser.add_argument('--cluster_name', type=str, default='')
+parser.add_argument('--dry_run', type=str, default='false')
+parser.add_argument('--hdinsight_version', type=str, default='')
+parser.add_argument('--keyfile', type=str, default='')
+parser.add_argument('--region', type=str, default='')
+parser.add_argument('--notebook_ip', type=str, default='')
+parser.add_argument('--scala_version', type=str, default='')
+parser.add_argument('--edge_user_name', type=str, default='')
+parser.add_argument('--project_name', type=str, default='')
+parser.add_argument('--os_user', type=str, default='')
+parser.add_argument('--edge_hostname', type=str, default='')
+parser.add_argument('--proxy_port', type=str, default='')
+parser.add_argument('--pip_mirror', type=str, default='')
+parser.add_argument('--application', type=str, default='')
+parser.add_argument('--headnode_ip', type=str, default='')
+args = parser.parse_args()
+
+def configure_notebook(args):
+    templates_dir = '/root/templates/'
+    files_dir = '/root/files/'
+    scripts_dir = '/root/scripts/'
+    datalab.fab.conn.put(templates_dir + 'dataengine-service_sparkmagic_config.json', '/tmp/dataengine-service_sparkmagic_config.json')
+    datalab.fab.conn.put(scripts_dir + '{}_dataengine-service_create_configs.py'.format(args.application), '/tmp/create_configs.py')
+    datalab.fab.conn.sudo('\cp /tmp/create_configs.py /usr/local/bin/create_configs.py')
+    datalab.fab.conn.sudo('chmod 755 /usr/local/bin/create_configs.py')
+    datalab.fab.conn.sudo('mkdir -p /usr/lib/python3.8/datalab/')
+    datalab.fab.conn.run('mkdir -p /tmp/datalab_libs/')
+    host_string = args.os_user + "@" + args.notebook_ip
+    datalab.fab.conn.local('rsync -e "ssh -i {}" /usr/lib/python3.8/datalab/*.py {}:/tmp/datalab_libs/'.format(args.keyfile, host_string))
+    datalab.fab.conn.run('chmod a+x /tmp/datalab_libs/*')
+    datalab.fab.conn.sudo('mv /tmp/datalab_libs/* /usr/lib/python3.8/datalab/')
+    if exists(datalab.fab.conn, '/usr/lib64'):
+        datalab.fab.conn.sudo('mkdir -p /usr/lib64/python3.8')
+        datalab.fab.conn.sudo('ln -fs /usr/lib/python3.8/datalab /usr/lib64/python3.8/datalab')
+
+def install_sparkamagic_kernels(args):
+    try:
+        datalab.fab.conn.sudo('jupyter nbextension enable --py --sys-prefix widgetsnbextension')
+        sparkmagic_dir = datalab.fab.conn.sudo(''' bash -l -c 'pip3 show sparkmagic | grep "Location: "' ''').stdout.rstrip("\n\r").split(' ')[1]
+        datalab.fab.conn.sudo('jupyter-kernelspec install {}/sparkmagic/kernels/sparkkernel --prefix=/home/{}/.local/'.format(sparkmagic_dir, args.os_user))
+        datalab.fab.conn.sudo('jupyter-kernelspec install {}/sparkmagic/kernels/pysparkkernel --prefix=/home/{}/.local/'.format(sparkmagic_dir, args.os_user))
+        #datalab.fab.conn.sudo('jupyter-kernelspec install {}/sparkmagic/kernels/sparkrkernel --prefix=/home/{}/.local/'.format(sparkmagic_dir, args.os_user))
+        pyspark_kernel_name = 'PySpark (Python-{0} / Spark-{1} ) [{2}]'.format(args.python_version, args.spark_version,
+                                                                         args.cluster_name)
+        datalab.fab.conn.sudo('sed -i \'s|PySpark|{0}|g\' /home/{1}/.local/share/jupyter/kernels/pysparkkernel/kernel.json'.format(
+            pyspark_kernel_name, args.os_user))
+        spark_kernel_name = 'Spark (Scala-{0} / Spark-{1} ) [{2}]'.format(args.scala_version, args.spark_version,
+                                                                         args.cluster_name)
+        datalab.fab.conn.sudo('sed -i \'s|Spark|{0}|g\' /home/{1}/.local/share/jupyter/kernels/sparkkernel/kernel.json'.format(
+            spark_kernel_name, args.os_user))
+        #sparkr_kernel_name = 'SparkR (R-{0} / Spark-{1} ) [{2}]'.format(args.r_version, args.spark_version,
+        #                                                                   args.cluster_name)
+        #datalab.fab.conn.sudo('sed -i \'s|SparkR|{0}|g\' /home/{1}/.local/share/jupyter/kernels/sparkrkernel/kernel.json'.format(
+        #    sparkr_kernel_name, args.os_user))
+        datalab.fab.conn.sudo('mkdir -p /home/' + args.os_user + '/.sparkmagic')
+        datalab.fab.conn.sudo('cp -f /tmp/dataengine-service_sparkmagic_config.json /home/' + args.os_user + '/.sparkmagic/config.json')
+        datalab.fab.conn.sudo('sed -i \'s|HEADNODEIP:PORT|{0}:{2}|g\' /home/{1}/.sparkmagic/config.json'.format(
+                args.master_ip, args.os_user, args.livy_port))
+        datalab.fab.conn.sudo('chown -R {0}:{0} /home/{0}/.sparkmagic/'.format(args.os_user))
+    except:
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    global conn
+    conn = init_datalab_connection(args.notebook_ip, args.os_user, args.keyfile)
+    configure_notebook(args)
+    args.spark_version = '3.1.2'
+    args.python_version = '3.8.10'
+    args.livy_port = '8998'
+    args.master_ip = args.headnode_ip
+    install_sparkamagic_kernels(args)
diff --git a/infrastructure-provisioning/src/general/templates/azure/dataengine-service_sparkmagic_config.json b/infrastructure-provisioning/src/general/templates/azure/dataengine-service_sparkmagic_config.json
new file mode 100644
index 000000000..d9907b454
--- /dev/null
+++ b/infrastructure-provisioning/src/general/templates/azure/dataengine-service_sparkmagic_config.json
@@ -0,0 +1,17 @@
+{
+  "kernel_python_credentials" : {
+    "username": "",
+    "password": "",
+    "url": "http://HEADNODEIP:PORT/",
+    "auth": "None"
+  },
+  "kernel_scala_credentials" : {
+    "username": "",
+    "password": "",
+    "url": "http://HEADNODEIP:PORT/",
+    "auth": "None"
+  },
+  "custom_headers" : {
+    "X-Requested-By": "livy"
+  }
+}


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datalab.apache.org
For additional commands, e-mail: commits-help@datalab.apache.org