You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@datalab.apache.org by lf...@apache.org on 2022/09/20 13:50:03 UTC

[incubator-datalab] 01/01: [DATALAB-2998]: added new files for zeppelin dataengine-service connection

This is an automated email from the ASF dual-hosted git repository.

lfrolov pushed a commit to branch DATALAB-2998
in repository https://gitbox.apache.org/repos/asf/incubator-datalab.git

commit e6d0b171333cf7fd1e65b55a34543965b811cd5a
Author: leonidfrolov <fr...@gmail.com>
AuthorDate: Tue Sep 20 16:49:48 2022 +0300

    [DATALAB-2998]: added new files for zeppelin dataengine-service connection
---
 .../src/general/lib/os/fab.py                      |   6 +
 ...common_notebook_configure_dataengine-service.py | 129 +++++++++++++++++++++
 .../zeppelin_dataengine-service_create_configs.py  |  94 +++++++++++++++
 .../zeppelin_install_dataengine-service_kernels.py | 116 ++++++++++++++++++
 4 files changed, 345 insertions(+)

diff --git a/infrastructure-provisioning/src/general/lib/os/fab.py b/infrastructure-provisioning/src/general/lib/os/fab.py
index 6635258d2..928a6b5c4 100644
--- a/infrastructure-provisioning/src/general/lib/os/fab.py
+++ b/infrastructure-provisioning/src/general/lib/os/fab.py
@@ -1428,3 +1428,9 @@ def update_pyopenssl_lib(os_user):
             conn.sudo('touch /home/{}/.ensure_dir/pyopenssl_updated'.format(os_user))
         except:
             sys.exit(1)
+
+def get_hdinsight_headnode_private_ip(os_user, cluster_name, keyfile):
+    init_datalab_connection('{}-ssh.azurehdinsight.net'.format(cluster_name), os_user, keyfile)
+    headnode_private_ip = conn.sudo("cat /etc/hosts | grep headnode | awk '{print $1}'")
+    conn.close()
+    return headnode_private_ip
diff --git a/infrastructure-provisioning/src/general/scripts/azure/common_notebook_configure_dataengine-service.py b/infrastructure-provisioning/src/general/scripts/azure/common_notebook_configure_dataengine-service.py
new file mode 100644
index 000000000..92484657f
--- /dev/null
+++ b/infrastructure-provisioning/src/general/scripts/azure/common_notebook_configure_dataengine-service.py
@@ -0,0 +1,129 @@
+#!/usr/bin/python3
+
+# *****************************************************************************
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# ******************************************************************************
+
+import datalab.fab
+import datalab.actions_lib
+import datalab.meta_lib
+import json
+from datalab.logger import logging
+import os
+import sys
+import traceback
+import subprocess
+from fabric import *
+
+
+def clear_resources():
+    AzureActions.terminate_hdinsight_cluster(notebook_config['resource_group_name'], notebook_config['cluster_name'])
+    for storage_account in AzureMeta.list_storage_accounts(notebook_config['resource_group_name']):
+        if notebook_config['storage_account_name_tag'] == storage_account.tags["Name"]:
+            AzureActions.remove_storage_account(notebook_config['resource_group_name'], storage_account.name)
+    # AzureActions.remove_kernels(notebook_config['notebook_name'], notebook_config['cluster_name'],
+    #                             os.environ['dataproc_version'], os.environ['conf_os_user'], notebook_config['key_path'])
+
+
+if __name__ == "__main__":
+    # generating variables dictionary
+    AzureMeta = datalab.meta_lib.AzureMeta()
+    AzureActions = datalab.actions_lib.AzureActions()
+    logging.info('Generating infrastructure names and tags')
+    notebook_config = dict()
+    notebook_config['resource_group_name'] = os.environ['azure_resource_group_name']
+    notebook_config['service_base_name'] = (os.environ['conf_service_base_name'])
+    notebook_config['notebook_name'] = os.environ['notebook_instance_name']
+    notebook_config['edge_user_name'] = (os.environ['edge_user_name'])
+    notebook_config['project_name'] = (os.environ['project_name']).replace('_', '-').lower()
+    notebook_config['project_tag'] = notebook_config['project_name']
+    notebook_config['endpoint_name'] = (os.environ['endpoint_name']).replace('_', '-').lower()
+    notebook_config['endpoint_tag'] = notebook_config['endpoint_name']
+    notebook_config['tag_name'] = notebook_config['service_base_name'] + '-tag'
+    notebook_config['bucket_name'] = '{0}-{1}-{2}-bucket'.format(notebook_config['service_base_name'],
+                                                                 notebook_config['project_name'],
+                                                                 notebook_config['endpoint_name'])
+    notebook_config['cluster_name'] = '{}-{}-{}-des-{}'.format(notebook_config['service_base_name'],
+                                                              notebook_config['project_name'],
+                                                              notebook_config['endpoint_name'],
+                                                              notebook_config['computational_name'])
+    notebook_config['storage_account_name_tag'] = ('{}-bucket'.format(notebook_config['cluster_name'])).lower()
+    notebook_config['notebook_ip'] = AzureMeta.get_private_ip_address(notebook_config['resource_group_name'],
+                                                                      notebook_config['notebook_name'])
+    notebook_config['key_path'] = '{0}{1}.pem'.format(os.environ['conf_key_dir'], os.environ['conf_key_name'])
+    edge_instance_name = '{0}-{1}-{2}-edge'.format(notebook_config['service_base_name'],
+                                                   notebook_config['project_name'], notebook_config['endpoint_tag'])
+    edge_instance_hostname = AzureMeta.get_private_ip_address(notebook_config['resource_group_name'],
+                                                              edge_instance_name)
+
+    if os.environ['application'] == 'deeplearning':
+        application = 'jupyter'
+    else:
+        application = os.environ['application']
+
+    try:
+        logging.info('[INSTALLING KERNELS INTO SPECIFIED NOTEBOOK]')
+        params = "--bucket {} --cluster_name {} --dataproc_version {} --keyfile {} --notebook_ip {} --region {} " \
+                 "--edge_user_name {} --project_name {} --os_user {}  --edge_hostname {} --proxy_port {} " \
+                 "--scala_version {} --application {}" \
+            .format(notebook_config['storage_account_name_tag'], notebook_config['cluster_name'], os.environ['dataproc_version'],
+                    notebook_config['key_path'], notebook_config['notebook_ip'], os.environ['gcp_region'],
+                    notebook_config['edge_user_name'], notebook_config['project_name'], os.environ['conf_os_user'],
+                    edge_instance_hostname, '3128', os.environ['notebook_scala_version'], os.environ['application'])
+        try:
+            subprocess.run("~/scripts/{}_{}.py {}".format(application, 'install_dataengine-service_kernels', params), 
+                           shell=True, check=True)
+        except:
+            traceback.print_exc()
+            raise Exception
+    except Exception as err:
+        clear_resources()
+        datalab.fab.append_result("Failed installing Dataproc kernels.", str(err))
+        sys.exit(1)
+
+    try:
+        logging.info('[UPDATING SPARK CONFIGURATION FILES ON NOTEBOOK]')
+        params = "--hostname {0} " \
+                 "--keyfile {1} " \
+                 "--os_user {2} " \
+            .format(notebook_config['notebook_ip'],
+                    notebook_config['key_path'],
+                    os.environ['conf_os_user'])
+        try:
+            subprocess.run("~/scripts/{0}.py {1}".format('common_configure_spark', params), shell=True, check=True)
+        except:
+            traceback.print_exc()
+            raise Exception
+    except Exception as err:
+        datalab.fab.append_result("Failed to configure Spark.", str(err))
+        clear_resources()
+        sys.exit(1)
+
+    try:
+        with open("/root/result.json", 'w') as result:
+            res = {"notebook_name": notebook_config['notebook_name'],
+                   "Tag_name": notebook_config['tag_name'],
+                   "Action": "Configure notebook server"}
+            logging.info(json.dumps(res))
+            result.write(json.dumps(res))
+    except Exception as err:
+        datalab.fab.append_result("Error with writing results", str(err))
+        clear_resources()
+        sys.exit(1)
diff --git a/infrastructure-provisioning/src/general/scripts/azure/zeppelin_dataengine-service_create_configs.py b/infrastructure-provisioning/src/general/scripts/azure/zeppelin_dataengine-service_create_configs.py
new file mode 100644
index 000000000..f645b64b4
--- /dev/null
+++ b/infrastructure-provisioning/src/general/scripts/azure/zeppelin_dataengine-service_create_configs.py
@@ -0,0 +1,94 @@
+#!/usr/bin/python3
+
+# *****************************************************************************
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# ******************************************************************************
+
+import argparse
+import subprocess
+from datalab.actions_lib import jars, yarn, install_emr_spark, spark_defaults, installing_python, configure_zeppelin_emr_interpreter
+from datalab.common_lib import *
+from datalab.fab import configuring_notebook, update_zeppelin_interpreters
+from datalab.notebook_lib import *
+from fabric import *
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--bucket', type=str, default='')
+parser.add_argument('--cluster_name', type=str, default='')
+parser.add_argument('--dry_run', type=str, default='false')
+parser.add_argument('--emr_version', type=str, default='')
+parser.add_argument('--spark_version', type=str, default='')
+parser.add_argument('--scala_version', type=str, default='')
+parser.add_argument('--hadoop_version', type=str, default='')
+parser.add_argument('--matplotlib_version', type=str, default='')
+parser.add_argument('--region', type=str, default='')
+parser.add_argument('--excluded_lines', type=str, default='')
+parser.add_argument('--project_name', type=str, default='')
+parser.add_argument('--os_user', type=str, default='')
+parser.add_argument('--edge_hostname', type=str, default='')
+parser.add_argument('--proxy_port', type=str, default='')
+parser.add_argument('--livy_version', type=str, default='')
+parser.add_argument('--multiple_clusters', type=str, default='')
+parser.add_argument('--numpy_version', type=str, default='')
+parser.add_argument('--application', type=str, default='')
+parser.add_argument('--r_enabled', type=str, default='')
+args = parser.parse_args()
+
+emr_dir = '/opt/' + args.emr_version + '/jars/'
+kernels_dir = '/home/' + args.os_user + '/.local/share/jupyter/kernels/'
+spark_dir = '/opt/' + args.emr_version + '/' + args.cluster_name + '/spark/'
+yarn_dir = '/opt/' + args.emr_version + '/' + args.cluster_name + '/conf/'
+
+
+def install_remote_livy(args):
+    subprocess.run('sudo chown ' + args.os_user + ':' + args.os_user + ' -R /opt/zeppelin/', shell=True, check=True)
+    subprocess.run('sudo service zeppelin-notebook stop', shell=True, check=True)
+    subprocess.run('sudo -i wget http://archive.cloudera.com/beta/livy/livy-server-' + args.livy_version + '.zip -O /opt/'
+          + args.emr_version + '/' + args.cluster_name + '/livy-server-' + args.livy_version + '.zip', shell=True, check=True)
+    subprocess.run('sudo unzip /opt/'
+          + args.emr_version + '/' + args.cluster_name + '/livy-server-' + args.livy_version + '.zip -d /opt/'
+          + args.emr_version + '/' + args.cluster_name + '/', shell=True, check=True)
+    subprocess.run('sudo mv /opt/' + args.emr_version + '/' + args.cluster_name + '/livy-server-' + args.livy_version +
+          '/ /opt/' + args.emr_version + '/' + args.cluster_name + '/livy/', shell=True, check=True)
+    livy_path = '/opt/' + args.emr_version + '/' + args.cluster_name + '/livy/'
+    subprocess.run('sudo mkdir -p ' + livy_path + '/logs', shell=True, check=True)
+    subprocess.run('sudo mkdir -p /var/run/livy', shell=True, check=True)
+    subprocess.run('sudo chown ' + args.os_user + ':' + args.os_user + ' -R /var/run/livy', shell=True, check=True)
+    subprocess.run('sudo chown ' + args.os_user + ':' + args.os_user + ' -R ' + livy_path, shell=True, check=True)
+
+
+if __name__ == "__main__":
+    if args.dry_run == 'true':
+        parser.print_help()
+    else:
+        result = prepare(emr_dir, yarn_dir)
+        if result == False :
+            jars(args, emr_dir)
+        yarn(args, yarn_dir)
+        install_emr_spark(args)
+        spark_defaults(args)
+        configuring_notebook(args.emr_version)
+        if args.multiple_clusters == 'true':
+            install_remote_livy(args)
+        installing_python(args.region, args.bucket, args.project_name, args.cluster_name, args.application,
+                          args.numpy_version, args.matplotlib_version)
+        configure_zeppelin_emr_interpreter(args.emr_version, args.cluster_name, args.region, spark_dir, args.os_user,
+                                           yarn_dir, args.bucket, args.project_name, endpoint_url, args.multiple_clusters)
+        update_zeppelin_interpreters(args.multiple_clusters, args.r_enabled)
diff --git a/infrastructure-provisioning/src/general/scripts/azure/zeppelin_install_dataengine-service_kernels.py b/infrastructure-provisioning/src/general/scripts/azure/zeppelin_install_dataengine-service_kernels.py
new file mode 100644
index 000000000..b8ea051d1
--- /dev/null
+++ b/infrastructure-provisioning/src/general/scripts/azure/zeppelin_install_dataengine-service_kernels.py
@@ -0,0 +1,116 @@
+#!/usr/bin/python3
+
+# *****************************************************************************
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# ******************************************************************************
+
+import argparse
+import os
+from datalab.meta_lib import *
+from fabric import *
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--bucket', type=str, default='')
+parser.add_argument('--cluster_name', type=str, default='')
+parser.add_argument('--dry_run', type=str, default='false')
+parser.add_argument('--emr_version', type=str, default='')
+parser.add_argument('--keyfile', type=str, default='')
+parser.add_argument('--region', type=str, default='')
+parser.add_argument('--notebook_ip', type=str, default='')
+parser.add_argument('--scala_version', type=str, default='')
+parser.add_argument('--emr_excluded_spark_properties', type=str, default='')
+parser.add_argument('--project_name', type=str, default='')
+parser.add_argument('--os_user', type=str, default='')
+parser.add_argument('--edge_hostname', type=str, default='')
+parser.add_argument('--proxy_port', type=str, default='')
+parser.add_argument('--application', type=str, default='')
+args = parser.parse_args()
+
+
+def configure_notebook(args):
+    templates_dir = '/root/templates/'
+    scripts_dir = '/root/scripts/'
+    if os.environ['notebook_multiple_clusters'] == 'true':
+        conn.put(templates_dir + 'dataengine-service_interpreter_livy.json', '/tmp/dataengine-service_interpreter.json')
+    else:
+        conn.put(templates_dir + 'dataengine-service_interpreter_spark.json', '/tmp/dataengine-service_interpreter.json')
+    conn.put('{}{}_dataengine-service_create_configs.py'.format(scripts_dir, args.application),
+             '/tmp/zeppelin_dataengine-service_create_configs.py')
+    conn.sudo('\cp /tmp/zeppelin_dataengine-service_create_configs.py '
+              '/usr/local/bin/zeppelin_dataengine-service_create_configs.py')
+    conn.sudo('chmod 755 /usr/local/bin/zeppelin_dataengine-service_create_configs.py')
+    conn.sudo('mkdir -p /usr/lib/python3.8/datalab/')
+    conn.run('mkdir -p /tmp/datalab_libs/')
+    conn.local('rsync -e "ssh -i {}" /usr/lib/python3.8/datalab/*.py {}@{}:/tmp/datalab_libs/'.format(args.keyfile, args.os_user, args.notebook_ip))
+    conn.run('chmod a+x /tmp/datalab_libs/*')
+    conn.sudo('mv /tmp/datalab_libs/* /usr/lib/python3.8/datalab/')
+    if exists(conn, '/usr/lib64'):
+        conn.sudo('mkdir -p /usr/lib64/python3.8')
+        conn.sudo('ln -fs /usr/lib/python3.8/datalab /usr/lib64/python3.8/datalab')
+
+
+if __name__ == "__main__":
+    global conn
+    conn = datalab.fab.init_datalab_connection(args.notebook_ip, args.os_user, args.keyfile)
+    configure_notebook(args)
+    spark_version = "None"  #get_spark_version(args.cluster_name)
+    hadoop_version = "None"  #get_hadoop_version(args.cluster_name)
+    livy_version = os.environ['notebook_livy_version']
+    r_enabled = os.environ['notebook_r_enabled']
+    numpy_version = os.environ['notebook_numpy_version']
+    matplotlib_version = os.environ['notebook_matplotlib_version']
+    command = "/usr/bin/python3 /usr/local/bin/zeppelin_dataengine-service_create_configs.py " \
+              "--bucket {0} " \
+              "--cluster_name {1} " \
+              "--emr_version {2} " \
+              "--spark_version {3} " \
+              "--hadoop_version {4} " \
+              "--region {5} " \
+              "--excluded_lines '{6}' " \
+              "--project_name {7} " \
+              "--os_user {8} " \
+              "--edge_hostname {9} " \
+              "--proxy_port {10} " \
+              "--scala_version {11} " \
+              "--livy_version {12} " \
+              "--multiple_clusters {13} " \
+              "--numpy_version {14} " \
+              "--matplotlib_version {15} " \
+              "--application {16} " \
+              "--r_enabled {17}" \
+        .format(args.bucket,
+                args.cluster_name,
+                args.emr_version,
+                spark_version,
+                hadoop_version,
+                args.region,
+                args.emr_excluded_spark_properties,
+                args.project_name,
+                args.os_user,
+                args.edge_hostname,
+                args.proxy_port,
+                args.scala_version,
+                livy_version,
+                os.environ['notebook_multiple_clusters'],
+                numpy_version,
+                matplotlib_version,
+                args.application,
+                r_enabled)
+    conn.sudo(command)


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@datalab.apache.org
For additional commands, e-mail: commits-help@datalab.apache.org