You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@bigtop.apache.org by yw...@apache.org on 2019/10/18 04:08:26 UTC
[bigtop] branch cnb updated: BIGTOP-3255: Add Spark operator into distribution

This is an automated email from the ASF dual-hosted git repository.

ywkim pushed a commit to branch cnb
in repository https://gitbox.apache.org/repos/asf/bigtop.git


The following commit(s) were added to refs/heads/cnb by this push:
     new 472c525  BIGTOP-3255: Add Spark operator into distribution
472c525 is described below

commit 472c525c701a53c815d5cff1240eee21a9f5e897
Author: Youngwoo Kim <yw...@apache.org>
AuthorDate: Fri Oct 18 11:59:38 2019 +0900

    BIGTOP-3255: Add Spark operator into distribution
---
 README.md                                       |  4 ++
 bigtop-tests/smoke-tests/spark/TestSpark.groovy | 53 +++++++---------------
 bigtop-tests/smoke-tests/spark/build.gradle     |  2 +-
 bigtop.bom                                      | 13 +++++-
 kubespray/vagrant/Vagrantfile                   |  2 +-
 spark/README.md                                 | 50 +++++++++++++++++++++
 spark/examples/spark-pi-batch.yaml              | 40 +++++++++++++++++
 spark/examples/spark-pi.yaml                    | 39 +++++++++++++++++
 spark/spark-rbac.yaml                           | 32 ++++++++++++++
 spark/values.yaml                               | 58 +++++++++++++++++++++++++
 10 files changed, 253 insertions(+), 40 deletions(-)

diff --git a/README.md b/README.md
index 53c288b..02fba6e 100755
--- a/README.md
+++ b/README.md
@@ -234,6 +234,10 @@ incubator/schema-registry
 
 ```
 
+## Spark
+
+See spark/README.md
+
 # Getting Started
 
 Below are some recipes for getting started with using Apache Bigtop. As Apache Bigtop has different subprojects, these recipes will continue to evolve.
diff --git a/bigtop-tests/smoke-tests/spark/TestSpark.groovy b/bigtop-tests/smoke-tests/spark/TestSpark.groovy
index 6d13ebf..29e0957 100644
--- a/bigtop-tests/smoke-tests/spark/TestSpark.groovy
+++ b/bigtop-tests/smoke-tests/spark/TestSpark.groovy
@@ -39,54 +39,33 @@ class TestSpark {
   static private Log LOG = LogFactory.getLog(Object.class)
 
   static Shell sh = new Shell("/bin/bash -s")
-  static final String SPARK_HOME = System.getenv("SPARK_HOME")
-  static final String SPARK_MASTER_IP = System.getenv("SPARK_MASTER_IP")
-  static final String SPARK_MASTER_PORT = System.getenv("SPARK_MASTER_PORT")
-  static final String TEST_SPARKSQL_LOG = "/tmp/TestSpark_testSparkSQL.log"
+  static final String BIGTOP_HOME = System.getenv("BIGTOP_HOME")
+  static final String BIGTOP_K8S_NS = "bigtop";
 
   @BeforeClass
   static void setUp() {
-    sh.exec("rm -f " + TEST_SPARKSQL_LOG)
-    // create HDFS examples/src/main/resources
-    sh.exec("hdfs dfs -mkdir -p examples/src/main/resources")
-    // extract people.txt file into it
-    String examplesJar = JarContent.getJarName("$SPARK_HOME/examples/jars", 'spark-examples.*jar')
-    assertNotNull(examplesJar, "spark-examples.jar file wasn't found")
-    ZipInputStream zipInputStream = new ZipInputStream(new FileInputStream("$SPARK_HOME/examples/jars/$examplesJar"))
-    File examplesDir = new File('examples')
-    examplesDir.mkdirs()
-    zipInputStream.unzip(examplesDir.getName(), 'people')
-    sh.exec("hdfs dfs -put examples/* examples/src/main/resources")
-    logError(sh)
   }
 
   @AfterClass
   public static void tearDown() {
-    sh.exec("hdfs dfs -ls")
-    logError(sh)
-    sh.exec("hdfs dfs -rm -r examples")
+    sh.exec("kubectl delete sparkapplication " + "-n " + BIGTOP_K8S_NS + " spark-pi")
     logError(sh)
   }
 
   @Test
-  void testSparkSQL() {
-    // Let's figure out the proper mode for the submission
-    // If SPARK_MASTER_IP nor SPARK_MASTER_PORT are set, we'll assume
-    // 'yarn-client' mode
-    String masterMode = 'yarn-client'
-    if (SPARK_MASTER_IP != null && SPARK_MASTER_PORT != null)
-      masterMode = "spark://$SPARK_MASTER_IP:$SPARK_MASTER_PORT"
-    else
-      println("SPARK_MASTER isn't set. yarn-client submission will be used. " +
-          "Refer to smoke-tests/README If this isn't what you you expect.")
-
-    final String SPARK_SHELL = SPARK_HOME + "/bin/spark-shell --master $masterMode"
-    // Let's use time, 'cause the test has one job
-    sh.exec("timeout 120 " + SPARK_SHELL +
-        " --class org.apache.spark.examples.sql.JavaSparkSQLExample " +
-        " --jars " + SPARK_HOME + "/examples/jars/spark-examples*.jar > " +
-        TEST_SPARKSQL_LOG + " 2>&1")
-    logError(sh)
+  void testSparkPi() {
+    sh.exec("kubectl apply -f " + BIGTOP_HOME + "/spark/examples/spark-pi.yaml");
+    logError(sh);
     assertTrue("Failed ...", sh.getRet() == 0);
+
+    // sleep 20s
+    sleep(20000);
+
+    sh.exec("kubectl logs " + "-n " + BIGTOP_K8S_NS + " spark-pi-driver");
+    logError(sh);
+    String out = sh.getOut().toString();
+    LOG.info("Output of Spark application driver:\n" + out);
+    assertTrue(out.contains("Pi is roughly")); 
   }
 }
+
diff --git a/bigtop-tests/smoke-tests/spark/build.gradle b/bigtop-tests/smoke-tests/spark/build.gradle
index 1ed3e14..81fa007 100644
--- a/bigtop-tests/smoke-tests/spark/build.gradle
+++ b/bigtop-tests/smoke-tests/spark/build.gradle
@@ -29,5 +29,5 @@ sourceSets {
 }
 
 test.doFirst {
-  checkEnv(["SPARK_HOME"])
+  checkEnv(["BIGTOP_HOME"])
 }
diff --git a/bigtop.bom b/bigtop.bom
index af45d12..20f2b1c 100644
--- a/bigtop.bom
+++ b/bigtop.bom
@@ -90,7 +90,7 @@
 
 bigtop {
 /** Base Configuration of the mirror and archives */
-  version = "1.5.0-SNAPSHOT"
+  version = "2.0.0-SNAPSHOT"
   stack {
     'jdk' { version = "1." + ( System.getenv('BIGTOP_JDK') ?: "8" ); version_base = version }
     'scala' { version = '2.11.8'; version_base = version }
@@ -144,6 +144,17 @@ bigtop {
       url     { site = "https://github.com/rook/rook/archive"
                 archive = site }
     }
+    'volcano' {
+      name    = "volcano"
+      pkg     = "volcano"
+      relNotes = "Volcano is system for runnning high performance workloads on Kubernetes"
+      website = "https://volcano.sh"
+      version { base = '0.2'; pkg = base; release = 1 }
+      tarball { destination = "$name-${version.base}.tar.gz"
+                source      = "v${version.base}.tar.gz" }
+      url     { site = "https://github.com/volcano-sh/volcano/archive"
+                archive = site }
+    }
     'zookeeper' {
       name    = 'zookeeper'
       pkg     = name
diff --git a/kubespray/vagrant/Vagrantfile b/kubespray/vagrant/Vagrantfile
index 99131d5..dff6145 100644
--- a/kubespray/vagrant/Vagrantfile
+++ b/kubespray/vagrant/Vagrantfile
@@ -178,7 +178,7 @@ Vagrant.configure("2") do |config|
       # Disable swap for each vm
       node.vm.provision "shell", inline: "swapoff -a"
 
-      node.vm.provision "shell", inline: "yum install -y lvm2"
+      node.vm.provision "shell", inline: "yum install -y lvm2 java-1.8.0-openjdk-devel"
       node.vm.provision "shell", inline: "sudo cp /bigtop/kubectl/plugin/kubectl-bigtop /usr/local/bin/ ; sudo chmod +x /usr/local/bin/kubectl-bigtop" 
 
       host_vars[vm_name] = {
diff --git a/spark/README.md b/spark/README.md
new file mode 100644
index 0000000..c209826
--- /dev/null
+++ b/spark/README.md
@@ -0,0 +1,50 @@
+# Spark on Kubernetes
+
+Volcano[1]:
+```
+$ cd $VOLCANO_HOME
+$ kubectl apply -f installer/volcano-development.yaml
+
+```
+*TODO*
+- Install Volcano via Helm
+
+Spark operator[2]:
+You can install spark operator on 'bigtop' namespace:
+```
+$ cd $BIGTOP_HOME
+
+$ kubectl create -f spark/spark-rbac.yaml
+$ helm repo add incubator http://storage.googleapis.com/kubernetes-charts-incubator
+$ helm install incubator/sparkoperator --namespace bigtop -f spark/values.yaml \
+--set enableBatchScheduler=true \
+--set enableWebhook=true
+
+$ kubectl get po -n bigtop
+
+```
+
+Running spark Pi examples:
+```
+$ kubectl apply -f spark/examples/spark-pi.yaml
+$ kubectl describe sparkapplication -n bigtop spark-pi
+......
+$ kubectl logs -n bigtop spark-pi-driver
+......
+
+Pi is roughly 3.1405357026785135
+
+```
+
+Running Spark Pi example with Volcano scheduler:
+```
+kubectl apply -f spark/examples/spark-pi-batch.yaml
+$ kubectl logs -n bigtop spark-pi-batch-driver
+
+```
+----
+
+1. https://volcano.sh
+
+2. https://github.com/GoogleCloudPlatform/spark-on-k8s-operator
+
diff --git a/spark/examples/spark-pi-batch.yaml b/spark/examples/spark-pi-batch.yaml
new file mode 100644
index 0000000..6d328ad
--- /dev/null
+++ b/spark/examples/spark-pi-batch.yaml
@@ -0,0 +1,40 @@
+apiVersion: "sparkoperator.k8s.io/v1beta2"
+kind: SparkApplication
+metadata:
+  name: spark-pi-batch
+  namespace: bigtop
+spec:
+  type: Scala
+  mode: cluster
+  image: "gcr.io/spark-operator/spark:v2.4.4"
+  imagePullPolicy: Always
+  mainClass: org.apache.spark.examples.SparkPi
+  mainApplicationFile: "local:///opt/spark/examples/jars/spark-examples_2.11-2.4.4.jar"
+  sparkVersion: "2.4.4"
+  batchScheduler: "volcano"
+  restartPolicy:
+    type: Never
+  volumes:
+    - name: "test-volume"
+      hostPath:
+        path: "/tmp"
+        type: Directory
+  driver:
+    cores: 1
+    coreLimit: "1200m"
+    memory: "512m"
+    labels:
+      version: 2.4.4
+    serviceAccount: spark
+    volumeMounts:
+      - name: "test-volume"
+        mountPath: "/tmp"
+  executor:
+    cores: 1
+    instances: 1
+    memory: "512m"
+    labels:
+      version: 2.4.4
+    volumeMounts:
+      - name: "test-volume"
+        mountPath: "/tmp"
diff --git a/spark/examples/spark-pi.yaml b/spark/examples/spark-pi.yaml
new file mode 100644
index 0000000..7738fac
--- /dev/null
+++ b/spark/examples/spark-pi.yaml
@@ -0,0 +1,39 @@
+apiVersion: "sparkoperator.k8s.io/v1beta2"
+kind: SparkApplication
+metadata:
+  name: spark-pi
+  namespace: bigtop
+spec:
+  type: Scala
+  mode: cluster
+  image: "gcr.io/spark-operator/spark:v2.4.4"
+  imagePullPolicy: Always
+  mainClass: org.apache.spark.examples.SparkPi
+  mainApplicationFile: "local:///opt/spark/examples/jars/spark-examples_2.11-2.4.4.jar"
+  sparkVersion: "2.4.4"
+  restartPolicy:
+    type: Never
+  volumes:
+    - name: "test-volume"
+      hostPath:
+        path: "/tmp"
+        type: Directory
+  driver:
+    cores: 1
+    coreLimit: "1200m"
+    memory: "512m"
+    labels:
+      version: 2.4.4
+    serviceAccount: spark
+    volumeMounts:
+      - name: "test-volume"
+        mountPath: "/tmp"
+  executor:
+    cores: 1
+    instances: 1
+    memory: "512m"
+    labels:
+      version: 2.4.4
+    volumeMounts:
+      - name: "test-volume"
+        mountPath: "/tmp"
diff --git a/spark/spark-rbac.yaml b/spark/spark-rbac.yaml
new file mode 100644
index 0000000..779d5cf
--- /dev/null
+++ b/spark/spark-rbac.yaml
@@ -0,0 +1,32 @@
+apiVersion: v1
+kind: ServiceAccount
+metadata:
+  name: spark
+  namespace: bigtop 
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  namespace: bigtop
+  name: spark-role
+rules:
+- apiGroups: [""]
+  resources: ["pods"]
+  verbs: ["*"]
+- apiGroups: [""]
+  resources: ["services"]
+  verbs: ["*"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: spark-role-binding
+  namespace: bigtop
+subjects:
+- kind: ServiceAccount
+  name: spark
+  namespace: bigtop
+roleRef:
+  kind: Role
+  name: spark-role
+  apiGroup: rbac.authorization.k8s.io
diff --git a/spark/values.yaml b/spark/values.yaml
new file mode 100644
index 0000000..77f2dcf
--- /dev/null
+++ b/spark/values.yaml
@@ -0,0 +1,58 @@
+operatorImageName: gcr.io/spark-operator/spark-operator
+operatorVersion: v1beta2-1.0.1-2.4.4
+imagePullPolicy: IfNotPresent
+replicas: 1
+
+rbac:
+  create: true
+
+serviceAccounts:
+  spark:
+    create: true
+    name: spark
+  sparkoperator:
+    create: true
+    name:
+
+sparkJobNamespace: ""
+installCrds: true
+controllerThreads: 10
+resyncInterval: 30
+ingressUrlFormat: ""
+logLevel: 2
+
+securityContext: {}
+
+enableWebhook: true 
+webhookPort: 8080
+
+enableMetrics: true
+metricsPort: 10254
+metricsEndpoint: "/metrics"
+metricsPrefix: ""
+
+## Node labels for pod assignment
+## Ref: https://kubernetes.io/docs/user-guide/node-selection/
+##
+nodeSelector: {}
+
+podAnnotations: {}
+
+## Resources for the sparkoperator deployment
+## Ref: https://kubernetes.io/docs/concepts/configuration/manage-compute-resources-container/
+##
+resources: {}
+
+## Whether to enable batch scheduler for pod scheduling,
+## if enabled, end user can specify batch scheduler name in spark application.
+enableBatchScheduler: false
+
+## Whether to enable the ResourceQuota enforcement for SparkApplication resources.
+## Requires the webhook to be enabled by setting enableWebhook to true.
+## Ref: https://github.com/GoogleCloudPlatform/spark-on-k8s-operator/blob/master/docs/user-guide.md#enabling-resource-quota-enforcement.
+enableResourceQuotaEnforcement: false
+
+## Whether to enable leader election when the operator Deployment has more than one replica.
+## Only applicable when `replicas` is set to a value greater than 1.
+## Ref: https://github.com/GoogleCloudPlatform/spark-on-k8s-operator/blob/master/docs/user-guide.md#enabling-leader-election-for-high-availability.
+enableLeaderElection: false