You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@flink.apache.org by tr...@apache.org on 2018/08/09 08:04:16 UTC

[flink] 01/03: [FLINK-10063][tests] Use runit to supervise mesos processes.

This is an automated email from the ASF dual-hosted git repository.

trohrmann pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/flink.git

commit c029d2af9de1346cf3372a55fff663cd9b61e3f1
Author: gyao <ga...@data-artisans.com>
AuthorDate: Sun Aug 5 14:55:07 2018 +0200

    [FLINK-10063][tests] Use runit to supervise mesos processes.
---
 flink-jepsen/docker/Dockerfile-db       |  11 +-
 flink-jepsen/src/jepsen/flink/mesos.clj | 217 +++++++++++++++++++-------------
 2 files changed, 138 insertions(+), 90 deletions(-)

diff --git a/flink-jepsen/docker/Dockerfile-db b/flink-jepsen/docker/Dockerfile-db
index 1555329..cb60efc 100644
--- a/flink-jepsen/docker/Dockerfile-db
+++ b/flink-jepsen/docker/Dockerfile-db
@@ -21,7 +21,7 @@ FROM debian:jessie
 RUN echo "deb http://http.debian.net/debian jessie-backports main" >> /etc/apt/sources.list && \
     apt-get update && \
     apt-get install -y -t jessie-backports openjdk-8-jdk && \
-    apt-get install -y apt-utils bzip2 curl faketime iproute iptables iputils-ping less libzip2 logrotate man man-db net-tools ntpdate psmisc python rsyslog sudo sysvinit sysvinit-core sysvinit-utils tar unzip vim wget
+    apt-get install -y apt-utils bzip2 curl faketime iproute iptables iputils-ping less libzip2 logrotate man man-db net-tools ntpdate psmisc python rsyslog runit sudo sysvinit sysvinit-core sysvinit-utils tar unzip vim wget
 
 RUN apt-get update && \
     apt-get -y install openssh-server && \
@@ -35,5 +35,12 @@ RUN mkdir -p /root/.ssh/ && \
     chmod 600 /root/.ssh/authorized_keys && \
     cat /root/id_rsa.pub >> /root/.ssh/authorized_keys
 
+COPY sshd-run /etc/sv/service/sshd/run
+RUN chmod +x /etc/sv/service/sshd/run && \
+    ln -sf /etc/sv/service/sshd /etc/service
+
 EXPOSE 22
-CMD exec /usr/sbin/sshd -D
+
+# Start runit process supervisor which will bring up sshd.
+# In our tests we can use runit to supervise more processes, e.g., Mesos.
+CMD runsvdir -P /etc/service /dev/null > /dev/null
diff --git a/flink-jepsen/src/jepsen/flink/mesos.clj b/flink-jepsen/src/jepsen/flink/mesos.clj
index fd75991..a73f25f 100644
--- a/flink-jepsen/src/jepsen/flink/mesos.clj
+++ b/flink-jepsen/src/jepsen/flink/mesos.clj
@@ -24,11 +24,37 @@
             [jepsen.os.debian :as debian]
             [jepsen.flink.zookeeper :refer [zookeeper-uri]]))
 
+;;; runit process supervisor (http://smarden.org/runit/)
+;;;
+;;; We use runit to supervise Mesos processes because Mesos uses a "fail-fast" approach to
+;;; error handling, e.g., the Mesos master will exit when it discovers it has been partitioned away
+;;; from the Zookeeper quorum.
+
+(def runit-version "2.1.2-3")
+
+(defn create-supervised-service!
+  "Registers a service with the process supervisor and starts it."
+  [service-name cmd]
+  (let [service-dir (str "/etc/sv/" service-name)
+        run-script (str service-dir "/run")]
+    (c/su
+      (c/exec :mkdir :-p service-dir)
+      (c/exec :echo (clojure.string/join "\n" ["#!/bin/sh"
+                                               "exec 2>&1"
+                                               (str "exec " cmd)]) :> run-script)
+      (c/exec :chmod :+x run-script)
+      (c/exec :ln :-sf service-dir (str "/etc/service/" service-name)))))
+
+(defn stop-supervised-service!
+  "Stops a service and removes it from supervision."
+  [service-name]
+  (c/su
+    (c/exec :sv :down service-name)
+    (c/exec :rm :-f (str "/etc/service/" service-name))))
+
 ;;; Mesos
 
 (def master-count 1)
-(def master-pidfile "/var/run/mesos/master.pid")
-(def slave-pidfile "/var/run/mesos/slave.pid")
 (def master-dir "/var/lib/mesos/master")
 (def slave-dir "/var/lib/mesos/slave")
 (def log-dir "/var/log/mesos")
@@ -40,115 +66,130 @@
 
 (def marathon-bin "/usr/bin/marathon")
 (def zk-marathon-namespace "marathon")
-(def marathon-pidfile "/var/run/mesos/marathon.pid")
 (def marathon-rest-port 8080)
 
-(defn install!
-  [test node mesos-version marathon-version]
-  (c/su
-    (debian/add-repo! :mesosphere
-                      "deb http://repos.mesosphere.com/debian jessie main"
-                      "keyserver.ubuntu.com"
-                      "E56151BF")
-    (debian/install {:mesos    mesos-version
-                     :marathon marathon-version})
-    (c/exec :mkdir :-p "/var/run/mesos")
-    (c/exec :mkdir :-p master-dir)
-    (c/exec :mkdir :-p slave-dir)))
-
 ;;; Mesos functions
 
+(defn mesos-master-cmd
+  "Returns the command to run the mesos master."
+  [test node]
+  (clojure.string/join " "
+                       ["env GLOG_v=1"
+                        master-bin
+                        (str "--hostname=" (name node))
+                        (str "--log_dir=" log-dir)
+                        (str "--offer_timeout=30secs")
+                        (str "--quorum=" (util/majority master-count))
+                        (str "--registry_fetch_timeout=120secs")
+                        (str "--registry_store_timeout=5secs")
+                        (str "--work_dir=" master-dir)
+                        (str "--zk=" (zookeeper-uri test zk-namespace))]))
+
+(defn mesos-slave-cmd
+  "Returns the command to run the mesos agent."
+  [test node]
+  (clojure.string/join " "
+                       ["env GLOG_v=1"
+                        slave-bin
+                        (str "--hostname=" (name node))
+                        (str "--log_dir=" log-dir)
+                        (str "--master=" (zookeeper-uri test zk-namespace))
+                        (str "--recovery_timeout=30secs")
+                        (str "--work_dir=" slave-dir)]))
+
+(defn create-mesos-master-supervised-service!
+  [test node]
+  (create-supervised-service! "mesos-master"
+                              (mesos-master-cmd test node)))
+
+(defn create-mesos-slave-supervised-service!
+  [test node]
+  (create-supervised-service! "mesos-slave"
+                              (mesos-slave-cmd test node)))
+
+(defn master-node?
+  "Returns a truthy value if the node should run the mesos master."
+  [test node]
+  (some #{node} (take master-count (sort (:nodes test)))))
+
 (defn start-master!
   [test node]
-  (when (some #{node} (take master-count (sort (:nodes test))))
+  (when (master-node? test node)
     (info node "Starting mesos master")
     (c/su
-      (c/exec :start-stop-daemon
-              :--background
-              :--chdir master-dir
-              :--exec "/usr/bin/env"
-              :--make-pidfile
-              :--no-close
-              :--oknodo
-              :--pidfile master-pidfile
-              :--start
-              :--
-              "GLOG_v=1"
-              master-bin
-              (str "--hostname=" (name node))
-              (str "--log_dir=" log-dir)
-              (str "--offer_timeout=30secs")
-              (str "--quorum=" (util/majority master-count))
-              (str "--registry_fetch_timeout=120secs")
-              (str "--registry_store_timeout=5secs")
-              (str "--work_dir=" master-dir)
-              (str "--zk=" (zookeeper-uri test zk-namespace))
-              :>> (str log-dir "/master.stdout")
-              (c/lit "2>&1")))))
+      (create-mesos-master-supervised-service! test node))))
 
 (defn start-slave!
   [test node]
-  (when-not (some #{node} (take master-count (sort (:nodes test))))
+  (when-not (master-node? test node)
     (info node "Starting mesos slave")
     (c/su
-      (c/exec :start-stop-daemon :--start
-              :--background
-              :--chdir slave-dir
-              :--exec slave-bin
-              :--make-pidfile
-              :--no-close
-              :--pidfile slave-pidfile
-              :--oknodo
-              :--
-              (str "--hostname=" (name node))
-              (str "--log_dir=" log-dir)
-              (str "--master=" (zookeeper-uri test zk-namespace))
-              (str "--recovery_timeout=30secs")
-              (str "--work_dir=" slave-dir)
-              :>> (str log-dir "/slave.stdout")
-              (c/lit "2>&1")))))
+      (create-mesos-slave-supervised-service! test node))))
 
 (defn stop-master!
-  [node]
-  (info node "Stopping mesos master")
-  (meh (cu/grepkill! :mesos-master))
-  (meh (c/exec :rm :-rf master-pidfile))
-  (meh (c/exec :rm :-rf
-               (c/lit (str log-dir "/*"))
-               (c/lit (str master-dir "/*")))))
+  [test node]
+  (when (master-node? test node)
+    (info node "Stopping mesos master")
+    (stop-supervised-service! "mesos-master")
+    (meh (c/exec :rm :-rf
+                 (c/lit (str log-dir "/*"))
+                 (c/lit (str master-dir "/*"))))))
 
 (defn stop-slave!
-  [node]
-  (info node "Stopping mesos slave")
-  (meh (cu/grepkill! :mesos-slave))
-  (meh (c/exec :rm :-rf slave-pidfile))
-  (meh (c/exec :rm :-rf
-               (c/lit (str log-dir "/*"))
-               (c/lit (str slave-dir "/*")))))
+  [test node]
+  (when-not (master-node? test node)
+    (info node "Stopping mesos slave")
+    (stop-supervised-service! "mesos-slave")
+    (meh (c/exec :rm :-rf
+                 (c/lit (str log-dir "/*"))
+                 (c/lit (str slave-dir "/*"))))))
 
 ;;; Marathon functions
 
+(defn install!
+  [test node mesos-version marathon-version]
+  (c/su
+    (debian/add-repo! :mesosphere
+                      "deb http://repos.mesosphere.com/debian jessie main"
+                      "keyserver.ubuntu.com"
+                      "E56151BF")
+    (debian/install {:mesos    mesos-version
+                     :marathon marathon-version
+                     :runit    runit-version})
+    (c/exec :mkdir :-p "/var/run/mesos")
+    (c/exec :mkdir :-p master-dir)
+    (c/exec :mkdir :-p slave-dir)))
+
+(defn marathon-cmd
+  "Returns the command to run the marathon."
+  [test node]
+  (clojure.string/join " "
+                       [marathon-bin
+                        (str "--hostname " node)
+                        (str "--master " (zookeeper-uri test zk-namespace))
+                        (str "--zk " (zookeeper-uri test zk-marathon-namespace))
+                        (str ">> " log-dir "/marathon.out")]))
+
+(defn create-marathon-supervised-service!
+  [test node]
+  (create-supervised-service! "marathon"
+                              (marathon-cmd test node)))
+
+(defn marathon-node?
+  [test node]
+  (= node (first (sort (:nodes test)))))
+
 (defn start-marathon!
   [test node]
-  (when (= node (first (sort (:nodes test))))
+  (when (marathon-node? test node)
     (info "Start marathon")
     (c/su
-      (c/exec :start-stop-daemon :--start
-              :--background
-              :--exec marathon-bin
-              :--make-pidfile
-              :--no-close
-              :--pidfile marathon-pidfile
-              :--
-              (c/lit (str "--hostname " node))
-              (c/lit (str "--master " (zookeeper-uri test zk-namespace)))
-              (c/lit (str "--zk " (zookeeper-uri test zk-marathon-namespace)))
-              :>> (str log-dir "/marathon.stdout")
-              (c/lit "2>&1")))))
+      (create-marathon-supervised-service! test node))))
 
 (defn stop-marathon!
-  []
-  (cu/grepkill! "marathon"))
+  [test node]
+  (when (marathon-node? test node)
+    (stop-supervised-service! "marathon")))
 
 (defn marathon-base-url
   [test]
@@ -163,9 +204,9 @@
       (start-slave! test node)
       (start-marathon! test node))
     (teardown! [this test node]
-      (stop-slave! node)
-      (stop-master! node)
-      (stop-marathon!))
+      (stop-slave! test node)
+      (stop-master! test node)
+      (stop-marathon! test node))
     db/LogFiles
     (log-files [_ test node]
       (if (cu/exists? log-dir) (cu/ls-full log-dir) []))))