You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@flink.apache.org by tr...@apache.org on 2018/08/09 08:04:16 UTC
[flink] 01/03: [FLINK-10063][tests] Use runit to supervise mesos
processes.
This is an automated email from the ASF dual-hosted git repository.
trohrmann pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/flink.git
commit c029d2af9de1346cf3372a55fff663cd9b61e3f1
Author: gyao <ga...@data-artisans.com>
AuthorDate: Sun Aug 5 14:55:07 2018 +0200
[FLINK-10063][tests] Use runit to supervise mesos processes.
---
flink-jepsen/docker/Dockerfile-db | 11 +-
flink-jepsen/src/jepsen/flink/mesos.clj | 217 +++++++++++++++++++-------------
2 files changed, 138 insertions(+), 90 deletions(-)
diff --git a/flink-jepsen/docker/Dockerfile-db b/flink-jepsen/docker/Dockerfile-db
index 1555329..cb60efc 100644
--- a/flink-jepsen/docker/Dockerfile-db
+++ b/flink-jepsen/docker/Dockerfile-db
@@ -21,7 +21,7 @@ FROM debian:jessie
RUN echo "deb http://http.debian.net/debian jessie-backports main" >> /etc/apt/sources.list && \
apt-get update && \
apt-get install -y -t jessie-backports openjdk-8-jdk && \
- apt-get install -y apt-utils bzip2 curl faketime iproute iptables iputils-ping less libzip2 logrotate man man-db net-tools ntpdate psmisc python rsyslog sudo sysvinit sysvinit-core sysvinit-utils tar unzip vim wget
+ apt-get install -y apt-utils bzip2 curl faketime iproute iptables iputils-ping less libzip2 logrotate man man-db net-tools ntpdate psmisc python rsyslog runit sudo sysvinit sysvinit-core sysvinit-utils tar unzip vim wget
RUN apt-get update && \
apt-get -y install openssh-server && \
@@ -35,5 +35,12 @@ RUN mkdir -p /root/.ssh/ && \
chmod 600 /root/.ssh/authorized_keys && \
cat /root/id_rsa.pub >> /root/.ssh/authorized_keys
+COPY sshd-run /etc/sv/service/sshd/run
+RUN chmod +x /etc/sv/service/sshd/run && \
+ ln -sf /etc/sv/service/sshd /etc/service
+
EXPOSE 22
-CMD exec /usr/sbin/sshd -D
+
+# Start runit process supervisor which will bring up sshd.
+# In our tests we can use runit to supervise more processes, e.g., Mesos.
+CMD runsvdir -P /etc/service /dev/null > /dev/null
diff --git a/flink-jepsen/src/jepsen/flink/mesos.clj b/flink-jepsen/src/jepsen/flink/mesos.clj
index fd75991..a73f25f 100644
--- a/flink-jepsen/src/jepsen/flink/mesos.clj
+++ b/flink-jepsen/src/jepsen/flink/mesos.clj
@@ -24,11 +24,37 @@
[jepsen.os.debian :as debian]
[jepsen.flink.zookeeper :refer [zookeeper-uri]]))
+;;; runit process supervisor (http://smarden.org/runit/)
+;;;
+;;; We use runit to supervise Mesos processes because Mesos uses a "fail-fast" approach to
+;;; error handling, e.g., the Mesos master will exit when it discovers it has been partitioned away
+;;; from the Zookeeper quorum.
+
+(def runit-version "2.1.2-3")
+
+(defn create-supervised-service!
+ "Registers a service with the process supervisor and starts it."
+ [service-name cmd]
+ (let [service-dir (str "/etc/sv/" service-name)
+ run-script (str service-dir "/run")]
+ (c/su
+ (c/exec :mkdir :-p service-dir)
+ (c/exec :echo (clojure.string/join "\n" ["#!/bin/sh"
+ "exec 2>&1"
+ (str "exec " cmd)]) :> run-script)
+ (c/exec :chmod :+x run-script)
+ (c/exec :ln :-sf service-dir (str "/etc/service/" service-name)))))
+
+(defn stop-supervised-service!
+ "Stops a service and removes it from supervision."
+ [service-name]
+ (c/su
+ (c/exec :sv :down service-name)
+ (c/exec :rm :-f (str "/etc/service/" service-name))))
+
;;; Mesos
(def master-count 1)
-(def master-pidfile "/var/run/mesos/master.pid")
-(def slave-pidfile "/var/run/mesos/slave.pid")
(def master-dir "/var/lib/mesos/master")
(def slave-dir "/var/lib/mesos/slave")
(def log-dir "/var/log/mesos")
@@ -40,115 +66,130 @@
(def marathon-bin "/usr/bin/marathon")
(def zk-marathon-namespace "marathon")
-(def marathon-pidfile "/var/run/mesos/marathon.pid")
(def marathon-rest-port 8080)
-(defn install!
- [test node mesos-version marathon-version]
- (c/su
- (debian/add-repo! :mesosphere
- "deb http://repos.mesosphere.com/debian jessie main"
- "keyserver.ubuntu.com"
- "E56151BF")
- (debian/install {:mesos mesos-version
- :marathon marathon-version})
- (c/exec :mkdir :-p "/var/run/mesos")
- (c/exec :mkdir :-p master-dir)
- (c/exec :mkdir :-p slave-dir)))
-
;;; Mesos functions
+(defn mesos-master-cmd
+ "Returns the command to run the mesos master."
+ [test node]
+ (clojure.string/join " "
+ ["env GLOG_v=1"
+ master-bin
+ (str "--hostname=" (name node))
+ (str "--log_dir=" log-dir)
+ (str "--offer_timeout=30secs")
+ (str "--quorum=" (util/majority master-count))
+ (str "--registry_fetch_timeout=120secs")
+ (str "--registry_store_timeout=5secs")
+ (str "--work_dir=" master-dir)
+ (str "--zk=" (zookeeper-uri test zk-namespace))]))
+
+(defn mesos-slave-cmd
+ "Returns the command to run the mesos agent."
+ [test node]
+ (clojure.string/join " "
+ ["env GLOG_v=1"
+ slave-bin
+ (str "--hostname=" (name node))
+ (str "--log_dir=" log-dir)
+ (str "--master=" (zookeeper-uri test zk-namespace))
+ (str "--recovery_timeout=30secs")
+ (str "--work_dir=" slave-dir)]))
+
+(defn create-mesos-master-supervised-service!
+ [test node]
+ (create-supervised-service! "mesos-master"
+ (mesos-master-cmd test node)))
+
+(defn create-mesos-slave-supervised-service!
+ [test node]
+ (create-supervised-service! "mesos-slave"
+ (mesos-slave-cmd test node)))
+
+(defn master-node?
+ "Returns a truthy value if the node should run the mesos master."
+ [test node]
+ (some #{node} (take master-count (sort (:nodes test)))))
+
(defn start-master!
[test node]
- (when (some #{node} (take master-count (sort (:nodes test))))
+ (when (master-node? test node)
(info node "Starting mesos master")
(c/su
- (c/exec :start-stop-daemon
- :--background
- :--chdir master-dir
- :--exec "/usr/bin/env"
- :--make-pidfile
- :--no-close
- :--oknodo
- :--pidfile master-pidfile
- :--start
- :--
- "GLOG_v=1"
- master-bin
- (str "--hostname=" (name node))
- (str "--log_dir=" log-dir)
- (str "--offer_timeout=30secs")
- (str "--quorum=" (util/majority master-count))
- (str "--registry_fetch_timeout=120secs")
- (str "--registry_store_timeout=5secs")
- (str "--work_dir=" master-dir)
- (str "--zk=" (zookeeper-uri test zk-namespace))
- :>> (str log-dir "/master.stdout")
- (c/lit "2>&1")))))
+ (create-mesos-master-supervised-service! test node))))
(defn start-slave!
[test node]
- (when-not (some #{node} (take master-count (sort (:nodes test))))
+ (when-not (master-node? test node)
(info node "Starting mesos slave")
(c/su
- (c/exec :start-stop-daemon :--start
- :--background
- :--chdir slave-dir
- :--exec slave-bin
- :--make-pidfile
- :--no-close
- :--pidfile slave-pidfile
- :--oknodo
- :--
- (str "--hostname=" (name node))
- (str "--log_dir=" log-dir)
- (str "--master=" (zookeeper-uri test zk-namespace))
- (str "--recovery_timeout=30secs")
- (str "--work_dir=" slave-dir)
- :>> (str log-dir "/slave.stdout")
- (c/lit "2>&1")))))
+ (create-mesos-slave-supervised-service! test node))))
(defn stop-master!
- [node]
- (info node "Stopping mesos master")
- (meh (cu/grepkill! :mesos-master))
- (meh (c/exec :rm :-rf master-pidfile))
- (meh (c/exec :rm :-rf
- (c/lit (str log-dir "/*"))
- (c/lit (str master-dir "/*")))))
+ [test node]
+ (when (master-node? test node)
+ (info node "Stopping mesos master")
+ (stop-supervised-service! "mesos-master")
+ (meh (c/exec :rm :-rf
+ (c/lit (str log-dir "/*"))
+ (c/lit (str master-dir "/*"))))))
(defn stop-slave!
- [node]
- (info node "Stopping mesos slave")
- (meh (cu/grepkill! :mesos-slave))
- (meh (c/exec :rm :-rf slave-pidfile))
- (meh (c/exec :rm :-rf
- (c/lit (str log-dir "/*"))
- (c/lit (str slave-dir "/*")))))
+ [test node]
+ (when-not (master-node? test node)
+ (info node "Stopping mesos slave")
+ (stop-supervised-service! "mesos-slave")
+ (meh (c/exec :rm :-rf
+ (c/lit (str log-dir "/*"))
+ (c/lit (str slave-dir "/*"))))))
;;; Marathon functions
+(defn install!
+ [test node mesos-version marathon-version]
+ (c/su
+ (debian/add-repo! :mesosphere
+ "deb http://repos.mesosphere.com/debian jessie main"
+ "keyserver.ubuntu.com"
+ "E56151BF")
+ (debian/install {:mesos mesos-version
+ :marathon marathon-version
+ :runit runit-version})
+ (c/exec :mkdir :-p "/var/run/mesos")
+ (c/exec :mkdir :-p master-dir)
+ (c/exec :mkdir :-p slave-dir)))
+
+(defn marathon-cmd
+ "Returns the command to run the marathon."
+ [test node]
+ (clojure.string/join " "
+ [marathon-bin
+ (str "--hostname " node)
+ (str "--master " (zookeeper-uri test zk-namespace))
+ (str "--zk " (zookeeper-uri test zk-marathon-namespace))
+ (str ">> " log-dir "/marathon.out")]))
+
+(defn create-marathon-supervised-service!
+ [test node]
+ (create-supervised-service! "marathon"
+ (marathon-cmd test node)))
+
+(defn marathon-node?
+ [test node]
+ (= node (first (sort (:nodes test)))))
+
(defn start-marathon!
[test node]
- (when (= node (first (sort (:nodes test))))
+ (when (marathon-node? test node)
(info "Start marathon")
(c/su
- (c/exec :start-stop-daemon :--start
- :--background
- :--exec marathon-bin
- :--make-pidfile
- :--no-close
- :--pidfile marathon-pidfile
- :--
- (c/lit (str "--hostname " node))
- (c/lit (str "--master " (zookeeper-uri test zk-namespace)))
- (c/lit (str "--zk " (zookeeper-uri test zk-marathon-namespace)))
- :>> (str log-dir "/marathon.stdout")
- (c/lit "2>&1")))))
+ (create-marathon-supervised-service! test node))))
(defn stop-marathon!
- []
- (cu/grepkill! "marathon"))
+ [test node]
+ (when (marathon-node? test node)
+ (stop-supervised-service! "marathon")))
(defn marathon-base-url
[test]
@@ -163,9 +204,9 @@
(start-slave! test node)
(start-marathon! test node))
(teardown! [this test node]
- (stop-slave! node)
- (stop-master! node)
- (stop-marathon!))
+ (stop-slave! test node)
+ (stop-master! test node)
+ (stop-marathon! test node))
db/LogFiles
(log-files [_ test node]
(if (cu/exists? log-dir) (cu/ls-full log-dir) []))))