You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@flink.apache.org by GitBox <gi...@apache.org> on 2018/08/09 08:05:38 UTC

[GitHub] asfgit closed pull request #6496: [FLINK-10063][tests] Use runit to supervise mesos processes.

asfgit closed pull request #6496: [FLINK-10063][tests] Use runit to supervise mesos processes.
URL: https://github.com/apache/flink/pull/6496
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/flink-jepsen/docker/Dockerfile-db b/flink-jepsen/docker/Dockerfile-db
index 1555329af3f..cb60efce2e5 100644
--- a/flink-jepsen/docker/Dockerfile-db
+++ b/flink-jepsen/docker/Dockerfile-db
@@ -21,7 +21,7 @@ FROM debian:jessie
 RUN echo "deb http://http.debian.net/debian jessie-backports main" >> /etc/apt/sources.list && \
     apt-get update && \
     apt-get install -y -t jessie-backports openjdk-8-jdk && \
-    apt-get install -y apt-utils bzip2 curl faketime iproute iptables iputils-ping less libzip2 logrotate man man-db net-tools ntpdate psmisc python rsyslog sudo sysvinit sysvinit-core sysvinit-utils tar unzip vim wget
+    apt-get install -y apt-utils bzip2 curl faketime iproute iptables iputils-ping less libzip2 logrotate man man-db net-tools ntpdate psmisc python rsyslog runit sudo sysvinit sysvinit-core sysvinit-utils tar unzip vim wget
 
 RUN apt-get update && \
     apt-get -y install openssh-server && \
@@ -35,5 +35,12 @@ RUN mkdir -p /root/.ssh/ && \
     chmod 600 /root/.ssh/authorized_keys && \
     cat /root/id_rsa.pub >> /root/.ssh/authorized_keys
 
+COPY sshd-run /etc/sv/service/sshd/run
+RUN chmod +x /etc/sv/service/sshd/run && \
+    ln -sf /etc/sv/service/sshd /etc/service
+
 EXPOSE 22
-CMD exec /usr/sbin/sshd -D
+
+# Start runit process supervisor which will bring up sshd.
+# In our tests we can use runit to supervise more processes, e.g., Mesos.
+CMD runsvdir -P /etc/service /dev/null > /dev/null
diff --git a/flink-jepsen/src/jepsen/flink/db.clj b/flink-jepsen/src/jepsen/flink/db.clj
index 9a725d7149a..becc551e2cf 100644
--- a/flink-jepsen/src/jepsen/flink/db.clj
+++ b/flink-jepsen/src/jepsen/flink/db.clj
@@ -97,7 +97,7 @@
   (if (cu/exists? log-dir) (cu/ls-full log-dir) []))
 
 (defn flink-db
-  [test]
+  []
   (reify db/DB
     (setup! [_ test node]
       (c/su
@@ -131,7 +131,7 @@
   []
   (let [zk (zk/db deb-zookeeper-package)
         hadoop (hadoop/db hadoop-dist-url)
-        flink (flink-db test)]
+        flink (flink-db)]
     (combined-db [hadoop zk flink])))
 
 (defn exec-flink!
@@ -192,7 +192,7 @@
   (let [zk (zk/db deb-zookeeper-package)
         hadoop (hadoop/db hadoop-dist-url)
         mesos (mesos/db deb-mesos-package deb-marathon-package)
-        flink (flink-db test)]
+        flink (flink-db)]
     (combined-db [hadoop zk mesos flink])))
 
 (defn submit-job-with-retry!
@@ -209,24 +209,25 @@
     (let [r (fu/retry (fn []
                         (http/post
                           (str (mesos/marathon-base-url test) "/v2/apps")
-                          {:form-params  {:id   "flink"
-                                          :cmd  (str "HADOOP_CLASSPATH=`" hadoop/install-dir "/bin/hadoop classpath` "
-                                                     "HADOOP_CONF_DIR=" hadoop/hadoop-conf-dir " "
-                                                     install-dir "/bin/mesos-appmaster.sh "
-                                                     "-Dmesos.master=" (zookeeper-uri
-                                                                         test
-                                                                         mesos/zk-namespace) " "
-                                                     "-Djobmanager.rpc.address=$(hostname -f) "
-                                                     "-Djobmanager.heap.mb=2048 "
-                                                     "-Djobmanager.rpc.port=6123 "
-                                                     "-Djobmanager.web.port=8081 "
-                                                     "-Dmesos.resourcemanager.tasks.mem=2048 "
-                                                     "-Dtaskmanager.heap.mb=2048 "
-                                                     "-Dtaskmanager.numberOfTaskSlots=2 "
-                                                     "-Dmesos.resourcemanager.tasks.cpus=1 "
-                                                     "-Drest.bind-address=$(hostname -f) ")
-                                          :cpus 1.0
-                                          :mem  2048}
+                          {:form-params  {:id                    "flink"
+                                          :cmd                   (str "HADOOP_CLASSPATH=`" hadoop/install-dir "/bin/hadoop classpath` "
+                                                                      "HADOOP_CONF_DIR=" hadoop/hadoop-conf-dir " "
+                                                                      install-dir "/bin/mesos-appmaster.sh "
+                                                                      "-Dmesos.master=" (zookeeper-uri
+                                                                                          test
+                                                                                          mesos/zk-namespace) " "
+                                                                      "-Djobmanager.rpc.address=$(hostname -f) "
+                                                                      "-Djobmanager.heap.mb=2048 "
+                                                                      "-Djobmanager.rpc.port=6123 "
+                                                                      "-Djobmanager.web.port=8081 "
+                                                                      "-Dmesos.resourcemanager.tasks.mem=2048 "
+                                                                      "-Dtaskmanager.heap.mb=2048 "
+                                                                      "-Dtaskmanager.numberOfTaskSlots=2 "
+                                                                      "-Dmesos.resourcemanager.tasks.cpus=1 "
+                                                                      "-Drest.bind-address=$(hostname -f) ")
+                                          :cpus                  1.0
+                                          :mem                   2048
+                                          :maxLaunchDelaySeconds 3}
                            :content-type :json})))]
       (info "Submitted Flink Application via Marathon" r)
       (c/on (-> test :nodes sort first)
diff --git a/flink-jepsen/src/jepsen/flink/mesos.clj b/flink-jepsen/src/jepsen/flink/mesos.clj
index fd75991bdf5..a73f25fd489 100644
--- a/flink-jepsen/src/jepsen/flink/mesos.clj
+++ b/flink-jepsen/src/jepsen/flink/mesos.clj
@@ -24,11 +24,37 @@
             [jepsen.os.debian :as debian]
             [jepsen.flink.zookeeper :refer [zookeeper-uri]]))
 
+;;; runit process supervisor (http://smarden.org/runit/)
+;;;
+;;; We use runit to supervise Mesos processes because Mesos uses a "fail-fast" approach to
+;;; error handling, e.g., the Mesos master will exit when it discovers it has been partitioned away
+;;; from the Zookeeper quorum.
+
+(def runit-version "2.1.2-3")
+
+(defn create-supervised-service!
+  "Registers a service with the process supervisor and starts it."
+  [service-name cmd]
+  (let [service-dir (str "/etc/sv/" service-name)
+        run-script (str service-dir "/run")]
+    (c/su
+      (c/exec :mkdir :-p service-dir)
+      (c/exec :echo (clojure.string/join "\n" ["#!/bin/sh"
+                                               "exec 2>&1"
+                                               (str "exec " cmd)]) :> run-script)
+      (c/exec :chmod :+x run-script)
+      (c/exec :ln :-sf service-dir (str "/etc/service/" service-name)))))
+
+(defn stop-supervised-service!
+  "Stops a service and removes it from supervision."
+  [service-name]
+  (c/su
+    (c/exec :sv :down service-name)
+    (c/exec :rm :-f (str "/etc/service/" service-name))))
+
 ;;; Mesos
 
 (def master-count 1)
-(def master-pidfile "/var/run/mesos/master.pid")
-(def slave-pidfile "/var/run/mesos/slave.pid")
 (def master-dir "/var/lib/mesos/master")
 (def slave-dir "/var/lib/mesos/slave")
 (def log-dir "/var/log/mesos")
@@ -40,115 +66,130 @@
 
 (def marathon-bin "/usr/bin/marathon")
 (def zk-marathon-namespace "marathon")
-(def marathon-pidfile "/var/run/mesos/marathon.pid")
 (def marathon-rest-port 8080)
 
-(defn install!
-  [test node mesos-version marathon-version]
-  (c/su
-    (debian/add-repo! :mesosphere
-                      "deb http://repos.mesosphere.com/debian jessie main"
-                      "keyserver.ubuntu.com"
-                      "E56151BF")
-    (debian/install {:mesos    mesos-version
-                     :marathon marathon-version})
-    (c/exec :mkdir :-p "/var/run/mesos")
-    (c/exec :mkdir :-p master-dir)
-    (c/exec :mkdir :-p slave-dir)))
-
 ;;; Mesos functions
 
+(defn mesos-master-cmd
+  "Returns the command to run the mesos master."
+  [test node]
+  (clojure.string/join " "
+                       ["env GLOG_v=1"
+                        master-bin
+                        (str "--hostname=" (name node))
+                        (str "--log_dir=" log-dir)
+                        (str "--offer_timeout=30secs")
+                        (str "--quorum=" (util/majority master-count))
+                        (str "--registry_fetch_timeout=120secs")
+                        (str "--registry_store_timeout=5secs")
+                        (str "--work_dir=" master-dir)
+                        (str "--zk=" (zookeeper-uri test zk-namespace))]))
+
+(defn mesos-slave-cmd
+  "Returns the command to run the mesos agent."
+  [test node]
+  (clojure.string/join " "
+                       ["env GLOG_v=1"
+                        slave-bin
+                        (str "--hostname=" (name node))
+                        (str "--log_dir=" log-dir)
+                        (str "--master=" (zookeeper-uri test zk-namespace))
+                        (str "--recovery_timeout=30secs")
+                        (str "--work_dir=" slave-dir)]))
+
+(defn create-mesos-master-supervised-service!
+  [test node]
+  (create-supervised-service! "mesos-master"
+                              (mesos-master-cmd test node)))
+
+(defn create-mesos-slave-supervised-service!
+  [test node]
+  (create-supervised-service! "mesos-slave"
+                              (mesos-slave-cmd test node)))
+
+(defn master-node?
+  "Returns a truthy value if the node should run the mesos master."
+  [test node]
+  (some #{node} (take master-count (sort (:nodes test)))))
+
 (defn start-master!
   [test node]
-  (when (some #{node} (take master-count (sort (:nodes test))))
+  (when (master-node? test node)
     (info node "Starting mesos master")
     (c/su
-      (c/exec :start-stop-daemon
-              :--background
-              :--chdir master-dir
-              :--exec "/usr/bin/env"
-              :--make-pidfile
-              :--no-close
-              :--oknodo
-              :--pidfile master-pidfile
-              :--start
-              :--
-              "GLOG_v=1"
-              master-bin
-              (str "--hostname=" (name node))
-              (str "--log_dir=" log-dir)
-              (str "--offer_timeout=30secs")
-              (str "--quorum=" (util/majority master-count))
-              (str "--registry_fetch_timeout=120secs")
-              (str "--registry_store_timeout=5secs")
-              (str "--work_dir=" master-dir)
-              (str "--zk=" (zookeeper-uri test zk-namespace))
-              :>> (str log-dir "/master.stdout")
-              (c/lit "2>&1")))))
+      (create-mesos-master-supervised-service! test node))))
 
 (defn start-slave!
   [test node]
-  (when-not (some #{node} (take master-count (sort (:nodes test))))
+  (when-not (master-node? test node)
     (info node "Starting mesos slave")
     (c/su
-      (c/exec :start-stop-daemon :--start
-              :--background
-              :--chdir slave-dir
-              :--exec slave-bin
-              :--make-pidfile
-              :--no-close
-              :--pidfile slave-pidfile
-              :--oknodo
-              :--
-              (str "--hostname=" (name node))
-              (str "--log_dir=" log-dir)
-              (str "--master=" (zookeeper-uri test zk-namespace))
-              (str "--recovery_timeout=30secs")
-              (str "--work_dir=" slave-dir)
-              :>> (str log-dir "/slave.stdout")
-              (c/lit "2>&1")))))
+      (create-mesos-slave-supervised-service! test node))))
 
 (defn stop-master!
-  [node]
-  (info node "Stopping mesos master")
-  (meh (cu/grepkill! :mesos-master))
-  (meh (c/exec :rm :-rf master-pidfile))
-  (meh (c/exec :rm :-rf
-               (c/lit (str log-dir "/*"))
-               (c/lit (str master-dir "/*")))))
+  [test node]
+  (when (master-node? test node)
+    (info node "Stopping mesos master")
+    (stop-supervised-service! "mesos-master")
+    (meh (c/exec :rm :-rf
+                 (c/lit (str log-dir "/*"))
+                 (c/lit (str master-dir "/*"))))))
 
 (defn stop-slave!
-  [node]
-  (info node "Stopping mesos slave")
-  (meh (cu/grepkill! :mesos-slave))
-  (meh (c/exec :rm :-rf slave-pidfile))
-  (meh (c/exec :rm :-rf
-               (c/lit (str log-dir "/*"))
-               (c/lit (str slave-dir "/*")))))
+  [test node]
+  (when-not (master-node? test node)
+    (info node "Stopping mesos slave")
+    (stop-supervised-service! "mesos-slave")
+    (meh (c/exec :rm :-rf
+                 (c/lit (str log-dir "/*"))
+                 (c/lit (str slave-dir "/*"))))))
 
 ;;; Marathon functions
 
+(defn install!
+  [test node mesos-version marathon-version]
+  (c/su
+    (debian/add-repo! :mesosphere
+                      "deb http://repos.mesosphere.com/debian jessie main"
+                      "keyserver.ubuntu.com"
+                      "E56151BF")
+    (debian/install {:mesos    mesos-version
+                     :marathon marathon-version
+                     :runit    runit-version})
+    (c/exec :mkdir :-p "/var/run/mesos")
+    (c/exec :mkdir :-p master-dir)
+    (c/exec :mkdir :-p slave-dir)))
+
+(defn marathon-cmd
+  "Returns the command to run the marathon."
+  [test node]
+  (clojure.string/join " "
+                       [marathon-bin
+                        (str "--hostname " node)
+                        (str "--master " (zookeeper-uri test zk-namespace))
+                        (str "--zk " (zookeeper-uri test zk-marathon-namespace))
+                        (str ">> " log-dir "/marathon.out")]))
+
+(defn create-marathon-supervised-service!
+  [test node]
+  (create-supervised-service! "marathon"
+                              (marathon-cmd test node)))
+
+(defn marathon-node?
+  [test node]
+  (= node (first (sort (:nodes test)))))
+
 (defn start-marathon!
   [test node]
-  (when (= node (first (sort (:nodes test))))
+  (when (marathon-node? test node)
     (info "Start marathon")
     (c/su
-      (c/exec :start-stop-daemon :--start
-              :--background
-              :--exec marathon-bin
-              :--make-pidfile
-              :--no-close
-              :--pidfile marathon-pidfile
-              :--
-              (c/lit (str "--hostname " node))
-              (c/lit (str "--master " (zookeeper-uri test zk-namespace)))
-              (c/lit (str "--zk " (zookeeper-uri test zk-marathon-namespace)))
-              :>> (str log-dir "/marathon.stdout")
-              (c/lit "2>&1")))))
+      (create-marathon-supervised-service! test node))))
 
 (defn stop-marathon!
-  []
-  (cu/grepkill! "marathon"))
+  [test node]
+  (when (marathon-node? test node)
+    (stop-supervised-service! "marathon")))
 
 (defn marathon-base-url
   [test]
@@ -163,9 +204,9 @@
       (start-slave! test node)
       (start-marathon! test node))
     (teardown! [this test node]
-      (stop-slave! node)
-      (stop-master! node)
-      (stop-marathon!))
+      (stop-slave! test node)
+      (stop-master! test node)
+      (stop-marathon! test node))
     db/LogFiles
     (log-files [_ test node]
       (if (cu/exists? log-dir) (cu/ls-full log-dir) []))))


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services