You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@flink.apache.org by ga...@apache.org on 2019/07/23 19:32:23 UTC

[flink] branch master updated: [FLINK-13345][tests] Dump jstack output for Flink JVMs

This is an automated email from the ASF dual-hosted git repository.

gary pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/flink.git


The following commit(s) were added to refs/heads/master by this push:
     new 869ccd6  [FLINK-13345][tests] Dump jstack output for Flink JVMs
869ccd6 is described below

commit 869ccd68ac442f72e017232a6e7b91948cadb4dd
Author: Gary Yao <ga...@apache.org>
AuthorDate: Sun Jul 21 20:00:24 2019 +0200

    [FLINK-13345][tests] Dump jstack output for Flink JVMs
    
    Dump the jstack output for all Flink JVMs at the end of each Jepsen test in the
    log aggregation phase. This can be helpful for debugging deadlocks.
    
    This closes #9194.
---
 flink-jepsen/README.md                  |  3 ++-
 flink-jepsen/src/jepsen/flink/db.clj    |  7 ++++++-
 flink-jepsen/src/jepsen/flink/utils.clj | 30 ++++++++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 2 deletions(-)

diff --git a/flink-jepsen/README.md b/flink-jepsen/README.md
index d329c83..5694b74 100644
--- a/flink-jepsen/README.md
+++ b/flink-jepsen/README.md
@@ -78,4 +78,5 @@ or
 depending on whether the test passed or not. If neither output is generated, the test did not finish
 properly due to problems of the environment, bugs in Jepsen or in the test suite, etc.
 
-In addition, the test directories contain all relevant log files aggregated from all hosts.
+In addition, the test directories contain all relevant log files, and the jstack output for all Flink JVMs
+aggregated from the DB nodes.
diff --git a/flink-jepsen/src/jepsen/flink/db.clj b/flink-jepsen/src/jepsen/flink/db.clj
index 4aaa3f2..8ca200f 100644
--- a/flink-jepsen/src/jepsen/flink/db.clj
+++ b/flink-jepsen/src/jepsen/flink/db.clj
@@ -129,7 +129,12 @@
 
                         db/LogFiles
                         (log-files [_ _ _]
-                          (fu/find-files! log-dir)))]
+                          (c/su
+                            (fu/dump-jstack-by-pattern! log-dir
+                                                        "TaskExecutor"
+                                                        "TaskManager"
+                                                        "ClusterEntrypoint")
+                            (fu/find-files! log-dir))))]
     (combined-db [flink-base-db db])))
 
 (defn- sorted-nodes
diff --git a/flink-jepsen/src/jepsen/flink/utils.clj b/flink-jepsen/src/jepsen/flink/utils.clj
index 5d16568..8f6f654 100644
--- a/flink-jepsen/src/jepsen/flink/utils.clj
+++ b/flink-jepsen/src/jepsen/flink/utils.clj
@@ -112,3 +112,33 @@
     ;; Remove all symlinks in /etc/service except sshd.
     ;; This is only relevant when tests are run in Docker because there sshd is started using runit.
     (meh (c/exec :find (c/lit (str "/etc/service -mindepth 1 -maxdepth 1 -type l -not -name 'sshd' -delete"))))))
+
+;;; jstack
+
+(defn- includes-any?
+  [s substrs]
+  (some #(clojure.string/includes? s %) substrs))
+
+(defn- jps!
+  ([]
+   (map #(clojure.string/split % #"\s")
+        (-> (c/exec :jps)
+            (clojure.string/trim)
+            (clojure.string/split #"\n"))))
+
+  ([class-name-patterns]
+   (->> (jps!)
+        (filter #(= 2 (count %)))
+        (filter (fn [[_ class-name]] (includes-any? class-name class-name-patterns))))))
+
+(defn- write-jstack!
+  [pid out-path]
+  (c/exec :jstack :-l pid :> out-path))
+
+(defn dump-jstack-by-pattern!
+  "Dumps the output of jstack for all JVMs that match one of the specified patterns."
+  [out-dir & class-name-patterns]
+  (let [pid-class-names (jps! class-name-patterns)]
+    (doseq [[pid class-name] pid-class-names]
+      (let [out-path (str out-dir "/jstack_" pid "_" class-name)]
+        (write-jstack! pid out-path)))))