You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@qpid.apache.org by ac...@apache.org on 2009/08/10 23:10:53 UTC

svn commit: r802927 - in /qpid/trunk: .gitignore qpid/cpp/src/Makefile.am qpid/cpp/src/cluster.mk qpid/cpp/src/qpid/cluster/WatchDogPlugin.cpp qpid/cpp/src/qpid/cluster/qpidd_watchdog.cpp qpid/cpp/src/tests/cluster.mk qpid/cpp/src/tests/test_watchdog

Author: aconway
Date: Mon Aug 10 21:10:53 2009
New Revision: 802927

URL: http://svn.apache.org/viewvc?rev=802927&view=rev
Log:
Watchdog feature to remove unresponsive cluster nodes.

In some intstances (e.g. while resolving an error) it's possible for a
hung process to hang the entire cluster as they wait for its response.
The cluster can handle terminated processes but hung processes present
a problem.

If the watchdog plugin is loaded and --watchdog-interval is set then
the broker forks a child process that runs a very simple watchdog
program, and starts a timer in the broker process to signal the watchdog
every interval/2 seconds. The watchdog kills its parent if it does not
receive a signal for interval seconds. This allows a stuck broker to be 
removed from the cluster so other cluster members can continue.

Added:
    qpid/trunk/qpid/cpp/src/qpid/cluster/WatchDogPlugin.cpp
    qpid/trunk/qpid/cpp/src/qpid/cluster/qpidd_watchdog.cpp
    qpid/trunk/qpid/cpp/src/tests/test_watchdog   (with props)
Modified:
    qpid/trunk/.gitignore
    qpid/trunk/qpid/cpp/src/Makefile.am
    qpid/trunk/qpid/cpp/src/cluster.mk
    qpid/trunk/qpid/cpp/src/tests/cluster.mk

Modified: qpid/trunk/.gitignore
URL: http://svn.apache.org/viewvc/qpid/trunk/.gitignore?rev=802927&r1=802926&r2=802927&view=diff
==============================================================================
--- qpid/trunk/.gitignore (original)
+++ qpid/trunk/.gitignore Mon Aug 10 21:10:53 2009
@@ -20,7 +20,7 @@
 qpid/cpp/libtool
 qpidc.spec
 qpid/cpp/src/gen/
-*.mk
+*gen.mk
 *.timestamp
 rgen.timestamp
 *.pcl

Modified: qpid/trunk/qpid/cpp/src/Makefile.am
URL: http://svn.apache.org/viewvc/qpid/trunk/qpid/cpp/src/Makefile.am?rev=802927&r1=802926&r2=802927&view=diff
==============================================================================
--- qpid/trunk/qpid/cpp/src/Makefile.am (original)
+++ qpid/trunk/qpid/cpp/src/Makefile.am Mon Aug 10 21:10:53 2009
@@ -115,6 +115,7 @@
 # Destination for intalled programs and tests defined here
 #
 qpidexecdir = $(libexecdir)/qpid
+AM_CXXFLAGS += -DQPID_EXEC_DIR=\"$(qpidexecdir)\"
 qpidexec_PROGRAMS =
 qpidexec_SCRIPTS =
 qpidtestdir = $(qpidexecdir)/tests

Modified: qpid/trunk/qpid/cpp/src/cluster.mk
URL: http://svn.apache.org/viewvc/qpid/trunk/qpid/cpp/src/cluster.mk?rev=802927&r1=802926&r2=802927&view=diff
==============================================================================
--- qpid/trunk/qpid/cpp/src/cluster.mk (original)
+++ qpid/trunk/qpid/cpp/src/cluster.mk Mon Aug 10 21:10:53 2009
@@ -89,4 +89,13 @@
 cluster_la_CXXFLAGS = $(AM_CXXFLAGS) -fno-strict-aliasing
 cluster_la_LDFLAGS = $(PLUGINLDFLAGS)
 
+# The watchdog plugin and helper executable
+dmodule_LTLIBRARIES += watchdog.la
+watchdog_la_SOURCES = qpid/cluster/WatchDogPlugin.cpp
+watchdog_la_LIBADD = libqpidbroker.la
+watchdog_la_LDFLAGS = $(PLUGINLDFLAGS)
+
+qpidexec_PROGRAMS += qpidd_watchdog
+qpidd_watchdog_SOURCES = qpid/cluster/qpidd_watchdog.cpp
+
 endif				# HAVE_LIBCPG

Added: qpid/trunk/qpid/cpp/src/qpid/cluster/WatchDogPlugin.cpp
URL: http://svn.apache.org/viewvc/qpid/trunk/qpid/cpp/src/qpid/cluster/WatchDogPlugin.cpp?rev=802927&view=auto
==============================================================================
--- qpid/trunk/qpid/cpp/src/qpid/cluster/WatchDogPlugin.cpp (added)
+++ qpid/trunk/qpid/cpp/src/qpid/cluster/WatchDogPlugin.cpp Mon Aug 10 21:10:53 2009
@@ -0,0 +1,114 @@
+/*
+ *
+ * Copyright (c) 2006 The Apache Software Foundation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "qpid/Plugin.h"
+#include "qpid/Options.h"
+#include "qpid/log/Statement.h"
+#include "qpid/broker/Broker.h"
+#include "qpid/sys/Timer.h"
+#include "qpid/sys/Fork.h"
+#include <sys/types.h>
+#include <sys/wait.h>
+#include <signal.h>
+
+namespace qpid {
+namespace cluster {
+
+using broker::Broker;
+
+struct Settings {
+    Settings() : interval(0) {}
+    int interval;
+};
+
+struct WatchDogOptions : public qpid::Options {
+    Settings& settings;
+
+    WatchDogOptions(Settings& s) : settings(s) {
+        addOptions()
+            ("watchdog-interval", optValue(settings.interval, "N"),
+             "broker is automatically killed if it is hung for more than \
+	      N seconds. 0 disables watchdog.");
+    }
+};
+
+struct WatchDogTask : public sys::TimerTask {
+    int pid;
+    sys::Timer& timer;
+    int interval;
+
+    WatchDogTask(int pid_, sys::Timer& t, int _interval)
+        : TimerTask(_interval*sys::TIME_SEC/2), pid(pid_), timer(t), interval(_interval) {}
+
+    void fire() {
+        timer.add (new WatchDogTask(pid, timer, interval));
+        QPID_LOG(debug, "Sending keepalive signal to watchdog");
+        ::kill(pid, SIGUSR1);
+    }
+};
+
+struct WatchDogPlugin : public qpid::Plugin, public qpid::sys::Fork {
+    Settings settings;
+    WatchDogOptions options;
+    Broker* broker;
+    int watchdogPid;
+
+    WatchDogPlugin() : options(settings), broker(0), watchdogPid(0) {}
+
+    ~WatchDogPlugin() {
+        if (watchdogPid) ::kill(watchdogPid, SIGTERM);
+        ::waitpid(watchdogPid, 0, 0);
+    }
+
+    Options* getOptions() { return &options; }
+
+    void earlyInitialize(qpid::Plugin::Target& target) {
+        broker = dynamic_cast<Broker*>(&target);
+        if (broker && settings.interval) {
+            QPID_LOG(notice, "Starting watchdog process with interval of " <<
+                     settings.interval << " seconds");
+            fork();
+        }
+    }
+
+    void initialize(Target&) {}
+
+  protected:
+
+    void child() {              // Child of fork
+        const char* watchdog = ::getenv("QPID_WATCHDOG_EXE"); // For use in tests
+        if (!watchdog) watchdog=QPID_EXEC_DIR "/qpidd_watchdog";
+        std::string interval = boost::lexical_cast<std::string>(settings.interval);
+        ::execl(watchdog, watchdog, interval.c_str(), NULL);
+        QPID_LOG(critical, "Failed to exec watchdog program " << watchdog );
+        ::kill(::getppid(), SIGKILL);
+        exit(1);
+    }
+
+    void parent(int pid) {          // Parent of fork
+        watchdogPid = pid;
+        broker->getTimer().add(
+            new WatchDogTask(watchdogPid, broker->getTimer(), settings.interval));
+        // TODO aconway 2009-08-10: to be extra safe, we could monitor
+        // the watchdog child and re-start it if it exits.
+    }
+};
+
+static WatchDogPlugin instance; // Static initialization.
+
+}} // namespace qpid::cluster

Added: qpid/trunk/qpid/cpp/src/qpid/cluster/qpidd_watchdog.cpp
URL: http://svn.apache.org/viewvc/qpid/trunk/qpid/cpp/src/qpid/cluster/qpidd_watchdog.cpp?rev=802927&view=auto
==============================================================================
--- qpid/trunk/qpid/cpp/src/qpid/cluster/qpidd_watchdog.cpp (added)
+++ qpid/trunk/qpid/cpp/src/qpid/cluster/qpidd_watchdog.cpp Mon Aug 10 21:10:53 2009
@@ -0,0 +1,60 @@
+/*
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ */
+#include <sys/types.h>
+#include <sys/time.h>
+#include <signal.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <limits.h>
+
+long timeout;
+
+void killParent(int) {
+    ::kill(getppid(), SIGKILL);
+    ::fprintf(stderr, "Watchdog killed unresponsive broker, pid=%d\n", ::getppid());
+    ::exit(1);
+}
+
+void resetTimer(int) {
+    struct ::itimerval itval = { { 0, 0 }, { timeout, 0 } };
+    if (::setitimer(ITIMER_REAL, &itval, 0) !=0) {
+        ::perror("Watchdog failed to set timer");
+        killParent(0);
+        ::exit(1);
+    }
+}
+
+/** Simple watchdog program: kill parent process if timeout
+ * expires without a SIGUSR1.
+ * Will be killed with SIGHUP when parent shuts down.
+ * Args: timeout in seconds.
+ */
+int main(int argc, char** argv) {
+    if(argc != 2 || (timeout = atoi(argv[1])) == 0) {
+        ::fprintf(stderr, "Usage: %s <timeout_seconds>\n", argv[0]);
+        ::exit(1);
+    }
+    ::signal(SIGUSR1, resetTimer);
+    ::signal(SIGALRM, killParent);
+    resetTimer(0);
+    while (true) { sleep(INT_MAX); }
+}

Modified: qpid/trunk/qpid/cpp/src/tests/cluster.mk
URL: http://svn.apache.org/viewvc/qpid/trunk/qpid/cpp/src/tests/cluster.mk?rev=802927&r1=802926&r2=802927&view=diff
==============================================================================
--- qpid/trunk/qpid/cpp/src/tests/cluster.mk (original)
+++ qpid/trunk/qpid/cpp/src/tests/cluster.mk Mon Aug 10 21:10:53 2009
@@ -29,44 +29,46 @@
 
 
 # ais_check checks pre-requisites for cluster tests and runs them if ok.
-TESTS += \
-    ais_check \
-	run_cluster_tests \
-	federated_cluster_test \
+TESTS +=					\
+	ais_check				\
+	test_watchdog				\
+	run_cluster_tests			\
+	federated_cluster_test			\
 	clustered_replication_test
-	
-EXTRA_DIST += \
-	ais_check \
-	start_cluster \
-	stop_cluster \
-	restart_cluster \
-	cluster_python_tests \
-	cluster_python_tests_failing.txt \
-  	federated_cluster_test \
-  	clustered_replication_test \
-  	run_cluster_tests \
-  	run_long_cluster_tests \
-  	testlib.py \
-  	cluster_tests.py \
-  	long_cluster_tests.py
-  	
-
-LONG_TESTS += \
-	run_long_cluster_tests \
-	start_cluster \
-	cluster_python_tests \
+
+EXTRA_DIST +=					\
+	ais_check				\
+	start_cluster				\
+	stop_cluster				\
+	restart_cluster				\
+	cluster_python_tests			\
+	cluster_python_tests_failing.txt	\
+	federated_cluster_test			\
+	clustered_replication_test		\
+	run_cluster_tests			\
+	run_long_cluster_tests			\
+	testlib.py				\
+	cluster_tests.py			\
+	long_cluster_tests.py
+
+LONG_TESTS +=					\
+	run_long_cluster_tests			\
+	start_cluster				\
+	cluster_python_tests			\
 	stop_cluster
 
 qpidtest_PROGRAMS += cluster_test
-cluster_test_SOURCES = \
-  	cluster_test.cpp \
-  	unit_test.cpp \
-  	ClusterFixture.cpp \
-  	ClusterFixture.h \
-  	ForkedBroker.h \
-  	ForkedBroker.cpp \
-  	PartialFailure.cpp \
-  	ClusterFailover.cpp
+
+cluster_test_SOURCES =				\
+	cluster_test.cpp			\
+	unit_test.cpp				\
+	ClusterFixture.cpp			\
+	ClusterFixture.h			\
+	ForkedBroker.h				\
+	ForkedBroker.cpp			\
+	PartialFailure.cpp			\
+	ClusterFailover.cpp
+
 cluster_test_LDADD=$(lib_client) $(lib_broker) -lboost_unit_test_framework
 
 qpidtest_SCRIPTS += run_cluster_tests cluster_tests.py run_long_cluster_tests long_cluster_tests.py testlib.py

Added: qpid/trunk/qpid/cpp/src/tests/test_watchdog
URL: http://svn.apache.org/viewvc/qpid/trunk/qpid/cpp/src/tests/test_watchdog?rev=802927&view=auto
==============================================================================
--- qpid/trunk/qpid/cpp/src/tests/test_watchdog (added)
+++ qpid/trunk/qpid/cpp/src/tests/test_watchdog Mon Aug 10 21:10:53 2009
@@ -0,0 +1,16 @@
+#!/bin/sh
+# Tests for the watchdog plug-in
+
+# Start a broker with  watchdog, freeze it with kill -STOP, verify that it is killed.
+export QPID_WATCHDOG_EXE=$PWD/../qpidd_watchdog
+PORT=`../qpidd -dp0 --no-data-dir --auth=no --no-module-dir --load-module $PWD/../.libs/watchdog.so --log-to-file=qpidd_watchdog.log --watchdog-interval 1`
+PID=`../qpidd -cp $PORT`
+kill -STOP $PID
+sleep 2
+
+if kill -0 $PID 2>/dev/null; then
+    echo "Hung process did not die."
+    kill $PID
+else
+    true
+fi

Propchange: qpid/trunk/qpid/cpp/src/tests/test_watchdog
------------------------------------------------------------------------------
    svn:executable = *



---------------------------------------------------------------------
Apache Qpid - AMQP Messaging Implementation
Project:      http://qpid.apache.org
Use/Interact: mailto:commits-subscribe@qpid.apache.org