You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@apisix.apache.org by sp...@apache.org on 2021/10/14 08:03:35 UTC
[apisix] branch master updated: feat: etcd cluster single node
failure APISIX startup failure (#5158)
This is an automated email from the ASF dual-hosted git repository.
spacewander pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/apisix.git
The following commit(s) were added to refs/heads/master by this push:
new a413014 feat: etcd cluster single node failure APISIX startup failure (#5158)
a413014 is described below
commit a413014debc00b79566a8607ca4a340bd76d9068
Author: 帅进超 <sh...@gmail.com>
AuthorDate: Thu Oct 14 16:03:26 2021 +0800
feat: etcd cluster single node failure APISIX startup failure (#5158)
---
apisix/cli/etcd.lua | 48 ++++++++++++++++++++++++++----------------
t/cli/test_etcd_healthcheck.sh | 42 +++++++++++++++++++++++++++++++++---
2 files changed, 69 insertions(+), 21 deletions(-)
diff --git a/apisix/cli/etcd.lua b/apisix/cli/etcd.lua
index 3cdaaa8..4595ec5 100644
--- a/apisix/cli/etcd.lua
+++ b/apisix/cli/etcd.lua
@@ -32,6 +32,8 @@ local tonumber = tonumber
local str_format = string.format
local str_sub = string.sub
local table_concat = table.concat
+local table_insert = table.insert
+local io_stderr = io.stderr
local _M = {}
@@ -187,6 +189,7 @@ function _M.init(env, args)
end
-- check the etcd cluster version
+ local etcd_healthy_hosts = {}
for index, host in ipairs(yaml_conf.etcd.host) do
local version_url = host .. "/version"
local errmsg
@@ -206,29 +209,38 @@ function _M.init(env, args)
version_url, err, retry_time))
end
- if not res then
- errmsg = str_format("request etcd endpoint \'%s\' error, %s\n", version_url, err)
- util.die(errmsg)
- end
+ if res then
+ local body, _, err = dkjson.decode(res)
+ if err or (body and not body["etcdcluster"]) then
+ errmsg = str_format("got malformed version message: \"%s\" from etcd \"%s\"\n", res,
+ version_url)
+ util.die(errmsg)
+ end
- local body, _, err = dkjson.decode(res)
- if err or (body and not body["etcdcluster"]) then
- errmsg = str_format("got malformed version message: \"%s\" from etcd \"%s\"\n", res,
- version_url)
- util.die(errmsg)
- end
+ local cluster_version = body["etcdcluster"]
+ if compare_semantic_version(cluster_version, env.min_etcd_version) then
+ util.die("etcd cluster version ", cluster_version,
+ " is less than the required version ", env.min_etcd_version,
+ ", please upgrade your etcd cluster\n")
+ end
- local cluster_version = body["etcdcluster"]
- if compare_semantic_version(cluster_version, env.min_etcd_version) then
- util.die("etcd cluster version ", cluster_version,
- " is less than the required version ",
- env.min_etcd_version,
- ", please upgrade your etcd cluster\n")
+ table_insert(etcd_healthy_hosts, host)
+ else
+ io_stderr:write(str_format("request etcd endpoint \'%s\' error, %s\n", version_url,
+ err))
end
end
+ if #etcd_healthy_hosts <= 0 then
+ util.die("all etcd nodes are unavailable\n")
+ end
+
+ if (#etcd_healthy_hosts / host_count * 100) <= 50 then
+ util.die("the etcd cluster needs at least 50% and above healthy nodes\n")
+ end
+
local etcd_ok = false
- for index, host in ipairs(yaml_conf.etcd.host) do
+ for index, host in ipairs(etcd_healthy_hosts) do
local is_success = true
local errmsg
@@ -358,7 +370,7 @@ function _M.init(env, args)
end
if not etcd_ok then
- util.die("none of the configured etcd works well")
+ util.die("none of the configured etcd works well\n")
end
end
diff --git a/t/cli/test_etcd_healthcheck.sh b/t/cli/test_etcd_healthcheck.sh
index 75dd78d..bc7e3dc 100755
--- a/t/cli/test_etcd_healthcheck.sh
+++ b/t/cli/test_etcd_healthcheck.sh
@@ -41,7 +41,7 @@ etcd:
docker-compose -f ./t/cli/docker-compose-etcd-cluster.yaml up -d
-# Check apisix not got effected when one etcd node disconnected
+# case 1: Check apisix not got effected when one etcd node disconnected
make init && make run
docker stop ${ETCD_NAME_0}
@@ -69,7 +69,7 @@ make stop
echo "passed: apisix not got effected when one etcd node disconnected"
-# Check when all etcd nodes disconnected, apisix trying to reconnect with backoff, and could successfully recover when reconnected
+# case 2: Check when all etcd nodes disconnected, apisix trying to reconnect with backoff, and could successfully recover when reconnected
make init && make run
docker stop ${ETCD_NAME_0} && docker stop ${ETCD_NAME_1} && docker stop ${ETCD_NAME_2}
@@ -84,7 +84,7 @@ fi
docker start ${ETCD_NAME_0} && docker start ${ETCD_NAME_1} && docker start ${ETCD_NAME_2}
-# sleep till etcd health check try to check again
+# case 3: sleep till etcd health check try to check again
current_time=$(date +%s)
sleep_seconds=$(( $sleep_till - $current_time + 3))
if [ "$sleep_seconds" -gt 0 ]; then
@@ -102,3 +102,39 @@ fi
make stop
echo "passed: when all etcd nodes disconnected, apisix trying to reconnect with backoff, and could successfully recover when reconnected"
+
+# case 4: stop one etcd node (result: start successful)
+docker stop ${ETCD_NAME_0}
+
+out=$(make init 2>&1)
+if echo "$out" | grep "23790" | grep "connection refused"; then
+ echo "passed: APISIX successfully to start, stop only one etcd node"
+else
+ echo "failed: stop only one etcd node APISIX should start normally"
+ exit 1
+fi
+
+# case 5: stop two etcd nodes (result: start failure)
+docker stop ${ETCD_NAME_1}
+
+out=$(make init 2>&1 || true)
+if echo "$out" | grep "23791" | grep "connection refused"; then
+ echo "passed: APISIX failed to start, etcd cluster must have two or more healthy nodes"
+else
+ echo "failed: two etcd nodes have been stopped, APISIX should fail to start"
+ exit 1
+fi
+
+# case 6: stop all etcd nodes (result: start failure)
+docker stop ${ETCD_NAME_2}
+
+out=$(make init 2>&1 || true)
+if echo "$out" | grep "23792" | grep "connection refused"; then
+ echo "passed: APISIX failed to start, all etcd nodes have stopped"
+else
+ echo "failed: all etcd nodes have stopped, APISIX should not be able to start"
+ exit 1
+fi
+
+# stop etcd docker container
+docker-compose -f ./t/cli/docker-compose-etcd-cluster.yaml down