You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@apisix.apache.org by me...@apache.org on 2021/02/27 14:17:29 UTC

[apisix] branch master updated: fix(chash): ensure retry can try every node (#3651)

This is an automated email from the ASF dual-hosted git repository.

membphis pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/apisix.git


The following commit(s) were added to refs/heads/master by this push:
     new e58a55f  fix(chash): ensure retry can try every node (#3651)
e58a55f is described below

commit e58a55fbc6e5ad2007ff206ac44096945c869390
Author: 罗泽轩 <sp...@gmail.com>
AuthorDate: Sat Feb 27 22:17:22 2021 +0800

    fix(chash): ensure retry can try every node (#3651)
    
    Previously the default number of retry is equal to the number of node,
    but the same node will be tried again according to its weight.
    
    Also ensure the same picker will be used in the whole request,
    especially during the retry.
---
 apisix/balancer.lua       |  8 +++++--
 apisix/balancer/chash.lua | 39 ++++++++++++++++++++++++++++--
 t/admin/balancer.t        |  2 ++
 t/node/chash-balance.t    | 60 +++++++++++++++++++++++++++++++++++++++++++++++
 t/node/healthcheck.t      |  2 +-
 5 files changed, 106 insertions(+), 5 deletions(-)

diff --git a/apisix/balancer.lua b/apisix/balancer.lua
index 2540f2f..4be56fc 100644
--- a/apisix/balancer.lua
+++ b/apisix/balancer.lua
@@ -161,8 +161,12 @@ local function pick_server(route, ctx)
         version = version .. "#" .. checker.status_ver
     end
 
-    local server_picker = lrucache_server_picker(key, version,
-                            create_server_picker, up_conf, checker)
+    -- the same picker will be used in the whole request, especially during the retry
+    local server_picker = ctx.server_picker
+    if not server_picker then
+        server_picker = lrucache_server_picker(key, version,
+                                               create_server_picker, up_conf, checker)
+    end
     if not server_picker then
         return nil, "failed to fetch server picker"
     end
diff --git a/apisix/balancer/chash.lua b/apisix/balancer/chash.lua
index df1568a..f9dbdbb 100644
--- a/apisix/balancer/chash.lua
+++ b/apisix/balancer/chash.lua
@@ -22,6 +22,9 @@ local str_gsub    = string.gsub
 local pairs = pairs
 
 
+local CONSISTENT_POINTS = 160   -- points per server, taken from `resty.chash`
+
+
 local _M = {}
 
 
@@ -62,27 +65,59 @@ end
 function _M.new(up_nodes, upstream)
     local str_null = str_char(0)
 
+    local nodes_count = 0
+    local safe_limit = 0
     local servers, nodes = {}, {}
     for serv, weight in pairs(up_nodes) do
         local id = str_gsub(serv, ":", str_null)
 
+        nodes_count = nodes_count + 1
+        safe_limit = safe_limit + weight
         servers[id] = serv
         nodes[id] = weight
     end
+    safe_limit = safe_limit * CONSISTENT_POINTS
 
     local picker = resty_chash:new(nodes)
     return {
         upstream = upstream,
         get = function (ctx)
             local id
-            if ctx.balancer_try_count > 1 and ctx.chash_last_server_index then
-                id, ctx.chash_last_server_index = picker:next(ctx.chash_last_server_index)
+            if ctx.balancer_tried_servers then
+                if ctx.balancer_tried_servers_count == nodes_count then
+                    return nil, "all upstream servers tried"
+                end
+
+                -- the 'safe_limit' is a best effort limit to prevent infinite loop caused by bug
+                for i = 1, safe_limit do
+                    id, ctx.chash_last_server_index = picker:next(ctx.chash_last_server_index)
+                    if not ctx.balancer_tried_servers[servers[id]] then
+                        break
+                    end
+                end
             else
                 local chash_key = fetch_chash_hash_key(ctx, upstream)
                 id, ctx.chash_last_server_index = picker:find(chash_key)
             end
             -- core.log.warn("chash id: ", id, " val: ", servers[id])
             return servers[id]
+        end,
+        after_balance = function (ctx, before_retry)
+            if not before_retry then
+                if ctx.balancer_tried_servers then
+                    core.tablepool.release("balancer_tried_servers", ctx.balancer_tried_servers)
+                    ctx.balancer_tried_servers = nil
+                end
+
+                return nil
+            end
+
+            if not ctx.balancer_tried_servers then
+                ctx.balancer_tried_servers = core.tablepool.fetch("balancer_tried_servers", 0, 2)
+            end
+
+            ctx.balancer_tried_servers[ctx.balancer_server] = true
+            ctx.balancer_tried_servers_count = (ctx.balancer_tried_servers_count or 0) + 1
         end
     }
 end
diff --git a/t/admin/balancer.t b/t/admin/balancer.t
index b9a76c5..d1b9027 100644
--- a/t/admin/balancer.t
+++ b/t/admin/balancer.t
@@ -52,6 +52,8 @@ add_block_preprocessor(sub {
         for _, key in ipairs(keys) do
             ngx.say("host: ", key, " count: ", res[key])
         end
+
+        ctx.server_picker = nil
     end
 _EOC_
     $block->set_value("init_by_lua_block", $init_by_lua_block);
diff --git a/t/node/chash-balance.t b/t/node/chash-balance.t
index 02eee5f..dfde2aa 100644
--- a/t/node/chash-balance.t
+++ b/t/node/chash-balance.t
@@ -496,3 +496,63 @@ GET /t
 --- error_code_like: ^(?:50\d)$
 --- error_log
 failed to find valid upstream server, no valid upstream node
+
+
+
+=== TEST 13: set route(ensure retry can try every node)
+--- config
+    location /t {
+        content_by_lua_block {
+            local t = require("lib.test_admin").test
+            local code, body = t('/apisix/admin/routes/1',
+                 ngx.HTTP_PUT,
+                 [[{
+                    "uri": "/server_port",
+                    "upstream": {
+                        "key": "arg_device_id",
+                        "type": "chash",
+                        "nodes": {
+                            "127.0.0.1:1979": 1000,
+                            "127.0.0.1:1980": 1
+                        }
+                    }
+                }]]
+                )
+
+            if code >= 300 then
+                ngx.status = code
+            end
+            ngx.say(body)
+        }
+    }
+--- request
+GET /t
+--- response_body
+passed
+--- no_error_log
+[error]
+
+
+
+=== TEST 14: hit routes
+--- config
+    location /t {
+        content_by_lua_block {
+            local http = require "resty.http"
+            local uri = "http://127.0.0.1:" .. ngx.var.server_port
+                        .. "/server_port?device_id=1"
+
+            local httpc = http.new()
+            local res, err = httpc:request_uri(uri, {method = "GET"})
+            if not res then
+                ngx.say(err)
+                return
+            end
+
+            ngx.say(res.status)
+        }
+    }
+--- request
+GET /t
+--- response_body
+200
diff --git a/t/node/healthcheck.t b/t/node/healthcheck.t
index d9bacb8..9f77a43 100644
--- a/t/node/healthcheck.t
+++ b/t/node/healthcheck.t
@@ -488,7 +488,7 @@ qr{.*http://127.0.0.1:1960/server_port.*
 .*http://127.0.0.1:1961/server_port.*
 .*http://127.0.0.1:1961/server_port.*
 .*http://127.0.0.1:1961/server_port.*
-.*http://127.0.0.1:1960/server_port.*}
+.*http://127.0.0.1:1961/server_port.*}
 --- timeout: 10