You are viewing a plain text version of this content. The canonical link for it is here.
Posted to notifications@couchdb.apache.org by GitBox <gi...@apache.org> on 2022/09/19 21:52:21 UTC

[GitHub] [couchdb] nickva opened a new pull request, #4182: Explicitly maintain a fully connected cluster

nickva opened a new pull request, #4182:
URL: https://github.com/apache/couchdb/pull/4182

   Previously, it was possible for the nodes to disconnect and for that state to persist util the nodes restarted. Some fabric requests could reconnect the nodes as a side-effect of sending remote messages, but most of the fabric requests currently start a rexi monitor, which immediately delivers a `rexi_DOWN` message to the coordinator for worker nodes not in the `[node() | nodes()]` list. That happens before `erlang:send/2,3` gets called so there is nothing there to eventually reconnect the nodes.
   
   To avoid relying on the random request reconnecting the cluster, use an explicit monitor process. It does the initial connections, as well as periodically maintains them.
   
   [1] https://github.com/apache/couchdb/blob/main/src/rexi/src/rexi_monitor.erl#L45
   
   


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4182: Explicitly maintain a fully connected cluster

Posted by GitBox <gi...@apache.org>.
nickva commented on code in PR #4182:
URL: https://github.com/apache/couchdb/pull/4182#discussion_r976949880


##########
src/mem3/src/mem3_distribution.erl:
##########
@@ -0,0 +1,76 @@
+-module(mem3_distribution).
+
+-behaviour(gen_server).
+
+-export([
+    start_link/0,
+    connect_node/1
+]).
+
+-export([
+    init/1,
+    handle_call/3,
+    handle_cast/2,
+    handle_info/2,
+    code_change/3
+]).
+
+-record(st, {
+    tref
+}).
+
+start_link() ->
+    gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
+
+connect_node(Node) when is_atom(Node) ->
+    net_kernel:connect_node(Node).
+
+init(_) ->
+    connect(false),
+    {ok, #st{tref = erlang:send_after(wait_msec(), self(), connect)}}.
+
+handle_call(Msg, _From, #st{} = St) ->
+    {stop, {bad_call, Msg}, {bad_call, Msg}, St}.
+
+handle_cast(Msg, #st{} = St) ->
+    {stop, {bad_cast, Msg}, St}.
+
+handle_info(connect, #st{} = St) ->
+    erlang:cancel_timer(St#st.tref),
+    ok = connect(true),
+    {noreply, St#st{tref = erlang:send_after(wait_msec(), self(), connect)}};
+handle_info(Msg, St) ->
+    {stop, {bad_info, Msg}, St}.
+
+code_change(_OldVsn, #st{} = St, _Extra) ->
+    {ok, St}.
+
+connect(Log) ->
+    Expected = [N || N <- mem3:nodes(), N =/= node()],
+    Nodes = nodes(),
+    NotConnected = [N || N <- Expected, not lists:member(N, Nodes)],
+    connect(NotConnected, Log).
+
+connect([], _Log) ->
+    ok;
+connect([N | Rest], Log) ->
+    ConnectRes = ?MODULE:connect_node(N),
+    log(Log, ConnectRes, N),
+    connect(Rest, Log).
+
+log(true, true, Node) ->
+    couch_log:warning("~s : reconnected to ~s", [?MODULE, Node]),
+    ok;
+log(_, _, _) ->
+    % Failed to connect or we don't want to log it
+    ok.
+
+wait_msec() ->
+    IntervalSec = config:get_integer("cluster", "reconnect_interval_sec", 37),
+    IntervalMSec = IntervalSec * 1000,
+    IntervalMSec + jitter(IntervalMSec).
+
+jitter(Interval) ->
+    Jitter = round(Interval * 0.25),

Review Comment:
   Agree



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4182: Explicitly maintain a fully connected cluster

Posted by GitBox <gi...@apache.org>.
nickva commented on code in PR #4182:
URL: https://github.com/apache/couchdb/pull/4182#discussion_r976945806


##########
src/mem3/src/mem3_distribution.erl:
##########
@@ -0,0 +1,76 @@
+-module(mem3_distribution).

Review Comment:
   Good idea!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] jaydoane commented on a diff in pull request #4182: Explicitly maintain a fully connected cluster

Posted by GitBox <gi...@apache.org>.
jaydoane commented on code in PR #4182:
URL: https://github.com/apache/couchdb/pull/4182#discussion_r976897571


##########
src/mem3/src/mem3_distribution.erl:
##########
@@ -0,0 +1,76 @@
+-module(mem3_distribution).
+
+-behaviour(gen_server).
+
+-export([
+    start_link/0,
+    connect_node/1
+]).
+
+-export([
+    init/1,
+    handle_call/3,
+    handle_cast/2,
+    handle_info/2,
+    code_change/3
+]).
+
+-record(st, {
+    tref
+}).
+
+start_link() ->
+    gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
+
+connect_node(Node) when is_atom(Node) ->
+    net_kernel:connect_node(Node).
+
+init(_) ->
+    connect(false),
+    {ok, #st{tref = erlang:send_after(wait_msec(), self(), connect)}}.
+
+handle_call(Msg, _From, #st{} = St) ->
+    {stop, {bad_call, Msg}, {bad_call, Msg}, St}.
+
+handle_cast(Msg, #st{} = St) ->
+    {stop, {bad_cast, Msg}, St}.
+
+handle_info(connect, #st{} = St) ->
+    erlang:cancel_timer(St#st.tref),
+    ok = connect(true),
+    {noreply, St#st{tref = erlang:send_after(wait_msec(), self(), connect)}};
+handle_info(Msg, St) ->
+    {stop, {bad_info, Msg}, St}.
+
+code_change(_OldVsn, #st{} = St, _Extra) ->
+    {ok, St}.
+
+connect(Log) ->
+    Expected = [N || N <- mem3:nodes(), N =/= node()],
+    Nodes = nodes(),
+    NotConnected = [N || N <- Expected, not lists:member(N, Nodes)],
+    connect(NotConnected, Log).
+
+connect([], _Log) ->
+    ok;
+connect([N | Rest], Log) ->
+    ConnectRes = ?MODULE:connect_node(N),
+    log(Log, ConnectRes, N),
+    connect(Rest, Log).
+
+log(true, true, Node) ->
+    couch_log:warning("~s : reconnected to ~s", [?MODULE, Node]),
+    ok;
+log(_, _, _) ->
+    % Failed to connect or we don't want to log it
+    ok.
+
+wait_msec() ->
+    IntervalSec = config:get_integer("cluster", "reconnect_interval_sec", 37),
+    IntervalMSec = IntervalSec * 1000,
+    IntervalMSec + jitter(IntervalMSec).
+
+jitter(Interval) ->
+    Jitter = round(Interval * 0.25),

Review Comment:
   Maybe extract the 0.25 into e.g. `JITTER_PERCENT` define?



##########
src/mem3/src/mem3_distribution.erl:
##########
@@ -0,0 +1,76 @@
+-module(mem3_distribution).

Review Comment:
   Is this file missing a license preamble?
   
   Also, would a sentence explaining what the module does be helpful?



##########
src/mem3/src/mem3_distribution.erl:
##########
@@ -0,0 +1,76 @@
+-module(mem3_distribution).
+
+-behaviour(gen_server).
+
+-export([
+    start_link/0,
+    connect_node/1
+]).
+
+-export([
+    init/1,
+    handle_call/3,
+    handle_cast/2,
+    handle_info/2,
+    code_change/3
+]).
+
+-record(st, {
+    tref
+}).
+
+start_link() ->
+    gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
+
+connect_node(Node) when is_atom(Node) ->
+    net_kernel:connect_node(Node).
+
+init(_) ->
+    connect(false),
+    {ok, #st{tref = erlang:send_after(wait_msec(), self(), connect)}}.
+
+handle_call(Msg, _From, #st{} = St) ->
+    {stop, {bad_call, Msg}, {bad_call, Msg}, St}.
+
+handle_cast(Msg, #st{} = St) ->
+    {stop, {bad_cast, Msg}, St}.
+
+handle_info(connect, #st{} = St) ->
+    erlang:cancel_timer(St#st.tref),
+    ok = connect(true),
+    {noreply, St#st{tref = erlang:send_after(wait_msec(), self(), connect)}};
+handle_info(Msg, St) ->
+    {stop, {bad_info, Msg}, St}.
+
+code_change(_OldVsn, #st{} = St, _Extra) ->
+    {ok, St}.
+
+connect(Log) ->
+    Expected = [N || N <- mem3:nodes(), N =/= node()],
+    Nodes = nodes(),
+    NotConnected = [N || N <- Expected, not lists:member(N, Nodes)],
+    connect(NotConnected, Log).
+
+connect([], _Log) ->
+    ok;
+connect([N | Rest], Log) ->
+    ConnectRes = ?MODULE:connect_node(N),
+    log(Log, ConnectRes, N),
+    connect(Rest, Log).
+
+log(true, true, Node) ->
+    couch_log:warning("~s : reconnected to ~s", [?MODULE, Node]),

Review Comment:
   Is "reconnected" correct when the cluster first forms?



##########
src/mem3/test/eunit/mem3_distribution_test.erl:
##########
@@ -0,0 +1,62 @@
+% Licensed under the Apache License, Version 2.0 (the "License"); you may not
+% use this file except in compliance with the License. You may obtain a copy of
+% the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+% License for the specific language governing permissions and limitations under
+% the License.
+
+-module(mem3_distribution_test).
+
+-include_lib("couch/include/couch_eunit.hrl").
+
+-define(TDEF_FE(Name), fun(Arg) -> {atom_to_list(Name), ?_test(Name(Arg))} end).
+-define(MOD, mem3_distribution).
+
+setup() ->
+    Ctx = test_util:start_couch([mem3]),
+    meck:new(mem3, [passthrough]),
+    meck:new(mem3_distribution, [passthrough]),
+    Ctx.
+
+teardown(Ctx) ->
+    meck:unload(),
+    test_util:stop_couch(Ctx).
+
+mem3_distribution_test_() ->
+    {
+        foreach,
+        fun setup/0,
+        fun teardown/1,
+        [
+            ?TDEF_FE(periodic_scheduler_works),
+            ?TDEF_FE(connect_to_unconnected_nodes)
+        ]
+    }.
+
+periodic_scheduler_works(_) ->
+    St = sys:get_state(?MOD),
+    {st, TRef} = St,
+    TVal = erlang:read_timer(TRef),
+    ?assert(is_integer(TVal)),
+    ?assert(TVal > 0),
+    ?assert(TVal =< 70000),
+    {noreply, St1} = ?MOD:handle_info(connect, St),
+    {st, TRef1} = St1,
+    ?assertNotEqual(TRef, TRef1),
+    TVal1 = erlang:read_timer(TRef1),
+    ?assert(is_integer(TVal1)),
+    ?assertNotEqual(TVal, TVal1).

Review Comment:
   Isn't it possible that these could sometimes be equal?



##########
src/mem3/src/mem3_distribution.erl:
##########
@@ -0,0 +1,76 @@
+-module(mem3_distribution).
+
+-behaviour(gen_server).
+
+-export([
+    start_link/0,
+    connect_node/1
+]).
+
+-export([
+    init/1,
+    handle_call/3,
+    handle_cast/2,
+    handle_info/2,
+    code_change/3
+]).
+
+-record(st, {
+    tref
+}).
+
+start_link() ->
+    gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
+
+connect_node(Node) when is_atom(Node) ->
+    net_kernel:connect_node(Node).
+
+init(_) ->
+    connect(false),
+    {ok, #st{tref = erlang:send_after(wait_msec(), self(), connect)}}.
+
+handle_call(Msg, _From, #st{} = St) ->
+    {stop, {bad_call, Msg}, {bad_call, Msg}, St}.
+
+handle_cast(Msg, #st{} = St) ->
+    {stop, {bad_cast, Msg}, St}.
+
+handle_info(connect, #st{} = St) ->
+    erlang:cancel_timer(St#st.tref),
+    ok = connect(true),
+    {noreply, St#st{tref = erlang:send_after(wait_msec(), self(), connect)}};
+handle_info(Msg, St) ->
+    {stop, {bad_info, Msg}, St}.
+
+code_change(_OldVsn, #st{} = St, _Extra) ->
+    {ok, St}.
+
+connect(Log) ->
+    Expected = [N || N <- mem3:nodes(), N =/= node()],
+    Nodes = nodes(),
+    NotConnected = [N || N <- Expected, not lists:member(N, Nodes)],

Review Comment:
   I'm assuming it's not worth the extra conversions to use sets here, although they would at first blush appear to have the right semantics.



##########
src/mem3/src/mem3_distribution.erl:
##########
@@ -0,0 +1,76 @@
+-module(mem3_distribution).
+
+-behaviour(gen_server).
+
+-export([
+    start_link/0,
+    connect_node/1
+]).
+
+-export([
+    init/1,
+    handle_call/3,
+    handle_cast/2,
+    handle_info/2,
+    code_change/3
+]).
+
+-record(st, {
+    tref
+}).
+
+start_link() ->
+    gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
+
+connect_node(Node) when is_atom(Node) ->
+    net_kernel:connect_node(Node).
+
+init(_) ->
+    connect(false),
+    {ok, #st{tref = erlang:send_after(wait_msec(), self(), connect)}}.
+
+handle_call(Msg, _From, #st{} = St) ->
+    {stop, {bad_call, Msg}, {bad_call, Msg}, St}.
+
+handle_cast(Msg, #st{} = St) ->
+    {stop, {bad_cast, Msg}, St}.
+
+handle_info(connect, #st{} = St) ->
+    erlang:cancel_timer(St#st.tref),
+    ok = connect(true),
+    {noreply, St#st{tref = erlang:send_after(wait_msec(), self(), connect)}};
+handle_info(Msg, St) ->
+    {stop, {bad_info, Msg}, St}.
+
+code_change(_OldVsn, #st{} = St, _Extra) ->
+    {ok, St}.
+
+connect(Log) ->
+    Expected = [N || N <- mem3:nodes(), N =/= node()],

Review Comment:
   Is it worth caching e.g. `ThisNode = node()` so we don't have to keep calling a function for each iteration here?



##########
src/mem3/test/eunit/mem3_distribution_test.erl:
##########
@@ -0,0 +1,62 @@
+% Licensed under the Apache License, Version 2.0 (the "License"); you may not
+% use this file except in compliance with the License. You may obtain a copy of
+% the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+% License for the specific language governing permissions and limitations under
+% the License.
+
+-module(mem3_distribution_test).
+
+-include_lib("couch/include/couch_eunit.hrl").
+
+-define(TDEF_FE(Name), fun(Arg) -> {atom_to_list(Name), ?_test(Name(Arg))} end).
+-define(MOD, mem3_distribution).
+
+setup() ->
+    Ctx = test_util:start_couch([mem3]),
+    meck:new(mem3, [passthrough]),
+    meck:new(mem3_distribution, [passthrough]),
+    Ctx.
+
+teardown(Ctx) ->
+    meck:unload(),
+    test_util:stop_couch(Ctx).
+
+mem3_distribution_test_() ->
+    {
+        foreach,
+        fun setup/0,
+        fun teardown/1,
+        [
+            ?TDEF_FE(periodic_scheduler_works),
+            ?TDEF_FE(connect_to_unconnected_nodes)
+        ]
+    }.
+
+periodic_scheduler_works(_) ->
+    St = sys:get_state(?MOD),
+    {st, TRef} = St,
+    TVal = erlang:read_timer(TRef),
+    ?assert(is_integer(TVal)),
+    ?assert(TVal > 0),
+    ?assert(TVal =< 70000),
+    {noreply, St1} = ?MOD:handle_info(connect, St),
+    {st, TRef1} = St1,
+    ?assertNotEqual(TRef, TRef1),
+    TVal1 = erlang:read_timer(TRef1),
+    ?assert(is_integer(TVal1)),
+    ?assertNotEqual(TVal, TVal1).
+
+connect_to_unconnected_nodes(_) ->
+    Nodes = ['foo', 'bar'],
+    meck:expect(mem3, nodes, 0, Nodes),
+    meck:reset(?MOD),
+    % Simulate connect timer expiry
+    ?MOD ! connect,
+    meck:wait(?MOD, connect_node, [foo], 5000),
+    meck:wait(?MOD, connect_node, [bar], 5000).

Review Comment:
   Do you think it's worth also checking whether these result in "reconnected" log messages? Maybe I'm misunderstanding the code, but I was expecting this test to cause two warning "reconnected" log entries, but I don't see them in `.eunit/couch.log`, nor does coverage indicate they ran:
   <img width="814" alt="Screen Shot 2022-09-21 at 12 53 34 PM" src="https://user-images.githubusercontent.com/51209/191599149-f38d3cf9-6ea6-4f78-b58b-2ff22ab84b67.png">
   Perhaps the test finishes before the logging would happen?
   
   Ah, never mind, I see that it's only expected to log if `connect_node` returns true, which it would not for made up `foo` and `bar` nodes, unless it was also mocked.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva merged pull request #4182: Explicitly maintain a fully connected cluster

Posted by GitBox <gi...@apache.org>.
nickva merged PR #4182:
URL: https://github.com/apache/couchdb/pull/4182


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on pull request #4182: Explicitly maintain a fully connected cluster

Posted by GitBox <gi...@apache.org>.
nickva commented on PR #4182:
URL: https://github.com/apache/couchdb/pull/4182#issuecomment-1289495776

   This issue fixes https://github.com/apache/couchdb/issues/3241


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4182: Explicitly maintain a fully connected cluster

Posted by GitBox <gi...@apache.org>.
nickva commented on code in PR #4182:
URL: https://github.com/apache/couchdb/pull/4182#discussion_r976949880


##########
src/mem3/src/mem3_distribution.erl:
##########
@@ -0,0 +1,76 @@
+-module(mem3_distribution).
+
+-behaviour(gen_server).
+
+-export([
+    start_link/0,
+    connect_node/1
+]).
+
+-export([
+    init/1,
+    handle_call/3,
+    handle_cast/2,
+    handle_info/2,
+    code_change/3
+]).
+
+-record(st, {
+    tref
+}).
+
+start_link() ->
+    gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
+
+connect_node(Node) when is_atom(Node) ->
+    net_kernel:connect_node(Node).
+
+init(_) ->
+    connect(false),
+    {ok, #st{tref = erlang:send_after(wait_msec(), self(), connect)}}.
+
+handle_call(Msg, _From, #st{} = St) ->
+    {stop, {bad_call, Msg}, {bad_call, Msg}, St}.
+
+handle_cast(Msg, #st{} = St) ->
+    {stop, {bad_cast, Msg}, St}.
+
+handle_info(connect, #st{} = St) ->
+    erlang:cancel_timer(St#st.tref),
+    ok = connect(true),
+    {noreply, St#st{tref = erlang:send_after(wait_msec(), self(), connect)}};
+handle_info(Msg, St) ->
+    {stop, {bad_info, Msg}, St}.
+
+code_change(_OldVsn, #st{} = St, _Extra) ->
+    {ok, St}.
+
+connect(Log) ->
+    Expected = [N || N <- mem3:nodes(), N =/= node()],
+    Nodes = nodes(),
+    NotConnected = [N || N <- Expected, not lists:member(N, Nodes)],
+    connect(NotConnected, Log).
+
+connect([], _Log) ->
+    ok;
+connect([N | Rest], Log) ->
+    ConnectRes = ?MODULE:connect_node(N),
+    log(Log, ConnectRes, N),
+    connect(Rest, Log).
+
+log(true, true, Node) ->
+    couch_log:warning("~s : reconnected to ~s", [?MODULE, Node]),
+    ok;
+log(_, _, _) ->
+    % Failed to connect or we don't want to log it
+    ok.
+
+wait_msec() ->
+    IntervalSec = config:get_integer("cluster", "reconnect_interval_sec", 37),
+    IntervalMSec = IntervalSec * 1000,
+    IntervalMSec + jitter(IntervalMSec).
+
+jitter(Interval) ->
+    Jitter = round(Interval * 0.25),

Review Comment:
   Agree. Added a defined for it.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4182: Explicitly maintain a fully connected cluster

Posted by GitBox <gi...@apache.org>.
nickva commented on code in PR #4182:
URL: https://github.com/apache/couchdb/pull/4182#discussion_r976965906


##########
src/mem3/src/mem3_distribution.erl:
##########
@@ -0,0 +1,76 @@
+-module(mem3_distribution).
+
+-behaviour(gen_server).
+
+-export([
+    start_link/0,
+    connect_node/1
+]).
+
+-export([
+    init/1,
+    handle_call/3,
+    handle_cast/2,
+    handle_info/2,
+    code_change/3
+]).
+
+-record(st, {
+    tref
+}).
+
+start_link() ->
+    gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
+
+connect_node(Node) when is_atom(Node) ->
+    net_kernel:connect_node(Node).
+
+init(_) ->
+    connect(false),
+    {ok, #st{tref = erlang:send_after(wait_msec(), self(), connect)}}.
+
+handle_call(Msg, _From, #st{} = St) ->
+    {stop, {bad_call, Msg}, {bad_call, Msg}, St}.
+
+handle_cast(Msg, #st{} = St) ->
+    {stop, {bad_cast, Msg}, St}.
+
+handle_info(connect, #st{} = St) ->
+    erlang:cancel_timer(St#st.tref),
+    ok = connect(true),
+    {noreply, St#st{tref = erlang:send_after(wait_msec(), self(), connect)}};
+handle_info(Msg, St) ->
+    {stop, {bad_info, Msg}, St}.
+
+code_change(_OldVsn, #st{} = St, _Extra) ->
+    {ok, St}.
+
+connect(Log) ->
+    Expected = [N || N <- mem3:nodes(), N =/= node()],

Review Comment:
   It's only a 1-2 microseconds worth so it might be better to save an extra line of code?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4182: Explicitly maintain a fully connected cluster

Posted by GitBox <gi...@apache.org>.
nickva commented on code in PR #4182:
URL: https://github.com/apache/couchdb/pull/4182#discussion_r976946792


##########
src/mem3/test/eunit/mem3_distribution_test.erl:
##########
@@ -0,0 +1,62 @@
+% Licensed under the Apache License, Version 2.0 (the "License"); you may not
+% use this file except in compliance with the License. You may obtain a copy of
+% the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+% License for the specific language governing permissions and limitations under
+% the License.
+
+-module(mem3_distribution_test).
+
+-include_lib("couch/include/couch_eunit.hrl").
+
+-define(TDEF_FE(Name), fun(Arg) -> {atom_to_list(Name), ?_test(Name(Arg))} end).
+-define(MOD, mem3_distribution).
+
+setup() ->
+    Ctx = test_util:start_couch([mem3]),
+    meck:new(mem3, [passthrough]),
+    meck:new(mem3_distribution, [passthrough]),
+    Ctx.
+
+teardown(Ctx) ->
+    meck:unload(),
+    test_util:stop_couch(Ctx).
+
+mem3_distribution_test_() ->
+    {
+        foreach,
+        fun setup/0,
+        fun teardown/1,
+        [
+            ?TDEF_FE(periodic_scheduler_works),
+            ?TDEF_FE(connect_to_unconnected_nodes)
+        ]
+    }.
+
+periodic_scheduler_works(_) ->
+    St = sys:get_state(?MOD),
+    {st, TRef} = St,
+    TVal = erlang:read_timer(TRef),
+    ?assert(is_integer(TVal)),
+    ?assert(TVal > 0),
+    ?assert(TVal =< 70000),
+    {noreply, St1} = ?MOD:handle_info(connect, St),
+    {st, TRef1} = St1,
+    ?assertNotEqual(TRef, TRef1),
+    TVal1 = erlang:read_timer(TRef1),
+    ?assert(is_integer(TVal1)),
+    ?assertNotEqual(TVal, TVal1).

Review Comment:
   Ah good point. Just checking for a different timer ref is probably enough here. Updated.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4182: Explicitly maintain a fully connected cluster

Posted by GitBox <gi...@apache.org>.
nickva commented on code in PR #4182:
URL: https://github.com/apache/couchdb/pull/4182#discussion_r976973585


##########
src/mem3/src/mem3_distribution.erl:
##########
@@ -0,0 +1,76 @@
+-module(mem3_distribution).
+
+-behaviour(gen_server).
+
+-export([
+    start_link/0,
+    connect_node/1
+]).
+
+-export([
+    init/1,
+    handle_call/3,
+    handle_cast/2,
+    handle_info/2,
+    code_change/3
+]).
+
+-record(st, {
+    tref
+}).
+
+start_link() ->
+    gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
+
+connect_node(Node) when is_atom(Node) ->
+    net_kernel:connect_node(Node).
+
+init(_) ->
+    connect(false),
+    {ok, #st{tref = erlang:send_after(wait_msec(), self(), connect)}}.
+
+handle_call(Msg, _From, #st{} = St) ->
+    {stop, {bad_call, Msg}, {bad_call, Msg}, St}.
+
+handle_cast(Msg, #st{} = St) ->
+    {stop, {bad_cast, Msg}, St}.
+
+handle_info(connect, #st{} = St) ->
+    erlang:cancel_timer(St#st.tref),
+    ok = connect(true),
+    {noreply, St#st{tref = erlang:send_after(wait_msec(), self(), connect)}};
+handle_info(Msg, St) ->
+    {stop, {bad_info, Msg}, St}.
+
+code_change(_OldVsn, #st{} = St, _Extra) ->
+    {ok, St}.
+
+connect(Log) ->
+    Expected = [N || N <- mem3:nodes(), N =/= node()],
+    Nodes = nodes(),
+    NotConnected = [N || N <- Expected, not lists:member(N, Nodes)],

Review Comment:
   ```erlang
       Expected = ordsets:from_list([N || N <- mem3:nodes(), N =/= node()]),
       Connected = ordsets:from_list(nodes()),
       NotConnected = ordsets:subtract(Expected, Connected),
   ```
   
   Looks better, thanks for the suggestion!



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4182: Explicitly maintain a fully connected cluster

Posted by GitBox <gi...@apache.org>.
nickva commented on code in PR #4182:
URL: https://github.com/apache/couchdb/pull/4182#discussion_r976945806


##########
src/mem3/src/mem3_distribution.erl:
##########
@@ -0,0 +1,76 @@
+-module(mem3_distribution).

Review Comment:
   Good idea. Added a license preamble and a description of the module.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4182: Explicitly maintain a fully connected cluster

Posted by GitBox <gi...@apache.org>.
nickva commented on code in PR #4182:
URL: https://github.com/apache/couchdb/pull/4182#discussion_r976946792


##########
src/mem3/test/eunit/mem3_distribution_test.erl:
##########
@@ -0,0 +1,62 @@
+% Licensed under the Apache License, Version 2.0 (the "License"); you may not
+% use this file except in compliance with the License. You may obtain a copy of
+% the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+% License for the specific language governing permissions and limitations under
+% the License.
+
+-module(mem3_distribution_test).
+
+-include_lib("couch/include/couch_eunit.hrl").
+
+-define(TDEF_FE(Name), fun(Arg) -> {atom_to_list(Name), ?_test(Name(Arg))} end).
+-define(MOD, mem3_distribution).
+
+setup() ->
+    Ctx = test_util:start_couch([mem3]),
+    meck:new(mem3, [passthrough]),
+    meck:new(mem3_distribution, [passthrough]),
+    Ctx.
+
+teardown(Ctx) ->
+    meck:unload(),
+    test_util:stop_couch(Ctx).
+
+mem3_distribution_test_() ->
+    {
+        foreach,
+        fun setup/0,
+        fun teardown/1,
+        [
+            ?TDEF_FE(periodic_scheduler_works),
+            ?TDEF_FE(connect_to_unconnected_nodes)
+        ]
+    }.
+
+periodic_scheduler_works(_) ->
+    St = sys:get_state(?MOD),
+    {st, TRef} = St,
+    TVal = erlang:read_timer(TRef),
+    ?assert(is_integer(TVal)),
+    ?assert(TVal > 0),
+    ?assert(TVal =< 70000),
+    {noreply, St1} = ?MOD:handle_info(connect, St),
+    {st, TRef1} = St1,
+    ?assertNotEqual(TRef, TRef1),
+    TVal1 = erlang:read_timer(TRef1),
+    ?assert(is_integer(TVal1)),
+    ?assertNotEqual(TVal, TVal1).

Review Comment:
   Ah good point. Just checking for a different timer ref is probably enough here



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on pull request #4182: Explicitly maintain a fully connected cluster

Posted by GitBox <gi...@apache.org>.
nickva commented on PR #4182:
URL: https://github.com/apache/couchdb/pull/4182#issuecomment-1251872241

   To also test manually can start a 3 node cluster. Then in remsh disconnect the nodes ex. `net_kernel:disconnect('node2@127.0.0.1').`. Then `nodes()` should not contain the disconnected nodes. Might have do this a few times or disconnect more nodes. Then after a less than a minute nodes should start reconnecting to each and this log message should appear on **one** of the nodes:
   
   ```
   [warning] node3@127.0.0.1 <0.397.0> -------- mem3_distribution : reconnected to node2@127.0.0.1
   ```


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4182: Explicitly maintain a fully connected cluster

Posted by GitBox <gi...@apache.org>.
nickva commented on code in PR #4182:
URL: https://github.com/apache/couchdb/pull/4182#discussion_r976948272


##########
src/mem3/test/eunit/mem3_distribution_test.erl:
##########
@@ -0,0 +1,62 @@
+% Licensed under the Apache License, Version 2.0 (the "License"); you may not
+% use this file except in compliance with the License. You may obtain a copy of
+% the License at
+%
+%   http://www.apache.org/licenses/LICENSE-2.0
+%
+% Unless required by applicable law or agreed to in writing, software
+% distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
+% WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
+% License for the specific language governing permissions and limitations under
+% the License.
+
+-module(mem3_distribution_test).
+
+-include_lib("couch/include/couch_eunit.hrl").
+
+-define(TDEF_FE(Name), fun(Arg) -> {atom_to_list(Name), ?_test(Name(Arg))} end).
+-define(MOD, mem3_distribution).
+
+setup() ->
+    Ctx = test_util:start_couch([mem3]),
+    meck:new(mem3, [passthrough]),
+    meck:new(mem3_distribution, [passthrough]),
+    Ctx.
+
+teardown(Ctx) ->
+    meck:unload(),
+    test_util:stop_couch(Ctx).
+
+mem3_distribution_test_() ->
+    {
+        foreach,
+        fun setup/0,
+        fun teardown/1,
+        [
+            ?TDEF_FE(periodic_scheduler_works),
+            ?TDEF_FE(connect_to_unconnected_nodes)
+        ]
+    }.
+
+periodic_scheduler_works(_) ->
+    St = sys:get_state(?MOD),
+    {st, TRef} = St,
+    TVal = erlang:read_timer(TRef),
+    ?assert(is_integer(TVal)),
+    ?assert(TVal > 0),
+    ?assert(TVal =< 70000),
+    {noreply, St1} = ?MOD:handle_info(connect, St),
+    {st, TRef1} = St1,
+    ?assertNotEqual(TRef, TRef1),
+    TVal1 = erlang:read_timer(TRef1),
+    ?assert(is_integer(TVal1)),
+    ?assertNotEqual(TVal, TVal1).
+
+connect_to_unconnected_nodes(_) ->
+    Nodes = ['foo', 'bar'],
+    meck:expect(mem3, nodes, 0, Nodes),
+    meck:reset(?MOD),
+    % Simulate connect timer expiry
+    ?MOD ! connect,
+    meck:wait(?MOD, connect_node, [foo], 5000),
+    meck:wait(?MOD, connect_node, [bar], 5000).

Review Comment:
   A good idea, I'll make it `true` and `false` 



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4182: Explicitly maintain a fully connected cluster

Posted by GitBox <gi...@apache.org>.
nickva commented on code in PR #4182:
URL: https://github.com/apache/couchdb/pull/4182#discussion_r976947338


##########
src/mem3/src/mem3_distribution.erl:
##########
@@ -0,0 +1,76 @@
+-module(mem3_distribution).
+
+-behaviour(gen_server).
+
+-export([
+    start_link/0,
+    connect_node/1
+]).
+
+-export([
+    init/1,
+    handle_call/3,
+    handle_cast/2,
+    handle_info/2,
+    code_change/3
+]).
+
+-record(st, {
+    tref
+}).
+
+start_link() ->
+    gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
+
+connect_node(Node) when is_atom(Node) ->
+    net_kernel:connect_node(Node).
+
+init(_) ->
+    connect(false),
+    {ok, #st{tref = erlang:send_after(wait_msec(), self(), connect)}}.
+
+handle_call(Msg, _From, #st{} = St) ->
+    {stop, {bad_call, Msg}, {bad_call, Msg}, St}.
+
+handle_cast(Msg, #st{} = St) ->
+    {stop, {bad_cast, Msg}, St}.
+
+handle_info(connect, #st{} = St) ->
+    erlang:cancel_timer(St#st.tref),
+    ok = connect(true),
+    {noreply, St#st{tref = erlang:send_after(wait_msec(), self(), connect)}};
+handle_info(Msg, St) ->
+    {stop, {bad_info, Msg}, St}.
+
+code_change(_OldVsn, #st{} = St, _Extra) ->
+    {ok, St}.
+
+connect(Log) ->
+    Expected = [N || N <- mem3:nodes(), N =/= node()],
+    Nodes = nodes(),
+    NotConnected = [N || N <- Expected, not lists:member(N, Nodes)],
+    connect(NotConnected, Log).
+
+connect([], _Log) ->
+    ok;
+connect([N | Rest], Log) ->
+    ConnectRes = ?MODULE:connect_node(N),
+    log(Log, ConnectRes, N),
+    connect(Rest, Log).
+
+log(true, true, Node) ->
+    couch_log:warning("~s : reconnected to ~s", [?MODULE, Node]),

Review Comment:
   yeah, because on first connect we pass in `Log=false`.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org


[GitHub] [couchdb] nickva commented on a diff in pull request #4182: Explicitly maintain a fully connected cluster

Posted by GitBox <gi...@apache.org>.
nickva commented on code in PR #4182:
URL: https://github.com/apache/couchdb/pull/4182#discussion_r976949675


##########
src/mem3/src/mem3_distribution.erl:
##########
@@ -0,0 +1,76 @@
+-module(mem3_distribution).
+
+-behaviour(gen_server).
+
+-export([
+    start_link/0,
+    connect_node/1
+]).
+
+-export([
+    init/1,
+    handle_call/3,
+    handle_cast/2,
+    handle_info/2,
+    code_change/3
+]).
+
+-record(st, {
+    tref
+}).
+
+start_link() ->
+    gen_server:start_link({local, ?MODULE}, ?MODULE, [], []).
+
+connect_node(Node) when is_atom(Node) ->
+    net_kernel:connect_node(Node).
+
+init(_) ->
+    connect(false),
+    {ok, #st{tref = erlang:send_after(wait_msec(), self(), connect)}}.
+
+handle_call(Msg, _From, #st{} = St) ->
+    {stop, {bad_call, Msg}, {bad_call, Msg}, St}.
+
+handle_cast(Msg, #st{} = St) ->
+    {stop, {bad_cast, Msg}, St}.
+
+handle_info(connect, #st{} = St) ->
+    erlang:cancel_timer(St#st.tref),
+    ok = connect(true),
+    {noreply, St#st{tref = erlang:send_after(wait_msec(), self(), connect)}};
+handle_info(Msg, St) ->
+    {stop, {bad_info, Msg}, St}.
+
+code_change(_OldVsn, #st{} = St, _Extra) ->
+    {ok, St}.
+
+connect(Log) ->
+    Expected = [N || N <- mem3:nodes(), N =/= node()],
+    Nodes = nodes(),
+    NotConnected = [N || N <- Expected, not lists:member(N, Nodes)],

Review Comment:
   It might work, yeah. I'll give it a try.



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: notifications-unsubscribe@couchdb.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org