You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@couchdb.apache.org by ja...@apache.org on 2020/04/26 22:22:53 UTC

[couchdb] 01/01: Improve robustness of couch expiring cache test

This is an automated email from the ASF dual-hosted git repository.

jaydoane pushed a commit to branch improve-expiring-cache-test-robustness
in repository https://gitbox.apache.org/repos/asf/couchdb.git

commit 042e401cc92fd74c3e7e60521ff392d136fa782f
Author: Jay Doane <ja...@apache.org>
AuthorDate: Sun Apr 26 15:22:04 2020 -0700

    Improve robustness of couch expiring cache test
    
    In its current incarnation, the so-called "simple lifecycle" test is
    prone to numerous failures in the CI system [1], doubtless because it's
    riddled with race conditions. The original author makes many assumptions
    about how quickly an (actual, unmocked) FDB instance will respond to a
    request.
    
    The primary goal is to stop failing CI builds, while other
    considerations include: keeping the run time of the test as low as
    possible, keeping the code coverage high, and documenting the known
    races.
    
    Specifically:
    - Increase the `stale` and `expired` times by a factor of 5 to decrease
    sensitivity to poor FDB performance.
    - Change default timer from `erlang:system_time/1` to `os:timestamp` on
    the assumption that the latter is less prone to warping [2].
    - Decrease the period of the cache server reaper by half to increase
    accuracy of eviction time.
    - Inline and modify the `test_util:wait` code to make the timer
    explicit, and emphasize that `timer:delay/1` only works with millisecond
    resolution.
    - Don't fail the test if it can't get a fresh lookup immediately after
    insertion, but let it continue on to the next race, at least to the
    point of expiration and deletion, which continue to be asserted.
    - Factor `Timeout` and `Interval` to allow declarations near the other
    hard-coded parameters.
    - Move cache server `Opts` into `setup/0` and eliminate `start_link/0`.
    - Double the overall test timeout to 20 seconds.
    
    This has soaked for hundreds of runs on a 5 year old laptop, but the
    real test is the CI system.
    
    Should this test continue to fail CI builds, additional improvements
    could include mocking the timer and/or FDB layer to eliminate the
    variability of an integrated system.
    
    [1] https://ci-couchdb.apache.org/blue/organizations/jenkins/jenkins-cm1%2FPullRequests/detail/PR-2813/10/pipeline
    [2] http://erlang.org/doc/apps/erts/time_correction.html#terminology
---
 .../src/couch_expiring_cache_server.erl            |  10 +-
 .../test/couch_expiring_cache_tests.erl            | 101 ++++++++++++++-------
 2 files changed, 77 insertions(+), 34 deletions(-)

diff --git a/src/couch_expiring_cache/src/couch_expiring_cache_server.erl b/src/couch_expiring_cache/src/couch_expiring_cache_server.erl
index eb74e6e..74c432e 100644
--- a/src/couch_expiring_cache/src/couch_expiring_cache_server.erl
+++ b/src/couch_expiring_cache/src/couch_expiring_cache_server.erl
@@ -17,6 +17,7 @@
 -callback start_link() -> {ok, pid()} | ignore | {error, term()}.
 
 -export([
+    now_ts/0,
     start_link/2
 ]).
 
@@ -80,10 +81,10 @@ handle_info(remove_expired, St) ->
         largest_elapsed := LargestElapsed
     } = St,
 
-    NowTS = erlang:system_time(?TIME_UNIT),
+    NowTS = now_ts(),
     OldestTS = max(OldestTS0,
         couch_expiring_cache_fdb:clear_range_to(Name, NowTS, BatchSize)),
-    Elapsed = erlang:system_time(?TIME_UNIT) - NowTS,
+    Elapsed = now_ts() - NowTS,
 
     {noreply, St#{
         timer_ref := schedule_remove_expired(Period, MaxJitter),
@@ -108,6 +109,11 @@ code_change(_OldVsn, St, _Extra) ->
     {ok, St}.
 
 
+now_ts() ->
+    {Mega, Sec, Micro} = os:timestamp(),
+    ((Mega * 1000000) + Sec) * 1000 + Micro div 1000.
+
+
 %% Private
 
 
diff --git a/src/couch_expiring_cache/test/couch_expiring_cache_tests.erl b/src/couch_expiring_cache/test/couch_expiring_cache_tests.erl
index 2e06fcc..0780b88 100644
--- a/src/couch_expiring_cache/test/couch_expiring_cache_tests.erl
+++ b/src/couch_expiring_cache/test/couch_expiring_cache_tests.erl
@@ -18,16 +18,9 @@
 -include_lib("couch_expiring_cache/include/couch_expiring_cache.hrl").
 
 
--define(CACHE_NAME, <<"test">>).
+-define(CACHE_NAME, atom_to_binary(?MODULE, utf8)).
 
-
-start_link() ->
-    Opts = #{
-        cache_name => ?CACHE_NAME,
-        period => 20,
-        max_jitter => 0
-    },
-    couch_expiring_cache_server:start_link(?MODULE, Opts).
+-define(FOREVER, 576460752303423488). % max int 64 bit
 
 
 couch_expiring_cache_basic_test_() ->
@@ -56,7 +49,11 @@ teardown_couch(Ctx) ->
 
 
 setup() ->
-    {ok, Pid} = start_link(),
+    Opts = #{
+        cache_name => ?CACHE_NAME,
+        period => 10,
+        max_jitter => 0},
+    {ok, Pid} = couch_expiring_cache_server:start_link(?MODULE, Opts),
     true = unlink(Pid),
     #{pid => Pid}.
 
@@ -66,10 +63,18 @@ teardown(#{pid := Pid}) ->
 
 
 simple_lifecycle(_) ->
-    {timeout, 10, ?_test(begin
-        Now = erlang:system_time(?TIME_UNIT),
-        StaleTS = Now + 100,
-        ExpiresTS = Now + 200,
+    % The entire test is racing against FDB being faster than timeout seconds
+    {timeout, 20, ?_test(begin
+        Start = couch_expiring_cache_server:now_ts(),
+        % Race Alert!
+        % We're betting on FDB returning a lookup faster than these:
+        Stale = 500,
+        Expires = 1000,
+        Timeout = 5000,
+        Interval = 5,
+
+        StaleTS = Start + Stale,
+        ExpiresTS = Start + Expires,
         Name = ?CACHE_NAME,
         Key = <<"key">>,
         Val = <<"val">>,
@@ -77,34 +82,66 @@ simple_lifecycle(_) ->
         ?assertEqual(ok, couch_expiring_cache_fdb:clear_all(Name)),
         ?assertEqual(not_found, couch_expiring_cache:lookup(Name, Key)),
         ?assertEqual([], entries(Name)),
-        ?assertEqual(ok,
-            couch_expiring_cache:insert(Name, Key, Val, StaleTS, ExpiresTS)),
-        ?assertEqual({fresh, Val}, couch_expiring_cache:lookup(Name, Key)),
-        ok = wait_lookup(Name, Key, {stale, Val}),
+        ?assertEqual(ok, couch_expiring_cache:insert(Name, Key, Val,
+            StaleTS, ExpiresTS)),
+        ok = attempt_fresh_and_stale_lookups(Name, Key, Timeout, Interval),
 
         % Refresh the existing key with updated timestamps
-        ?assertEqual(ok,
-            couch_expiring_cache:insert(Name, Key, Val,
-                StaleTS + 100, ExpiresTS + 100)),
-        ?assertEqual({fresh, Val}, couch_expiring_cache:lookup(Name, Key)),
+        Refresh = couch_expiring_cache_server:now_ts(),
+        ?assertEqual(ok, couch_expiring_cache:insert(Name, Key, Val,
+            Refresh + Stale, Refresh + Expires)),
+        ok = attempt_fresh_and_stale_lookups(Name, Key, Timeout, Interval),
         ?assertEqual(1, length(entries(Name))),
-        ok = wait_lookup(Name, Key, {stale, Val}),
-        ok = wait_lookup(Name, Key, expired),
-        ok = wait_lookup(Name, Key, not_found),
-        ?assertEqual([], entries(Name)),
-        ?assertEqual(not_found, couch_expiring_cache:lookup(Name, Key))
+        % These last 2 are also races, betting on FDB to be reasonably
+        % fast on the home stretch
+        ok = wait_lookup(Name, Key, expired, Timeout, Interval),
+        ok = wait_lookup(Name, Key, not_found, Timeout, Interval),
+        ?assertEqual([], entries(Name))
     end)}.
 
 
+% In this race we're betting on FDB to take less than `Stale` and then
+% `Expired` milliseconds to respond
+attempt_fresh_and_stale_lookups(Name, Key, Timeout, Interval) ->
+    case couch_expiring_cache:lookup(Name, Key) of
+        {fresh, Val} ->
+            % We won that race, let's bet on another!
+            ok = wait_lookup(Name, Key, {stale, Val}, Timeout, Interval);
+        _ ->
+            % Unlucky! But don't fail the test just yet...
+            ok
+    end.
+
+
 entries(Name) ->
-    FarFuture = erlang:system_time(?TIME_UNIT) * 2,
-    couch_expiring_cache_fdb:get_range_to(Name, FarFuture, _Limit=100).
+    couch_expiring_cache_fdb:get_range_to(Name, ?FOREVER, _Limit=100).
 
 
-wait_lookup(Name, Key, Expect) ->
-    test_util:wait(fun() ->
+% This lookup races against Timeout
+wait_lookup(Name, Key, Expect, Timeout, Interval) ->
+    wait(fun() ->
         case couch_expiring_cache:lookup(Name, Key) of
             Expect -> ok;
             _ -> wait
         end
-    end, _Timeout = 1000, _PollingInterval = 10).
+    end, Timeout, Interval).
+
+
+wait(Fun, Timeout, Delay) ->
+    Now = couch_expiring_cache_server:now_ts(),
+    wait(Fun, Timeout, Delay, Now, Now).
+
+
+wait(_Fun, Timeout, _Delay, Started, Prev) when Prev - Started > Timeout ->
+    timeout;
+
+wait(Fun, Timeout, Delay, Started, _Prev) ->
+    case Fun() of
+        wait ->
+            % http://erlang.org/doc/man/timer.html#sleep-1
+            ok = timer:sleep(Delay), % always millisecond
+            wait(Fun, Timeout, Delay, Started,
+                couch_expiring_cache_server:now_ts());
+        Else ->
+            Else
+    end.