You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafficcontrol.apache.org by ra...@apache.org on 2018/11/06 20:51:45 UTC

[trafficcontrol] 01/04: Add an exponential time backoff object for use in traffic_monitor.

This is an automated email from the ASF dual-hosted git repository.

rawlin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficcontrol.git

commit bc41a6e6e5ce07c89ffdfd8b6078ae029a669d45
Author: John Rushford <jr...@apache.org>
AuthorDate: Mon Oct 29 21:46:31 2018 +0000

    Add an exponential time backoff object for use in traffic_monitor.
---
 lib/go-util/backoff.go               | 105 +++++++++++++++++++++++++++++++++++
 lib/go-util/backoff_test.go          | 105 +++++++++++++++++++++++++++++++++++
 lib/go-util/num_test.go              |   1 -
 traffic_monitor/config/config.go     |  23 ++++----
 traffic_monitor/manager/opsconfig.go |  27 ++++++++-
 5 files changed, 247 insertions(+), 14 deletions(-)

diff --git a/lib/go-util/backoff.go b/lib/go-util/backoff.go
new file mode 100644
index 0000000..94576c7
--- /dev/null
+++ b/lib/go-util/backoff.go
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+// Used to return an exponentially increasing time.Duration value that may be used
+// to sleep between retries.
+
+package util
+
+import (
+	"fmt"
+	"math"
+	"math/rand"
+	"time"
+)
+
+// DefaultFactor may be used by applications for the factor argument.
+const DefaultFactor = 2.0
+
+type backoff struct {
+	attempt float64
+	Factor  float64
+	Min     time.Duration
+	Max     time.Duration
+	rgen    *rand.Rand
+}
+
+type Backoff interface {
+	BackoffDuration() time.Duration
+	Reset()
+}
+
+func NewBackoff(min time.Duration, max time.Duration, factor float64) (Backoff, error) {
+
+	// verify arguments and set defaults if necessary.
+	if min < 1 {
+		return nil, fmt.Errorf("'min: %v, is invalid.  min must be greater than '1'", min)
+	}
+	if max <= min {
+		return nil, fmt.Errorf("'max: %v, is invalid.  max must be greater than 'min'", max)
+	}
+	if factor <= 1.0 {
+		return nil, fmt.Errorf("'factor: %v, is invalid.  factor must be greater than '1'", factor)
+	}
+
+	src := rand.NewSource(time.Now().UTC().UnixNano())
+
+	return &backoff{
+		attempt: 0,
+		Factor:  factor,
+		Min:     min,
+		Max:     max,
+		rgen:    rand.New(src),
+	}, nil
+}
+
+// generate random jitter
+func (b *backoff) jitter(durFloat float64, minFloat float64) float64 {
+	return b.rgen.Float64()*(durFloat-minFloat) + minFloat
+}
+
+func (b *backoff) Reset() {
+	b.attempt = 0
+}
+
+// Calculate and return  backoff time duration
+func (b *backoff) BackoffDuration() time.Duration {
+
+	minFloat := float64(b.Min)
+	durFloat := minFloat * math.Pow(b.Factor, b.attempt)
+	b.attempt++
+
+	// add jitter
+	durFloat += b.jitter(durFloat, minFloat)
+
+	// reached the max duration return max
+	if durFloat >= float64(b.Max) {
+		return b.Max
+	}
+
+	dur := time.Duration(durFloat)
+
+	if dur < b.Min {
+		return b.Min
+	}
+	if dur > b.Max {
+		return b.Max
+	}
+	return dur
+}
diff --git a/lib/go-util/backoff_test.go b/lib/go-util/backoff_test.go
new file mode 100644
index 0000000..e300d68
--- /dev/null
+++ b/lib/go-util/backoff_test.go
@@ -0,0 +1,105 @@
+package util
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import (
+	"testing"
+	"time"
+)
+
+func TestNewBackoff(t *testing.T) {
+	min := 500 * time.Millisecond
+	max := 600 * time.Millisecond
+	fac := DefaultFactor
+
+	bo, err := NewBackoff(min, max, fac)
+
+	if err != nil {
+		t.Errorf("un-expected error. min: %v, max: %v, fac: %v - %v", min, max, fac, err)
+	}
+
+	if bo == nil {
+		t.Errorf("un-expected error bo is nil, min: %v, max: %v, fac: %v - %v", min, max, fac, err)
+	}
+
+	bo, err = NewBackoff(0, 0, 0)
+
+	if err == nil {
+		t.Errorf("Expected errors passing invalid arguments to NewBackOff()")
+	}
+
+	if bo != nil {
+		t.Errorf("un-expected  error passing invalid arguments to NewBackOff(), 'bo' should be nil")
+	}
+}
+
+func TestReset(t *testing.T) {
+	bo, err := NewBackoff(1, 2000000000, DefaultFactor)
+
+	if err != nil {
+		t.Errorf("un-expected error: %v", err)
+	}
+
+	val1 := bo.BackoffDuration()
+	val2 := bo.BackoffDuration()
+
+	if val2 < val1 {
+		t.Errorf("expected val1: %v is less than val2: %v", val1, val2)
+	}
+
+	bo.Reset()
+
+	val3 := bo.BackoffDuration()
+
+	if val3 > val2 {
+		t.Errorf("expected val3: %v is less than val2: %v", val3, val2)
+	}
+}
+
+func TestBackoffDuration(t *testing.T) {
+	min := 100 * time.Nanosecond
+	max := 20000 * time.Nanosecond
+	fac := DefaultFactor
+	bo, err := NewBackoff(min, max, fac)
+
+	if err != nil {
+		t.Errorf("un-expected error: %v", err)
+	}
+	dur := min
+
+	val := bo.BackoffDuration()
+	if val <= min {
+		t.Errorf("unexpected duration calculation, val: %v is less than min: %v", val, min)
+	}
+
+	// 8 iterations with the default settings.
+	for i := 0; i < 8; i++ {
+		val = bo.BackoffDuration()
+		if val < dur {
+			t.Errorf("unexpected duration calculation, iteration: %v, val: %v  <= dur: %v", i, val, dur)
+		}
+		dur = val
+	}
+
+	// after 8 calls with the default settings, val should be '1m0s', DefaultMaxMS
+	if val != max {
+		t.Errorf("unexpected duration calculation, val: %v  != : max: %v", val, max)
+	}
+}
diff --git a/lib/go-util/num_test.go b/lib/go-util/num_test.go
index dd37986..06e8aa2 100644
--- a/lib/go-util/num_test.go
+++ b/lib/go-util/num_test.go
@@ -17,7 +17,6 @@ package util
 // When adding symbols, document the RFC and section they correspond to.
 
 import (
-	"encoding/json"
 	"reflect"
 	"testing"
 )
diff --git a/traffic_monitor/config/config.go b/traffic_monitor/config/config.go
index 8bb2daa..43097df 100644
--- a/traffic_monitor/config/config.go
+++ b/traffic_monitor/config/config.go
@@ -39,6 +39,8 @@ const (
 	LogLocationNull = "null"
 	//StaticFileDir is the directory that contains static html and js files.
 	StaticFileDir = "/opt/traffic_monitor/static/"
+	// FixedToRetryInterval is a fallback retry interval to use if the Backoff intervals are misconfigured.
+	FixedRetryInterval = 10000 * time.Millisecond
 )
 
 // Config is the configuration for the application. It includes myriad data, such as polling intervals and log locations.
@@ -64,7 +66,8 @@ type Config struct {
 	HealthToStatRatio            uint64        `json:"health_to_stat_ratio"`
 	StaticFileDir                string        `json:"static_file_dir"`
 	CRConfigHistoryCount         uint64        `json:"crconfig_history_count"`
-	TrafficOpsRetryInterval      time.Duration `json:"-"`
+	TrafficOpsMinRetryInterval   time.Duration `json:"-"`
+	TrafficOpsMaxRetryInterval   time.Duration `json:"-"`
 }
 
 func (c Config) ErrorLog() log.LogLocation   { return log.LogLocation(c.LogLocationError) }
@@ -96,7 +99,8 @@ var DefaultConfig = Config{
 	HealthToStatRatio:            4,
 	StaticFileDir:                StaticFileDir,
 	CRConfigHistoryCount:         20000,
-	TrafficOpsRetryInterval:      3 * time.Second,
+	TrafficOpsMinRetryInterval:   100 * time.Millisecond,
+	TrafficOpsMaxRetryInterval:   60000 * time.Millisecond,
 }
 
 // MarshalJSON marshals custom millisecond durations. Aliasing inspired by http://choly.ca/post/go-json-marshalling/
@@ -141,7 +145,8 @@ func (c *Config) UnmarshalJSON(data []byte) error {
 		StatFlushIntervalMs            *uint64 `json:"stat_flush_interval_ms"`
 		ServeReadTimeoutMs             *uint64 `json:"serve_read_timeout_ms"`
 		ServeWriteTimeoutMs            *uint64 `json:"serve_write_timeout_ms"`
-		TrafficOpsRetryIntervalSec     *uint64 `json:"traffic_ops_retry_interval_sec"`
+		TrafficOpsMinRetryIntervalMs   *uint64 `json:"traffic_ops_min_retry_interval_ms"`
+		TrafficOpsMaxRetryIntervalMs   *uint64 `json:"traffic_ops_max_retry_interval_ms"`
 		*Alias
 	}{
 		Alias: (*Alias)(c),
@@ -180,13 +185,11 @@ func (c *Config) UnmarshalJSON(data []byte) error {
 	if aux.PeerOptimistic != nil {
 		c.PeerOptimistic = *aux.PeerOptimistic
 	}
-	if aux.TrafficOpsRetryIntervalSec != nil {
-		if *aux.TrafficOpsRetryIntervalSec <= 0 {
-			log.Errorf("The 'traffic_ops_retry_interval_sec: %v' setting is incorrect, needs to be a positive number of seconds, using default of 3 seconds", *aux.TrafficOpsRetryIntervalSec)
-			c.TrafficOpsRetryInterval = 3 * time.Second
-		} else {
-			c.TrafficOpsRetryInterval = time.Duration(*aux.TrafficOpsRetryIntervalSec) * time.Second
-		}
+	if aux.TrafficOpsMinRetryIntervalMs != nil {
+		c.TrafficOpsMinRetryInterval = time.Duration(*aux.TrafficOpsMinRetryIntervalMs) * time.Millisecond
+	}
+	if aux.TrafficOpsMaxRetryIntervalMs != nil {
+		c.TrafficOpsMaxRetryInterval = time.Duration(*aux.TrafficOpsMaxRetryIntervalMs) * time.Millisecond
 	}
 	return nil
 }
diff --git a/traffic_monitor/manager/opsconfig.go b/traffic_monitor/manager/opsconfig.go
index 5beed93..d0c44b6 100644
--- a/traffic_monitor/manager/opsconfig.go
+++ b/traffic_monitor/manager/opsconfig.go
@@ -29,6 +29,7 @@ import (
 	"golang.org/x/sys/unix"
 
 	"github.com/apache/trafficcontrol/lib/go-log"
+	"github.com/apache/trafficcontrol/lib/go-util"
 	"github.com/apache/trafficcontrol/traffic_monitor/config"
 	"github.com/apache/trafficcontrol/traffic_monitor/datareq"
 	"github.com/apache/trafficcontrol/traffic_monitor/handler"
@@ -138,12 +139,22 @@ func StartOpsConfigManager(
 
 		// fixed an issue here where traffic_monitor loops forever, doing nothing useful if traffic_ops is down,
 		// and would never logging in again.  since traffic_monitor  is just starting up here, keep retrying until traffic_ops is reachable and a session can be established.
+		backoff, err := util.NewBackoff(cfg.TrafficOpsMinRetryInterval, cfg.TrafficOpsMaxRetryInterval, util.DefaultFactor)
+		if err != nil {
+			log.Errorf("possible invalid backoff arguments, will use a fixed sleep interval: %v", err)
+		}
 		for {
 			realToSession, toAddr, err = to.LoginWithAgent(newOpsConfig.Url, newOpsConfig.Username, newOpsConfig.Password, newOpsConfig.Insecure, staticAppData.UserAgent, useCache, trafficOpsRequestTimeout)
 			if err != nil {
 				handleErr(fmt.Errorf("MonitorConfigPoller: error instantiating Session with traffic_ops (%v): %s\n", toAddr, err))
-				log.Errorf("cfg.TrafficOpsRetryInterval: %v", cfg.TrafficOpsRetryInterval)
-				time.Sleep(cfg.TrafficOpsRetryInterval)
+				if backoff != nil {
+					duration := backoff.BackoffDuration()
+					log.Errorf("retrying in %v\n", duration)
+					time.Sleep(duration)
+				} else {
+					log.Errorf("retrying in %v\n", config.FixedRetryInterval)
+					time.Sleep(config.FixedRetryInterval)
+				}
 				continue
 			} else {
 				toSession.Set(realToSession)
@@ -162,10 +173,20 @@ func StartOpsConfigManager(
 
 		// fixed an issue when traffic_monitor receives corrupt data, CRConfig, from traffic_ops.
 		// Will loop and retry until a good CRConfig is received from traffic_ops
+		if backoff != nil {
+			backoff.Reset()
+		}
 		for {
 			if err := toData.Fetch(toSession, newOpsConfig.CdnName); err != nil {
 				handleErr(fmt.Errorf("Error getting Traffic Ops data: %v\n", err))
-				time.Sleep(cfg.TrafficOpsRetryInterval)
+				if backoff != nil {
+					duration := backoff.BackoffDuration()
+					log.Errorf("retrying in %v\n", duration)
+					time.Sleep(duration)
+				} else {
+					log.Errorf("retrying in %v\n", config.FixedRetryInterval)
+					time.Sleep(config.FixedRetryInterval)
+				}
 				continue
 			}
 			break