You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@trafficcontrol.apache.org by GitBox <gi...@apache.org> on 2018/11/03 22:41:07 UTC

[GitHub] dg4prez closed pull request #2985: Backport #2921 to 3.0.x

dg4prez closed pull request #2985: Backport #2921 to 3.0.x
URL: https://github.com/apache/trafficcontrol/pull/2985
 
 
   

This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:

As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):

diff --git a/traffic_monitor/config/config.go b/traffic_monitor/config/config.go
index a93016756..8bb2daab2 100644
--- a/traffic_monitor/config/config.go
+++ b/traffic_monitor/config/config.go
@@ -64,6 +64,7 @@ type Config struct {
 	HealthToStatRatio            uint64        `json:"health_to_stat_ratio"`
 	StaticFileDir                string        `json:"static_file_dir"`
 	CRConfigHistoryCount         uint64        `json:"crconfig_history_count"`
+	TrafficOpsRetryInterval      time.Duration `json:"-"`
 }
 
 func (c Config) ErrorLog() log.LogLocation   { return log.LogLocation(c.LogLocationError) }
@@ -95,6 +96,7 @@ var DefaultConfig = Config{
 	HealthToStatRatio:            4,
 	StaticFileDir:                StaticFileDir,
 	CRConfigHistoryCount:         20000,
+	TrafficOpsRetryInterval:      3 * time.Second,
 }
 
 // MarshalJSON marshals custom millisecond durations. Aliasing inspired by http://choly.ca/post/go-json-marshalling/
@@ -139,6 +141,7 @@ func (c *Config) UnmarshalJSON(data []byte) error {
 		StatFlushIntervalMs            *uint64 `json:"stat_flush_interval_ms"`
 		ServeReadTimeoutMs             *uint64 `json:"serve_read_timeout_ms"`
 		ServeWriteTimeoutMs            *uint64 `json:"serve_write_timeout_ms"`
+		TrafficOpsRetryIntervalSec     *uint64 `json:"traffic_ops_retry_interval_sec"`
 		*Alias
 	}{
 		Alias: (*Alias)(c),
@@ -177,6 +180,14 @@ func (c *Config) UnmarshalJSON(data []byte) error {
 	if aux.PeerOptimistic != nil {
 		c.PeerOptimistic = *aux.PeerOptimistic
 	}
+	if aux.TrafficOpsRetryIntervalSec != nil {
+		if *aux.TrafficOpsRetryIntervalSec <= 0 {
+			log.Errorf("The 'traffic_ops_retry_interval_sec: %v' setting is incorrect, needs to be a positive number of seconds, using default of 3 seconds", *aux.TrafficOpsRetryIntervalSec)
+			c.TrafficOpsRetryInterval = 3 * time.Second
+		} else {
+			c.TrafficOpsRetryInterval = time.Duration(*aux.TrafficOpsRetryIntervalSec) * time.Second
+		}
+	}
 	return nil
 }
 
diff --git a/traffic_monitor/manager/opsconfig.go b/traffic_monitor/manager/opsconfig.go
index c31584366..5beed937f 100644
--- a/traffic_monitor/manager/opsconfig.go
+++ b/traffic_monitor/manager/opsconfig.go
@@ -23,6 +23,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"io/ioutil"
+	"net"
 	"time"
 
 	"golang.org/x/sys/unix"
@@ -132,13 +133,23 @@ func StartOpsConfigManager(
 		// TODO config? parameter?
 		useCache := false
 		trafficOpsRequestTimeout := time.Second * time.Duration(10)
-
-		realToSession, toAddr, err := to.LoginWithAgent(newOpsConfig.Url, newOpsConfig.Username, newOpsConfig.Password, newOpsConfig.Insecure, staticAppData.UserAgent, useCache, trafficOpsRequestTimeout)
-		if err != nil {
-			handleErr(fmt.Errorf("MonitorConfigPoller: error instantiating Session with traffic_ops (%v): %s\n", toAddr, err))
-			return
+		var realToSession *to.Session
+		var toAddr net.Addr
+
+		// fixed an issue here where traffic_monitor loops forever, doing nothing useful if traffic_ops is down,
+		// and would never logging in again.  since traffic_monitor  is just starting up here, keep retrying until traffic_ops is reachable and a session can be established.
+		for {
+			realToSession, toAddr, err = to.LoginWithAgent(newOpsConfig.Url, newOpsConfig.Username, newOpsConfig.Password, newOpsConfig.Insecure, staticAppData.UserAgent, useCache, trafficOpsRequestTimeout)
+			if err != nil {
+				handleErr(fmt.Errorf("MonitorConfigPoller: error instantiating Session with traffic_ops (%v): %s\n", toAddr, err))
+				log.Errorf("cfg.TrafficOpsRetryInterval: %v", cfg.TrafficOpsRetryInterval)
+				time.Sleep(cfg.TrafficOpsRetryInterval)
+				continue
+			} else {
+				toSession.Set(realToSession)
+				break
+			}
 		}
-		toSession.Set(realToSession)
 
 		if cdn, err := getMonitorCDN(realToSession, staticAppData.Hostname); err != nil {
 			handleErr(fmt.Errorf("getting CDN name from Traffic Ops, using config CDN '%s': %s\n", newOpsConfig.CdnName, err))
@@ -149,9 +160,15 @@ func StartOpsConfigManager(
 			newOpsConfig.CdnName = cdn
 		}
 
-		if err := toData.Fetch(toSession, newOpsConfig.CdnName); err != nil {
-			handleErr(fmt.Errorf("Error getting Traffic Ops data: %v\n", err))
-			return
+		// fixed an issue when traffic_monitor receives corrupt data, CRConfig, from traffic_ops.
+		// Will loop and retry until a good CRConfig is received from traffic_ops
+		for {
+			if err := toData.Fetch(toSession, newOpsConfig.CdnName); err != nil {
+				handleErr(fmt.Errorf("Error getting Traffic Ops data: %v\n", err))
+				time.Sleep(cfg.TrafficOpsRetryInterval)
+				continue
+			}
+			break
 		}
 
 		// These must be in a goroutine, because the monitorConfigPoller tick sends to a channel this select listens for. Thus, if we block on sends to the monitorConfigPoller, we have a livelock race condition.


 

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
users@infra.apache.org


With regards,
Apache Git Services