You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafficcontrol.apache.org by dg...@apache.org on 2018/11/03 22:41:10 UTC

[trafficcontrol] branch 3.0.x updated (62bbcd9 -> 90ae2ef)

This is an automated email from the ASF dual-hosted git repository.

dgelinas pushed a change to branch 3.0.x
in repository https://gitbox.apache.org/repos/asf/trafficcontrol.git.


    from 62bbcd9  Fix the CHANGELOG.md release links
     new 3f2c198  Fixes issues where when traffic monitor is starting up and is unable to contact traffic_ops and obtain valid CRConfig data, traffic_monitor will loop forever doing nothing useful.  This PR fixes the issue so that during startup, traffic_monitor will continuosly retry reaching traffic_ops.  Upon sucessful login and a good CRConfig fetch, traffic_monitor will resume its startup and begin polling caches without intervention.
     new 90ae2ef  fix error message formatting to dereference the TrafficOpsRetryIntervalSec pointer

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 traffic_monitor/config/config.go     | 11 +++++++++++
 traffic_monitor/manager/opsconfig.go | 35 ++++++++++++++++++++++++++---------
 2 files changed, 37 insertions(+), 9 deletions(-)


[trafficcontrol] 02/02: fix error message formatting to dereference the TrafficOpsRetryIntervalSec pointer

Posted by dg...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

dgelinas pushed a commit to branch 3.0.x
in repository https://gitbox.apache.org/repos/asf/trafficcontrol.git

commit 90ae2ef3db4c3a1ecf8ddfc5daf79c20aa78bc9b
Author: John Rushford <jr...@apache.org>
AuthorDate: Tue Oct 23 16:42:04 2018 +0000

    fix error message formatting to dereference the TrafficOpsRetryIntervalSec pointer
    
    (cherry picked from commit c5619f93c94968a34f98e15e62f9744d3c3e1da7)
---
 traffic_monitor/config/config.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/traffic_monitor/config/config.go b/traffic_monitor/config/config.go
index 673c75d..8bb2daa 100644
--- a/traffic_monitor/config/config.go
+++ b/traffic_monitor/config/config.go
@@ -182,7 +182,7 @@ func (c *Config) UnmarshalJSON(data []byte) error {
 	}
 	if aux.TrafficOpsRetryIntervalSec != nil {
 		if *aux.TrafficOpsRetryIntervalSec <= 0 {
-			log.Errorf("The 'traffic_ops_retry_interval_sec: %v' setting is incorrect, needs to be a positive number of seconds, using default of 3 seconds", aux.TrafficOpsRetryIntervalSec)
+			log.Errorf("The 'traffic_ops_retry_interval_sec: %v' setting is incorrect, needs to be a positive number of seconds, using default of 3 seconds", *aux.TrafficOpsRetryIntervalSec)
 			c.TrafficOpsRetryInterval = 3 * time.Second
 		} else {
 			c.TrafficOpsRetryInterval = time.Duration(*aux.TrafficOpsRetryIntervalSec) * time.Second


[trafficcontrol] 01/02: Fixes issues where when traffic monitor is starting up and is unable to contact traffic_ops and obtain valid CRConfig data, traffic_monitor will loop forever doing nothing useful. This PR fixes the issue so that during startup, traffic_monitor will continuosly retry reaching traffic_ops. Upon sucessful login and a good CRConfig fetch, traffic_monitor will resume its startup and begin polling caches without intervention.

Posted by dg...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

dgelinas pushed a commit to branch 3.0.x
in repository https://gitbox.apache.org/repos/asf/trafficcontrol.git

commit 3f2c198714ee5fb3b21d6613d2ad8de926b0a2ea
Author: John Rushford <jr...@apache.org>
AuthorDate: Fri Oct 12 15:39:02 2018 +0000

    Fixes issues where when traffic monitor is starting up and is unable
    to contact traffic_ops and obtain valid CRConfig data, traffic_monitor
    will loop forever doing nothing useful.  This PR fixes the issue so
    that during startup, traffic_monitor will continuosly retry reaching
    traffic_ops.  Upon sucessful login and a good CRConfig fetch,
    traffic_monitor will resume its startup and begin polling caches
    without intervention.
    
    (cherry picked from commit 436ba8dd3b39510c4201424d6f63846654e5d40d)
---
 traffic_monitor/config/config.go     | 11 +++++++++++
 traffic_monitor/manager/opsconfig.go | 35 ++++++++++++++++++++++++++---------
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/traffic_monitor/config/config.go b/traffic_monitor/config/config.go
index a930167..673c75d 100644
--- a/traffic_monitor/config/config.go
+++ b/traffic_monitor/config/config.go
@@ -64,6 +64,7 @@ type Config struct {
 	HealthToStatRatio            uint64        `json:"health_to_stat_ratio"`
 	StaticFileDir                string        `json:"static_file_dir"`
 	CRConfigHistoryCount         uint64        `json:"crconfig_history_count"`
+	TrafficOpsRetryInterval      time.Duration `json:"-"`
 }
 
 func (c Config) ErrorLog() log.LogLocation   { return log.LogLocation(c.LogLocationError) }
@@ -95,6 +96,7 @@ var DefaultConfig = Config{
 	HealthToStatRatio:            4,
 	StaticFileDir:                StaticFileDir,
 	CRConfigHistoryCount:         20000,
+	TrafficOpsRetryInterval:      3 * time.Second,
 }
 
 // MarshalJSON marshals custom millisecond durations. Aliasing inspired by http://choly.ca/post/go-json-marshalling/
@@ -139,6 +141,7 @@ func (c *Config) UnmarshalJSON(data []byte) error {
 		StatFlushIntervalMs            *uint64 `json:"stat_flush_interval_ms"`
 		ServeReadTimeoutMs             *uint64 `json:"serve_read_timeout_ms"`
 		ServeWriteTimeoutMs            *uint64 `json:"serve_write_timeout_ms"`
+		TrafficOpsRetryIntervalSec     *uint64 `json:"traffic_ops_retry_interval_sec"`
 		*Alias
 	}{
 		Alias: (*Alias)(c),
@@ -177,6 +180,14 @@ func (c *Config) UnmarshalJSON(data []byte) error {
 	if aux.PeerOptimistic != nil {
 		c.PeerOptimistic = *aux.PeerOptimistic
 	}
+	if aux.TrafficOpsRetryIntervalSec != nil {
+		if *aux.TrafficOpsRetryIntervalSec <= 0 {
+			log.Errorf("The 'traffic_ops_retry_interval_sec: %v' setting is incorrect, needs to be a positive number of seconds, using default of 3 seconds", aux.TrafficOpsRetryIntervalSec)
+			c.TrafficOpsRetryInterval = 3 * time.Second
+		} else {
+			c.TrafficOpsRetryInterval = time.Duration(*aux.TrafficOpsRetryIntervalSec) * time.Second
+		}
+	}
 	return nil
 }
 
diff --git a/traffic_monitor/manager/opsconfig.go b/traffic_monitor/manager/opsconfig.go
index c315843..5beed93 100644
--- a/traffic_monitor/manager/opsconfig.go
+++ b/traffic_monitor/manager/opsconfig.go
@@ -23,6 +23,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"io/ioutil"
+	"net"
 	"time"
 
 	"golang.org/x/sys/unix"
@@ -132,13 +133,23 @@ func StartOpsConfigManager(
 		// TODO config? parameter?
 		useCache := false
 		trafficOpsRequestTimeout := time.Second * time.Duration(10)
-
-		realToSession, toAddr, err := to.LoginWithAgent(newOpsConfig.Url, newOpsConfig.Username, newOpsConfig.Password, newOpsConfig.Insecure, staticAppData.UserAgent, useCache, trafficOpsRequestTimeout)
-		if err != nil {
-			handleErr(fmt.Errorf("MonitorConfigPoller: error instantiating Session with traffic_ops (%v): %s\n", toAddr, err))
-			return
+		var realToSession *to.Session
+		var toAddr net.Addr
+
+		// fixed an issue here where traffic_monitor loops forever, doing nothing useful if traffic_ops is down,
+		// and would never logging in again.  since traffic_monitor  is just starting up here, keep retrying until traffic_ops is reachable and a session can be established.
+		for {
+			realToSession, toAddr, err = to.LoginWithAgent(newOpsConfig.Url, newOpsConfig.Username, newOpsConfig.Password, newOpsConfig.Insecure, staticAppData.UserAgent, useCache, trafficOpsRequestTimeout)
+			if err != nil {
+				handleErr(fmt.Errorf("MonitorConfigPoller: error instantiating Session with traffic_ops (%v): %s\n", toAddr, err))
+				log.Errorf("cfg.TrafficOpsRetryInterval: %v", cfg.TrafficOpsRetryInterval)
+				time.Sleep(cfg.TrafficOpsRetryInterval)
+				continue
+			} else {
+				toSession.Set(realToSession)
+				break
+			}
 		}
-		toSession.Set(realToSession)
 
 		if cdn, err := getMonitorCDN(realToSession, staticAppData.Hostname); err != nil {
 			handleErr(fmt.Errorf("getting CDN name from Traffic Ops, using config CDN '%s': %s\n", newOpsConfig.CdnName, err))
@@ -149,9 +160,15 @@ func StartOpsConfigManager(
 			newOpsConfig.CdnName = cdn
 		}
 
-		if err := toData.Fetch(toSession, newOpsConfig.CdnName); err != nil {
-			handleErr(fmt.Errorf("Error getting Traffic Ops data: %v\n", err))
-			return
+		// fixed an issue when traffic_monitor receives corrupt data, CRConfig, from traffic_ops.
+		// Will loop and retry until a good CRConfig is received from traffic_ops
+		for {
+			if err := toData.Fetch(toSession, newOpsConfig.CdnName); err != nil {
+				handleErr(fmt.Errorf("Error getting Traffic Ops data: %v\n", err))
+				time.Sleep(cfg.TrafficOpsRetryInterval)
+				continue
+			}
+			break
 		}
 
 		// These must be in a goroutine, because the monitorConfigPoller tick sends to a channel this select listens for. Thus, if we block on sends to the monitorConfigPoller, we have a livelock race condition.