You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafficcontrol.apache.org by ra...@apache.org on 2018/10/23 16:53:37 UTC

[trafficcontrol] branch master updated (30d5c83 -> c5619f9)

This is an automated email from the ASF dual-hosted git repository.

rawlin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/trafficcontrol.git.


    from 30d5c83  Tabs look more like tabs; bigger top margin; moved misc stats
     new 436ba8d  Fixes issues where when traffic monitor is starting up and is unable to contact traffic_ops and obtain valid CRConfig data, traffic_monitor will loop forever doing nothing useful.  This PR fixes the issue so that during startup, traffic_monitor will continuosly retry reaching traffic_ops.  Upon sucessful login and a good CRConfig fetch, traffic_monitor will resume its startup and begin polling caches without intervention.
     new c5619f9  fix error message formatting to dereference the TrafficOpsRetryIntervalSec pointer

The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails.  The revisions
listed as "add" were already present in the repository and have only
been added to this reference.


Summary of changes:
 traffic_monitor/config/config.go     | 11 +++++++++++
 traffic_monitor/manager/opsconfig.go | 35 ++++++++++++++++++++++++++---------
 2 files changed, 37 insertions(+), 9 deletions(-)


[trafficcontrol] 02/02: fix error message formatting to dereference the TrafficOpsRetryIntervalSec pointer

Posted by ra...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

rawlin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficcontrol.git

commit c5619f93c94968a34f98e15e62f9744d3c3e1da7
Author: John Rushford <jr...@apache.org>
AuthorDate: Tue Oct 23 16:42:04 2018 +0000

    fix error message formatting to dereference the TrafficOpsRetryIntervalSec pointer
---
 traffic_monitor/config/config.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/traffic_monitor/config/config.go b/traffic_monitor/config/config.go
index 673c75d..8bb2daa 100644
--- a/traffic_monitor/config/config.go
+++ b/traffic_monitor/config/config.go
@@ -182,7 +182,7 @@ func (c *Config) UnmarshalJSON(data []byte) error {
 	}
 	if aux.TrafficOpsRetryIntervalSec != nil {
 		if *aux.TrafficOpsRetryIntervalSec <= 0 {
-			log.Errorf("The 'traffic_ops_retry_interval_sec: %v' setting is incorrect, needs to be a positive number of seconds, using default of 3 seconds", aux.TrafficOpsRetryIntervalSec)
+			log.Errorf("The 'traffic_ops_retry_interval_sec: %v' setting is incorrect, needs to be a positive number of seconds, using default of 3 seconds", *aux.TrafficOpsRetryIntervalSec)
 			c.TrafficOpsRetryInterval = 3 * time.Second
 		} else {
 			c.TrafficOpsRetryInterval = time.Duration(*aux.TrafficOpsRetryIntervalSec) * time.Second


[trafficcontrol] 01/02: Fixes issues where when traffic monitor is starting up and is unable to contact traffic_ops and obtain valid CRConfig data, traffic_monitor will loop forever doing nothing useful. This PR fixes the issue so that during startup, traffic_monitor will continuosly retry reaching traffic_ops. Upon sucessful login and a good CRConfig fetch, traffic_monitor will resume its startup and begin polling caches without intervention.

Posted by ra...@apache.org.
This is an automated email from the ASF dual-hosted git repository.

rawlin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficcontrol.git

commit 436ba8dd3b39510c4201424d6f63846654e5d40d
Author: John Rushford <jr...@apache.org>
AuthorDate: Fri Oct 12 15:39:02 2018 +0000

    Fixes issues where when traffic monitor is starting up and is unable
    to contact traffic_ops and obtain valid CRConfig data, traffic_monitor
    will loop forever doing nothing useful.  This PR fixes the issue so
    that during startup, traffic_monitor will continuosly retry reaching
    traffic_ops.  Upon sucessful login and a good CRConfig fetch,
    traffic_monitor will resume its startup and begin polling caches
    without intervention.
---
 traffic_monitor/config/config.go     | 11 +++++++++++
 traffic_monitor/manager/opsconfig.go | 35 ++++++++++++++++++++++++++---------
 2 files changed, 37 insertions(+), 9 deletions(-)

diff --git a/traffic_monitor/config/config.go b/traffic_monitor/config/config.go
index a930167..673c75d 100644
--- a/traffic_monitor/config/config.go
+++ b/traffic_monitor/config/config.go
@@ -64,6 +64,7 @@ type Config struct {
 	HealthToStatRatio            uint64        `json:"health_to_stat_ratio"`
 	StaticFileDir                string        `json:"static_file_dir"`
 	CRConfigHistoryCount         uint64        `json:"crconfig_history_count"`
+	TrafficOpsRetryInterval      time.Duration `json:"-"`
 }
 
 func (c Config) ErrorLog() log.LogLocation   { return log.LogLocation(c.LogLocationError) }
@@ -95,6 +96,7 @@ var DefaultConfig = Config{
 	HealthToStatRatio:            4,
 	StaticFileDir:                StaticFileDir,
 	CRConfigHistoryCount:         20000,
+	TrafficOpsRetryInterval:      3 * time.Second,
 }
 
 // MarshalJSON marshals custom millisecond durations. Aliasing inspired by http://choly.ca/post/go-json-marshalling/
@@ -139,6 +141,7 @@ func (c *Config) UnmarshalJSON(data []byte) error {
 		StatFlushIntervalMs            *uint64 `json:"stat_flush_interval_ms"`
 		ServeReadTimeoutMs             *uint64 `json:"serve_read_timeout_ms"`
 		ServeWriteTimeoutMs            *uint64 `json:"serve_write_timeout_ms"`
+		TrafficOpsRetryIntervalSec     *uint64 `json:"traffic_ops_retry_interval_sec"`
 		*Alias
 	}{
 		Alias: (*Alias)(c),
@@ -177,6 +180,14 @@ func (c *Config) UnmarshalJSON(data []byte) error {
 	if aux.PeerOptimistic != nil {
 		c.PeerOptimistic = *aux.PeerOptimistic
 	}
+	if aux.TrafficOpsRetryIntervalSec != nil {
+		if *aux.TrafficOpsRetryIntervalSec <= 0 {
+			log.Errorf("The 'traffic_ops_retry_interval_sec: %v' setting is incorrect, needs to be a positive number of seconds, using default of 3 seconds", aux.TrafficOpsRetryIntervalSec)
+			c.TrafficOpsRetryInterval = 3 * time.Second
+		} else {
+			c.TrafficOpsRetryInterval = time.Duration(*aux.TrafficOpsRetryIntervalSec) * time.Second
+		}
+	}
 	return nil
 }
 
diff --git a/traffic_monitor/manager/opsconfig.go b/traffic_monitor/manager/opsconfig.go
index c315843..5beed93 100644
--- a/traffic_monitor/manager/opsconfig.go
+++ b/traffic_monitor/manager/opsconfig.go
@@ -23,6 +23,7 @@ import (
 	"encoding/json"
 	"fmt"
 	"io/ioutil"
+	"net"
 	"time"
 
 	"golang.org/x/sys/unix"
@@ -132,13 +133,23 @@ func StartOpsConfigManager(
 		// TODO config? parameter?
 		useCache := false
 		trafficOpsRequestTimeout := time.Second * time.Duration(10)
-
-		realToSession, toAddr, err := to.LoginWithAgent(newOpsConfig.Url, newOpsConfig.Username, newOpsConfig.Password, newOpsConfig.Insecure, staticAppData.UserAgent, useCache, trafficOpsRequestTimeout)
-		if err != nil {
-			handleErr(fmt.Errorf("MonitorConfigPoller: error instantiating Session with traffic_ops (%v): %s\n", toAddr, err))
-			return
+		var realToSession *to.Session
+		var toAddr net.Addr
+
+		// fixed an issue here where traffic_monitor loops forever, doing nothing useful if traffic_ops is down,
+		// and would never logging in again.  since traffic_monitor  is just starting up here, keep retrying until traffic_ops is reachable and a session can be established.
+		for {
+			realToSession, toAddr, err = to.LoginWithAgent(newOpsConfig.Url, newOpsConfig.Username, newOpsConfig.Password, newOpsConfig.Insecure, staticAppData.UserAgent, useCache, trafficOpsRequestTimeout)
+			if err != nil {
+				handleErr(fmt.Errorf("MonitorConfigPoller: error instantiating Session with traffic_ops (%v): %s\n", toAddr, err))
+				log.Errorf("cfg.TrafficOpsRetryInterval: %v", cfg.TrafficOpsRetryInterval)
+				time.Sleep(cfg.TrafficOpsRetryInterval)
+				continue
+			} else {
+				toSession.Set(realToSession)
+				break
+			}
 		}
-		toSession.Set(realToSession)
 
 		if cdn, err := getMonitorCDN(realToSession, staticAppData.Hostname); err != nil {
 			handleErr(fmt.Errorf("getting CDN name from Traffic Ops, using config CDN '%s': %s\n", newOpsConfig.CdnName, err))
@@ -149,9 +160,15 @@ func StartOpsConfigManager(
 			newOpsConfig.CdnName = cdn
 		}
 
-		if err := toData.Fetch(toSession, newOpsConfig.CdnName); err != nil {
-			handleErr(fmt.Errorf("Error getting Traffic Ops data: %v\n", err))
-			return
+		// fixed an issue when traffic_monitor receives corrupt data, CRConfig, from traffic_ops.
+		// Will loop and retry until a good CRConfig is received from traffic_ops
+		for {
+			if err := toData.Fetch(toSession, newOpsConfig.CdnName); err != nil {
+				handleErr(fmt.Errorf("Error getting Traffic Ops data: %v\n", err))
+				time.Sleep(cfg.TrafficOpsRetryInterval)
+				continue
+			}
+			break
 		}
 
 		// These must be in a goroutine, because the monitorConfigPoller tick sends to a channel this select listens for. Thus, if we block on sends to the monitorConfigPoller, we have a livelock race condition.