You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafficcontrol.apache.org by ra...@apache.org on 2018/10/23 16:53:37 UTC
[trafficcontrol] branch master updated (30d5c83 -> c5619f9)
This is an automated email from the ASF dual-hosted git repository.
rawlin pushed a change to branch master
in repository https://gitbox.apache.org/repos/asf/trafficcontrol.git.
from 30d5c83 Tabs look more like tabs; bigger top margin; moved misc stats
new 436ba8d Fixes issues where when traffic monitor is starting up and is unable to contact traffic_ops and obtain valid CRConfig data, traffic_monitor will loop forever doing nothing useful. This PR fixes the issue so that during startup, traffic_monitor will continuosly retry reaching traffic_ops. Upon sucessful login and a good CRConfig fetch, traffic_monitor will resume its startup and begin polling caches without intervention.
new c5619f9 fix error message formatting to dereference the TrafficOpsRetryIntervalSec pointer
The 2 revisions listed above as "new" are entirely new to this
repository and will be described in separate emails. The revisions
listed as "add" were already present in the repository and have only
been added to this reference.
Summary of changes:
traffic_monitor/config/config.go | 11 +++++++++++
traffic_monitor/manager/opsconfig.go | 35 ++++++++++++++++++++++++++---------
2 files changed, 37 insertions(+), 9 deletions(-)
[trafficcontrol] 02/02: fix error message formatting to dereference
the TrafficOpsRetryIntervalSec pointer
Posted by ra...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
rawlin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficcontrol.git
commit c5619f93c94968a34f98e15e62f9744d3c3e1da7
Author: John Rushford <jr...@apache.org>
AuthorDate: Tue Oct 23 16:42:04 2018 +0000
fix error message formatting to dereference the TrafficOpsRetryIntervalSec pointer
---
traffic_monitor/config/config.go | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/traffic_monitor/config/config.go b/traffic_monitor/config/config.go
index 673c75d..8bb2daa 100644
--- a/traffic_monitor/config/config.go
+++ b/traffic_monitor/config/config.go
@@ -182,7 +182,7 @@ func (c *Config) UnmarshalJSON(data []byte) error {
}
if aux.TrafficOpsRetryIntervalSec != nil {
if *aux.TrafficOpsRetryIntervalSec <= 0 {
- log.Errorf("The 'traffic_ops_retry_interval_sec: %v' setting is incorrect, needs to be a positive number of seconds, using default of 3 seconds", aux.TrafficOpsRetryIntervalSec)
+ log.Errorf("The 'traffic_ops_retry_interval_sec: %v' setting is incorrect, needs to be a positive number of seconds, using default of 3 seconds", *aux.TrafficOpsRetryIntervalSec)
c.TrafficOpsRetryInterval = 3 * time.Second
} else {
c.TrafficOpsRetryInterval = time.Duration(*aux.TrafficOpsRetryIntervalSec) * time.Second
[trafficcontrol] 01/02: Fixes issues where when traffic monitor is
starting up and is unable to contact traffic_ops and obtain valid CRConfig
data,
traffic_monitor will loop forever doing nothing useful. This PR fixes the
issue so that during startup,
traffic_monitor will continuosly retry reaching traffic_ops. Upon sucessful
login and a good CRConfig fetch,
traffic_monitor will resume its startup and begin polling caches without
intervention.
Posted by ra...@apache.org.
This is an automated email from the ASF dual-hosted git repository.
rawlin pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficcontrol.git
commit 436ba8dd3b39510c4201424d6f63846654e5d40d
Author: John Rushford <jr...@apache.org>
AuthorDate: Fri Oct 12 15:39:02 2018 +0000
Fixes issues where when traffic monitor is starting up and is unable
to contact traffic_ops and obtain valid CRConfig data, traffic_monitor
will loop forever doing nothing useful. This PR fixes the issue so
that during startup, traffic_monitor will continuosly retry reaching
traffic_ops. Upon sucessful login and a good CRConfig fetch,
traffic_monitor will resume its startup and begin polling caches
without intervention.
---
traffic_monitor/config/config.go | 11 +++++++++++
traffic_monitor/manager/opsconfig.go | 35 ++++++++++++++++++++++++++---------
2 files changed, 37 insertions(+), 9 deletions(-)
diff --git a/traffic_monitor/config/config.go b/traffic_monitor/config/config.go
index a930167..673c75d 100644
--- a/traffic_monitor/config/config.go
+++ b/traffic_monitor/config/config.go
@@ -64,6 +64,7 @@ type Config struct {
HealthToStatRatio uint64 `json:"health_to_stat_ratio"`
StaticFileDir string `json:"static_file_dir"`
CRConfigHistoryCount uint64 `json:"crconfig_history_count"`
+ TrafficOpsRetryInterval time.Duration `json:"-"`
}
func (c Config) ErrorLog() log.LogLocation { return log.LogLocation(c.LogLocationError) }
@@ -95,6 +96,7 @@ var DefaultConfig = Config{
HealthToStatRatio: 4,
StaticFileDir: StaticFileDir,
CRConfigHistoryCount: 20000,
+ TrafficOpsRetryInterval: 3 * time.Second,
}
// MarshalJSON marshals custom millisecond durations. Aliasing inspired by http://choly.ca/post/go-json-marshalling/
@@ -139,6 +141,7 @@ func (c *Config) UnmarshalJSON(data []byte) error {
StatFlushIntervalMs *uint64 `json:"stat_flush_interval_ms"`
ServeReadTimeoutMs *uint64 `json:"serve_read_timeout_ms"`
ServeWriteTimeoutMs *uint64 `json:"serve_write_timeout_ms"`
+ TrafficOpsRetryIntervalSec *uint64 `json:"traffic_ops_retry_interval_sec"`
*Alias
}{
Alias: (*Alias)(c),
@@ -177,6 +180,14 @@ func (c *Config) UnmarshalJSON(data []byte) error {
if aux.PeerOptimistic != nil {
c.PeerOptimistic = *aux.PeerOptimistic
}
+ if aux.TrafficOpsRetryIntervalSec != nil {
+ if *aux.TrafficOpsRetryIntervalSec <= 0 {
+ log.Errorf("The 'traffic_ops_retry_interval_sec: %v' setting is incorrect, needs to be a positive number of seconds, using default of 3 seconds", aux.TrafficOpsRetryIntervalSec)
+ c.TrafficOpsRetryInterval = 3 * time.Second
+ } else {
+ c.TrafficOpsRetryInterval = time.Duration(*aux.TrafficOpsRetryIntervalSec) * time.Second
+ }
+ }
return nil
}
diff --git a/traffic_monitor/manager/opsconfig.go b/traffic_monitor/manager/opsconfig.go
index c315843..5beed93 100644
--- a/traffic_monitor/manager/opsconfig.go
+++ b/traffic_monitor/manager/opsconfig.go
@@ -23,6 +23,7 @@ import (
"encoding/json"
"fmt"
"io/ioutil"
+ "net"
"time"
"golang.org/x/sys/unix"
@@ -132,13 +133,23 @@ func StartOpsConfigManager(
// TODO config? parameter?
useCache := false
trafficOpsRequestTimeout := time.Second * time.Duration(10)
-
- realToSession, toAddr, err := to.LoginWithAgent(newOpsConfig.Url, newOpsConfig.Username, newOpsConfig.Password, newOpsConfig.Insecure, staticAppData.UserAgent, useCache, trafficOpsRequestTimeout)
- if err != nil {
- handleErr(fmt.Errorf("MonitorConfigPoller: error instantiating Session with traffic_ops (%v): %s\n", toAddr, err))
- return
+ var realToSession *to.Session
+ var toAddr net.Addr
+
+ // fixed an issue here where traffic_monitor loops forever, doing nothing useful if traffic_ops is down,
+ // and would never logging in again. since traffic_monitor is just starting up here, keep retrying until traffic_ops is reachable and a session can be established.
+ for {
+ realToSession, toAddr, err = to.LoginWithAgent(newOpsConfig.Url, newOpsConfig.Username, newOpsConfig.Password, newOpsConfig.Insecure, staticAppData.UserAgent, useCache, trafficOpsRequestTimeout)
+ if err != nil {
+ handleErr(fmt.Errorf("MonitorConfigPoller: error instantiating Session with traffic_ops (%v): %s\n", toAddr, err))
+ log.Errorf("cfg.TrafficOpsRetryInterval: %v", cfg.TrafficOpsRetryInterval)
+ time.Sleep(cfg.TrafficOpsRetryInterval)
+ continue
+ } else {
+ toSession.Set(realToSession)
+ break
+ }
}
- toSession.Set(realToSession)
if cdn, err := getMonitorCDN(realToSession, staticAppData.Hostname); err != nil {
handleErr(fmt.Errorf("getting CDN name from Traffic Ops, using config CDN '%s': %s\n", newOpsConfig.CdnName, err))
@@ -149,9 +160,15 @@ func StartOpsConfigManager(
newOpsConfig.CdnName = cdn
}
- if err := toData.Fetch(toSession, newOpsConfig.CdnName); err != nil {
- handleErr(fmt.Errorf("Error getting Traffic Ops data: %v\n", err))
- return
+ // fixed an issue when traffic_monitor receives corrupt data, CRConfig, from traffic_ops.
+ // Will loop and retry until a good CRConfig is received from traffic_ops
+ for {
+ if err := toData.Fetch(toSession, newOpsConfig.CdnName); err != nil {
+ handleErr(fmt.Errorf("Error getting Traffic Ops data: %v\n", err))
+ time.Sleep(cfg.TrafficOpsRetryInterval)
+ continue
+ }
+ break
}
// These must be in a goroutine, because the monitorConfigPoller tick sends to a channel this select listens for. Thus, if we block on sends to the monitorConfigPoller, we have a livelock race condition.