You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafficcontrol.apache.org by mi...@apache.org on 2019/06/18 21:51:00 UTC

[trafficcontrol] branch master updated: Fix Monitor Thresholds (#3646)

This is an automated email from the ASF dual-hosted git repository.

mitchell852 pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/trafficcontrol.git


The following commit(s) were added to refs/heads/master by this push:
     new 139d796  Fix Monitor Thresholds (#3646)
139d796 is described below

commit 139d796ba0e5fa4dd8b32c46bd685eaead0cdd59
Author: Robert Butts <ro...@users.noreply.github.com>
AuthorDate: Tue Jun 18 15:50:54 2019 -0600

    Fix Monitor Thresholds (#3646)
    
    * Fix TM Thresholds
    
    This fixes the Monitor racing and the health poll marking Available
    caches which were marked Unavailable by the Stat poll.
    
    There was already logic to prevent that, but it wasn't accounting
    for the Calculated stats, so the Health poll didn't check Calculated
    stats for thresholds, but it does "have" those stats, so it thought
    it could mark them available again.
    
    This fixes the Health poll to also check thresholds for Calculated
    stats, which not only fixes the race, but makes thresholds be
    marked up and down faster with the quicker Health poll.
    
    * Add TM unit test for kbps threshold
    
    * Add Changelog TM Thresholds fix line
---
 CHANGELOG.md                         |   1 +
 traffic_monitor/health/cache.go      |  61 +++-----------
 traffic_monitor/health/cache_test.go | 149 +++++++++++++++++++++++++++++++++++
 traffic_monitor/manager/health.go    |   4 +-
 traffic_monitor/manager/stat.go      |   3 +-
 5 files changed, 168 insertions(+), 50 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index bd9f5d5..182e190 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -53,6 +53,7 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 - Issue #3605: Fixed Traffic Monitor custom ports in health polling URL.
 - Issue 3587: Fixed Traffic Ops Golang reverse proxy and Riak logs to be consistent with the format of other error logs.
 - Database migrations have been collapsed. Rollbacks to migrations that previously existed are no longer possible.
+- Issue #3646: Fixed Traffic Monitor Thresholds.
 
 ## [3.0.0] - 2018-10-30
 ### Added
diff --git a/traffic_monitor/health/cache.go b/traffic_monitor/health/cache.go
index ac2c7dd..35c6872 100644
--- a/traffic_monitor/health/cache.go
+++ b/traffic_monitor/health/cache.go
@@ -92,19 +92,6 @@ func GetVitals(newResult *cache.Result, prevResult *cache.Result, mc *tc.Traffic
 	// log.Infoln(newResult.Id, "BytesOut", newResult.Vitals.BytesOut, "BytesIn", newResult.Vitals.BytesIn, "Kbps", newResult.Vitals.KbpsOut, "max", newResult.Vitals.MaxKbpsOut)
 }
 
-// EvalCache returns whether the given cache should be marked available, a string describing why, and which stat exceeded a threshold. The `stats` may be nil, for pollers which don't poll stats.
-// The availability of EvalCache MAY NOT be used to directly set the cache's local availability, because the threshold stats may not be part of the poller which produced the result. Rather, if the cache was previously unavailable from a threshold, it must be verified that threshold stat is in the results before setting the cache to available.
-// TODO change to return a `cache.AvailableStatus`
-func EvalCache(result cache.ResultInfo, mc *tc.TrafficMonitorConfigMap) (bool, string, string) {
-	serverInfo, ok := mc.TrafficServer[string(result.ID)]
-	if !ok {
-		log.Errorf("Cache %v missing from from Traffic Ops Monitor Config - treating as OFFLINE\n", result.ID)
-		return false, "ERROR - server missing in Traffic Ops monitor config", ""
-	}
-	serverStatus := tc.CacheStatusFromString(serverInfo.ServerStatus)
-	return EvalCacheWithStatusInfo(result, mc, serverStatus, serverInfo)
-}
-
 func EvalCacheWithStatusInfo(result cache.ResultInfo, mc *tc.TrafficMonitorConfigMap, status tc.CacheStatus, serverInfo tc.TrafficServer) (bool, string, string) {
 	availability := AvailableStr
 	if !result.Available {
@@ -134,8 +121,9 @@ const UnavailableStr = "unavailable"
 
 // EvalCache returns whether the given cache should be marked available, a string describing why, and which stat exceeded a threshold. The `stats` may be nil, for pollers which don't poll stats.
 // The availability of EvalCache MAY NOT be used to directly set the cache's local availability, because the threshold stats may not be part of the poller which produced the result. Rather, if the cache was previously unavailable from a threshold, it must be verified that threshold stat is in the results before setting the cache to available.
+// The resultStats may be nil, and if so, won't be checked for thresholds. For example, the Health poller doesn't have Stats.
 // TODO change to return a `cache.AvailableStatus`
-func EvalCacheWithStats(result cache.ResultInfo, resultStats threadsafe.ResultStatValHistory, mc *tc.TrafficMonitorConfigMap) (bool, string, string) {
+func EvalCache(result cache.ResultInfo, resultStats *threadsafe.ResultStatValHistory, mc *tc.TrafficMonitorConfigMap) (bool, string, string) {
 	serverInfo, ok := mc.TrafficServer[string(result.ID)]
 	if !ok {
 		log.Errorf("Cache %v missing from from Traffic Ops Monitor Config - treating as OFFLINE\n", result.ID)
@@ -166,6 +154,9 @@ func EvalCacheWithStats(result cache.ResultInfo, resultStats threadsafe.ResultSt
 			dummyCombinedstate := tc.IsAvailable{} // the only stats which use combinedState are things like isAvailable, which don't make sense to ever be thresholds.
 			resultStat = computedStatF(result, serverInfo, serverProfile, dummyCombinedstate)
 		} else {
+			if resultStats == nil {
+				continue
+			}
 			resultStatHistory := resultStats.Load(stat)
 			if len(resultStatHistory) == 0 {
 				continue
@@ -187,43 +178,17 @@ func EvalCacheWithStats(result cache.ResultInfo, resultStats threadsafe.ResultSt
 	return avail, eventDescVal, eventMsg
 }
 
-func CalcAvailabilityWithStats(results []cache.Result, pollerName string, statResultHistory threadsafe.ResultStatHistory, mc tc.TrafficMonitorConfigMap, toData todata.TOData, localCacheStatusThreadsafe threadsafe.CacheAvailableStatus, localStates peer.CRStatesThreadsafe, events ThreadsafeEvents) {
+// CalcAvailabilityWithStats calculates the availability of each cache in results.
+// statResultHistory may be nil, in which case stats won't be used to calculate availability.
+func CalcAvailability(results []cache.Result, pollerName string, statResultHistory *threadsafe.ResultStatHistory, mc tc.TrafficMonitorConfigMap, toData todata.TOData, localCacheStatusThreadsafe threadsafe.CacheAvailableStatus, localStates peer.CRStatesThreadsafe, events ThreadsafeEvents) {
 	localCacheStatuses := localCacheStatusThreadsafe.Get().Copy()
+	statResults := (*threadsafe.ResultStatValHistory)(nil)
 	for _, result := range results {
-		statResults := statResultHistory.LoadOrStore(result.ID)
-		isAvailable, whyAvailable, unavailableStat := EvalCacheWithStats(cache.ToInfo(result), statResults, &mc)
-
-		// if the cache is now Available, and was previously unavailable due to a threshold, make sure this poller contains the stat which exceeded the threshold.
-		if previousStatus, hasPreviousStatus := localCacheStatuses[result.ID]; isAvailable && hasPreviousStatus && !previousStatus.Available && previousStatus.UnavailableStat != "" {
-			if !result.HasStat(previousStatus.UnavailableStat) {
-				return
-			}
+		if statResultHistory != nil {
+			statResultsVal := statResultHistory.LoadOrStore(result.ID)
+			statResults = &statResultsVal
 		}
-		localCacheStatuses[result.ID] = cache.AvailableStatus{
-			Available:       isAvailable,
-			Status:          mc.TrafficServer[string(result.ID)].ServerStatus,
-			Why:             whyAvailable,
-			UnavailableStat: unavailableStat,
-			Poller:          pollerName,
-		} // TODO move within localStates?
-
-		if available, ok := localStates.GetCache(result.ID); !ok || available.IsAvailable != isAvailable {
-			log.Infof("Changing state for %s was: %t now: %t because %s poller: %v error: %v", result.ID, available.IsAvailable, isAvailable, whyAvailable, pollerName, result.Error)
-			events.Add(Event{Time: Time(time.Now()), Description: whyAvailable + " (" + pollerName + ")", Name: string(result.ID), Hostname: string(result.ID), Type: toData.ServerTypes[result.ID].String(), Available: isAvailable})
-		}
-
-		localStates.SetCache(result.ID, tc.IsAvailable{IsAvailable: isAvailable})
-	}
-	calculateDeliveryServiceState(toData.DeliveryServiceServers, localStates, toData)
-	localCacheStatusThreadsafe.Set(localCacheStatuses)
-}
-
-// CalcAvailability calculates the availability of the cache, from the given result. Availability is stored in `localCacheStatus` and `localStates`, and if the status changed an event is added to `events`. statResultHistory may be nil, for pollers which don't poll stats.
-// TODO add tc for poller names?
-func CalcAvailability(results []cache.Result, pollerName string, mc tc.TrafficMonitorConfigMap, toData todata.TOData, localCacheStatusThreadsafe threadsafe.CacheAvailableStatus, localStates peer.CRStatesThreadsafe, events ThreadsafeEvents) {
-	localCacheStatuses := localCacheStatusThreadsafe.Get().Copy()
-	for _, result := range results {
-		isAvailable, whyAvailable, unavailableStat := EvalCache(cache.ToInfo(result), &mc)
+		isAvailable, whyAvailable, unavailableStat := EvalCache(cache.ToInfo(result), statResults, &mc)
 
 		// if the cache is now Available, and was previously unavailable due to a threshold, make sure this poller contains the stat which exceeded the threshold.
 		if previousStatus, hasPreviousStatus := localCacheStatuses[result.ID]; isAvailable && hasPreviousStatus && !previousStatus.Available && previousStatus.UnavailableStat != "" {
diff --git a/traffic_monitor/health/cache_test.go b/traffic_monitor/health/cache_test.go
new file mode 100644
index 0000000..a81506c
--- /dev/null
+++ b/traffic_monitor/health/cache_test.go
@@ -0,0 +1,149 @@
+package health
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import (
+	"strings"
+	"testing"
+	"time"
+
+	"github.com/apache/trafficcontrol/lib/go-tc"
+	"github.com/apache/trafficcontrol/traffic_monitor/cache"
+	"github.com/apache/trafficcontrol/traffic_monitor/peer"
+	"github.com/apache/trafficcontrol/traffic_monitor/threadsafe"
+	"github.com/apache/trafficcontrol/traffic_monitor/todata"
+)
+
+func TestCalcAvailabilityThresholds(t *testing.T) {
+	result := cache.Result{
+		ID:    "myCacheName",
+		Error: nil,
+		Astats: cache.Astats{
+			Ats: map[string]interface{}{},
+			System: cache.AstatsSystem{
+				InfName:           "bond0",
+				InfSpeed:          20000,
+				ProcNetDev:        "bond0: 1234567891011121 123456789101    0    5    0     0          0  9876543 12345678910111213 1234567891011    0 1234    0     0       0          0",
+				ProcLoadavg:       "5.43 4.32 3.21 3/1234 32109",
+				ConfigLoadRequest: 9,
+				LastReloadRequest: 1559237772,
+				ConfigReloads:     1,
+				LastReload:        1559237773,
+				AstatsLoad:        1559237774,
+				NotAvailable:      false,
+			},
+		},
+		Time:            time.Now(),
+		RequestTime:     time.Second,
+		Vitals:          cache.Vitals{},
+		PollID:          42,
+		PollFinished:    make(chan uint64, 1),
+		PrecomputedData: cache.PrecomputedData{},
+		Available:       true,
+	}
+	GetVitals(&result, nil, nil)
+
+	prevResult := cache.Result{
+		Time:   result.Time.Add(time.Second * -1),
+		Vitals: cache.Vitals{BytesOut: result.Vitals.BytesOut - 1250000000}, // 10 gigabits
+	}
+	GetVitals(&result, &prevResult, nil)
+
+	statResultHistory := (*threadsafe.ResultStatHistory)(nil)
+	mc := tc.TrafficMonitorConfigMap{
+		TrafficServer: map[string]tc.TrafficServer{
+			string(result.ID): {
+				ServerStatus: string(tc.CacheStatusReported),
+				Profile:      "myProfileName",
+			},
+		},
+		Profile: map[string]tc.TMProfile{},
+	}
+	mc.Profile[mc.TrafficServer[string(result.ID)].Profile] = tc.TMProfile{
+		Name: mc.TrafficServer[string(result.ID)].Profile,
+		Parameters: tc.TMParameters{
+			Thresholds: map[string]tc.HealthThreshold{
+				"availableBandwidthInKbps": tc.HealthThreshold{
+					Val:        15000000,
+					Comparator: ">",
+				},
+			},
+		},
+	}
+
+	toData := todata.TOData{
+		ServerTypes:            map[tc.CacheName]tc.CacheType{},
+		DeliveryServiceServers: map[tc.DeliveryServiceName][]tc.CacheName{},
+		ServerCachegroups:      map[tc.CacheName]tc.CacheGroupName{},
+	}
+	toData.ServerTypes[result.ID] = tc.CacheTypeEdge
+	toData.DeliveryServiceServers["myDS"] = []tc.CacheName{result.ID}
+	toData.ServerCachegroups[result.ID] = "myCG"
+
+	localCacheStatusThreadsafe := threadsafe.NewCacheAvailableStatus()
+	localStates := peer.NewCRStatesThreadsafe()
+	events := NewThreadsafeEvents(200)
+
+	// test that a normal stat poll over the kbps threshold marks down
+
+	pollerName := "stat"
+	results := []cache.Result{result}
+	CalcAvailability(results, pollerName, statResultHistory, mc, toData, localCacheStatusThreadsafe, localStates, events)
+
+	localCacheStatuses := localCacheStatusThreadsafe.Get()
+	if localCacheStatus, ok := localCacheStatuses[result.ID]; !ok {
+		t.Fatalf("expected: localCacheStatus[cacheName], actual: missing")
+	} else if localCacheStatus.Available {
+		t.Fatalf("localCacheStatus.Available over kbps threshold expected: false, actual: true")
+	} else if localCacheStatus.Status != string(tc.CacheStatusReported) {
+		t.Fatalf("localCacheStatus.Status expected %v actual %v", "todo", localCacheStatus.Status)
+	} else if localCacheStatus.UnavailableStat != "availableBandwidthInKbps" {
+		t.Fatalf("localCacheStatus.UnavailableStat expected %v actual %v", "availableBandwidthInKbps", localCacheStatus.UnavailableStat)
+	} else if localCacheStatus.Poller != pollerName {
+		t.Fatalf("localCacheStatus.Poller expected %v actual %v", pollerName, localCacheStatus.Poller)
+	} else if !strings.Contains(localCacheStatus.Why, "availableBandwidthInKbps too low") {
+		t.Fatalf("localCacheStatus.Why expected 'availableBandwidthInKbps too low' actual %v", localCacheStatus.Why)
+	}
+
+	// test that the health poll didn't override the stat poll threshold markdown and mark available
+	// https://github.com/apache/trafficcontrol/issues/3646
+
+	healthResult := result
+	healthResult.Astats.System.ProcNetDev = "bond0: 1234567891011121 123456789101    0    5    0     0          0  9876543 12345680160111212 1234567891011    0 1234    0     0       0          0" // 10Gb more than result
+	GetVitals(&healthResult, &result, nil)
+	healthPollerName := "health"
+	healthResults := []cache.Result{healthResult}
+	CalcAvailability(healthResults, healthPollerName, nil, mc, toData, localCacheStatusThreadsafe, localStates, events)
+
+	localCacheStatuses = localCacheStatusThreadsafe.Get()
+	if localCacheStatus, ok := localCacheStatuses[result.ID]; !ok {
+		t.Fatalf("expected: localCacheStatus[cacheName], actual: missing")
+	} else if localCacheStatus.Available {
+		t.Fatalf("localCacheStatus.Available over kbps threshold expected: false, actual: true")
+	} else if localCacheStatus.Status != string(tc.CacheStatusReported) {
+		t.Fatalf("localCacheStatus.Status expected %v actual %v", "tc.CacheStatusReported", localCacheStatus.Status)
+	} else if localCacheStatus.UnavailableStat != "availableBandwidthInKbps" {
+		t.Fatalf("localCacheStatus.UnavailableStat expected %v actual %v", "availableBandwidthInKbps", localCacheStatus.UnavailableStat)
+	} else if localCacheStatus.Poller != healthPollerName {
+		t.Fatalf("localCacheStatus.Poller expected %v actual %v", healthPollerName, localCacheStatus.Poller)
+	} else if !strings.Contains(localCacheStatus.Why, "availableBandwidthInKbps too low") {
+		t.Fatalf("localCacheStatus.Why expected 'availableBandwidthInKbps too low' actual %v", localCacheStatus.Why)
+	}
+}
diff --git a/traffic_monitor/manager/health.go b/traffic_monitor/manager/health.go
index 6fad8aa..60a17eb 100644
--- a/traffic_monitor/manager/health.go
+++ b/traffic_monitor/manager/health.go
@@ -184,7 +184,9 @@ func processHealthResult(
 		healthHistoryCopy[healthResult.ID] = pruneHistory(append([]cache.Result{healthResult}, healthHistoryCopy[healthResult.ID]...), maxHistory)
 	}
 
-	health.CalcAvailability(results, "health", monitorConfigCopy, toDataCopy, localCacheStatusThreadsafe, localStates, events)
+	pollerName := "health"
+	statResultHistoryNil := (*threadsafe.ResultStatHistory)(nil) // health poller doesn't have stats
+	health.CalcAvailability(results, pollerName, statResultHistoryNil, monitorConfigCopy, toDataCopy, localCacheStatusThreadsafe, localStates, events)
 
 	healthHistory.Set(healthHistoryCopy)
 	// TODO determine if we should combineCrStates() here
diff --git a/traffic_monitor/manager/stat.go b/traffic_monitor/manager/stat.go
index 8c9d706..413795d 100644
--- a/traffic_monitor/manager/stat.go
+++ b/traffic_monitor/manager/stat.go
@@ -318,7 +318,8 @@ func processStatResults(
 		lastStats.Set(*lastStatsCopy)
 	}
 
-	health.CalcAvailabilityWithStats(results, "stat", statResultHistoryThreadsafe, mc, toData, localCacheStatusThreadsafe, localStates, events)
+	pollerName := "stat"
+	health.CalcAvailability(results, pollerName, &statResultHistoryThreadsafe, mc, toData, localCacheStatusThreadsafe, localStates, events)
 	combineState()
 
 	endTime := time.Now()