You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@trafficcontrol.apache.org by GitBox <gi...@apache.org> on 2022/10/24 19:06:14 UTC

[GitHub] [trafficcontrol] zrhoffman commented on a diff in pull request #7096: Add health client parent health

zrhoffman commented on code in PR #7096:
URL: https://github.com/apache/trafficcontrol/pull/7096#discussion_r1003601203


##########
tc-health-client/config/config_test.go:
##########
@@ -37,7 +37,7 @@ func TestLoadConfig(t *testing.T) {
 	}
 
 	cfg := Cfg{
-		TrafficMonitors:        make(map[string]bool, 0),
+		// TrafficMonitors:        make(map[string]bool, 0),

Review Comment:
   Does this commented need to be here?



##########
tc-health-client/sar/ephemeralportholder.go:
##########
@@ -0,0 +1,77 @@
+package sar
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import (
+	"errors"
+	"net"
+	"strconv"
+)
+
+// EphemeralPortHolder serves 2 purposes: it gets an ephemeral TCP port, and it holds
+// onto the port so the OS doesn't assign it to any other app.
+//
+// It listens on :0 thereby getting a socket on an ephemeral port.
+// It continues listening but never reading from the socket until Close is called.
+type EphemeralPortHolder struct {
+	listener net.Listener
+	port     int
+}
+
+// GetAndHoldEphemeralPort gets an ephemeral port, and listens on it to prevent
+// the OS assigning the port to other apps.
+// Close must be called on the returned EphemeralPortHolder to stop listening.
+func GetAndHoldEphemeralPort(addr string) (*EphemeralPortHolder, error) {
+	addr += ":0"
+	listener, err := net.Listen("tcp", addr)
+	if err != nil {
+		return nil, errors.New("listening: " + err.Error())
+	}
+
+	// get the port now, so EphemeralPortHolder.Port() doesn't need to return an error
+
+	listenAddr := listener.Addr().String()
+	ipPort := SplitLast(listenAddr, ":")
+	if len(ipPort) < 1 {
+		return nil, errors.New("malformed addr '" + listenAddr + "', should have been ip:port") // should never happen
+	}
+	portStr := ipPort[1]
+	port, err := strconv.Atoi(portStr)
+	if err != nil {
+		return nil, errors.New("malformed addr '" + listenAddr + "' port was not an integer") // should never happen
+	}
+	if port > 65535 || port < 0 {

Review Comment:
   Nit: `65535` can be `math.MaxUint16`



##########
tc-health-client/tmagent/markdownservice.go:
##########
@@ -0,0 +1,484 @@
+package tmagent
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import (
+	"bytes"
+	"errors"
+	"os/exec"
+	"path/filepath"
+	"strconv"
+	"time"
+
+	"github.com/apache/trafficcontrol/lib/go-log"
+	"github.com/apache/trafficcontrol/lib/go-tc"
+	"github.com/apache/trafficcontrol/lib/go-util"
+	"github.com/apache/trafficcontrol/tc-health-client/config"
+)
+
+// ParentStatus contains the trafficserver 'HostStatus' fields that
+// are necessary to interface with the trafficserver 'traffic_ctl' command.
+type ParentStatus struct {
+	Fqdn                 string
+	ActiveReason         bool
+	LocalReason          bool
+	ManualReason         bool
+	LastTmPoll           int64
+	UnavailablePollCount int
+	MarkUpPollCount      int
+}
+
+// used to get the overall parent availablity from the
+// HostStatus markdown reasons.  all markdown reasons
+// must be true for a parent to be considered available.
+func (p ParentStatus) available(reasonCode string) bool {
+	rc := false
+
+	switch reasonCode {
+	case "active":
+		rc = p.ActiveReason
+	case "local":
+		rc = p.LocalReason
+	case "manual":
+		rc = p.ManualReason
+	}
+	return rc
+}
+
+// used to log that a parent's status is either UP or
+// DOWN based upon the HostStatus reason codes.  to
+// be considered UP, all reason codes must be 'true'.
+func (p ParentStatus) Status() string {
+	if !p.ActiveReason {
+		return "DOWN"
+	} else if !p.LocalReason {
+		return "DOWN"
+	} else if !p.ManualReason {
+		return "DOWN"
+	}
+	return "UP"
+}
+
+type StatusReason int
+
+// these are the HostStatus reason codes used within
+// trafficserver.
+const (
+	ACTIVE StatusReason = iota
+	LOCAL
+	MANUAL
+)
+
+// used for logging a parent's HostStatus reason code
+// setting.
+func (s StatusReason) String() string {
+	switch s {
+	case ACTIVE:
+		return "ACTIVE"
+	case LOCAL:
+		return "LOCAL"
+	case MANUAL:
+		return "MANUAL"
+	}
+	return "UNDEFINED"
+}
+
+// MarkdownServer contains symbols for manipulating a running MarkdownService
+// started by StartMarkdownService.
+type MarkdownServer struct {
+	// Shutdown signals the MarkdownService to stop its goroutines and release all resources.
+	//
+	// It is not buffered. Writes will block until the markdown service is done with any
+	// ongoing markdown operation. Once a write returns, the MarkdownService will begin
+	// shutdown.
+	// TODO: add signal to callers when shutdown has finished.
+	Shutdown chan<- struct{}
+
+	// UpdateHealth signals to the markdown service that health status has changed, and it should process markdowns again.
+	UpdateHealth func()
+}
+
+func StartMarkdownService(pi *ParentInfo, pollInterval time.Duration) *MarkdownServer {
+	doneCh := make(chan struct{})
+	updateCh := make(chan struct{})
+	updateHealthF := func() {
+		select {
+		case updateCh <- struct{}{}:
+		default:
+		}
+	}
+
+	go markdownServicePoll(pi, pollInterval, doneCh, updateCh)
+	return &MarkdownServer{Shutdown: doneCh, UpdateHealth: updateHealthF}
+}
+
+func markdownServicePoll(pi *ParentInfo, pollInterval time.Duration, doneChan <-chan struct{}, updateChan <-chan struct{}) {
+	for {
+		select {
+		case <-doneChan:
+			return
+		default:
+			<-updateChan
+			start := time.Now()
+			doMarkdown(pi)
+			log.Infof("poll-status poll=markdown ms=%v\n", int(time.Since(start)/time.Millisecond))
+			time.Sleep(pollInterval)
+		}
+	}
+}
+
+// decideHealthy is the algorithm for deciding whether a cache is healthy.
+// It takes the cache hostname, and all health results, and returns a single boolean decision.
+func decideHealthy(
+	markdownMethods map[config.HealthMethod]struct{},
+	parentFQDN string,
+	tmHealth *TrafficMonitorHealth,
+	parentHealthL4 *ParentHealth,
+	parentHealthL7 *ParentHealth,
+	parentServiceHealth *ParentServiceHealth,
+) bool {
+	// TODO use hostname:port? Parents can be healthy on one port/service but not another.
+	// The ATS markdown command can't mark down per-port as of this writing, but
+	// we can at least calculate it, to log that data, and be able to easily
+	// mark down host:port when ATS adds that.
+
+	// note the CacheStatuses has v4 and v6 data,
+	// but ATS also can't mark down hosts per IP version yet,
+	// so we just use the generic/old non-IP-versioned IsAvailable field.
+
+	// TODO make decision algorithm/pessimism/ratio/etc configurable
+
+	tmHealthy := (*bool)(nil)
+	l4Healthy := (*bool)(nil)
+	l7Healthy := (*bool)(nil)
+	recursiveHealthy := (*bool)(nil)
+
+	l4Health, hasL4Health := parentHealthL4.ParentHealthPollResults[parentFQDN]
+	l7Health, hasL7Health := parentHealthL7.ParentHealthPollResults[parentFQDN]
+
+	if _, use := markdownMethods[config.HealthMethodTrafficMonitor]; use && tmHealth.CacheStatuses != nil {
+		parentHostName := parseFqdn(parentFQDN)
+		tmHealthy = util.BoolPtr(tmHealth.CacheStatuses[tc.CacheName(parentHostName)].IsAvailable)
+	}
+	if _, use := markdownMethods[config.HealthMethodParentL4]; use && parentHealthL4 != nil && hasL4Health && len(parentHealthL4.ParentHealthPollResults) > 0 {
+		l4Healthy = util.BoolPtr(l4Health.Healthy)
+	}
+	if _, use := markdownMethods[config.HealthMethodParentL7]; use && parentHealthL7 != nil && hasL7Health && len(parentHealthL7.ParentHealthPollResults) > 0 {
+		l7Healthy = util.BoolPtr(l7Health.Healthy)
+	}
+	if _, use := markdownMethods[config.HealthMethodParentService]; use && parentServiceHealth != nil && parentServiceHealth.ParentServiceHealthPollResults != nil {
+		recursiveHealthy = util.BoolPtr(decideRecursiveHealthy(parentServiceHealth.ParentServiceHealthPollResults[parentFQDN].ParentServiceHealth))
+	}
+
+	printbp := func(bp *bool) string {
+		if bp == nil {
+			return "nil"
+		}
+		return strconv.FormatBool(*bp)
+	}
+
+	// TODO wait for all polls to have results, before marking down?
+	//      Maybe it doesn't matter for pessimistic health?
+
+	log.Infof("decide-healthy host=%v tm=%v l4=%v l7=%v svc=%v\n", parentFQDN, printbp(tmHealthy), printbp(l4Healthy), printbp(l7Healthy), printbp(recursiveHealthy))
+
+	// nilOrTrue is a helper func that returns true if the given *bool is nil or true.
+	// This is used for pessimistic health, but if the health doesn't exist for the parent
+	// for a particular health type poller, we don't want to mark it unhealthy.
+	nilOrTrue := func(bp *bool) bool {
+		return bp == nil || *bp
+	}
+
+	// pessimistic: if any health mechanism is unhealthy, consider the host unhealthy
+	return nilOrTrue(tmHealthy) && nilOrTrue(l4Healthy) && nilOrTrue(l7Healthy) && nilOrTrue(recursiveHealthy)
+}
+
+// decideRecursiveHealthy is the algorithm for deciding whether a parent is healthy,
+// based on a heuristic of whether it can get to most parents
+// It takes the cache hostname, and all health results, and returns a single boolean decision.
+//
+// Note this is a heuristic because ATS doesn't currently have the ability to mark down
+// a parent hostname for a single remap.
+// If and when ATS has the ability to mark down parents per-remap, we can mark down parents
+// just for remaps whose origins aren't available on that parent.
+func decideRecursiveHealthy(parentServiceHealth *ParentServiceHealth) bool {
+	// TODO get direct vs final parent data from strategies.yaml and parent.config
+	//      Because we really want to heuristically check only the final parents (i.e. origins)
+	//      are available on our direct parent, not any other origins it has that aren't assigned
+	//      to this cache.
+
+	// TODO make configurable
+	const ratioToConsiderHealthy = 0.5
+
+	parentHealthy := map[string]bool{}
+
+	// TODO this is currently pessimistic: if any health mechanism is unhealthy, consider unhealthy.
+	//      make health decision configurable.
+
+	if parentServiceHealth == nil || parentServiceHealth.ParentServiceHealthPollResults == nil {
+		log.Warnln("decideRecursiveHealthy got nil parents, returning healthy")
+		return true
+	}
+
+	for parentFQDN, recursiveHealth := range parentServiceHealth.ParentServiceHealthPollResults {
+		if recursiveHealth.ParentHealthL4 != nil && !recursiveHealth.ParentHealthL4.Healthy {
+			parentHealthy[parentFQDN] = false
+			continue
+		}
+		if recursiveHealth.ParentHealthL7 != nil && !recursiveHealth.ParentHealthL7.Healthy {
+			parentHealthy[parentFQDN] = false
+			continue
+		}
+		if !decideRecursiveHealthy(recursiveHealth.ParentServiceHealth.ParentServiceHealthPollResults[parentFQDN].ParentServiceHealth) {
+			parentHealthy[parentFQDN] = false
+			continue
+		}
+		parentHealthy[parentFQDN] = true
+	}
+
+	numParents := 0
+	numParentsHealthy := 0
+	for _, healthy := range parentHealthy {
+		numParents++
+		if healthy {
+			numParentsHealthy++
+		}
+	}
+	healthyRatio := float64(numParentsHealthy) / float64(numParents)
+	return healthyRatio >= ratioToConsiderHealthy
+}
+
+type parent struct {
+	host string
+	fqdn string
+}
+
+// getParentFQDNs returns the FQDN of all parents in all health types.
+// This is necessary, because some health polls might not have parents, so we need to combine them all
+func getParentFQDNs(pi *ParentInfo, tmh *TrafficMonitorHealth, l4h *ParentHealth, l7h *ParentHealth, sh *ParentServiceHealth) []string {
+
+	// put FQDns in a set first, to remove duplicates
+	fqdnSet := map[string]struct{}{}
+
+	for cacheName, _ := range tmh.CacheStatuses {
+		hostName := string(cacheName)
+		fqdn, ok := pi.ParentHostFQDNs.Load(hostName)
+		if !ok {
+			// this should be normal. Many parents in TM won't be parents of this cache
+			continue
+		}
+		fqdnSet[fqdn] = struct{}{}
+	}
+
+	for fqdn, _ := range l4h.ParentHealthPollResults {
+		fqdnSet[fqdn] = struct{}{}
+	}
+	for fqdn, _ := range l7h.ParentHealthPollResults {
+		fqdnSet[fqdn] = struct{}{}
+	}
+	for fqdn, _ := range sh.ParentServiceHealthPollResults {
+		// we don't need recursive health, just the direct parent. We only need direct parents to mark down.
+		fqdnSet[fqdn] = struct{}{}
+	}
+
+	fqdns := make([]string, 0, len(fqdnSet))
+	for fqdn, _ := range fqdnSet {
+		fqdns = append(fqdns, fqdn)
+	}
+	return fqdns
+}
+
+// HealthSafetyRatio is the ratio of unhealthy parents after which no parents will be marked down.
+//
+// This is a safety mechanism: if for any reason most or all parents are marked down, something
+// is seriously wrong, possibly with the health code itself, and therefore don't mark any parents down,
+const HealthSafetyRatio = 0.3 // TODO make configurable?
+
+func doMarkdown(pi *ParentInfo) {
+	cfg := pi.Cfg.Get()
+
+	tmHealth := pi.TrafficMonitorHealth.Get()
+	parentHealthL4 := pi.ParentHealthL4.Get()
+	parentHealthL7 := pi.ParentHealthL7.Get()
+	parentServiceHealth := pi.ParentServiceHealth.Get()
+
+	parentFQDNs := getParentFQDNs(pi, tmHealth, parentHealthL4, parentHealthL7, parentServiceHealth)
+	unhealthyNum := 0
+	newCacheHealth := map[string]bool{} // map[fqdn]healthy
+	for _, fqdn := range parentFQDNs {
+		newAvailable := decideHealthy(pi.MarkdownMethods, fqdn, tmHealth, parentHealthL4, parentHealthL7, parentServiceHealth)
+		if !newAvailable {
+			unhealthyNum++
+		}
+		newCacheHealth[fqdn] = newAvailable
+	}
+
+	unhealthyParentsExceedSafetyRatio := (float64(unhealthyNum) / float64(len(newCacheHealth))) > HealthSafetyRatio
+	log.Infof("markdown Unhealthy parents %v/%v safety ratio %v\n", unhealthyNum, len(newCacheHealth), HealthSafetyRatio)
+	if unhealthyParentsExceedSafetyRatio {
+		log.Errorf("Unhealthy parents %v/%v exceed safety ratio %v!! Marking all parents up!!\n", unhealthyNum, len(newCacheHealth), HealthSafetyRatio)
+	}
+
+	for _, fqdn := range parentFQDNs {
+		isAvailable := tc.IsAvailable{}
+		{
+			hostName := parseFqdn(fqdn)
+			isAvailable = tmHealth.CacheStatuses[tc.CacheName(hostName)]
+		}
+		parentStatus, ok := pi.LoadParentStatus(fqdn)
+		if !ok {
+			continue // TODO warn? error? is this normal?
+		}
+
+		// update the polling time
+		parentStatus.LastTmPoll = tmHealth.PollTime.Unix()
+
+		newAvailable := newCacheHealth[fqdn]
+		if unhealthyParentsExceedSafetyRatio {
+			newAvailable = true
+		}
+		// newAvailable := isAvailable.IsAvailable
+		oldAvailable := parentStatus.available(cfg.ReasonCode)
+		if oldAvailable != newAvailable {
+			// do not mark down if the configuration disables mark downs.
+			if !cfg.EnableActiveMarkdowns && !newAvailable {
+				log.Infof("TM reports that %s is not available and should be marked DOWN but, mark downs are disabled by configuration", fqdn)
+			} else {
+				if newParentStatus, err := markParent(cfg, parentStatus, isAvailable.Status, newAvailable); err != nil {
+					log.Errorln(err.Error())
+				} else {
+					log.Infoln("TM reports that '" + parentStatus.Fqdn + "' is not available so marked DOWN")
+					parentStatus = newParentStatus
+				}
+			}
+		}
+
+		// if the host is available clear the unavailable poll count if not 0.
+		if oldAvailable && newAvailable {
+			if parentStatus.UnavailablePollCount > 0 {
+				log.Debugf("resetting the UnavailablePollCount for '%s' from %d to 0",
+					fqdn, parentStatus.UnavailablePollCount)
+				parentStatus.UnavailablePollCount = 0
+			}
+		}
+
+		{
+			parentHostName := parseFqdn(parentStatus.Fqdn)
+			pi.ParentHostFQDNs.Store(parentHostName, parentStatus.Fqdn)
+		}
+
+		// even if the status wasn't updated, we always need to update the poll time on the ParentStatus
+		pi.StoreParentStatus(parentStatus.Fqdn, parentStatus)
+	}
+
+}
+
+// markParent is used to mark a parent as up or down in the trafficserver HostStatus subsystem.
+// It takes a parent, modifies it by marking available and setting reasons and other data,
+// and returns the modified/marked parent.
+func markParent(cfg *config.Cfg, pv ParentStatus, cacheStatus string, available bool) (ParentStatus, error) {
+	var hostAvailable bool
+
+	hostName := parseFqdn(pv.Fqdn)
+
+	activeReason := pv.ActiveReason
+	localReason := pv.LocalReason
+	unavailablePollCount := pv.UnavailablePollCount
+	markUpPollCount := pv.MarkUpPollCount
+
+	log.Debugf("hostName: %s, UnavailablePollCount: %d, available: %v", hostName, unavailablePollCount, available)
+
+	if !available { // unavailable
+		unavailablePollCount += 1
+		if unavailablePollCount < cfg.UnavailablePollThreshold {
+			log.Infof("TM indicates %s is unavailable but the UnavailablePollThreshold has not been reached", hostName)
+			hostAvailable = true
+		} else {
+			// marking the host down
+			if err := execTrafficCtl(pv.Fqdn, false, cfg.ReasonCode, cfg.TrafficServerBinDir); err != nil {
+				log.Errorln(err)
+				return ParentStatus{}, err
+			}
+
+			hostAvailable = false
+			// reset the poll counts
+			markUpPollCount = 0
+			unavailablePollCount = 0
+			log.Infof("marked parent %s DOWN, cache status was: %s\n", hostName, cacheStatus)
+		}
+	} else { // available
+		// marking the host up
+		markUpPollCount += 1
+		if markUpPollCount < cfg.MarkUpPollThreshold {
+			log.Infof("TM indicates %s is available but the MarkUpPollThreshold has not been reached", hostName)
+			hostAvailable = false
+		} else {
+			if err := execTrafficCtl(pv.Fqdn, true, cfg.ReasonCode, cfg.TrafficServerBinDir); err != nil {
+				log.Errorln(err)
+				return ParentStatus{}, err
+			}
+			hostAvailable = true
+			// reset the poll counts
+			unavailablePollCount = 0
+			markUpPollCount = 0
+			log.Infof("marked parent %s UP, cache status was: %s\n", hostName, cacheStatus)
+		}
+	}
+
+	// update parent info
+	reason := cfg.ReasonCode
+	switch reason {
+	case "active":
+		activeReason = hostAvailable
+	case "local":
+		localReason = hostAvailable
+	}
+	// save updates
+	pv.ActiveReason = activeReason
+	pv.LocalReason = localReason
+	pv.UnavailablePollCount = unavailablePollCount
+	pv.MarkUpPollCount = markUpPollCount
+	// pi.Parents[hostName] = pv

Review Comment:
   Should this commented line be here?



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscribe@trafficcontrol.apache.org

For queries about this service, please contact Infrastructure at:
users@infra.apache.org