You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafficserver.apache.org by zw...@apache.org on 2022/04/05 20:12:22 UTC
[trafficserver] 01/02: Added metrics to the rate limit plugin and document the new options (#8395)

This is an automated email from the ASF dual-hosted git repository.

zwoop pushed a commit to branch 9.2.x
in repository https://gitbox.apache.org/repos/asf/trafficserver.git

commit 8ea2f4c003549034f4d32593763881ab67b0119c
Author: Jeff Elsloo <el...@users.noreply.github.com>
AuthorDate: Tue Oct 26 08:48:10 2021 -0600

    Added metrics to the rate limit plugin and document the new options (#8395)
    
    * Added metrics to the rate limit plugin and documented the new options.
    
    * * Addressed feedback in PR review
    
    * Fixed calculation for metric length that was counting too many things due to a prior implementation and lack of cleanup of this specific line of code
    
    (cherry picked from commit a3f04cb6dc12c0d192f4abd981b5ff09647484f4)
---
 doc/admin-guide/plugins/rate_limit.en.rst       | 134 ++++++++++++++++++++++++
 plugins/experimental/rate_limit/limiter.h       |  81 ++++++++++++++
 plugins/experimental/rate_limit/rate_limit.cc   |   5 +-
 plugins/experimental/rate_limit/sni_limiter.cc  |  10 ++
 plugins/experimental/rate_limit/sni_limiter.h   |   2 +
 plugins/experimental/rate_limit/sni_selector.cc |   4 +
 plugins/experimental/rate_limit/txn_limiter.cc  |  14 +++
 plugins/experimental/rate_limit/utilities.cc    |  46 ++++++++
 plugins/experimental/rate_limit/utilities.h     |   1 +
 9 files changed, 296 insertions(+), 1 deletion(-)

diff --git a/doc/admin-guide/plugins/rate_limit.en.rst b/doc/admin-guide/plugins/rate_limit.en.rst
index f50377223..ef3f3246f 100644
--- a/doc/admin-guide/plugins/rate_limit.en.rst
+++ b/doc/admin-guide/plugins/rate_limit.en.rst
@@ -80,6 +80,18 @@ are available:
    An optional `max-age` for how long a transaction can sit in the delay queue.
    The value (default 0) is the age in milliseconds.
 
+.. option:: --prefix
+
+   An optional metric prefix to use instead of the default (plugin.rate_limiter).
+
+.. option:: --tag
+
+   An optional metric tag to use instead of the default. When a tag is not specified
+   the plugin will use the scheme, FQDN, and port when it is non-standard. For example
+   a default plugin tag might be "https.example.com" or "http.example.com:8080"
+   noting that in the latter exampe, the non-standard scheme and port led to
+   ":8080" being appended to the string.
+
 Global Plugin
 -------------
 
@@ -122,6 +134,61 @@ The following options are available:
    An optional `max-age` for how long a transaction can sit in the delay queue.
    The value (default 0) is the age in milliseconds.
 
+.. option:: --prefix
+
+   An optional metric prefix to use instead of the default (plugin.rate_limiter).
+
+.. option:: --tag
+
+   An optional metric tag to use instead of the default. When a tag is not specified
+   the plugin will use the FQDN of the SNI associated with each rate limiter instance
+   created during plugin initialization.
+
+Metrics
+-------
+Metric names are generated either using defaults or user-supplied values. In either
+case, the format of the metric names is as follows:
+
+   ``prefix.type.tag.metric``
+
+A user can specify their own prefixes and tags, but not types or metrics.
+
+``prefix``
+   The default prefix for all metrics is `plugin.rate_limiter`.
+
+``type``
+   There are two types of metrics: `sni` and `remap`. Each type corresponds with the
+   type of configuration used to generate the metric. The global configuration is for
+   rate limiting requests during TLS negotiation, hence, the type of ``sni``. Similarly
+   ``remap`` connotes a remap configuration.
+
+``tag``
+   By default the metric tag is derived from a description that is set conditionally.
+   When configured in global mode, the ``SNI`` argument allows a comma separated list
+   of FQDNs that require rate limiting. Each FQDN is associated with an instance of
+   the rate limiter, and the description of each limiter is set to the FQDN.
+
+   When configured on a remap, the plugin will generate a description based on the
+   configuration. When the scheme and port number are standard, the port is omitted
+   from the generated description, however, when the scheme and port combination are
+   non-standard, the port is appended. For example, a standard scheme and port would
+   lead to a description of ``http.example.com`` or ``https.example.com`` but if a
+   non-standard port was used, a description might be ``https.example.com:8443`` or
+   ``http.example.com:8080``. This approach allows each limiter to increment metrics
+   for the correct remaps.
+
+``metric``
+   There are four metrics that may be incremented, depending on which action the plugin takes:
+
+   ============== ===================================================================
+   Metric         Definition
+   ============== ===================================================================
+   ``queued``     Request queued due to being at the limit but under the queue limit.
+   ``rejected``   Request rejected due to being over the defined limits.
+   ``expired``    Queued connection is too old to be resumed and is rejected.
+   ``resumed``    Queued connection is resumed.
+   ============== ===================================================================
+
 Examples
 --------
 
@@ -158,3 +225,70 @@ In this case, the response would look like this when the queue is full: ::
     Content-Language: en
     Retry-After: 3600
     Content-Length: 207
+
+Metric Examples
+---------------
+The following examples show the metric names that result from various settings
+using a hypothetical domain of example.com with both global and remap configurations.
+Note that in this example the remap configuration contains both TLS and non-TLS
+remap rules.
+
+Defaults:
+::
+
+   proxy.rate_limiter.sni.example.com.queued
+   proxy.rate_limiter.sni.example.com.rejected
+   proxy.rate_limiter.sni.example.com.expired
+   proxy.rate_limiter.sni.example.com.resumed
+
+   proxy.rate_limiter.remap.https.example.com.queued
+   proxy.rate_limiter.remap.https.example.com.rejected
+   proxy.rate_limiter.remap.https.example.com.expired
+   proxy.rate_limiter.remap.https.example.com.resumed
+
+   proxy.rate_limiter.remap.http.example.com.queued
+   proxy.rate_limiter.remap.http.example.com.rejected
+   proxy.rate_limiter.remap.http.example.com.expired
+   proxy.rate_limiter.remap.http.example.com.resumed
+
+Defaults with non-standard scheme+port combinations in the remap rules:
+::
+
+   proxy.rate_limiter.sni.example.com.queued
+   proxy.rate_limiter.sni.example.com.rejected
+   proxy.rate_limiter.sni.example.com.expired
+   proxy.rate_limiter.sni.example.com.resumed
+
+   proxy.rate_limiter.remap.https.example.com:8443.queued
+   proxy.rate_limiter.remap.https.example.com:8443.rejected
+   proxy.rate_limiter.remap.https.example.com:8443.expired
+   proxy.rate_limiter.remap.https.example.com:8443.resumed
+
+   proxy.rate_limiter.remap.http.example.com:8080.queued
+   proxy.rate_limiter.remap.http.example.com:8080.rejected
+   proxy.rate_limiter.remap.http.example.com:8080.expired
+   proxy.rate_limiter.remap.http.example.com:8080.resumed
+
+With:
+  * ``--prefix=limiter`` on the global configuration
+  * ``--tag=tls.example.com`` on the global configuration
+  * ``@pparam=--prefix=limiter`` on the remap configurations
+  * ``@pparam=--tag=secure.example.com`` on the TLS-enabled remap configuration
+  * ``@pparam=--tag=insecure.example.com`` on the non-TLS-enabled remap configuration
+
+::
+
+   limiter.sni.tls.example.com.queued
+   limiter.sni.tls.example.com.rejected
+   limiter.sni.tls.example.com.expired
+   limiter.sni.tls.example.com.resumed
+
+   limiter.remap.secure.example.com.queued
+   limiter.remap.secure.example.com.rejected
+   limiter.remap.secure.example.com.expired
+   limiter.remap.secure.example.com.resumed
+
+   limiter.remap.insecure.example.com.queued
+   limiter.remap.insecure.example.com.rejected
+   limiter.remap.insecure.example.com.expired
+   limiter.remap.insecure.example.com.resumed
diff --git a/plugins/experimental/rate_limit/limiter.h b/plugins/experimental/rate_limit/limiter.h
index 4d56ffc95..9c4f4b0cd 100644
--- a/plugins/experimental/rate_limit/limiter.h
+++ b/plugins/experimental/rate_limit/limiter.h
@@ -31,6 +31,39 @@
 constexpr auto QUEUE_DELAY_TIME = std::chrono::milliseconds{200}; // Examine the queue every 200ms
 using QueueTime                 = std::chrono::time_point<std::chrono::system_clock>;
 
+enum {
+  RATE_LIMITER_TYPE_SNI = 0,
+  RATE_LIMITER_TYPE_REMAP,
+
+  RATE_LIMITER_TYPE_MAX
+};
+
+// order must align with the above
+static const char *types[] = {
+  "sni",
+  "remap",
+};
+
+// no metric for requests we accept; accepted requests should be counted under their usual metrics
+enum {
+  RATE_LIMITER_METRIC_QUEUED = 0,
+  RATE_LIMITER_METRIC_REJECTED,
+  RATE_LIMITER_METRIC_EXPIRED,
+  RATE_LIMITER_METRIC_RESUMED,
+
+  RATE_LIMITER_METRIC_MAX
+};
+
+// order must align with the above
+static const char *suffixes[] = {
+  "queued",
+  "rejected",
+  "expired",
+  "resumed",
+};
+
+static const char *RATE_LIMITER_METRIC_PREFIX = "plugin.rate_limiter";
+
 ///////////////////////////////////////////////////////////////////////////////
 // Base class for all limiters
 //
@@ -139,6 +172,50 @@ public:
     }
   }
 
+  void
+  initializeMetrics(uint type)
+  {
+    TSReleaseAssert(type < RATE_LIMITER_TYPE_MAX);
+    memset(_metrics, 0, sizeof(_metrics));
+
+    std::string metric_prefix = prefix;
+    metric_prefix.append("." + std::string(types[type]));
+
+    if (!tag.empty()) {
+      metric_prefix.append("." + tag);
+    } else if (!description.empty()) {
+      metric_prefix.append("." + description);
+    }
+
+    for (int i = 0; i < RATE_LIMITER_METRIC_MAX; i++) {
+      size_t const metricsz = metric_prefix.length() + strlen(suffixes[i]) + 2; // padding for dot+terminator
+      char *const metric    = (char *)TSmalloc(metricsz);
+      snprintf(metric, metricsz, "%s.%s", metric_prefix.data(), suffixes[i]);
+
+      _metrics[i] = TS_ERROR;
+
+      if (TSStatFindName(metric, &_metrics[i]) == TS_ERROR) {
+        _metrics[i] = TSStatCreate(metric, TS_RECORDDATATYPE_INT, TS_STAT_NON_PERSISTENT, TS_STAT_SYNC_SUM);
+      }
+
+      if (_metrics[i] != TS_ERROR) {
+        TSDebug(PLUGIN_NAME, "established metric '%s' as ID %d", metric, _metrics[i]);
+      } else {
+        TSError("failed to create metric '%s'", metric);
+      }
+
+      TSfree(metric);
+    }
+  }
+
+  void
+  incrementMetric(uint metric)
+  {
+    if (_metrics[metric] != TS_ERROR) {
+      TSStatIntIncrement(_metrics[metric], 1);
+    }
+  }
+
   // Initialize a new instance of this rate limiter
   bool initialize(int argc, const char *argv[]);
 
@@ -147,6 +224,8 @@ public:
   unsigned max_queue                = UINT_MAX; // No queue limit, but if sets will give an immediate error if at max
   std::chrono::milliseconds max_age = std::chrono::milliseconds::zero(); // Max age (ms) in the queue
   std::string description           = "";
+  std::string prefix                = RATE_LIMITER_METRIC_PREFIX; // metric prefix, i.e.: plugin.rate_limiter
+  std::string tag                   = "";                         // optional tag to append to the prefix (prefix.tag)
 
 private:
   std::atomic<unsigned> _active = 0; // Current active number of txns. This has to always stay <= limit above
@@ -154,4 +233,6 @@ private:
 
   TSMutex _queue_lock, _active_lock; // Resource locks
   std::deque<QueueItem> _queue;      // Queue for the pending TXN's. ToDo: Should also move (see below)
+
+  int _metrics[RATE_LIMITER_METRIC_MAX];
 };
diff --git a/plugins/experimental/rate_limit/rate_limit.cc b/plugins/experimental/rate_limit/rate_limit.cc
index 8220f55d7..a3c94d094 100644
--- a/plugins/experimental/rate_limit/rate_limit.cc
+++ b/plugins/experimental/rate_limit/rate_limit.cc
@@ -29,7 +29,7 @@
 #include "sni_limiter.h"
 
 ///////////////////////////////////////////////////////////////////////////////
-// As a global plugin, things works a little difference since we don't setup
+// As a global plugin, things works a little different since we don't setup
 // per transaction or via remap.config.
 extern int gVCIdx;
 
@@ -113,6 +113,9 @@ TSRemapNewInstance(int argc, char *argv[], void **ih, char * /* errbuf ATS_UNUSE
 {
   TxnRateLimiter *limiter = new TxnRateLimiter();
 
+  // set the description based on the pristine remap URL prior to advancing the pointer below
+  limiter->description = getDescriptionFromUrl(argv[0]);
+
   // argv contains the "to" and "from" URLs. Skip the first so that the
   // second one poses as the program name.
   --argc;
diff --git a/plugins/experimental/rate_limit/sni_limiter.cc b/plugins/experimental/rate_limit/sni_limiter.cc
index d1a5e0586..b63c50b1d 100644
--- a/plugins/experimental/rate_limit/sni_limiter.cc
+++ b/plugins/experimental/rate_limit/sni_limiter.cc
@@ -54,12 +54,14 @@ sni_limit_cont(TSCont contp, TSEvent event, void *edata)
           TSVConnReenableEx(vc, TS_EVENT_ERROR);
           TSDebug(PLUGIN_NAME, "Rejecting connection, we're at capacity and queue is full");
           TSUserArgSet(vc, gVCIdx, nullptr);
+          limiter->incrementMetric(RATE_LIMITER_METRIC_REJECTED);
 
           return TS_ERROR;
         } else {
           TSUserArgSet(vc, gVCIdx, reinterpret_cast<void *>(limiter));
           limiter->push(vc, contp);
           TSDebug(PLUGIN_NAME, "Queueing the VC, we are at capacity");
+          limiter->incrementMetric(RATE_LIMITER_METRIC_QUEUED);
         }
       } else {
         // Not at limit on the handshake, we can re-enable
@@ -103,6 +105,8 @@ SniRateLimiter::initialize(int argc, const char *argv[])
     {const_cast<char *>("limit"), required_argument, nullptr, 'l'},
     {const_cast<char *>("queue"), required_argument, nullptr, 'q'},
     {const_cast<char *>("maxage"), required_argument, nullptr, 'm'},
+    {const_cast<char *>("prefix"), required_argument, nullptr, 'p'},
+    {const_cast<char *>("tag"), required_argument, nullptr, 't'},
     // EOF
     {nullptr, no_argument, nullptr, '\0'},
   };
@@ -120,6 +124,12 @@ SniRateLimiter::initialize(int argc, const char *argv[])
     case 'm':
       this->max_age = std::chrono::milliseconds(strtol(optarg, nullptr, 10));
       break;
+    case 'p':
+      this->prefix = std::string(optarg);
+      break;
+    case 't':
+      this->tag = std::string(optarg);
+      break;
     }
     if (opt == -1) {
       break;
diff --git a/plugins/experimental/rate_limit/sni_limiter.h b/plugins/experimental/rate_limit/sni_limiter.h
index ea3581b9b..3889a0819 100644
--- a/plugins/experimental/rate_limit/sni_limiter.h
+++ b/plugins/experimental/rate_limit/sni_limiter.h
@@ -35,6 +35,8 @@ public:
     limit     = src.limit;
     max_queue = src.max_queue;
     max_age   = src.max_age;
+    prefix    = src.prefix;
+    tag       = src.tag;
   }
 
   bool initialize(int argc, const char *argv[]);
diff --git a/plugins/experimental/rate_limit/sni_selector.cc b/plugins/experimental/rate_limit/sni_selector.cc
index 60fc2ee85..d41b4df06 100644
--- a/plugins/experimental/rate_limit/sni_selector.cc
+++ b/plugins/experimental/rate_limit/sni_selector.cc
@@ -41,6 +41,7 @@ sni_queue_cont(TSCont cont, TSEvent event, void *edata)
       (void)contp; // Ugly, but silences some compilers.
       TSDebug(PLUGIN_NAME, "SNI=%s: Enabling queued VC after %ldms", key.data(), static_cast<long>(delay.count()));
       TSVConnReenable(vc);
+      limiter->incrementMetric(RATE_LIMITER_METRIC_RESUMED);
     }
 
     // Kill any queued VCs if they are too old
@@ -55,6 +56,7 @@ sni_queue_cont(TSCont cont, TSEvent event, void *edata)
         (void)contp;
         TSDebug(PLUGIN_NAME, "Queued VC is too old (%ldms), erroring out", static_cast<long>(age.count()));
         TSVConnReenableEx(vc, TS_EVENT_ERROR);
+        limiter->incrementMetric(RATE_LIMITER_METRIC_EXPIRED);
       }
     }
   }
@@ -73,6 +75,8 @@ SniSelector::insert(std::string_view sni, SniRateLimiter *limiter)
     TSDebug(PLUGIN_NAME, "Added global limiter for SNI=%s (limit=%u, queue=%u, max_age=%ldms)", sni.data(), limiter->limit,
             limiter->max_queue, static_cast<long>(limiter->max_age.count()));
 
+    limiter->initializeMetrics(RATE_LIMITER_TYPE_SNI);
+
     return true;
   }
 
diff --git a/plugins/experimental/rate_limit/txn_limiter.cc b/plugins/experimental/rate_limit/txn_limiter.cc
index f5b0951e0..6e3366588 100644
--- a/plugins/experimental/rate_limit/txn_limiter.cc
+++ b/plugins/experimental/rate_limit/txn_limiter.cc
@@ -40,6 +40,7 @@ txn_limit_cont(TSCont cont, TSEvent event, void *edata)
 
   case TS_EVENT_HTTP_POST_REMAP:
     limiter->push(static_cast<TSHttpTxn>(edata), cont);
+    limiter->incrementMetric(RATE_LIMITER_METRIC_QUEUED);
     return TS_EVENT_NONE;
     break;
 
@@ -47,6 +48,7 @@ txn_limit_cont(TSCont cont, TSEvent event, void *edata)
     retryAfter(static_cast<TSHttpTxn>(edata), limiter->retry);
     TSContDestroy(cont); // We are done with this continuation now
     TSHttpTxnReenable(static_cast<TSHttpTxn>(edata), TS_EVENT_HTTP_CONTINUE);
+    limiter->incrementMetric(RATE_LIMITER_METRIC_REJECTED);
     return TS_EVENT_CONTINUE;
     break;
 
@@ -74,6 +76,7 @@ txn_queue_cont(TSCont cont, TSEvent event, void *edata)
     // Since this was a delayed transaction, we need to add the TXN_CLOSE hook to free the slot when done
     TSHttpTxnHookAdd(txnp, TS_HTTP_TXN_CLOSE_HOOK, contp);
     TSHttpTxnReenable(txnp, TS_EVENT_HTTP_CONTINUE);
+    limiter->incrementMetric(RATE_LIMITER_METRIC_RESUMED);
   }
 
   // Kill any queued txns if they are too old
@@ -90,6 +93,7 @@ txn_queue_cont(TSCont cont, TSEvent event, void *edata)
       TSHttpTxnStatusSet(txnp, static_cast<TSHttpStatus>(limiter->error));
       TSHttpTxnHookAdd(txnp, TS_HTTP_SEND_RESPONSE_HDR_HOOK, contp);
       TSHttpTxnReenable(txnp, TS_EVENT_HTTP_ERROR);
+      limiter->incrementMetric(RATE_LIMITER_METRIC_EXPIRED);
     }
   }
 
@@ -109,6 +113,8 @@ TxnRateLimiter::initialize(int argc, const char *argv[])
     {const_cast<char *>("retry"), required_argument, nullptr, 'r'},
     {const_cast<char *>("header"), required_argument, nullptr, 'h'},
     {const_cast<char *>("maxage"), required_argument, nullptr, 'm'},
+    {const_cast<char *>("prefix"), required_argument, nullptr, 'p'},
+    {const_cast<char *>("tag"), required_argument, nullptr, 't'},
     // EOF
     {nullptr, no_argument, nullptr, '\0'},
   };
@@ -135,6 +141,12 @@ TxnRateLimiter::initialize(int argc, const char *argv[])
     case 'h':
       this->header = optarg;
       break;
+    case 'p':
+      this->prefix = std::string(optarg);
+      break;
+    case 't':
+      this->tag = std::string(optarg);
+      break;
     }
     if (opt == -1) {
       break;
@@ -148,6 +160,8 @@ TxnRateLimiter::initialize(int argc, const char *argv[])
     _action = TSContScheduleEveryOnPool(_queue_cont, QUEUE_DELAY_TIME.count(), TS_THREAD_POOL_TASK);
   }
 
+  this->initializeMetrics(RATE_LIMITER_TYPE_REMAP);
+
   return true;
 }
 
diff --git a/plugins/experimental/rate_limit/utilities.cc b/plugins/experimental/rate_limit/utilities.cc
index c648d98c1..0838689c0 100644
--- a/plugins/experimental/rate_limit/utilities.cc
+++ b/plugins/experimental/rate_limit/utilities.cc
@@ -70,3 +70,49 @@ retryAfter(TSHttpTxn txnp, unsigned retry)
     }
   }
 }
+
+///////////////////////////////////////////////////////////////////////////////
+// Parse a URL to obtain a description for use with metrics when no user
+// provided tag is available. This is used by the remap side of the plugin,
+// while the SNI side uses the FQDN associated with each limiter instance
+// which is obtained from the list of SNIs in the global plugin configuration.
+//
+std::string
+getDescriptionFromUrl(const char *url)
+{
+  TSMBuffer const buf = TSMBufferCreate();
+  TSMLoc url_loc      = nullptr;
+
+  const int url_len = strlen(url);
+  std::string description;
+
+  if (TS_SUCCESS == TSUrlCreate(buf, &url_loc) && TS_PARSE_DONE == TSUrlParse(buf, url_loc, &url, url + url_len)) {
+    int host_len, scheme_len = 0;
+    const char *s  = TSUrlSchemeGet(buf, url_loc, &scheme_len);
+    const char *h  = TSUrlHostGet(buf, url_loc, &host_len);
+    const int port = TSUrlPortGet(buf, url_loc);
+
+    const std::string hostname = std::string(h, host_len);
+    const std::string scheme   = std::string(s, scheme_len);
+
+    TSDebug(PLUGIN_NAME, "scheme = %s, host = %s, port = %d", scheme.c_str(), hostname.c_str(), port);
+
+    description = scheme;
+    description.append(".");
+    description.append(hostname);
+
+    // only append the port when it is non-standard
+    if (!(strncmp(s, TS_URL_SCHEME_HTTP, scheme_len) == 0 && port == 80) &&
+        !(strncmp(s, TS_URL_SCHEME_HTTPS, scheme_len) == 0 && port == 443)) {
+      description.append(":" + std::to_string(port));
+    }
+  }
+
+  if (url_loc != nullptr) {
+    TSHandleMLocRelease(buf, nullptr, url_loc);
+  }
+
+  TSMBufferDestroy(buf);
+
+  return description;
+}
diff --git a/plugins/experimental/rate_limit/utilities.h b/plugins/experimental/rate_limit/utilities.h
index 0ff58bff3..e936912e6 100644
--- a/plugins/experimental/rate_limit/utilities.h
+++ b/plugins/experimental/rate_limit/utilities.h
@@ -26,3 +26,4 @@ constexpr char const PLUGIN_NAME[] = "rate_limit";
 
 void delayHeader(TSHttpTxn txnp, std::string &header, std::chrono::milliseconds delay);
 void retryAfter(TSHttpTxn txnp, unsigned retry);
+std::string getDescriptionFromUrl(const char *url);