You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by ti...@apache.org on 2024/02/16 19:13:31 UTC

(pinot) branch master updated: Add metrics for SEGMENTS_WITH_LESS_REPLICAS monitoring (#12336)

This is an automated email from the ASF dual-hosted git repository.

tingchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git


The following commit(s) were added to refs/heads/master by this push:
     new 9eaa3a1364 Add metrics for SEGMENTS_WITH_LESS_REPLICAS monitoring (#12336)
9eaa3a1364 is described below

commit 9eaa3a1364b0ad8882b83b8d692c161c19ad31a0
Author: lnbest0707-uber <10...@users.noreply.github.com>
AuthorDate: Fri Feb 16 11:13:25 2024 -0800

    Add metrics for SEGMENTS_WITH_LESS_REPLICAS monitoring (#12336)
    
    * Add metrics for no-HA segments monitoring
    
    Summary:
    Add metrics to monitor any segments running with only one replica.
    This could help us monitor the reliability risk during node replacement.
    
    * Fix UT
    
    * Track nIdeal - 1 replicas instead of 1 replica
    
    * Improve log message
    
    * Improve variable naming
---
 .../apache/pinot/common/metrics/ControllerGauge.java  |  3 +++
 .../pinot/controller/helix/SegmentStatusChecker.java  | 19 ++++++++++++++++---
 .../controller/helix/SegmentStatusCheckerTest.java    | 16 ++++++++++++++++
 3 files changed, 35 insertions(+), 3 deletions(-)

diff --git a/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java b/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java
index ca8c141447..82e86c55a9 100644
--- a/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java
+++ b/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java
@@ -40,6 +40,9 @@ public enum ControllerGauge implements AbstractMetrics.Gauge {
   // ideal state
   PERCENT_SEGMENTS_AVAILABLE("segments", false),
 
+  // Number of segments running with less than expected replicas in external view
+  SEGMENTS_WITH_LESS_REPLICAS("segments", false),
+
   SEGMENT_COUNT("SegmentCount", false),
 
   // Number of segments including the replaced segments which are specified in the segment lineage entries and cannot
diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/SegmentStatusChecker.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/SegmentStatusChecker.java
index f4121506a1..5b543e4319 100644
--- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/SegmentStatusChecker.java
+++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/SegmentStatusChecker.java
@@ -242,6 +242,7 @@ public class SegmentStatusChecker extends ControllerPeriodicTask<SegmentStatusCh
     int nReplicasExternal = -1; // Keeps track of minimum number of replicas in external view
     int nErrors = 0; // Keeps track of number of segments in error state
     int nOffline = 0; // Keeps track of number segments with no online replicas
+    int nNumOfReplicasLessThanIdeal = 0; // Keeps track of number of segments running with less than expected replicas
     int nSegments = 0; // Counts number of segments
     long tableCompressedSize = 0; // Tracks the total compressed segment size in deep store per table
     for (String partitionName : segmentsExcludeReplaced) {
@@ -303,6 +304,10 @@ public class SegmentStatusChecker extends ControllerPeriodicTask<SegmentStatusCh
           LOGGER.warn("Segment {} of table {} has no online replicas", partitionName, tableNameWithType);
         }
         nOffline++;
+      } else if (nReplicas < nReplicasIdealMax) {
+        LOGGER.debug("Segment {} of table {} is running with {} replicas which is less than the expected values {}",
+            partitionName, tableNameWithType, nReplicas, nReplicasIdealMax);
+        nNumOfReplicasLessThanIdeal++;
       }
       nReplicasExternal =
           ((nReplicasExternal > nReplicas) || (nReplicasExternal == -1)) ? nReplicas : nReplicasExternal;
@@ -315,6 +320,8 @@ public class SegmentStatusChecker extends ControllerPeriodicTask<SegmentStatusCh
     _controllerMetrics.setValueOfTableGauge(tableNameWithType, ControllerGauge.PERCENT_OF_REPLICAS,
         (nReplicasIdealMax > 0) ? (nReplicasExternal * 100 / nReplicasIdealMax) : 100);
     _controllerMetrics.setValueOfTableGauge(tableNameWithType, ControllerGauge.SEGMENTS_IN_ERROR_STATE, nErrors);
+    _controllerMetrics.setValueOfTableGauge(tableNameWithType, ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS,
+        nNumOfReplicasLessThanIdeal);
     _controllerMetrics.setValueOfTableGauge(tableNameWithType, ControllerGauge.PERCENT_SEGMENTS_AVAILABLE,
         (nSegments > 0) ? (nSegments - nOffline) * 100 / nSegments : 100);
     _controllerMetrics.setValueOfTableGauge(tableNameWithType, ControllerGauge.TABLE_COMPRESSED_SIZE,
@@ -323,9 +330,13 @@ public class SegmentStatusChecker extends ControllerPeriodicTask<SegmentStatusCh
     if (nOffline > 0) {
       LOGGER.warn("Table {} has {} segments with no online replicas", tableNameWithType, nOffline);
     }
+    if (nNumOfReplicasLessThanIdeal > 0) {
+      LOGGER.warn("Table {} has {} segments with number of replicas less than the replication factor",
+          tableNameWithType, nNumOfReplicasLessThanIdeal);
+    }
     if (nReplicasExternal < nReplicasIdealMax) {
-      LOGGER.warn("Table {} has {} replicas, below replication threshold :{}", tableNameWithType, nReplicasExternal,
-          nReplicasIdealMax);
+      LOGGER.warn("Table {} has at least one segment running with only {} replicas, below replication threshold :{}",
+          tableNameWithType, nReplicasExternal, nReplicasIdealMax);
     }
 
     if (tableType == TableType.REALTIME && tableConfig != null) {
@@ -346,13 +357,13 @@ public class SegmentStatusChecker extends ControllerPeriodicTask<SegmentStatusCh
     _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.NUMBER_OF_REPLICAS);
     _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.PERCENT_OF_REPLICAS);
     _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.PERCENT_SEGMENTS_AVAILABLE);
-
     _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.IDEALSTATE_ZNODE_SIZE);
     _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.IDEALSTATE_ZNODE_BYTE_SIZE);
     _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.SEGMENT_COUNT);
     _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.SEGMENT_COUNT_INCLUDING_REPLACED);
 
     _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.SEGMENTS_IN_ERROR_STATE);
+    _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS);
     _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.PERCENT_SEGMENTS_AVAILABLE);
     _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.TABLE_DISABLED);
     _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.TABLE_CONSUMPTION_PAUSED);
@@ -371,6 +382,8 @@ public class SegmentStatusChecker extends ControllerPeriodicTask<SegmentStatusCh
     _controllerMetrics.setValueOfTableGauge(tableName, ControllerGauge.NUMBER_OF_REPLICAS, Long.MIN_VALUE);
     _controllerMetrics.setValueOfTableGauge(tableName, ControllerGauge.PERCENT_OF_REPLICAS, Long.MIN_VALUE);
     _controllerMetrics.setValueOfTableGauge(tableName, ControllerGauge.SEGMENTS_IN_ERROR_STATE, Long.MIN_VALUE);
+    _controllerMetrics.setValueOfTableGauge(tableName, ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS,
+        Long.MIN_VALUE);
     _controllerMetrics.setValueOfTableGauge(tableName, ControllerGauge.PERCENT_SEGMENTS_AVAILABLE, Long.MIN_VALUE);
   }
 
diff --git a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/SegmentStatusCheckerTest.java b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/SegmentStatusCheckerTest.java
index 731f1f33d5..99991b3d4c 100644
--- a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/SegmentStatusCheckerTest.java
+++ b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/SegmentStatusCheckerTest.java
@@ -163,6 +163,8 @@ public class SegmentStatusCheckerTest {
             ControllerGauge.SEGMENT_COUNT_INCLUDING_REPLACED), 5);
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
             ControllerGauge.SEGMENTS_IN_ERROR_STATE), 1);
+    Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
+        ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), 2);
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
             ControllerGauge.NUMBER_OF_REPLICAS), 2);
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
@@ -248,6 +250,8 @@ public class SegmentStatusCheckerTest {
             ControllerGauge.REPLICATION_FROM_CONFIG), 3);
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
             ControllerGauge.SEGMENTS_IN_ERROR_STATE), 0);
+    Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
+        ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), 0);
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
             ControllerGauge.NUMBER_OF_REPLICAS), 3);
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
@@ -346,6 +350,8 @@ public class SegmentStatusCheckerTest {
     _segmentStatusChecker.run();
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
             ControllerGauge.SEGMENTS_IN_ERROR_STATE), 1);
+    Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
+        ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), 2);
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
             ControllerGauge.NUMBER_OF_REPLICAS), 0);
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
@@ -403,6 +409,8 @@ public class SegmentStatusCheckerTest {
     _segmentStatusChecker.run();
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
             ControllerGauge.SEGMENTS_IN_ERROR_STATE), 0);
+    Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
+        ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), 0);
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
             ControllerGauge.NUMBER_OF_REPLICAS), 0);
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
@@ -446,6 +454,8 @@ public class SegmentStatusCheckerTest {
     _segmentStatusChecker.run();
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
             ControllerGauge.SEGMENTS_IN_ERROR_STATE), Long.MIN_VALUE);
+    Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
+        ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), Long.MIN_VALUE);
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
             ControllerGauge.NUMBER_OF_REPLICAS), Long.MIN_VALUE);
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
@@ -538,6 +548,8 @@ public class SegmentStatusCheckerTest {
     _segmentStatusChecker.run();
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
             ControllerGauge.SEGMENTS_IN_ERROR_STATE), 0);
+    Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
+        ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), 0);
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
             ControllerGauge.NUMBER_OF_REPLICAS), 2);
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
@@ -593,6 +605,8 @@ public class SegmentStatusCheckerTest {
     _segmentStatusChecker.run();
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
         ControllerGauge.SEGMENTS_IN_ERROR_STATE), 0);
+    Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
+        ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), 0);
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
             ControllerGauge.NUMBER_OF_REPLICAS), 1);
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
@@ -806,6 +820,8 @@ public class SegmentStatusCheckerTest {
     _segmentStatusChecker.start();
     _segmentStatusChecker.run();
 
+    Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
+        ControllerGauge.SEGMENTS_IN_ERROR_STATE), Long.MIN_VALUE);
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
         ControllerGauge.SEGMENTS_IN_ERROR_STATE), Long.MIN_VALUE);
     Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,


---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org