You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pinot.apache.org by ti...@apache.org on 2024/02/16 19:13:31 UTC
(pinot) branch master updated: Add metrics for SEGMENTS_WITH_LESS_REPLICAS monitoring (#12336)
This is an automated email from the ASF dual-hosted git repository.
tingchen pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/pinot.git
The following commit(s) were added to refs/heads/master by this push:
new 9eaa3a1364 Add metrics for SEGMENTS_WITH_LESS_REPLICAS monitoring (#12336)
9eaa3a1364 is described below
commit 9eaa3a1364b0ad8882b83b8d692c161c19ad31a0
Author: lnbest0707-uber <10...@users.noreply.github.com>
AuthorDate: Fri Feb 16 11:13:25 2024 -0800
Add metrics for SEGMENTS_WITH_LESS_REPLICAS monitoring (#12336)
* Add metrics for no-HA segments monitoring
Summary:
Add metrics to monitor any segments running with only one replica.
This could help us monitor the reliability risk during node replacement.
* Fix UT
* Track nIdeal - 1 replicas instead of 1 replica
* Improve log message
* Improve variable naming
---
.../apache/pinot/common/metrics/ControllerGauge.java | 3 +++
.../pinot/controller/helix/SegmentStatusChecker.java | 19 ++++++++++++++++---
.../controller/helix/SegmentStatusCheckerTest.java | 16 ++++++++++++++++
3 files changed, 35 insertions(+), 3 deletions(-)
diff --git a/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java b/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java
index ca8c141447..82e86c55a9 100644
--- a/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java
+++ b/pinot-common/src/main/java/org/apache/pinot/common/metrics/ControllerGauge.java
@@ -40,6 +40,9 @@ public enum ControllerGauge implements AbstractMetrics.Gauge {
// ideal state
PERCENT_SEGMENTS_AVAILABLE("segments", false),
+ // Number of segments running with less than expected replicas in external view
+ SEGMENTS_WITH_LESS_REPLICAS("segments", false),
+
SEGMENT_COUNT("SegmentCount", false),
// Number of segments including the replaced segments which are specified in the segment lineage entries and cannot
diff --git a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/SegmentStatusChecker.java b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/SegmentStatusChecker.java
index f4121506a1..5b543e4319 100644
--- a/pinot-controller/src/main/java/org/apache/pinot/controller/helix/SegmentStatusChecker.java
+++ b/pinot-controller/src/main/java/org/apache/pinot/controller/helix/SegmentStatusChecker.java
@@ -242,6 +242,7 @@ public class SegmentStatusChecker extends ControllerPeriodicTask<SegmentStatusCh
int nReplicasExternal = -1; // Keeps track of minimum number of replicas in external view
int nErrors = 0; // Keeps track of number of segments in error state
int nOffline = 0; // Keeps track of number segments with no online replicas
+ int nNumOfReplicasLessThanIdeal = 0; // Keeps track of number of segments running with less than expected replicas
int nSegments = 0; // Counts number of segments
long tableCompressedSize = 0; // Tracks the total compressed segment size in deep store per table
for (String partitionName : segmentsExcludeReplaced) {
@@ -303,6 +304,10 @@ public class SegmentStatusChecker extends ControllerPeriodicTask<SegmentStatusCh
LOGGER.warn("Segment {} of table {} has no online replicas", partitionName, tableNameWithType);
}
nOffline++;
+ } else if (nReplicas < nReplicasIdealMax) {
+ LOGGER.debug("Segment {} of table {} is running with {} replicas which is less than the expected values {}",
+ partitionName, tableNameWithType, nReplicas, nReplicasIdealMax);
+ nNumOfReplicasLessThanIdeal++;
}
nReplicasExternal =
((nReplicasExternal > nReplicas) || (nReplicasExternal == -1)) ? nReplicas : nReplicasExternal;
@@ -315,6 +320,8 @@ public class SegmentStatusChecker extends ControllerPeriodicTask<SegmentStatusCh
_controllerMetrics.setValueOfTableGauge(tableNameWithType, ControllerGauge.PERCENT_OF_REPLICAS,
(nReplicasIdealMax > 0) ? (nReplicasExternal * 100 / nReplicasIdealMax) : 100);
_controllerMetrics.setValueOfTableGauge(tableNameWithType, ControllerGauge.SEGMENTS_IN_ERROR_STATE, nErrors);
+ _controllerMetrics.setValueOfTableGauge(tableNameWithType, ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS,
+ nNumOfReplicasLessThanIdeal);
_controllerMetrics.setValueOfTableGauge(tableNameWithType, ControllerGauge.PERCENT_SEGMENTS_AVAILABLE,
(nSegments > 0) ? (nSegments - nOffline) * 100 / nSegments : 100);
_controllerMetrics.setValueOfTableGauge(tableNameWithType, ControllerGauge.TABLE_COMPRESSED_SIZE,
@@ -323,9 +330,13 @@ public class SegmentStatusChecker extends ControllerPeriodicTask<SegmentStatusCh
if (nOffline > 0) {
LOGGER.warn("Table {} has {} segments with no online replicas", tableNameWithType, nOffline);
}
+ if (nNumOfReplicasLessThanIdeal > 0) {
+ LOGGER.warn("Table {} has {} segments with number of replicas less than the replication factor",
+ tableNameWithType, nNumOfReplicasLessThanIdeal);
+ }
if (nReplicasExternal < nReplicasIdealMax) {
- LOGGER.warn("Table {} has {} replicas, below replication threshold :{}", tableNameWithType, nReplicasExternal,
- nReplicasIdealMax);
+ LOGGER.warn("Table {} has at least one segment running with only {} replicas, below replication threshold :{}",
+ tableNameWithType, nReplicasExternal, nReplicasIdealMax);
}
if (tableType == TableType.REALTIME && tableConfig != null) {
@@ -346,13 +357,13 @@ public class SegmentStatusChecker extends ControllerPeriodicTask<SegmentStatusCh
_controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.NUMBER_OF_REPLICAS);
_controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.PERCENT_OF_REPLICAS);
_controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.PERCENT_SEGMENTS_AVAILABLE);
-
_controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.IDEALSTATE_ZNODE_SIZE);
_controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.IDEALSTATE_ZNODE_BYTE_SIZE);
_controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.SEGMENT_COUNT);
_controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.SEGMENT_COUNT_INCLUDING_REPLACED);
_controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.SEGMENTS_IN_ERROR_STATE);
+ _controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS);
_controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.PERCENT_SEGMENTS_AVAILABLE);
_controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.TABLE_DISABLED);
_controllerMetrics.removeTableGauge(tableNameWithType, ControllerGauge.TABLE_CONSUMPTION_PAUSED);
@@ -371,6 +382,8 @@ public class SegmentStatusChecker extends ControllerPeriodicTask<SegmentStatusCh
_controllerMetrics.setValueOfTableGauge(tableName, ControllerGauge.NUMBER_OF_REPLICAS, Long.MIN_VALUE);
_controllerMetrics.setValueOfTableGauge(tableName, ControllerGauge.PERCENT_OF_REPLICAS, Long.MIN_VALUE);
_controllerMetrics.setValueOfTableGauge(tableName, ControllerGauge.SEGMENTS_IN_ERROR_STATE, Long.MIN_VALUE);
+ _controllerMetrics.setValueOfTableGauge(tableName, ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS,
+ Long.MIN_VALUE);
_controllerMetrics.setValueOfTableGauge(tableName, ControllerGauge.PERCENT_SEGMENTS_AVAILABLE, Long.MIN_VALUE);
}
diff --git a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/SegmentStatusCheckerTest.java b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/SegmentStatusCheckerTest.java
index 731f1f33d5..99991b3d4c 100644
--- a/pinot-controller/src/test/java/org/apache/pinot/controller/helix/SegmentStatusCheckerTest.java
+++ b/pinot-controller/src/test/java/org/apache/pinot/controller/helix/SegmentStatusCheckerTest.java
@@ -163,6 +163,8 @@ public class SegmentStatusCheckerTest {
ControllerGauge.SEGMENT_COUNT_INCLUDING_REPLACED), 5);
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
ControllerGauge.SEGMENTS_IN_ERROR_STATE), 1);
+ Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
+ ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), 2);
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
ControllerGauge.NUMBER_OF_REPLICAS), 2);
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
@@ -248,6 +250,8 @@ public class SegmentStatusCheckerTest {
ControllerGauge.REPLICATION_FROM_CONFIG), 3);
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
ControllerGauge.SEGMENTS_IN_ERROR_STATE), 0);
+ Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
+ ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), 0);
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
ControllerGauge.NUMBER_OF_REPLICAS), 3);
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
@@ -346,6 +350,8 @@ public class SegmentStatusCheckerTest {
_segmentStatusChecker.run();
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
ControllerGauge.SEGMENTS_IN_ERROR_STATE), 1);
+ Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
+ ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), 2);
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
ControllerGauge.NUMBER_OF_REPLICAS), 0);
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
@@ -403,6 +409,8 @@ public class SegmentStatusCheckerTest {
_segmentStatusChecker.run();
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
ControllerGauge.SEGMENTS_IN_ERROR_STATE), 0);
+ Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
+ ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), 0);
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
ControllerGauge.NUMBER_OF_REPLICAS), 0);
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
@@ -446,6 +454,8 @@ public class SegmentStatusCheckerTest {
_segmentStatusChecker.run();
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
ControllerGauge.SEGMENTS_IN_ERROR_STATE), Long.MIN_VALUE);
+ Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
+ ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), Long.MIN_VALUE);
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
ControllerGauge.NUMBER_OF_REPLICAS), Long.MIN_VALUE);
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
@@ -538,6 +548,8 @@ public class SegmentStatusCheckerTest {
_segmentStatusChecker.run();
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
ControllerGauge.SEGMENTS_IN_ERROR_STATE), 0);
+ Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
+ ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), 0);
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
ControllerGauge.NUMBER_OF_REPLICAS), 2);
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, externalView.getId(),
@@ -593,6 +605,8 @@ public class SegmentStatusCheckerTest {
_segmentStatusChecker.run();
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
ControllerGauge.SEGMENTS_IN_ERROR_STATE), 0);
+ Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
+ ControllerGauge.SEGMENTS_WITH_LESS_REPLICAS), 0);
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
ControllerGauge.NUMBER_OF_REPLICAS), 1);
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
@@ -806,6 +820,8 @@ public class SegmentStatusCheckerTest {
_segmentStatusChecker.start();
_segmentStatusChecker.run();
+ Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
+ ControllerGauge.SEGMENTS_IN_ERROR_STATE), Long.MIN_VALUE);
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
ControllerGauge.SEGMENTS_IN_ERROR_STATE), Long.MIN_VALUE);
Assert.assertEquals(MetricValueUtils.getTableGaugeValue(_controllerMetrics, tableName,
---------------------------------------------------------------------
To unsubscribe, e-mail: commits-unsubscribe@pinot.apache.org
For additional commands, e-mail: commits-help@pinot.apache.org