You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@helix.apache.org by jx...@apache.org on 2018/11/02 20:53:13 UTC
helix git commit: [HELIX-785] Record helix latency instead of user
latency in top state handoff metrics
Repository: helix
Updated Branches:
refs/heads/master 45009e2c4 -> ccf263c91
[HELIX-785] Record helix latency instead of user latency in top state handoff metrics
Project: http://git-wip-us.apache.org/repos/asf/helix/repo
Commit: http://git-wip-us.apache.org/repos/asf/helix/commit/ccf263c9
Tree: http://git-wip-us.apache.org/repos/asf/helix/tree/ccf263c9
Diff: http://git-wip-us.apache.org/repos/asf/helix/diff/ccf263c9
Branch: refs/heads/master
Commit: ccf263c9110fa5af54f1c1cabd8b5a2af64d473e
Parents: 45009e2
Author: Harry Zhang <hr...@linkedin.com>
Authored: Fri Nov 2 11:30:15 2018 -0700
Committer: Harry Zhang <hr...@linkedin.com>
Committed: Fri Nov 2 13:52:15 2018 -0700
----------------------------------------------------------------------
.../stages/TopStateHandoffReportStage.java | 14 +++---
.../monitoring/mbeans/ClusterStatusMonitor.java | 4 +-
.../monitoring/mbeans/ResourceMonitor.java | 16 +++---
.../mbeans/TestTopStateHandoffMetrics.java | 52 ++++++++++----------
.../resources/TestTopStateHandoffMetrics.json | 12 ++---
5 files changed, 48 insertions(+), 50 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/helix/blob/ccf263c9/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
----------------------------------------------------------------------
diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
index 5bfd5e6..d86e430 100644
--- a/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
+++ b/helix-core/src/main/java/org/apache/helix/controller/stages/TopStateHandoffReportStage.java
@@ -287,13 +287,13 @@ public class TopStateHandoffReportStage extends AbstractBaseStage {
}
long duration = endTime - startTime;
- long userLatency = fromTopStateUserLatency + toTopStateuserLatency;
+ long helixLatency = duration - fromTopStateUserLatency - toTopStateuserLatency;
// We always treat such top state handoff as graceful as if top state handoff is triggered
// by instance crash, we cannot observe the entire handoff process within 1 pipeline run
- logMissingTopStateInfo(duration, userLatency, true, partition.getPartitionName());
+ logMissingTopStateInfo(duration, helixLatency, true, partition.getPartitionName());
if (clusterStatusMonitor != null) {
clusterStatusMonitor
- .updateMissingTopStateDurationStats(resourceName, duration, userLatency, true, true);
+ .updateMissingTopStateDurationStats(resourceName, duration, helixLatency, true, true);
}
}
@@ -468,17 +468,17 @@ public class TopStateHandoffReportStage extends AbstractBaseStage {
if (handOffStartTime > 0 && handOffEndTime - handOffStartTime <= threshold) {
long duration = handOffEndTime - handOffStartTime;
- long userLatency = fromTopStateUserLatency + toTopStateUserLatency;
+ long helixLatency = duration - fromTopStateUserLatency - toTopStateUserLatency;
// It is possible that during controller leader switch, we lost previous master information
// and use current time to approximate missing top state start time. If we see the actual
// user latency is larger than the duration we estimated, we use user latency to start with
- duration = Math.max(duration, userLatency);
+ duration = Math.max(duration, helixLatency);
boolean isGraceful = record.isGracefulHandoff();
- logMissingTopStateInfo(duration, userLatency, isGraceful, partition.getPartitionName());
+ logMissingTopStateInfo(duration, helixLatency, isGraceful, partition.getPartitionName());
if (clusterStatusMonitor != null) {
clusterStatusMonitor
- .updateMissingTopStateDurationStats(resourceName, duration, userLatency, isGraceful,
+ .updateMissingTopStateDurationStats(resourceName, duration, helixLatency, isGraceful,
true);
}
}
http://git-wip-us.apache.org/repos/asf/helix/blob/ccf263c9/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java
----------------------------------------------------------------------
diff --git a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java
index 803bd3c..a980d3c 100644
--- a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java
+++ b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ClusterStatusMonitor.java
@@ -467,12 +467,12 @@ public class ClusterStatusMonitor implements ClusterStatusMonitorMBean {
}
public synchronized void updateMissingTopStateDurationStats(String resourceName,
- long totalDuration, long userLatency, boolean isGraceful, boolean succeeded) {
+ long totalDuration, long helixLatency, boolean isGraceful, boolean succeeded) {
ResourceMonitor resourceMonitor = getOrCreateResourceMonitor(resourceName);
if (resourceMonitor != null) {
resourceMonitor.updateStateHandoffStats(ResourceMonitor.MonitorState.TOP_STATE, totalDuration,
- userLatency, isGraceful, succeeded);
+ helixLatency, isGraceful, succeeded);
}
}
http://git-wip-us.apache.org/repos/asf/helix/blob/ccf263c9/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
----------------------------------------------------------------------
diff --git a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
index fb9a779..c66ae2d 100644
--- a/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
+++ b/helix-core/src/main/java/org/apache/helix/monitoring/mbeans/ResourceMonitor.java
@@ -71,7 +71,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
// Histograms
private HistogramDynamicMetric _partitionTopStateHandoffDurationGauge;
- private HistogramDynamicMetric _partitionTopStateHandoffUserLatencyGauge;
+ private HistogramDynamicMetric _partitionTopStateHandoffHelixLatencyGauge;
private HistogramDynamicMetric _partitionTopStateNonGracefulHandoffDurationGauge;
private SimpleDynamicMetric<String> _rebalanceState;
@@ -101,7 +101,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
attributeList.add(_failedTopStateHandoffCounter);
attributeList.add(_maxSinglePartitionTopStateHandoffDuration);
attributeList.add(_partitionTopStateHandoffDurationGauge);
- attributeList.add(_partitionTopStateHandoffUserLatencyGauge);
+ attributeList.add(_partitionTopStateHandoffHelixLatencyGauge);
attributeList.add(_partitionTopStateNonGracefulHandoffDurationGauge);
attributeList.add(_totalMessageReceived);
attributeList.add(_numPendingStateTransitions);
@@ -142,8 +142,8 @@ public class ResourceMonitor extends DynamicMBeanProvider {
new HistogramDynamicMetric("PartitionTopStateHandoffDurationGauge", new Histogram(
new SlidingTimeWindowArrayReservoir(DEFAULT_RESET_INTERVAL_MS, TimeUnit.MILLISECONDS)));
- _partitionTopStateHandoffUserLatencyGauge =
- new HistogramDynamicMetric("PartitionTopStateHandoffUserLatencyGauge", new Histogram(
+ _partitionTopStateHandoffHelixLatencyGauge =
+ new HistogramDynamicMetric("PartitionTopStateHandoffHelixLatencyGauge", new Histogram(
new SlidingTimeWindowArrayReservoir(DEFAULT_RESET_INTERVAL_MS, TimeUnit.MILLISECONDS)));
_partitionTopStateNonGracefulHandoffDurationGauge =
new HistogramDynamicMetric("PartitionTopStateNonGracefulHandoffGauge", new Histogram(
@@ -202,8 +202,8 @@ public class ResourceMonitor extends DynamicMBeanProvider {
return _partitionTopStateNonGracefulHandoffDurationGauge;
}
- public HistogramDynamicMetric getPartitionTopStateHandoffUserLatencyGauge() {
- return _partitionTopStateHandoffUserLatencyGauge;
+ public HistogramDynamicMetric getPartitionTopStateHandoffHelixLatencyGauge() {
+ return _partitionTopStateHandoffHelixLatencyGauge;
}
public long getFailedTopStateHandoffCounter() {
@@ -345,7 +345,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
}
public void updateStateHandoffStats(MonitorState monitorState, long totalDuration,
- long userLatency, boolean isGraceful, boolean succeeded) {
+ long helixLatency, boolean isGraceful, boolean succeeded) {
switch (monitorState) {
case TOP_STATE:
if (succeeded) {
@@ -354,7 +354,7 @@ public class ResourceMonitor extends DynamicMBeanProvider {
.updateValue(_successfulTopStateHandoffDurationCounter.getValue() + totalDuration);
if (isGraceful) {
_partitionTopStateHandoffDurationGauge.updateValue(totalDuration);
- _partitionTopStateHandoffUserLatencyGauge.updateValue(userLatency);
+ _partitionTopStateHandoffHelixLatencyGauge.updateValue(helixLatency);
} else {
_partitionTopStateNonGracefulHandoffDurationGauge.updateValue(totalDuration);
}
http://git-wip-us.apache.org/repos/asf/helix/blob/ccf263c9/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
----------------------------------------------------------------------
diff --git a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
index c1cfce0..5550386 100644
--- a/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
+++ b/helix-core/src/test/java/org/apache/helix/monitoring/mbeans/TestTopStateHandoffMetrics.java
@@ -52,7 +52,7 @@ public class TestTopStateHandoffMetrics extends BaseStageTest {
private static final ObjectMapper OBJECT_MAPPER = new ObjectMapper();
private static final String NON_GRACEFUL_HANDOFF_DURATION = "PartitionTopStateNonGracefulHandoffGauge.Max";
private static final String GRACEFUL_HANDOFF_DURATION = "PartitionTopStateHandoffDurationGauge.Max";
- private static final String HANDOFF_USER_LATENCY = "PartitionTopStateHandoffUserLatencyGauge.Max";
+ private static final String HANDOFF_HELIX_LATENCY = "PartitionTopStateHandoffHelixLatencyGauge.Max";
private static final Range<Long> DURATION_ZERO = Range.closed(0L, 0L);
private TestConfig config;
@@ -93,7 +93,7 @@ public class TestTopStateHandoffMetrics extends BaseStageTest {
final Map<String, CurrentStateInfo> finalCurrentState;
final long duration;
final boolean isGraceful;
- final long userLatency;
+ final long helixLatency;
@JsonCreator
public TestCaseConfig(
@@ -101,14 +101,14 @@ public class TestTopStateHandoffMetrics extends BaseStageTest {
@JsonProperty("MissingTopStates") Map<String, CurrentStateInfo> missing,
@JsonProperty("HandoffCurrentStates") Map<String, CurrentStateInfo> handoff,
@JsonProperty("Duration") long d,
- @JsonProperty("UserLatency") long user,
+ @JsonProperty("HelixLatency") long helix,
@JsonProperty("IsGraceful") boolean graceful
) {
initialCurrentStates = initial;
currentStateWithMissingTopState = missing;
finalCurrentState = handoff;
duration = d;
- userLatency = user;
+ helixLatency = helix;
isGraceful = graceful;
}
}
@@ -170,7 +170,7 @@ public class TestTopStateHandoffMetrics extends BaseStageTest {
// since this is a single top state handoff observed within 1 pipeline, we treat it as graceful,
// and only record user latency for transiting to master
Range<Long> expectedDuration = Range.closed(1500L, 1500L);
- Range<Long> expectedUserLatency = Range.closed(1000L, 1000L);
+ Range<Long> expectedHelixLatency = Range.closed(500L, 500L);
runStageAndVerify(
cfg.initialCurrentStates, cfg.currentStateWithMissingTopState, cfg.finalCurrentState,
new MissingStatesDataCacheInject() {
@@ -181,8 +181,7 @@ public class TestTopStateHandoffMetrics extends BaseStageTest {
}, 1, 0,
expectedDuration,
DURATION_ZERO,
- expectedDuration,
- expectedUserLatency
+ expectedDuration, expectedHelixLatency
);
event.addAttribute(AttributeName.LastRebalanceFinishTimeStamp.name(),
TopStateHandoffReportStage.TIMESTAMP_NOT_RECORDED);
@@ -235,26 +234,24 @@ public class TestTopStateHandoffMetrics extends BaseStageTest {
// in such case, reportTopStateMissing will use current system time as missing top state
// start time, and we assume it is a graceful handoff, and only "to top state" user latency
// will be recorded
- long userLatency = 0;
- for (CurrentStateInfo info : cfg.currentStateWithMissingTopState.values()) {
- if (info.previousState.equals("MASTER")) {
- userLatency = cfg.userLatency - (info.endTime - info.startTime);
- }
- info.previousState = "OFFLINE";
- }
+ long helixLatency = 1000;
+ long userLatency = 1000;
for (CurrentStateInfo states : cfg.finalCurrentState.values()) {
if (states.currentState.equals("MASTER")) {
- states.endTime = System.currentTimeMillis();
- states.startTime = System.currentTimeMillis() - userLatency;
+ states.endTime = System.currentTimeMillis() + helixLatency + userLatency;
+ states.startTime = System.currentTimeMillis() + helixLatency;
break;
}
}
+
+ // actual timestamp when running the stage will be later than current time, so the expected
+ // helix latency will be less than the mocked helix latency
runStageAndVerify(Collections.EMPTY_MAP, cfg.currentStateWithMissingTopState,
cfg.finalCurrentState, null, 1, 0,
- Range.closed(userLatency, userLatency),
+ Range.closed(0L, helixLatency + userLatency),
DURATION_ZERO,
- Range.closed(userLatency, userLatency),
- Range.closed(userLatency, userLatency)
+ Range.closed(0L, helixLatency + userLatency),
+ Range.closed(0L, helixLatency)
);
}
@@ -277,6 +274,7 @@ public class TestTopStateHandoffMetrics extends BaseStageTest {
userLatency = info.endTime - info.startTime;
}
}
+ long helixLatency = durationToVerify - userLatency;
// No initialCurrentStates means no input can be used as the clue of the previous master.
// in this case, we will treat the handoff as graceful and only to-master user latency
@@ -311,7 +309,7 @@ public class TestTopStateHandoffMetrics extends BaseStageTest {
Range.closed(durationToVerify, durationToVerify),
DURATION_ZERO,
Range.closed(durationToVerify, durationToVerify),
- Range.closed(userLatency, userLatency));
+ Range.closed(helixLatency, helixLatency));
}
@DataProvider(name = "successCurrentStateInput")
@@ -347,11 +345,11 @@ public class TestTopStateHandoffMetrics extends BaseStageTest {
Range<Long> duration = Range.closed(cfg.duration, cfg.duration);
Range<Long> expectedDuration = cfg.isGraceful ? duration : DURATION_ZERO;
Range<Long> expectedNonGracefulDuration = cfg.isGraceful ? DURATION_ZERO : duration;
- Range<Long> expectedUserLatency =
- cfg.isGraceful ? Range.closed(cfg.userLatency, cfg.userLatency) : DURATION_ZERO;
+ Range<Long> expectedHelixLatency =
+ cfg.isGraceful ? Range.closed(cfg.helixLatency, cfg.helixLatency) : DURATION_ZERO;
runStageAndVerify(cfg.initialCurrentStates, cfg.currentStateWithMissingTopState,
cfg.finalCurrentState, null, expectFail ? 0 : 1, expectFail ? 1 : 0, expectedDuration, expectedNonGracefulDuration,
- expectedDuration, expectedUserLatency);
+ expectedDuration, expectedHelixLatency);
}
private Map<String, CurrentState> generateCurrentStateMap(
@@ -410,7 +408,7 @@ public class TestTopStateHandoffMetrics extends BaseStageTest {
Range<Long> expectedDuration,
Range<Long> expectedNonGracefulDuration,
Range<Long> expectedMaxDuration,
- Range<Long> expectedUserLatency
+ Range<Long> expectedHelixLatency
) {
runPipeLine(initialCurrentStates, missingTopStates, handOffCurrentStates, inject);
ClusterStatusMonitor clusterStatusMonitor =
@@ -424,12 +422,12 @@ public class TestTopStateHandoffMetrics extends BaseStageTest {
.getAttributeValue(GRACEFUL_HANDOFF_DURATION).longValue();
long nonGraceful = monitor.getPartitionTopStateNonGracefulHandoffDurationGauge()
.getAttributeValue(NON_GRACEFUL_HANDOFF_DURATION).longValue();
- long user = monitor.getPartitionTopStateHandoffUserLatencyGauge()
- .getAttributeValue(HANDOFF_USER_LATENCY).longValue();
+ long helix = monitor.getPartitionTopStateHandoffHelixLatencyGauge()
+ .getAttributeValue(HANDOFF_HELIX_LATENCY).longValue();
long max = monitor.getMaxSinglePartitionTopStateHandoffDurationGauge();
Assert.assertTrue(expectedDuration.contains(graceful));
Assert.assertTrue(expectedNonGracefulDuration.contains(nonGraceful));
- Assert.assertTrue(expectedUserLatency.contains(user));
+ Assert.assertTrue(expectedHelixLatency.contains(helix));
Assert.assertTrue(expectedMaxDuration.contains(max));
}
http://git-wip-us.apache.org/repos/asf/helix/blob/ccf263c9/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
----------------------------------------------------------------------
diff --git a/helix-core/src/test/resources/TestTopStateHandoffMetrics.json b/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
index 0b39a1d..c1e579f 100644
--- a/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
+++ b/helix-core/src/test/resources/TestTopStateHandoffMetrics.json
@@ -62,7 +62,7 @@
}
},
"Duration": 7000,
- "UserLatency": 5000,
+ "HelixLatency": 2000,
"IsGraceful": true
},
{
@@ -127,7 +127,7 @@
}
},
"Duration": 14000,
- "UserLatency": 4000,
+ "HelixLatency": 10000,
"IsGraceful": true
}
],
@@ -182,7 +182,7 @@
}
},
"Duration": 7000,
- "UserLatency": 5000,
+ "HelixLatency": 2000,
"IsGraceful": false
}
],
@@ -249,7 +249,7 @@
}
},
"Duration": 0,
- "UserLatency": 0,
+ "HelixLatency": 0,
"IsGraceful": false
},
{
@@ -314,7 +314,7 @@
}
},
"Duration": 0,
- "UserLatency": 0,
+ "HelixLatency": 0,
"IsGraceful": false
}
],
@@ -364,7 +364,7 @@
},
"Duration": 3000,
- "UserLatency": 2000,
+ "HelixLatency": 1000,
"IsGraceful": true
}
]