You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@pulsar.apache.org by pe...@apache.org on 2021/02/18 15:57:09 UTC

[pulsar] 15/27: Fix testBrokerSelectionForAntiAffinityGroup by increasing OverloadedThreshold (#9393)

This is an automated email from the ASF dual-hosted git repository.

penghui pushed a commit to branch branch-2.7
in repository https://gitbox.apache.org/repos/asf/pulsar.git

commit 05d414d642ccdec029f4deee8aafa2dbac4e4de8
Author: Michael Marshall <47...@users.noreply.github.com>
AuthorDate: Wed Feb 10 18:45:20 2021 -0700

    Fix testBrokerSelectionForAntiAffinityGroup by increasing OverloadedThreshold (#9393)
    
    Fixes: #6368
    
    Flaky-test: `AntiAffinityNamespaceGroupTest.testBrokerSelectionForAntiAffinityGroup`
    
    TestClass is flaky. The testMethod test method fails sporadically.
    
    See #6368 for example failures as well as for my analysis and detailed justification for this change.
    
    In short, by setting the `LoadBalancerBrokerOverloadedThresholdPercentage` to `100`, we remove the main edge case that allows two namespaces in the same anti-affinity group to get placed on the same broker.
    
    Note that I am assuming the following method will never return a value greater than 1, which could lead to test failure. https://github.com/apache/pulsar/blob/85f3ff4edbaa10c7894af8ad823cbce37b13829c/pulsar-common/src/main/java/org/apache/pulsar/policies/data/loadbalancer/LocalBrokerData.java#L214-L217
    
    (cherry picked from commit 610b17d6433b0894a541421841b525741f8d5d40)
---
 .../loadbalance/impl/GenericBrokerHostUsageImpl.java  | 19 ++++++++++++-------
 .../loadbalance/impl/LinuxBrokerHostUsageImpl.java    |  7 ++++---
 .../loadbalance/AntiAffinityNamespaceGroupTest.java   | 11 +++++++++++
 3 files changed, 27 insertions(+), 10 deletions(-)

diff --git a/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/GenericBrokerHostUsageImpl.java b/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/GenericBrokerHostUsageImpl.java
index 9d3de7b..14c670c 100644
--- a/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/GenericBrokerHostUsageImpl.java
+++ b/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/GenericBrokerHostUsageImpl.java
@@ -52,8 +52,13 @@ public class GenericBrokerHostUsageImpl implements BrokerHostUsage {
         this.systemBean = (OperatingSystemMXBean) ManagementFactory.getOperatingSystemMXBean();
         this.usage = new SystemResourceUsage();
         this.totalCpuLimit = getTotalCpuLimit();
-        executorService.scheduleAtFixedRate(this::checkCpuLoad, 0, CPU_CHECK_MILLIS, TimeUnit.MILLISECONDS);
-        executorService.scheduleAtFixedRate(this::doCalculateBrokerHostUsage, 0, hostUsageCheckIntervalMin, TimeUnit.MINUTES);
+
+        // Call now to initialize values before the constructor returns
+        calculateBrokerHostUsage();
+        executorService.scheduleAtFixedRate(this::checkCpuLoad, CPU_CHECK_MILLIS,
+                CPU_CHECK_MILLIS, TimeUnit.MILLISECONDS);
+        executorService.scheduleAtFixedRate(this::doCalculateBrokerHostUsage, hostUsageCheckIntervalMin,
+                hostUsageCheckIntervalMin, TimeUnit.MINUTES);
     }
 
     @Override
@@ -61,7 +66,7 @@ public class GenericBrokerHostUsageImpl implements BrokerHostUsage {
         return usage;
     }
 
-    private void checkCpuLoad() {
+    private synchronized void checkCpuLoad() {
         cpuUsageSum += systemBean.getSystemCpuLoad();
         cpuUsageCount++;
     }
@@ -84,7 +89,10 @@ public class GenericBrokerHostUsageImpl implements BrokerHostUsage {
         return 100 * Runtime.getRuntime().availableProcessors();
     }
 
-    private double getTotalCpuUsage() {
+    private synchronized double getTotalCpuUsage() {
+        if (cpuUsageCount == 0) {
+            return 0;
+        }
         double cpuUsage = cpuUsageSum / cpuUsageCount;
         cpuUsageSum = 0d;
         cpuUsageCount = 0;
@@ -92,9 +100,6 @@ public class GenericBrokerHostUsageImpl implements BrokerHostUsage {
     }
 
     private ResourceUsage getCpuUsage() {
-        if (cpuUsageCount == 0) {
-            return new ResourceUsage(0, totalCpuLimit);
-        }
         return new ResourceUsage(getTotalCpuUsage() * totalCpuLimit, totalCpuLimit);
     }
 
diff --git a/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/LinuxBrokerHostUsageImpl.java b/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/LinuxBrokerHostUsageImpl.java
index 992b743..e18ff91 100644
--- a/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/LinuxBrokerHostUsageImpl.java
+++ b/pulsar-broker/src/main/java/org/apache/pulsar/broker/loadbalance/impl/LinuxBrokerHostUsageImpl.java
@@ -78,8 +78,6 @@ public class LinuxBrokerHostUsageImpl implements BrokerHostUsage {
         this.lastCollection = 0L;
         this.usage = new SystemResourceUsage();
         this.overrideBrokerNicSpeedGbps = overrideBrokerNicSpeedGbps;
-        executorService.scheduleAtFixedRate(this::calculateBrokerHostUsage, 0,
-                hostUsageCheckIntervalMin, TimeUnit.MINUTES);
 
         boolean isCGroupsEnabled = false;
         try {
@@ -87,9 +85,12 @@ public class LinuxBrokerHostUsageImpl implements BrokerHostUsage {
         } catch (Exception e) {
             log.warn("Failed to check cgroup CPU usage file: {}", e.getMessage());
         }
-
         this.isCGroupsEnabled = isCGroupsEnabled;
+
+        // Call now to initialize values before the constructor returns
         calculateBrokerHostUsage();
+        executorService.scheduleAtFixedRate(this::calculateBrokerHostUsage, hostUsageCheckIntervalMin,
+                hostUsageCheckIntervalMin, TimeUnit.MINUTES);
     }
 
     @Override
diff --git a/pulsar-broker/src/test/java/org/apache/pulsar/broker/loadbalance/AntiAffinityNamespaceGroupTest.java b/pulsar-broker/src/test/java/org/apache/pulsar/broker/loadbalance/AntiAffinityNamespaceGroupTest.java
index 6551172..2286477 100644
--- a/pulsar-broker/src/test/java/org/apache/pulsar/broker/loadbalance/AntiAffinityNamespaceGroupTest.java
+++ b/pulsar-broker/src/test/java/org/apache/pulsar/broker/loadbalance/AntiAffinityNamespaceGroupTest.java
@@ -113,6 +113,8 @@ public class AntiAffinityNamespaceGroupTest {
         config1.setFailureDomainsEnabled(true);
         config1.setLoadBalancerEnabled(true);
         config1.setAdvertisedAddress("localhost");
+        // Don't want overloaded threshold to affect namespace placement
+        config1.setLoadBalancerBrokerOverloadedThresholdPercentage(400);
         createCluster(bkEnsemble.getZkClient(), config1);
         pulsar1 = new PulsarService(config1);
         pulsar1.start();
@@ -130,6 +132,8 @@ public class AntiAffinityNamespaceGroupTest {
         config2.setBrokerServicePort(Optional.of(0));
         config2.setFailureDomainsEnabled(true);
         config2.setAdvertisedAddress("localhost");
+        // Don't want overloaded threshold to affect namespace placement
+        config2.setLoadBalancerBrokerOverloadedThresholdPercentage(400);
         pulsar2 = new PulsarService(config2);
         pulsar2.start();
 
@@ -366,6 +370,13 @@ public class AntiAffinityNamespaceGroupTest {
     /**
      * It verifies anti-affinity with failure domain enabled with 2 brokers.
      *
+     * Note: in this class's setup method, the LoadBalancerBrokerOverloadedThresholdPercentage
+     * is set to 400 to ensure that the overloaded logic doesn't affect the broker selection.
+     * Without that configuration, two namespaces in the same anti-affinity group could
+     * be placed on the same broker. The CPU usage can be over 100%, and if we run with more
+     * than 4 cores, it could conceivably be above 400%. If this test becomes flaky again,
+     * look at the logs to see if there is a mention of an overloaded broker.
+     *
      * <pre>
      * 1. Register brokers to domain: domain-1: broker1 & domain-2: broker2
      * 2. Load-Manager receives a watch and updates brokerToDomain cache with new domain data