You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@helix.apache.org by xy...@apache.org on 2023/10/24 21:42:10 UTC

[helix] branch ApplicationClusterManager updated: Refactor stoppable check logic for enhanced zone analysis (#2654)

This is an automated email from the ASF dual-hosted git repository.

xyuanlu pushed a commit to branch ApplicationClusterManager
in repository https://gitbox.apache.org/repos/asf/helix.git


The following commit(s) were added to refs/heads/ApplicationClusterManager by this push:
     new 9f4000611 Refactor stoppable check logic for enhanced zone analysis (#2654)
9f4000611 is described below

commit 9f4000611c9aa59b9bbd65d2d50667237e278535
Author: Xiaxuan Gao <32...@users.noreply.github.com>
AuthorDate: Tue Oct 24 14:42:04 2023 -0700

    Refactor stoppable check logic for enhanced zone analysis (#2654)
    
    Refactor the zone-based stoppable check logic and add support to randomly select zone order for the zone-based stoppable check.
---
 .../StoppableInstancesSelector.java                | 232 +++++++++++++++++++++
 .../server/resources/helix/InstancesAccessor.java  | 110 +++-------
 2 files changed, 261 insertions(+), 81 deletions(-)

diff --git a/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/StoppableInstancesSelector.java b/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/StoppableInstancesSelector.java
new file mode 100644
index 000000000..dafe1ab2d
--- /dev/null
+++ b/helix-rest/src/main/java/org/apache/helix/rest/clusterMaintenanceService/StoppableInstancesSelector.java
@@ -0,0 +1,232 @@
+package org.apache.helix.rest.clusterMaintenanceService;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.TreeSet;
+import java.util.stream.Collectors;
+
+import com.fasterxml.jackson.databind.node.ArrayNode;
+import com.fasterxml.jackson.databind.node.JsonNodeFactory;
+import com.fasterxml.jackson.databind.node.ObjectNode;
+import org.apache.commons.lang3.NotImplementedException;
+import org.apache.helix.rest.server.json.cluster.ClusterTopology;
+import org.apache.helix.rest.server.json.instance.StoppableCheck;
+
+public class StoppableInstancesSelector {
+  // This type does not belong to real HealthCheck failed reason. Also, if we add this type
+  // to HealthCheck enum, it could introduce more unnecessary check step since the InstanceServiceImpl
+  // loops all the types to do corresponding checks.
+  private final static String INSTANCE_NOT_EXIST = "HELIX:INSTANCE_NOT_EXIST";
+  private String _clusterId;
+  private List<String> _orderOfZone;
+  private String _customizedInput;
+  private ArrayNode _stoppableInstances;
+  private ObjectNode _failedStoppableInstances;
+  private MaintenanceManagementService _maintenanceService;
+  private ClusterTopology _clusterTopology;
+
+  public StoppableInstancesSelector(String clusterId, List<String> orderOfZone,
+      String customizedInput, ArrayNode stoppableInstances, ObjectNode failedStoppableInstances,
+      MaintenanceManagementService maintenanceService, ClusterTopology clusterTopology) {
+    _clusterId = clusterId;
+    _orderOfZone = orderOfZone;
+    _customizedInput = customizedInput;
+    _stoppableInstances = stoppableInstances;
+    _failedStoppableInstances = failedStoppableInstances;
+    _maintenanceService = maintenanceService;
+    _clusterTopology = clusterTopology;
+  }
+
+  /**
+   * Evaluates and collects stoppable instances within a specified or determined zone based on the order of zones.
+   * If _orderOfZone is specified, the method targets the first non-empty zone; otherwise, it targets the zone with
+   * the highest instance count. The method iterates through instances, performing stoppable checks, and records
+   * reasons for non-stoppability.
+   *
+   * @param instances A list of instance to be evaluated.
+   * @throws IOException
+   */
+  public void getStoppableInstancesInSingleZone(List<String> instances) throws IOException {
+    List<String> zoneBasedInstance =
+        getZoneBasedInstances(instances, _clusterTopology.toZoneMapping());
+    Map<String, StoppableCheck> instancesStoppableChecks =
+        _maintenanceService.batchGetInstancesStoppableChecks(_clusterId, zoneBasedInstance,
+            _customizedInput);
+    for (Map.Entry<String, StoppableCheck> instanceStoppableCheck : instancesStoppableChecks.entrySet()) {
+      String instance = instanceStoppableCheck.getKey();
+      StoppableCheck stoppableCheck = instanceStoppableCheck.getValue();
+      if (!stoppableCheck.isStoppable()) {
+        ArrayNode failedReasonsNode = _failedStoppableInstances.putArray(instance);
+        for (String failedReason : stoppableCheck.getFailedChecks()) {
+          failedReasonsNode.add(JsonNodeFactory.instance.textNode(failedReason));
+        }
+      } else {
+        _stoppableInstances.add(instance);
+      }
+    }
+    // Adding following logic to check whether instances exist or not. An instance exist could be
+    // checking following scenario:
+    // 1. Instance got dropped. (InstanceConfig is gone.)
+    // 2. Instance name has typo.
+
+    // If we dont add this check, the instance, which does not exist, will be disappeared from
+    // result since Helix skips instances for instances not in the selected zone. User may get
+    // confused with the output.
+    Set<String> nonSelectedInstances = new HashSet<>(instances);
+    nonSelectedInstances.removeAll(_clusterTopology.getAllInstances());
+    for (String nonSelectedInstance : nonSelectedInstances) {
+      ArrayNode failedReasonsNode = _failedStoppableInstances.putArray(nonSelectedInstance);
+      failedReasonsNode.add(JsonNodeFactory.instance.textNode(INSTANCE_NOT_EXIST));
+    }
+  }
+
+  public void getStoppableInstancesCrossZones() {
+    // TODO: Add implementation to enable cross zone stoppable check.
+    throw new NotImplementedException("Not Implemented");
+  }
+
+  /**
+   * Determines the order of zones. If an order is provided by the user, it will be used directly.
+   * Otherwise, zones will be ordered by their associated instance count in descending order.
+   *
+   * If `random` is true, the order of zones will be randomized regardless of any previous order.
+   *
+   * @param random Indicates whether to randomize the order of zones.
+   */
+  public void calculateOrderOfZone(boolean random) {
+    if (_orderOfZone == null) {
+      _orderOfZone =
+          new ArrayList<>(getOrderedZoneToInstancesMap(_clusterTopology.toZoneMapping()).keySet());
+    }
+
+    if (_orderOfZone.isEmpty()) {
+      return;
+    }
+
+    if (random) {
+      Collections.shuffle(_orderOfZone);
+    }
+  }
+
+  /**
+   * Get instances belongs to the first zone. If the zone is already empty, Helix will iterate zones
+   * by order until find the zone contains instances.
+   *
+   * The order of zones can directly come from user input. If user did not specify it, Helix will order
+   * zones by the number of associated instances in descending order.
+   *
+   * @param instances
+   * @param zoneMapping
+   * @return
+   */
+  private List<String> getZoneBasedInstances(List<String> instances,
+      Map<String, Set<String>> zoneMapping) {
+    if (_orderOfZone.isEmpty()) {
+      return _orderOfZone;
+    }
+
+    Set<String> instanceSet = null;
+    for (String zone : _orderOfZone) {
+      instanceSet = new TreeSet<>(instances);
+      Set<String> currentZoneInstanceSet = new HashSet<>(zoneMapping.get(zone));
+      instanceSet.retainAll(currentZoneInstanceSet);
+      if (instanceSet.size() > 0) {
+        return new ArrayList<>(instanceSet);
+      }
+    }
+
+    return Collections.EMPTY_LIST;
+  }
+
+  /**
+   * Returns a map from zone to instances set, ordered by the number of instances in each zone
+   * in descending order.
+   *
+   * @param zoneMapping A map from zone to instances set
+   * @return An ordered map from zone to instances set, with zones having more instances appearing first.
+   */
+  private Map<String, Set<String>> getOrderedZoneToInstancesMap(
+      Map<String, Set<String>> zoneMapping) {
+    return zoneMapping.entrySet().stream()
+        .sorted((e1, e2) -> e2.getValue().size() - e1.getValue().size()).collect(
+            Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue,
+                (existing, replacement) -> existing, LinkedHashMap::new));
+  }
+
+  public static class StoppableInstancesSelectorBuilder {
+    private String _clusterId;
+    private List<String> _orderOfZone;
+    private String _customizedInput;
+    private ArrayNode _stoppableInstances;
+    private ObjectNode _failedStoppableInstances;
+    private MaintenanceManagementService _maintenanceService;
+    private ClusterTopology _clusterTopology;
+
+    public StoppableInstancesSelectorBuilder setClusterId(String clusterId) {
+      _clusterId = clusterId;
+      return this;
+    }
+
+    public StoppableInstancesSelectorBuilder setOrderOfZone(List<String> orderOfZone) {
+      _orderOfZone = orderOfZone;
+      return this;
+    }
+
+    public StoppableInstancesSelectorBuilder setCustomizedInput(String customizedInput) {
+      _customizedInput = customizedInput;
+      return this;
+    }
+
+    public StoppableInstancesSelectorBuilder setStoppableInstances(ArrayNode stoppableInstances) {
+      _stoppableInstances = stoppableInstances;
+      return this;
+    }
+
+    public StoppableInstancesSelectorBuilder setFailedStoppableInstances(ObjectNode failedStoppableInstances) {
+      _failedStoppableInstances = failedStoppableInstances;
+      return this;
+    }
+
+    public StoppableInstancesSelectorBuilder setMaintenanceService(
+        MaintenanceManagementService maintenanceService) {
+      _maintenanceService = maintenanceService;
+      return this;
+    }
+
+    public StoppableInstancesSelectorBuilder setClusterTopology(ClusterTopology clusterTopology) {
+      _clusterTopology = clusterTopology;
+      return this;
+    }
+
+    public StoppableInstancesSelector build() {
+      return new StoppableInstancesSelector(_clusterId, _orderOfZone,
+          _customizedInput, _stoppableInstances, _failedStoppableInstances, _maintenanceService,
+          _clusterTopology);
+    }
+  }
+}
diff --git a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/InstancesAccessor.java b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/InstancesAccessor.java
index 87be72b96..8a2120270 100644
--- a/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/InstancesAccessor.java
+++ b/helix-rest/src/main/java/org/apache/helix/rest/server/resources/helix/InstancesAccessor.java
@@ -20,13 +20,10 @@ package org.apache.helix.rest.server.resources.helix;
  */
 
 import java.io.IOException;
-import java.util.ArrayList;
 import java.util.Collections;
-import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
-import java.util.TreeSet;
 import javax.ws.rs.DefaultValue;
 import javax.ws.rs.GET;
 import javax.ws.rs.POST;
@@ -49,6 +46,7 @@ import org.apache.helix.model.ClusterConfig;
 import org.apache.helix.model.InstanceConfig;
 import org.apache.helix.rest.clusterMaintenanceService.MaintenanceManagementService;
 import org.apache.helix.rest.common.HttpConstants;
+import org.apache.helix.rest.clusterMaintenanceService.StoppableInstancesSelector;
 import org.apache.helix.rest.server.filters.ClusterAuth;
 import org.apache.helix.rest.server.json.cluster.ClusterTopology;
 import org.apache.helix.rest.server.json.instance.StoppableCheck;
@@ -63,10 +61,6 @@ import org.slf4j.LoggerFactory;
 @Path("/clusters/{clusterId}/instances")
 public class InstancesAccessor extends AbstractHelixResource {
   private final static Logger _logger = LoggerFactory.getLogger(InstancesAccessor.class);
-  // This type does not belongs to real HealthCheck failed reason. Also if we add this type
-  // to HealthCheck enum, it could introduce more unnecessary check step since the InstanceServiceImpl
-  // loops all the types to do corresponding checks.
-  private final static String INSTANCE_NOT_EXIST = "HELIX:INSTANCE_NOT_EXIST";
   public enum InstancesProperties {
     instances,
     online,
@@ -80,7 +74,8 @@ public class InstancesAccessor extends AbstractHelixResource {
 
   public enum InstanceHealthSelectionBase {
     instance_based,
-    zone_based
+    zone_based,
+    cross_zone_based
   }
 
   @ResponseMetered(name = HttpConstants.READ_REQUEST)
@@ -153,7 +148,8 @@ public class InstancesAccessor extends AbstractHelixResource {
       @QueryParam("command") String command,
       @QueryParam("continueOnFailures") boolean continueOnFailures,
       @QueryParam("skipZKRead") boolean skipZKRead,
-      @QueryParam("skipHealthCheckCategories") String skipHealthCheckCategories, String content) {
+      @QueryParam("skipHealthCheckCategories") String skipHealthCheckCategories,
+      @DefaultValue("false") @QueryParam("random") boolean random, String content) {
     Command cmd;
     try {
       cmd = Command.valueOf(command);
@@ -198,7 +194,7 @@ public class InstancesAccessor extends AbstractHelixResource {
           break;
         case stoppable:
           return batchGetStoppableInstances(clusterId, node, skipZKRead, continueOnFailures,
-              skipHealthCheckCategorySet);
+              skipHealthCheckCategorySet, random);
         default:
           _logger.error("Unsupported command :" + command);
           return badRequest("Unsupported command :" + command);
@@ -215,8 +211,8 @@ public class InstancesAccessor extends AbstractHelixResource {
   }
 
   private Response batchGetStoppableInstances(String clusterId, JsonNode node, boolean skipZKRead,
-      boolean continueOnFailures, Set<StoppableCheck.Category> skipHealthCheckCategories)
-      throws IOException {
+      boolean continueOnFailures, Set<StoppableCheck.Category> skipHealthCheckCategories,
+      boolean random) throws IOException {
     try {
       // TODO: Process input data from the content
       InstancesAccessor.InstanceHealthSelectionBase selectionBase =
@@ -237,6 +233,12 @@ public class InstancesAccessor extends AbstractHelixResource {
         orderOfZone = OBJECT_MAPPER.readValue(
             node.get(InstancesAccessor.InstancesProperties.zone_order.name()).toString(),
             OBJECT_MAPPER.getTypeFactory().constructCollectionType(List.class, String.class));
+        if (!orderOfZone.isEmpty() && random) {
+          String message =
+              "Both 'orderOfZone' and 'random' parameters are set. Please specify only one option.";
+          _logger.error(message);
+          return badRequest(message);
+        }
       }
 
       // Prepare output result
@@ -253,40 +255,23 @@ public class InstancesAccessor extends AbstractHelixResource {
       ClusterService clusterService =
           new ClusterServiceImpl(getDataAccssor(clusterId), getConfigAccessor());
       ClusterTopology clusterTopology = clusterService.getClusterTopology(clusterId);
+      StoppableInstancesSelector stoppableInstancesSelector =
+          new StoppableInstancesSelector.StoppableInstancesSelectorBuilder()
+              .setClusterId(clusterId)
+              .setOrderOfZone(orderOfZone)
+              .setCustomizedInput(customizedInput)
+              .setStoppableInstances(stoppableInstances)
+              .setFailedStoppableInstances(failedStoppableInstances)
+              .setMaintenanceService(maintenanceService)
+              .setClusterTopology(clusterTopology)
+              .build();
+      stoppableInstancesSelector.calculateOrderOfZone(random);
       switch (selectionBase) {
         case zone_based:
-          List<String> zoneBasedInstance =
-              getZoneBasedInstances(instances, orderOfZone, clusterTopology.toZoneMapping());
-          Map<String, StoppableCheck> instancesStoppableChecks =
-              maintenanceService.batchGetInstancesStoppableChecks(clusterId, zoneBasedInstance,
-                  customizedInput);
-          for (Map.Entry<String, StoppableCheck> instanceStoppableCheck : instancesStoppableChecks.entrySet()) {
-            String instance = instanceStoppableCheck.getKey();
-            StoppableCheck stoppableCheck = instanceStoppableCheck.getValue();
-            if (!stoppableCheck.isStoppable()) {
-              ArrayNode failedReasonsNode = failedStoppableInstances.putArray(instance);
-              for (String failedReason : stoppableCheck.getFailedChecks()) {
-                failedReasonsNode.add(JsonNodeFactory.instance.textNode(failedReason));
-              }
-            } else {
-              stoppableInstances.add(instance);
-            }
-          }
-          // Adding following logic to check whether instances exist or not. An instance exist could be
-          // checking following scenario:
-          // 1. Instance got dropped. (InstanceConfig is gone.)
-          // 2. Instance name has typo.
-
-          // If we dont add this check, the instance, which does not exist, will be disappeared from
-          // result since Helix skips instances for instances not in the selected zone. User may get
-          // confused with the output.
-          Set<String> nonSelectedInstances = new HashSet<>(instances);
-          nonSelectedInstances.removeAll(clusterTopology.getAllInstances());
-          for (String nonSelectedInstance : nonSelectedInstances) {
-            ArrayNode failedReasonsNode = failedStoppableInstances.putArray(nonSelectedInstance);
-            failedReasonsNode.add(JsonNodeFactory.instance.textNode(INSTANCE_NOT_EXIST));
-          }
-
+          stoppableInstancesSelector.getStoppableInstancesInSingleZone(instances);
+          break;
+        case cross_zone_based:
+          stoppableInstancesSelector.getStoppableInstancesCrossZones();
           break;
         case instance_based:
         default:
@@ -304,41 +289,4 @@ public class InstancesAccessor extends AbstractHelixResource {
       throw e;
     }
   }
-
-  /**
-   * Get instances belongs to the first zone. If the zone is already empty, Helix will iterate zones
-   * by order until find the zone contains instances.
-   *
-   * The order of zones can directly come from user input. If user did not specify it, Helix will order
-   * zones with alphabetical order.
-   *
-   * @param instances
-   * @param orderedZones
-   * @return
-   */
-  private List<String> getZoneBasedInstances(List<String> instances, List<String> orderedZones,
-      Map<String, Set<String>> zoneMapping) {
-
-    // If the orderedZones is not specified, we will order all zones in alphabetical order.
-    if (orderedZones == null) {
-      orderedZones = new ArrayList<>(zoneMapping.keySet());
-      Collections.sort(orderedZones);
-    }
-
-    if (orderedZones.isEmpty()) {
-      return orderedZones;
-    }
-
-    Set<String> instanceSet = null;
-    for (String zone : orderedZones) {
-      instanceSet = new TreeSet<>(instances);
-      Set<String> currentZoneInstanceSet = new HashSet<>(zoneMapping.get(zone));
-      instanceSet.retainAll(currentZoneInstanceSet);
-      if (instanceSet.size() > 0) {
-        return new ArrayList<>(instanceSet);
-      }
-    }
-
-    return Collections.EMPTY_LIST;
-  }
 }