You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2021/05/10 15:34:45 UTC

[lucene-solr] branch branch_8x updated: SOLR-15300: Report collection and shard "health" state in CLUSTERSTATUS response.

This is an automated email from the ASF dual-hosted git repository.

ab pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new 651b44f  SOLR-15300: Report collection and shard "health" state in CLUSTERSTATUS response.
651b44f is described below

commit 651b44f210896f3bcfcf856dada68aafccba0865
Author: Andrzej Bialecki <ab...@apache.org>
AuthorDate: Mon May 10 17:34:16 2021 +0200

    SOLR-15300: Report collection and shard "health" state in CLUSTERSTATUS response.
---
 solr/CHANGES.txt                                   |  2 +
 .../apache/solr/handler/admin/ClusterStatus.java   | 70 +++++++++++++++++++++-
 .../cloud/api/collections/TestCollectionAPI.java   | 65 ++++++++++++++++++++
 .../src/cluster-node-management.adoc               | 22 +++++++
 4 files changed, 156 insertions(+), 3 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index d144204..74c92b0 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -21,6 +21,8 @@ New Features
 
 * SOLR-15365: Improved Grafana dashboard for Prometheus Exporter with new Solr Cluster row (janhoy)
 
+* SOLR-15300: Report collection and shard "health" state in CLUSTERSTATUS response. (ab, janhoy)
+
 Improvements
 ---------------------
 * SOLR-15081: Metrics for a core: add SolrCloud "isLeader" and "replicaState".  (David Smiley)
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/ClusterStatus.java b/solr/core/src/java/org/apache/solr/handler/admin/ClusterStatus.java
index e9a404a..80c2b2d 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/ClusterStatus.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/ClusterStatus.java
@@ -49,6 +49,48 @@ public class ClusterStatus {
   private final ZkNodeProps message;
   private final String collection; // maybe null
 
+  /** Shard / collection health state. */
+  public enum Health {
+    /** All replicas up, leader exists. */
+    GREEN,
+    /** Some replicas down, leader exists. */
+    YELLOW,
+    /** Most replicas down, leader exists. */
+    ORANGE,
+    /** No leader or all replicas down. */
+    RED;
+
+    public static final float ORANGE_LEVEL = 0.5f;
+    public static final float RED_LEVEL = 0.0f;
+
+    public static Health calcShardHealth(float fractionReplicasUp, boolean hasLeader) {
+      if (hasLeader) {
+        if (fractionReplicasUp == 1.0f) {
+          return GREEN;
+        } else if (fractionReplicasUp > ORANGE_LEVEL) {
+          return YELLOW;
+        } else if (fractionReplicasUp > RED_LEVEL) {
+          return ORANGE;
+        } else {
+          return RED;
+        }
+      } else {
+        return RED;
+      }
+    }
+
+    /** Combine multiple states into one. Always reports as the worst state. */
+    public static Health combine(Collection<Health> states) {
+      Health res = GREEN;
+      for (Health state : states) {
+        if (state.ordinal() > res.ordinal()) {
+          res = state;
+        }
+      }
+      return res;
+    }
+  }
+
   public ClusterStatus(ZkStateReader zkStateReader, ZkNodeProps props) {
     this.zkStateReader = zkStateReader;
     this.message = props;
@@ -260,17 +302,39 @@ public class ClusterStatus {
     final Map<String, Map<String,Object>> shards = collection != null
         ? (Map<String, Map<String,Object>>)collection.getOrDefault("shards", Collections.emptyMap())
         : Collections.emptyMap();
-    shards.values().forEach(s -> {
+    final List<Health> healthStates = new ArrayList<>(shards.size());
+    shards.forEach((shardName, s) -> {
       final Map<String, Map<String,Object>> replicas =
           (Map<String, Map<String,Object>>)s.getOrDefault("replicas", Collections.emptyMap());
-      replicas.values().forEach(r -> {
+      int[] totalVsActive = new int[2];
+      boolean hasLeader = false;
+      for(Map<String, Object> r : replicas.values()) {
+        totalVsActive[0]++;
+        boolean active = false;
+        if (Replica.State.ACTIVE.toString().equals(r.get("state"))) {
+          totalVsActive[1]++;
+          active = true;
+        }
+        if ("true".equals(r.get("leader")) && active) {
+          hasLeader = true;
+        }
         String nodeName = (String)r.get(ZkStateReader.NODE_NAME_PROP);
         if (nodeName != null) {
           // UI needs the base_url set
           r.put(ZkStateReader.BASE_URL_PROP, UrlScheme.INSTANCE.getBaseUrlForNodeName(nodeName));
         }
-      });
+      }
+      float ratioActive;
+      if (totalVsActive[0] == 0) {
+        ratioActive = 0.0f;
+      } else {
+        ratioActive = (float) totalVsActive[1] / totalVsActive[0];
+      }
+      Health health = Health.calcShardHealth(ratioActive, hasLeader);
+      s.put("health", health.toString());
+      healthStates.add(health);
     });
+    collection.put("health", Health.combine(healthStates).toString());
     return collection;
   }
 }
diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/TestCollectionAPI.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/TestCollectionAPI.java
index d174cae..74e9654 100644
--- a/solr/core/src/test/org/apache/solr/cloud/api/collections/TestCollectionAPI.java
+++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/TestCollectionAPI.java
@@ -22,10 +22,15 @@ import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicLong;
 
 import com.google.common.collect.Lists;
+import org.apache.solr.client.solrj.cloud.SolrCloudManager;
+import org.apache.solr.client.solrj.embedded.JettySolrRunner;
 import org.apache.solr.client.solrj.impl.BaseHttpSolrClient.RemoteSolrException;
+import org.apache.solr.client.solrj.impl.SolrClientCloudManager;
+import org.apache.solr.cloud.CloudUtil;
 import org.apache.solr.cloud.ZkTestServer;
 import org.apache.solr.client.solrj.SolrQuery;
 import org.apache.solr.client.solrj.SolrRequest;
@@ -45,10 +50,12 @@ import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.cloud.SolrZkClient;
 import org.apache.solr.common.cloud.ZkConfigManager;
+import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.common.params.CollectionParams;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.params.ShardParams;
 import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.Utils;
 import org.apache.zookeeper.KeeperException;
 import org.junit.Test;
 
@@ -369,6 +376,64 @@ public class TestCollectionAPI extends ReplicaPropertiesBase {
     }
   }
 
+  @SuppressWarnings({"unchecked"})
+  private void clusterStatusWithCollectionHealthState() throws Exception {
+    try (CloudSolrClient client = createCloudClient(null)) {
+      final CollectionAdminRequest.ClusterStatus request = new CollectionAdminRequest.ClusterStatus();
+      request.setCollectionName(COLLECTION_NAME);
+      NamedList<Object> rsp = request.process(client).getResponse();
+      NamedList<Object> cluster = (NamedList<Object>) rsp.get("cluster");
+      assertNotNull("Cluster state should not be null", cluster);
+      Map<String, Object> collection = (Map<String, Object>) Utils.getObjectByPath(cluster, false, "collections/" + COLLECTION_NAME);
+      assertEquals("collection health", "GREEN", collection.get("health"));
+      Map<String, Object> shardStatus = (Map<String, Object>) collection.get("shards");
+      assertEquals(2, shardStatus.size());
+      String health = (String) Utils.getObjectByPath(shardStatus, false, "shard1/health");
+      assertEquals("shard1 health", "GREEN", health);
+      health = (String) Utils.getObjectByPath(shardStatus, false, "shard2/health");
+      assertEquals("shard2 health", "GREEN", health);
+
+      // bring some replicas down
+      JettySolrRunner jetty = chaosMonkey.getShard("shard1", 0);
+      String nodeName = jetty.getNodeName();
+      jetty.stop();
+      ZkStateReader zkStateReader = client.getZkStateReader();
+      zkStateReader.waitForLiveNodes(30, TimeUnit.SECONDS, (o, n) -> n != null && !n.contains(nodeName));
+
+      rsp = request.process(client).getResponse();
+      collection = (Map<String, Object>) Utils.getObjectByPath(rsp, false, "cluster/collections/" + COLLECTION_NAME);
+      assertFalse("collection health should not be GREEN", "GREEN".equals(collection.get("health")));
+      shardStatus = (Map<String, Object>) collection.get("shards");
+      assertEquals(2, shardStatus.size());
+      String health1 = (String) Utils.getObjectByPath(shardStatus, false, "shard1/health");
+      String health2 = (String) Utils.getObjectByPath(shardStatus, false, "shard2/health");
+      assertTrue("shard1=" + health1 + ", shard2=" + health2, !"GREEN".equals(health1) || !"GREEN".equals(health2));
+
+      // bring them up again
+      jetty.start();
+      SolrCloudManager cloudManager = new SolrClientCloudManager(null, client);
+      zkStateReader.waitForLiveNodes(30, TimeUnit.SECONDS, (o, n) -> n != null && n.contains(nodeName));
+      CloudUtil.waitForState(cloudManager, COLLECTION_NAME, 30, TimeUnit.SECONDS, (liveNodes, coll) -> {
+        for (Replica r : coll.getReplicas()) {
+          if (!r.isActive(liveNodes)) {
+            return false;
+          }
+        }
+        return true;
+      });
+      rsp = request.process(client).getResponse();
+      collection = (Map<String, Object>) Utils.getObjectByPath(rsp, false, "cluster/collections/" + COLLECTION_NAME);
+      assertEquals("collection health", "GREEN", collection.get("health"));
+      shardStatus = (Map<String, Object>) collection.get("shards");
+      assertEquals(2, shardStatus.size());
+      health = (String) Utils.getObjectByPath(shardStatus, false, "shard1/health");
+      assertEquals("shard1 health", "GREEN", health);
+      health = (String) Utils.getObjectByPath(shardStatus, false, "shard2/health");
+      assertEquals("shard2 health", "GREEN", health);
+
+    }
+  }
+
 
   private void listCollection() throws IOException, SolrServerException {
     try (CloudSolrClient client = createCloudClient(null)) {
diff --git a/solr/solr-ref-guide/src/cluster-node-management.adoc b/solr/solr-ref-guide/src/cluster-node-management.adoc
index 1bc31fd..7a13227 100644
--- a/solr/solr-ref-guide/src/cluster-node-management.adoc
+++ b/solr/solr-ref-guide/src/cluster-node-management.adoc
@@ -28,6 +28,25 @@ Fetch the cluster status including collections, shards, replicas, configuration
 
 `/admin/collections?action=CLUSTERSTATUS`
 
+Additionally, this command reports a `health` status of each collection and shard, in
+order to make it easier to monitor the operational state of the collections. The
+following health state values are defined, ordered from the best to worst, based on
+the percentage of active replicas (`active`):
+
+`GREEN`::
+`active == 100%`, all replicas are active and there's a shard leader.
+`YELLOW`::
+`100% > active > 50%`, AND there's a shard leader.
+`ORANGE`::
+`50% >= active > 0%`, AND there's a shard leader.
+`RED`::
+No active replicas *OR* there's no shard leader.
+
+The collection health state is reported as the worst state of any shard, e.g. for a
+collection with all shards GREEN except for one YELLOW the collection health will be
+reported as YELLOW.
+
+
 === CLUSTERSTATUS Parameters
 
 `collection`::
@@ -67,6 +86,7 @@ http://localhost:8983/solr/admin/collections?action=CLUSTERSTATUS
           "shard1":{
             "range":"80000000-ffffffff",
             "state":"active",
+            "health": "GREEN",
             "replicas":{
               "core_node1":{
                 "state":"active",
@@ -82,6 +102,7 @@ http://localhost:8983/solr/admin/collections?action=CLUSTERSTATUS
           "shard2":{
             "range":"0-7fffffff",
             "state":"active",
+            "health": "GREEN",
             "replicas":{
               "core_node2":{
                 "state":"active",
@@ -100,6 +121,7 @@ http://localhost:8983/solr/admin/collections?action=CLUSTERSTATUS
         "znodeVersion": 11,
         "autoCreated":"true",
         "configName" : "my_config",
+        "health": "GREEN",
         "aliases":["both_collections"]
       },
       "collection2":{