You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ab...@apache.org on 2021/05/10 15:34:45 UTC
[lucene-solr] branch branch_8x updated: SOLR-15300: Report
collection and shard "health" state in CLUSTERSTATUS response.
This is an automated email from the ASF dual-hosted git repository.
ab pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git
The following commit(s) were added to refs/heads/branch_8x by this push:
new 651b44f SOLR-15300: Report collection and shard "health" state in CLUSTERSTATUS response.
651b44f is described below
commit 651b44f210896f3bcfcf856dada68aafccba0865
Author: Andrzej Bialecki <ab...@apache.org>
AuthorDate: Mon May 10 17:34:16 2021 +0200
SOLR-15300: Report collection and shard "health" state in CLUSTERSTATUS response.
---
solr/CHANGES.txt | 2 +
.../apache/solr/handler/admin/ClusterStatus.java | 70 +++++++++++++++++++++-
.../cloud/api/collections/TestCollectionAPI.java | 65 ++++++++++++++++++++
.../src/cluster-node-management.adoc | 22 +++++++
4 files changed, 156 insertions(+), 3 deletions(-)
diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index d144204..74c92b0 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -21,6 +21,8 @@ New Features
* SOLR-15365: Improved Grafana dashboard for Prometheus Exporter with new Solr Cluster row (janhoy)
+* SOLR-15300: Report collection and shard "health" state in CLUSTERSTATUS response. (ab, janhoy)
+
Improvements
---------------------
* SOLR-15081: Metrics for a core: add SolrCloud "isLeader" and "replicaState". (David Smiley)
diff --git a/solr/core/src/java/org/apache/solr/handler/admin/ClusterStatus.java b/solr/core/src/java/org/apache/solr/handler/admin/ClusterStatus.java
index e9a404a..80c2b2d 100644
--- a/solr/core/src/java/org/apache/solr/handler/admin/ClusterStatus.java
+++ b/solr/core/src/java/org/apache/solr/handler/admin/ClusterStatus.java
@@ -49,6 +49,48 @@ public class ClusterStatus {
private final ZkNodeProps message;
private final String collection; // maybe null
+ /** Shard / collection health state. */
+ public enum Health {
+ /** All replicas up, leader exists. */
+ GREEN,
+ /** Some replicas down, leader exists. */
+ YELLOW,
+ /** Most replicas down, leader exists. */
+ ORANGE,
+ /** No leader or all replicas down. */
+ RED;
+
+ public static final float ORANGE_LEVEL = 0.5f;
+ public static final float RED_LEVEL = 0.0f;
+
+ public static Health calcShardHealth(float fractionReplicasUp, boolean hasLeader) {
+ if (hasLeader) {
+ if (fractionReplicasUp == 1.0f) {
+ return GREEN;
+ } else if (fractionReplicasUp > ORANGE_LEVEL) {
+ return YELLOW;
+ } else if (fractionReplicasUp > RED_LEVEL) {
+ return ORANGE;
+ } else {
+ return RED;
+ }
+ } else {
+ return RED;
+ }
+ }
+
+ /** Combine multiple states into one. Always reports as the worst state. */
+ public static Health combine(Collection<Health> states) {
+ Health res = GREEN;
+ for (Health state : states) {
+ if (state.ordinal() > res.ordinal()) {
+ res = state;
+ }
+ }
+ return res;
+ }
+ }
+
public ClusterStatus(ZkStateReader zkStateReader, ZkNodeProps props) {
this.zkStateReader = zkStateReader;
this.message = props;
@@ -260,17 +302,39 @@ public class ClusterStatus {
final Map<String, Map<String,Object>> shards = collection != null
? (Map<String, Map<String,Object>>)collection.getOrDefault("shards", Collections.emptyMap())
: Collections.emptyMap();
- shards.values().forEach(s -> {
+ final List<Health> healthStates = new ArrayList<>(shards.size());
+ shards.forEach((shardName, s) -> {
final Map<String, Map<String,Object>> replicas =
(Map<String, Map<String,Object>>)s.getOrDefault("replicas", Collections.emptyMap());
- replicas.values().forEach(r -> {
+ int[] totalVsActive = new int[2];
+ boolean hasLeader = false;
+ for(Map<String, Object> r : replicas.values()) {
+ totalVsActive[0]++;
+ boolean active = false;
+ if (Replica.State.ACTIVE.toString().equals(r.get("state"))) {
+ totalVsActive[1]++;
+ active = true;
+ }
+ if ("true".equals(r.get("leader")) && active) {
+ hasLeader = true;
+ }
String nodeName = (String)r.get(ZkStateReader.NODE_NAME_PROP);
if (nodeName != null) {
// UI needs the base_url set
r.put(ZkStateReader.BASE_URL_PROP, UrlScheme.INSTANCE.getBaseUrlForNodeName(nodeName));
}
- });
+ }
+ float ratioActive;
+ if (totalVsActive[0] == 0) {
+ ratioActive = 0.0f;
+ } else {
+ ratioActive = (float) totalVsActive[1] / totalVsActive[0];
+ }
+ Health health = Health.calcShardHealth(ratioActive, hasLeader);
+ s.put("health", health.toString());
+ healthStates.add(health);
});
+ collection.put("health", Health.combine(healthStates).toString());
return collection;
}
}
diff --git a/solr/core/src/test/org/apache/solr/cloud/api/collections/TestCollectionAPI.java b/solr/core/src/test/org/apache/solr/cloud/api/collections/TestCollectionAPI.java
index d174cae..74e9654 100644
--- a/solr/core/src/test/org/apache/solr/cloud/api/collections/TestCollectionAPI.java
+++ b/solr/core/src/test/org/apache/solr/cloud/api/collections/TestCollectionAPI.java
@@ -22,10 +22,15 @@ import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
+import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicLong;
import com.google.common.collect.Lists;
+import org.apache.solr.client.solrj.cloud.SolrCloudManager;
+import org.apache.solr.client.solrj.embedded.JettySolrRunner;
import org.apache.solr.client.solrj.impl.BaseHttpSolrClient.RemoteSolrException;
+import org.apache.solr.client.solrj.impl.SolrClientCloudManager;
+import org.apache.solr.cloud.CloudUtil;
import org.apache.solr.cloud.ZkTestServer;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrRequest;
@@ -45,10 +50,12 @@ import org.apache.solr.common.cloud.Replica;
import org.apache.solr.common.cloud.Slice;
import org.apache.solr.common.cloud.SolrZkClient;
import org.apache.solr.common.cloud.ZkConfigManager;
+import org.apache.solr.common.cloud.ZkStateReader;
import org.apache.solr.common.params.CollectionParams;
import org.apache.solr.common.params.ModifiableSolrParams;
import org.apache.solr.common.params.ShardParams;
import org.apache.solr.common.util.NamedList;
+import org.apache.solr.common.util.Utils;
import org.apache.zookeeper.KeeperException;
import org.junit.Test;
@@ -369,6 +376,64 @@ public class TestCollectionAPI extends ReplicaPropertiesBase {
}
}
+ @SuppressWarnings({"unchecked"})
+ private void clusterStatusWithCollectionHealthState() throws Exception {
+ try (CloudSolrClient client = createCloudClient(null)) {
+ final CollectionAdminRequest.ClusterStatus request = new CollectionAdminRequest.ClusterStatus();
+ request.setCollectionName(COLLECTION_NAME);
+ NamedList<Object> rsp = request.process(client).getResponse();
+ NamedList<Object> cluster = (NamedList<Object>) rsp.get("cluster");
+ assertNotNull("Cluster state should not be null", cluster);
+ Map<String, Object> collection = (Map<String, Object>) Utils.getObjectByPath(cluster, false, "collections/" + COLLECTION_NAME);
+ assertEquals("collection health", "GREEN", collection.get("health"));
+ Map<String, Object> shardStatus = (Map<String, Object>) collection.get("shards");
+ assertEquals(2, shardStatus.size());
+ String health = (String) Utils.getObjectByPath(shardStatus, false, "shard1/health");
+ assertEquals("shard1 health", "GREEN", health);
+ health = (String) Utils.getObjectByPath(shardStatus, false, "shard2/health");
+ assertEquals("shard2 health", "GREEN", health);
+
+ // bring some replicas down
+ JettySolrRunner jetty = chaosMonkey.getShard("shard1", 0);
+ String nodeName = jetty.getNodeName();
+ jetty.stop();
+ ZkStateReader zkStateReader = client.getZkStateReader();
+ zkStateReader.waitForLiveNodes(30, TimeUnit.SECONDS, (o, n) -> n != null && !n.contains(nodeName));
+
+ rsp = request.process(client).getResponse();
+ collection = (Map<String, Object>) Utils.getObjectByPath(rsp, false, "cluster/collections/" + COLLECTION_NAME);
+ assertFalse("collection health should not be GREEN", "GREEN".equals(collection.get("health")));
+ shardStatus = (Map<String, Object>) collection.get("shards");
+ assertEquals(2, shardStatus.size());
+ String health1 = (String) Utils.getObjectByPath(shardStatus, false, "shard1/health");
+ String health2 = (String) Utils.getObjectByPath(shardStatus, false, "shard2/health");
+ assertTrue("shard1=" + health1 + ", shard2=" + health2, !"GREEN".equals(health1) || !"GREEN".equals(health2));
+
+ // bring them up again
+ jetty.start();
+ SolrCloudManager cloudManager = new SolrClientCloudManager(null, client);
+ zkStateReader.waitForLiveNodes(30, TimeUnit.SECONDS, (o, n) -> n != null && n.contains(nodeName));
+ CloudUtil.waitForState(cloudManager, COLLECTION_NAME, 30, TimeUnit.SECONDS, (liveNodes, coll) -> {
+ for (Replica r : coll.getReplicas()) {
+ if (!r.isActive(liveNodes)) {
+ return false;
+ }
+ }
+ return true;
+ });
+ rsp = request.process(client).getResponse();
+ collection = (Map<String, Object>) Utils.getObjectByPath(rsp, false, "cluster/collections/" + COLLECTION_NAME);
+ assertEquals("collection health", "GREEN", collection.get("health"));
+ shardStatus = (Map<String, Object>) collection.get("shards");
+ assertEquals(2, shardStatus.size());
+ health = (String) Utils.getObjectByPath(shardStatus, false, "shard1/health");
+ assertEquals("shard1 health", "GREEN", health);
+ health = (String) Utils.getObjectByPath(shardStatus, false, "shard2/health");
+ assertEquals("shard2 health", "GREEN", health);
+
+ }
+ }
+
private void listCollection() throws IOException, SolrServerException {
try (CloudSolrClient client = createCloudClient(null)) {
diff --git a/solr/solr-ref-guide/src/cluster-node-management.adoc b/solr/solr-ref-guide/src/cluster-node-management.adoc
index 1bc31fd..7a13227 100644
--- a/solr/solr-ref-guide/src/cluster-node-management.adoc
+++ b/solr/solr-ref-guide/src/cluster-node-management.adoc
@@ -28,6 +28,25 @@ Fetch the cluster status including collections, shards, replicas, configuration
`/admin/collections?action=CLUSTERSTATUS`
+Additionally, this command reports a `health` status of each collection and shard, in
+order to make it easier to monitor the operational state of the collections. The
+following health state values are defined, ordered from the best to worst, based on
+the percentage of active replicas (`active`):
+
+`GREEN`::
+`active == 100%`, all replicas are active and there's a shard leader.
+`YELLOW`::
+`100% > active > 50%`, AND there's a shard leader.
+`ORANGE`::
+`50% >= active > 0%`, AND there's a shard leader.
+`RED`::
+No active replicas *OR* there's no shard leader.
+
+The collection health state is reported as the worst state of any shard, e.g. for a
+collection with all shards GREEN except for one YELLOW the collection health will be
+reported as YELLOW.
+
+
=== CLUSTERSTATUS Parameters
`collection`::
@@ -67,6 +86,7 @@ http://localhost:8983/solr/admin/collections?action=CLUSTERSTATUS
"shard1":{
"range":"80000000-ffffffff",
"state":"active",
+ "health": "GREEN",
"replicas":{
"core_node1":{
"state":"active",
@@ -82,6 +102,7 @@ http://localhost:8983/solr/admin/collections?action=CLUSTERSTATUS
"shard2":{
"range":"0-7fffffff",
"state":"active",
+ "health": "GREEN",
"replicas":{
"core_node2":{
"state":"active",
@@ -100,6 +121,7 @@ http://localhost:8983/solr/admin/collections?action=CLUSTERSTATUS
"znodeVersion": 11,
"autoCreated":"true",
"configName" : "my_config",
+ "health": "GREEN",
"aliases":["both_collections"]
},
"collection2":{