You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ds...@apache.org on 2024/03/15 02:06:26 UTC

(solr) 01/02: SOLR-16403: ClusterSingleton to remove inactive Shards (#1926)

This is an automated email from the ASF dual-hosted git repository.

dsmiley pushed a commit to branch branch_9x
in repository https://gitbox.apache.org/repos/asf/solr.git

commit 4098d6ae3e298efdfa219d5a2629d69f60c5aff9
Author: pjmcarthur <92...@users.noreply.github.com>
AuthorDate: Thu Mar 14 12:49:48 2024 -0700

    SOLR-16403: ClusterSingleton to remove inactive Shards (#1926)
    
    ClusterSingleton that periodically removes state=INACTIVE shards.  These occur from shard splits.
    
    Co-authored-by: Paul McArthur <pm...@proton.me>
    (cherry picked from commit ca58f1aa90b351b69b0ec4184adbaaca03978573)
---
 solr/CHANGES.txt                                   |   2 +
 .../cluster/maintenance/InactiveShardRemover.java  | 219 ++++++++++++++++++++
 .../maintenance/InactiveShardRemoverConfig.java    |  60 ++++++
 .../solr/cluster/maintenance/package-info.java     |  19 ++
 .../org/apache/solr/cloud/DeleteShardTest.java     |  52 +----
 .../maintenance/InactiveShardRemoverTest.java      | 225 +++++++++++++++++++++
 .../pages/cluster-singleton-plugins.adoc           |  93 +++++++++
 .../java/org/apache/solr/cloud/ShardTestUtil.java  |  57 ++++++
 8 files changed, 686 insertions(+), 41 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 0a034818332..0cbbc4ea70e 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -11,6 +11,8 @@ New Features
 
 * SOLR-599: Add a new SolrJ client using the JDK’s built-in Http Client.  (James Dyer)
 
+* SOLR-16403: A new cluster singleton plugin to automatically remove inactive shards. (Paul McArthur, David Smiley)
+
 Improvements
 ---------------------
 * SOLR-17119: When registering or updating a ConfigurablePlugin through the `/cluster/plugin` API,
diff --git a/solr/core/src/java/org/apache/solr/cluster/maintenance/InactiveShardRemover.java b/solr/core/src/java/org/apache/solr/cluster/maintenance/InactiveShardRemover.java
new file mode 100644
index 00000000000..177663d1140
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/cluster/maintenance/InactiveShardRemover.java
@@ -0,0 +1,219 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.cluster.maintenance;
+
+import com.google.common.annotations.VisibleForTesting;
+import java.lang.invoke.MethodHandles;
+import java.util.Collection;
+import java.util.HashSet;
+import java.util.Objects;
+import java.util.concurrent.Executors;
+import java.util.concurrent.ScheduledExecutorService;
+import java.util.concurrent.TimeUnit;
+import java.util.stream.Collectors;
+import org.apache.solr.api.ConfigurablePlugin;
+import org.apache.solr.client.solrj.SolrResponse;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.cloud.ClusterSingleton;
+import org.apache.solr.common.cloud.ClusterState;
+import org.apache.solr.common.cloud.DocCollection;
+import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.cloud.ZkStateReader;
+import org.apache.solr.common.util.ExecutorUtil;
+import org.apache.solr.common.util.SolrNamedThreadFactory;
+import org.apache.solr.core.CoreContainer;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * This Cluster Singleton can be configured to periodically find and remove {@link
+ * org.apache.solr.common.cloud.Slice.State#INACTIVE} Shards that are left behind after a Shard is
+ * split
+ */
+public class InactiveShardRemover
+    implements ClusterSingleton, ConfigurablePlugin<InactiveShardRemoverConfig> {
+
+  private static final Logger log = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());
+
+  public static final String PLUGIN_NAME = ".inactive-shard-remover";
+
+  static class DeleteActor {
+
+    private final CoreContainer coreContainer;
+
+    DeleteActor(final CoreContainer coreContainer) {
+      this.coreContainer = coreContainer;
+    }
+
+    void delete(final Slice slice) {
+      CollectionAdminRequest.DeleteShard deleteRequest =
+          CollectionAdminRequest.deleteShard(slice.getCollection(), slice.getName());
+      try {
+        SolrResponse response =
+            coreContainer.getZkController().getSolrCloudManager().request(deleteRequest);
+        if (response.getException() != null) {
+          throw response.getException();
+        }
+      } catch (Exception e) {
+        log.warn("An exception occurred when deleting an inactive shard", e);
+      }
+    }
+  }
+
+  private State state = State.STOPPED;
+
+  private final CoreContainer coreContainer;
+
+  private final DeleteActor deleteActor;
+
+  private ScheduledExecutorService executor;
+
+  private long scheduleIntervalSeconds;
+
+  private long ttlSeconds;
+
+  private int maxDeletesPerCycle;
+
+  /** Constructor invoked via Reflection */
+  public InactiveShardRemover(final CoreContainer cc) {
+    this(cc, new DeleteActor(cc));
+  }
+
+  public InactiveShardRemover(final CoreContainer cc, final DeleteActor deleteActor) {
+    this.coreContainer = cc;
+    this.deleteActor = deleteActor;
+  }
+
+  @Override
+  public void configure(final InactiveShardRemoverConfig cfg) {
+    Objects.requireNonNull(cfg, "config must be specified");
+    cfg.validate();
+    this.scheduleIntervalSeconds = cfg.scheduleIntervalSeconds;
+    this.maxDeletesPerCycle = cfg.maxDeletesPerCycle;
+    this.ttlSeconds = cfg.ttlSeconds;
+  }
+
+  @Override
+  public String getName() {
+    return PLUGIN_NAME;
+  }
+
+  @Override
+  public State getState() {
+    return state;
+  }
+
+  @Override
+  public void start() throws Exception {
+    state = State.STARTING;
+    executor = Executors.newScheduledThreadPool(1, new SolrNamedThreadFactory(PLUGIN_NAME));
+    executor.scheduleAtFixedRate(
+        this::deleteInactiveSlices,
+        scheduleIntervalSeconds,
+        scheduleIntervalSeconds,
+        TimeUnit.SECONDS);
+    state = State.RUNNING;
+  }
+
+  @Override
+  public void stop() {
+    if (state == State.RUNNING) {
+      state = State.STOPPING;
+      ExecutorUtil.shutdownNowAndAwaitTermination(executor);
+    }
+    state = State.STOPPED;
+  }
+
+  @VisibleForTesting
+  void deleteInactiveSlices() {
+    final ClusterState clusterState = coreContainer.getZkController().getClusterState();
+    Collection<Slice> inactiveSlices =
+        clusterState.getCollectionsMap().values().stream()
+            .flatMap(v -> collectInactiveSlices(v).stream())
+            .collect(Collectors.toSet());
+
+    if (log.isInfoEnabled()) {
+      log.info(
+          "Found {} inactive Shards to delete, {} will be deleted",
+          inactiveSlices.size(),
+          Math.min(inactiveSlices.size(), maxDeletesPerCycle));
+    }
+
+    inactiveSlices.stream().limit(maxDeletesPerCycle).forEach(this::deleteShard);
+  }
+
+  private Collection<Slice> collectInactiveSlices(final DocCollection docCollection) {
+    final Collection<Slice> slices = new HashSet<>(docCollection.getSlices());
+    slices.removeAll(docCollection.getActiveSlices());
+    return slices.stream().filter(this::isExpired).collect(Collectors.toSet());
+  }
+
+  private void deleteShard(final Slice s) {
+    deleteActor.delete(s);
+  }
+
+  /**
+   * An Inactive Shard is expired if it has not undergone a state change in the period of time
+   * defined by {@link InactiveShardRemover#ttlSeconds}. If it is expired, it is eligible for
+   * removal.
+   */
+  private boolean isExpired(final Slice slice) {
+
+    final String collectionName = slice.getCollection();
+    final String sliceName = slice.getName();
+
+    if (slice.getState() != Slice.State.INACTIVE) {
+      return false;
+    }
+
+    final String lastChangeTimestamp = slice.getStr(ZkStateReader.STATE_TIMESTAMP_PROP);
+    if (lastChangeTimestamp == null || lastChangeTimestamp.isEmpty()) {
+      log.warn(
+          "Collection {} Shard {} has no last change timestamp and will not be deleted",
+          collectionName,
+          sliceName);
+      return false;
+    }
+
+    final long epochTimestampNs;
+    try {
+      epochTimestampNs = Long.parseLong(lastChangeTimestamp);
+    } catch (NumberFormatException e) {
+      log.warn(
+          "Collection {} Shard {} has an invalid last change timestamp and will not be deleted",
+          collectionName,
+          sliceName);
+      return false;
+    }
+
+    long currentEpochTimeNs =
+        coreContainer.getZkController().getSolrCloudManager().getTimeSource().getEpochTimeNs();
+    long delta = TimeUnit.NANOSECONDS.toSeconds(currentEpochTimeNs - epochTimestampNs);
+
+    boolean expired = delta >= ttlSeconds;
+    if (log.isDebugEnabled()) {
+      log.debug(
+          "collection {} shard {} last state change {} seconds ago. Expired={}",
+          slice.getCollection(),
+          slice.getName(),
+          delta,
+          expired);
+    }
+    return expired;
+  }
+}
diff --git a/solr/core/src/java/org/apache/solr/cluster/maintenance/InactiveShardRemoverConfig.java b/solr/core/src/java/org/apache/solr/cluster/maintenance/InactiveShardRemoverConfig.java
new file mode 100644
index 00000000000..22465e82359
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/cluster/maintenance/InactiveShardRemoverConfig.java
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.cluster.maintenance;
+
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.annotation.JsonProperty;
+import org.apache.solr.common.util.ReflectMapWriter;
+
+public class InactiveShardRemoverConfig implements ReflectMapWriter {
+
+  public static final long DEFAULT_SCHEDULE_INTERVAL_SECONDS = 900L; // 15 minutes
+
+  public static final long DEFAULT_TTL_SECONDS = 900L; // 15 minutes
+
+  public static final int DEFAULT_MAX_DELETES_PER_CYCLE = 20;
+
+  @JsonProperty public long scheduleIntervalSeconds;
+
+  @JsonProperty public long ttlSeconds;
+
+  @JsonProperty public int maxDeletesPerCycle;
+
+  /** Default constructor required for deserialization */
+  public InactiveShardRemoverConfig() {
+    this(DEFAULT_SCHEDULE_INTERVAL_SECONDS, DEFAULT_TTL_SECONDS, DEFAULT_MAX_DELETES_PER_CYCLE);
+  }
+
+  public InactiveShardRemoverConfig(
+      final long scheduleIntervalSeconds, final long ttlSeconds, final int maxDeletesPerCycle) {
+    this.scheduleIntervalSeconds = scheduleIntervalSeconds;
+    this.ttlSeconds = ttlSeconds;
+    this.maxDeletesPerCycle = maxDeletesPerCycle;
+  }
+
+  public void validate() {
+    if (scheduleIntervalSeconds <= 0) {
+      throw new SolrException(
+          SolrException.ErrorCode.BAD_REQUEST, "scheduleIntervalSeconds must be greater than 0");
+    }
+    if (maxDeletesPerCycle <= 0) {
+      throw new SolrException(
+          SolrException.ErrorCode.BAD_REQUEST, "maxDeletesPerCycle must be greater than 0");
+    }
+  }
+}
diff --git a/solr/core/src/java/org/apache/solr/cluster/maintenance/package-info.java b/solr/core/src/java/org/apache/solr/cluster/maintenance/package-info.java
new file mode 100644
index 00000000000..3001cb775c3
--- /dev/null
+++ b/solr/core/src/java/org/apache/solr/cluster/maintenance/package-info.java
@@ -0,0 +1,19 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/** Cluster Singleton plugins that are used to perform maintenance tasks within the cluster. */
+package org.apache.solr.cluster.maintenance;
diff --git a/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java b/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java
index 63bd18ea46d..91ab353be1b 100644
--- a/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/DeleteShardTest.java
@@ -18,17 +18,12 @@ package org.apache.solr.cloud;
 
 import java.io.IOException;
 import org.apache.solr.client.solrj.SolrServerException;
-import org.apache.solr.client.solrj.cloud.DistributedQueue;
 import org.apache.solr.client.solrj.request.CollectionAdminRequest;
 import org.apache.solr.client.solrj.request.CoreStatus;
-import org.apache.solr.cloud.overseer.OverseerAction;
-import org.apache.solr.common.MapWriter;
 import org.apache.solr.common.cloud.DocCollection;
 import org.apache.solr.common.cloud.Replica;
 import org.apache.solr.common.cloud.Slice;
 import org.apache.solr.common.cloud.Slice.State;
-import org.apache.solr.common.cloud.ZkNodeProps;
-import org.apache.solr.common.cloud.ZkStateReader;
 import org.apache.solr.util.FileUtils;
 import org.junit.After;
 import org.junit.Before;
@@ -68,7 +63,7 @@ public class DeleteShardTest extends SolrCloudTestCase {
             CollectionAdminRequest.deleteShard(collection, "shard1")
                 .process(cluster.getSolrClient()));
 
-    setSliceState(collection, "shard1", Slice.State.INACTIVE);
+    ShardTestUtil.setSliceState(cluster, collection, "shard1", Slice.State.INACTIVE);
 
     // Can delete an INACTIVE shard
     CollectionAdminRequest.deleteShard(collection, "shard1").process(cluster.getSolrClient());
@@ -76,46 +71,12 @@ public class DeleteShardTest extends SolrCloudTestCase {
         "Expected 'shard1' to be removed", collection, (n, c) -> c.getSlice("shard1") == null);
 
     // Can delete a shard under construction
-    setSliceState(collection, "shard2", Slice.State.CONSTRUCTION);
+    ShardTestUtil.setSliceState(cluster, collection, "shard2", Slice.State.CONSTRUCTION);
     CollectionAdminRequest.deleteShard(collection, "shard2").process(cluster.getSolrClient());
     waitForState(
         "Expected 'shard2' to be removed", collection, (n, c) -> c.getSlice("shard2") == null);
   }
 
-  protected void setSliceState(String collection, String slice, State state) throws Exception {
-
-    // TODO can this be encapsulated better somewhere?
-    MapWriter m =
-        ew ->
-            ew.put(Overseer.QUEUE_OPERATION, OverseerAction.UPDATESHARDSTATE.toLower())
-                .put(slice, state.toString())
-                .put(ZkStateReader.COLLECTION_PROP, collection);
-    final Overseer overseer = cluster.getOpenOverseer();
-    if (overseer.getDistributedClusterStateUpdater().isDistributedStateUpdate()) {
-      overseer
-          .getDistributedClusterStateUpdater()
-          .doSingleStateUpdate(
-              DistributedClusterStateUpdater.MutatingCommand.SliceUpdateShardState,
-              new ZkNodeProps(m),
-              cluster.getOpenOverseer().getSolrCloudManager(),
-              cluster.getOpenOverseer().getZkStateReader());
-    } else {
-      DistributedQueue inQueue =
-          cluster
-              .getJettySolrRunner(0)
-              .getCoreContainer()
-              .getZkController()
-              .getOverseer()
-              .getStateUpdateQueue();
-      inQueue.offer(m);
-    }
-
-    waitForState(
-        "Expected shard " + slice + " to be in state " + state,
-        collection,
-        (n, c) -> c.getSlice(slice).getState() == state);
-  }
-
   @Test
   public void testDirectoryCleanupAfterDeleteShard() throws IOException, SolrServerException {
 
@@ -162,4 +123,13 @@ public class DeleteShardTest extends SolrCloudTestCase {
         "Instance directory still exists", FileUtils.fileExists(coreStatus.getInstanceDirectory()));
     assertTrue("Data directory still exists", FileUtils.fileExists(coreStatus.getDataDirectory()));
   }
+
+  private void setSliceState(String collectionName, String shardId, Slice.State state)
+      throws Exception {
+    ShardTestUtil.setSliceState(cluster, collectionName, shardId, state);
+    waitForState(
+        "Expected shard " + shardId + " to be in state " + state,
+        collectionName,
+        (n, c) -> c.getSlice(shardId).getState() == state);
+  }
 }
diff --git a/solr/core/src/test/org/apache/solr/cluster/maintenance/InactiveShardRemoverTest.java b/solr/core/src/test/org/apache/solr/cluster/maintenance/InactiveShardRemoverTest.java
new file mode 100644
index 00000000000..fb8f2b95715
--- /dev/null
+++ b/solr/core/src/test/org/apache/solr/cluster/maintenance/InactiveShardRemoverTest.java
@@ -0,0 +1,225 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.solr.cluster.maintenance;
+
+import static org.apache.solr.client.solrj.SolrRequest.METHOD.POST;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Map;
+import java.util.concurrent.TimeUnit;
+import org.apache.solr.client.solrj.SolrServerException;
+import org.apache.solr.client.solrj.request.CollectionAdminRequest;
+import org.apache.solr.client.solrj.request.V2Request;
+import org.apache.solr.client.solrj.request.beans.PluginMeta;
+import org.apache.solr.client.solrj.response.V2Response;
+import org.apache.solr.cloud.ShardTestUtil;
+import org.apache.solr.cloud.SolrCloudTestCase;
+import org.apache.solr.common.SolrException;
+import org.apache.solr.common.cloud.DocCollection;
+import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.util.TimeSource;
+import org.apache.solr.core.CoreContainer;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+public class InactiveShardRemoverTest extends SolrCloudTestCase {
+
+  @BeforeClass
+  public static void setupCluster() throws Exception {
+    configureCluster(1)
+        .addConfig(
+            "conf", TEST_PATH().resolve("configsets").resolve("cloud-minimal").resolve("conf"))
+        .configure();
+  }
+
+  @Test
+  public void testDeleteInactiveShard() throws Exception {
+
+    addPlugin(new InactiveShardRemoverConfig(1, 0, 2));
+    try {
+      final String collectionName = "testDeleteInactiveShard";
+      createCollection(collectionName, 1);
+
+      final String sliceName =
+          new ArrayList<>(getCollectionState(collectionName).getSlices()).get(0).getName();
+      ShardTestUtil.setSliceState(cluster, collectionName, sliceName, Slice.State.INACTIVE);
+
+      waitForState(
+          "Waiting for inactive shard to be deleted",
+          collectionName,
+          clusterShape(0, 0),
+          5,
+          TimeUnit.SECONDS);
+    } finally {
+      removePlugin();
+    }
+  }
+
+  @Test
+  public void testTtl() throws Exception {
+
+    final int ttlSeconds = 1 + random().nextInt(5);
+    final TimeSource timeSource = cluster.getOpenOverseer().getSolrCloudManager().getTimeSource();
+
+    addPlugin(new InactiveShardRemoverConfig(1, ttlSeconds, 1));
+    try {
+      final String collectionName = "testTtl";
+      createCollection(collectionName, 1);
+
+      final String sliceName =
+          new ArrayList<>(getCollectionState(collectionName).getSlices()).get(0).getName();
+      ShardTestUtil.setSliceState(cluster, collectionName, sliceName, Slice.State.INACTIVE);
+      waitForState(
+          "Expected shard " + sliceName + " to be in state " + Slice.State.INACTIVE,
+          collectionName,
+          (n, c) -> c.getSlice(sliceName).getState() == Slice.State.INACTIVE);
+
+      final long ttlStart = timeSource.getTimeNs();
+
+      waitForState(
+          "Waiting for InactiveShardRemover to delete inactive shard",
+          collectionName,
+          clusterShape(0, 0),
+          ttlSeconds + 5,
+          TimeUnit.SECONDS);
+
+      final long ttlEnd = timeSource.getTimeNs();
+      final long ttlPeriodSeconds = TimeUnit.NANOSECONDS.toSeconds(ttlEnd - ttlStart);
+
+      assertTrue(ttlPeriodSeconds >= ttlSeconds);
+    } finally {
+      removePlugin();
+    }
+  }
+
+  public void testMaxShardsToDeletePerCycle() throws Exception {
+
+    final CoreContainer cc = cluster.getOpenOverseer().getCoreContainer();
+
+    final int maxDeletesPerCycle = 5;
+    final InactiveShardRemover remover = new InactiveShardRemover(cc);
+    remover.configure(new InactiveShardRemoverConfig(1, 0, maxDeletesPerCycle));
+
+    // Remove across multiple collections
+    final String collection1 = "testMaxShardsToDeletePerCycle-1";
+    final String collection2 = "testMaxShardsToDeletePerCycle-2";
+    final int shardsPerCollection = 10;
+    final int totalShards = 2 * shardsPerCollection;
+
+    createCollection(collection1, shardsPerCollection);
+    createCollection(collection2, shardsPerCollection);
+
+    setAllShardsInactive(collection1);
+    setAllShardsInactive(collection2);
+
+    int cycle = 0;
+    int shardsDeleted = 0;
+    while (shardsDeleted < totalShards) {
+      cycle++;
+      remover.deleteInactiveSlices();
+      DocCollection coll1 = getCollectionState(collection1);
+      DocCollection coll2 = getCollectionState(collection2);
+
+      int remainingShards = coll1.getSlices().size() + coll2.getSlices().size();
+      if (remainingShards != totalShards - maxDeletesPerCycle * cycle) {
+        System.out.println(coll1);
+        System.out.println(coll2);
+      }
+      assertEquals(totalShards - maxDeletesPerCycle * cycle, remainingShards);
+      shardsDeleted = totalShards - remainingShards;
+    }
+  }
+
+  @Test
+  public void testConfigValidation() {
+
+    try {
+      new InactiveShardRemoverConfig(0, 0, 1).validate();
+      fail("Expected validation error for scheduleIntervalSeconds=0");
+    } catch (SolrException e) {
+      assertEquals(SolrException.ErrorCode.BAD_REQUEST.code, e.code());
+    }
+
+    try {
+      new InactiveShardRemoverConfig(1, 0, 0).validate();
+      fail("Expected validation error for maxDeletesPerCycle=0");
+    } catch (SolrException e) {
+      assertEquals(SolrException.ErrorCode.BAD_REQUEST.code, e.code());
+    }
+  }
+
+  private static void addPlugin(final InactiveShardRemoverConfig config)
+      throws SolrServerException, IOException {
+    PluginMeta plugin = pluginMeta(config);
+    pluginRequest(Collections.singletonMap("add", plugin));
+  }
+
+  private static void removePlugin() throws SolrServerException, IOException {
+    pluginRequest(Collections.singletonMap("remove", InactiveShardRemover.PLUGIN_NAME));
+  }
+
+  private static void pluginRequest(Map<String, Object> payload)
+      throws SolrServerException, IOException {
+    V2Request req =
+        new V2Request.Builder("/cluster/plugin").withMethod(POST).withPayload(payload).build();
+    V2Response rsp = req.process(cluster.getSolrClient());
+    assertEquals(0, rsp.getStatus());
+  }
+
+  private static PluginMeta pluginMeta(final InactiveShardRemoverConfig config) {
+    PluginMeta plugin = pluginMeta();
+    plugin.config = config;
+    return plugin;
+  }
+
+  private static PluginMeta pluginMeta() {
+    PluginMeta plugin = new PluginMeta();
+    plugin.klass = InactiveShardRemover.class.getName();
+    plugin.name = InactiveShardRemover.PLUGIN_NAME;
+    return plugin;
+  }
+
+  private void createCollection(final String collectionName, final int numShards)
+      throws SolrServerException, IOException {
+    CollectionAdminRequest.createCollection(collectionName, "conf", numShards, 1)
+        .process(cluster.getSolrClient());
+
+    cluster.waitForActiveCollection(collectionName, numShards, numShards);
+  }
+
+  private void setAllShardsInactive(final String collectionName) {
+    DocCollection collection = getCollectionState(collectionName);
+    collection.getSlices().stream()
+        .filter(s -> s.getState() != Slice.State.INACTIVE)
+        .forEach(
+            s -> {
+              try {
+                ShardTestUtil.setSliceState(
+                    cluster, s.getCollection(), s.getName(), Slice.State.INACTIVE);
+                waitForState(
+                    "Expected shard " + s + " to be in state " + Slice.State.INACTIVE,
+                    collection.getName(),
+                    (n, c) -> c.getSlice(s.getName()).getState() == Slice.State.INACTIVE);
+              } catch (Exception e) {
+                throw new RuntimeException(e);
+              }
+            });
+  }
+}
diff --git a/solr/solr-ref-guide/modules/configuration-guide/pages/cluster-singleton-plugins.adoc b/solr/solr-ref-guide/modules/configuration-guide/pages/cluster-singleton-plugins.adoc
new file mode 100644
index 00000000000..5afdcb0d18a
--- /dev/null
+++ b/solr/solr-ref-guide/modules/configuration-guide/pages/cluster-singleton-plugins.adoc
@@ -0,0 +1,93 @@
+= Cluster Singleton Plugins
+:toclevels: 3
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+The Solr distribution includes some Cluster Singleton plugins.
+Additional plugins can be added - they have to implement the ClusterSingleton interface.
+The configuration entry may also contain a config element if the plugin implements the ConfigurablePlugin interface.
+
+== Plugin Configuration
+Cluster Singleton plugins can be configured in two ways, either by using the xref:cluster-plugins.adoc[cluster plugins API] or by declaring them in xref:configuring-solr-xml.adoc[solr.xml].
+
+All cluster plugins must be declared using the same method. It is not possible to declare some plugins in solr.xml and use the cluster plugins API to manage other plugins.
+
+== Cluster Singleton Plugins Included with Solr
+Solr includes the following plugins out-of-the-box.
+
+=== Inactive Shard Remover
+This plugin will periodically find and delete shards that have an INACTIVE shard state.
+Shards become INACTIVE when they are split, and the documents they contain are now managed by two or more sub-shards.
+
+Configuration using the cluster plugin API
+[source,bash]
+----
+curl -X POST -H 'Content-type: application/json' -d '{
+    "add":{
+        "name": ".inactive-shard-remover",
+        "class": "org.apache.solr.cluster.maintenance.InactiveShardRemover",
+        "config": {
+          "scheduleIntervalSeconds": 3600,
+          "ttlSeconds": 1800,
+          "maxDeletesPerCycle": 20
+        }
+    }}'
+  http://localhost:8983/api/cluster/plugin
+----
+
+Configuration in solr.xml
+[source,xml]
+----
+<clusterSingleton name=".inactive-shard-remover" class="org.apache.solr.cluster.maintenance.InactiveShardRemover">
+  <long name="scheduleIntervalSeconds">3600</long>
+  <long name="ttlSeconds">1800</long>
+  <int name="maxDeletesPerCycle">20</int>
+</clusterSingleton>
+----
+
+NOTE: The Inactive Shard Remover plugin configuration MUST use the predefined name `.inactive-shard-remover`.
+There can be only one (or none) of these configurations defined.
+
+==== Configuration
+
+`scheduleIntervalSeconds`::
++
+[%autowidth,frame=none]
+|===
+|Optional |Default: `900` Seconds
+|===
++
+This value determines how often the inactive shard remover will run
+
+`ttlSeconds`::
++
+[%autowidth,frame=none]
+|===
+|Optional |Default: `900` Seconds
+|===
++
+This value defines the minimum period of time that a Shard must be INACTIVE before it is considered for deletion.
+
+`maxDeletesPerCycle`::
++
+[%autowidth,frame=none]
+|===
+|Optional |Default: `20`
+|===
++
+This is the maximum number of shards that will be deleted each time the inactive shard remover runs.
+If there are more Shards that could be deleted, they will be considered during the next cycle.
diff --git a/solr/test-framework/src/java/org/apache/solr/cloud/ShardTestUtil.java b/solr/test-framework/src/java/org/apache/solr/cloud/ShardTestUtil.java
new file mode 100644
index 00000000000..4faf9708098
--- /dev/null
+++ b/solr/test-framework/src/java/org/apache/solr/cloud/ShardTestUtil.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.solr.cloud;
+
+import org.apache.solr.client.solrj.cloud.DistributedQueue;
+import org.apache.solr.cloud.overseer.OverseerAction;
+import org.apache.solr.common.MapWriter;
+import org.apache.solr.common.cloud.Slice;
+import org.apache.solr.common.cloud.ZkNodeProps;
+import org.apache.solr.common.cloud.ZkStateReader;
+
+public class ShardTestUtil {
+
+  public static void setSliceState(
+      MiniSolrCloudCluster cluster, String collection, String slice, Slice.State state)
+      throws Exception {
+
+    MapWriter m =
+        ew ->
+            ew.put(Overseer.QUEUE_OPERATION, OverseerAction.UPDATESHARDSTATE.toLower())
+                .put(slice, state.toString())
+                .put(ZkStateReader.COLLECTION_PROP, collection);
+    final Overseer overseer = cluster.getOpenOverseer();
+    if (overseer.getDistributedClusterStateUpdater().isDistributedStateUpdate()) {
+      overseer
+          .getDistributedClusterStateUpdater()
+          .doSingleStateUpdate(
+              DistributedClusterStateUpdater.MutatingCommand.SliceUpdateShardState,
+              new ZkNodeProps(m),
+              cluster.getOpenOverseer().getSolrCloudManager(),
+              cluster.getOpenOverseer().getZkStateReader());
+    } else {
+      DistributedQueue inQueue =
+          cluster
+              .getJettySolrRunner(0)
+              .getCoreContainer()
+              .getZkController()
+              .getOverseer()
+              .getStateUpdateQueue();
+      inQueue.offer(m);
+    }
+  }
+}