You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by no...@apache.org on 2022/10/26 00:10:18 UTC

[solr] branch branch_9_1 updated: SOLR-16412 : Race condition in SizeLimitedDistributedMap for cleanup (#1032)

This is an automated email from the ASF dual-hosted git repository.

noble pushed a commit to branch branch_9_1
in repository https://gitbox.apache.org/repos/asf/solr.git


The following commit(s) were added to refs/heads/branch_9_1 by this push:
     new 5294c82c3cd SOLR-16412 : Race condition in SizeLimitedDistributedMap for cleanup (#1032)
5294c82c3cd is described below

commit 5294c82c3cd535604be5fce48f86f6ecedc187bc
Author: patsonluk <pa...@users.noreply.github.com>
AuthorDate: Tue Oct 25 17:07:52 2022 -0700

    SOLR-16412 : Race condition in SizeLimitedDistributedMap for cleanup (#1032)
---
 solr/CHANGES.txt                                   |  2 +
 .../solr/cloud/SizeLimitedDistributedMap.java      | 10 ++--
 .../solr/cloud/TestSizeLimitedDistributedMap.java  | 56 ++++++++++++++++++++++
 3 files changed, 65 insertions(+), 3 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index 6a5f73e3ec5..1bdb0a248b3 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -169,6 +169,8 @@ Bug Fixes
 
 * SOLR-16485: Fix NPE in ShardHandlerFactory when running in Standalone mode (Houston Putman)
 
+* SOLR-16412: Race condition could trigger error on concurrent SizeLimitedDistributedMap cleanup (Patson Luk via noble)
+
 Other Changes
 ---------------------
 * SOLR-16351: Upgrade Carrot2 to 4.4.3, upgrade randomizedtesting to 2.8.0. (Dawid Weiss)
diff --git a/solr/core/src/java/org/apache/solr/cloud/SizeLimitedDistributedMap.java b/solr/core/src/java/org/apache/solr/cloud/SizeLimitedDistributedMap.java
index 903fb7e8ebc..db36edbee1a 100644
--- a/solr/core/src/java/org/apache/solr/cloud/SizeLimitedDistributedMap.java
+++ b/solr/core/src/java/org/apache/solr/cloud/SizeLimitedDistributedMap.java
@@ -91,9 +91,13 @@ public class SizeLimitedDistributedMap extends DistributedMap {
       for (String child : children) {
         Long id = childToModificationZxid.get(child);
         if (id != null && id <= topElementMzxId) {
-          zookeeper.delete(dir + "/" + child, -1, true);
-          if (onOverflowObserver != null)
-            onOverflowObserver.onChildDelete(child.substring(PREFIX.length()));
+          try {
+            zookeeper.delete(dir + "/" + child, -1, true);
+            if (onOverflowObserver != null)
+              onOverflowObserver.onChildDelete(child.substring(PREFIX.length()));
+          } catch (KeeperException.NoNodeException ignored) {
+            // this could happen if multiple threads try to clean the same map
+          }
         }
       }
     }
diff --git a/solr/core/src/test/org/apache/solr/cloud/TestSizeLimitedDistributedMap.java b/solr/core/src/test/org/apache/solr/cloud/TestSizeLimitedDistributedMap.java
index a086656d13c..8b92bb8d49d 100644
--- a/solr/core/src/test/org/apache/solr/cloud/TestSizeLimitedDistributedMap.java
+++ b/solr/core/src/test/org/apache/solr/cloud/TestSizeLimitedDistributedMap.java
@@ -21,7 +21,12 @@ import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Set;
+import java.util.concurrent.Callable;
+import java.util.concurrent.ExecutorService;
+import java.util.concurrent.Future;
 import org.apache.solr.common.cloud.SolrZkClient;
+import org.apache.solr.common.util.ExecutorUtil;
+import org.apache.solr.common.util.SolrNamedThreadFactory;
 
 public class TestSizeLimitedDistributedMap extends TestDistributedMap {
 
@@ -67,6 +72,57 @@ public class TestSizeLimitedDistributedMap extends TestDistributedMap {
     }
   }
 
+  public void testConcurrentCleanup() throws Exception {
+    final Set<String> expectedKeys = new HashSet<>();
+    final List<String> deletedItems = new LinkedList<>();
+    int numResponsesToStore = TEST_NIGHTLY ? Overseer.NUM_RESPONSES_TO_STORE : 100;
+
+    try (SolrZkClient zkClient = new SolrZkClient(zkServer.getZkHost(), 10000)) {
+      String path = getAndMakeInitialPath(zkClient);
+      DistributedMap map =
+          new SizeLimitedDistributedMap(
+              zkClient, path, numResponsesToStore, (element) -> deletedItems.add(element));
+      // fill the map to limit first
+      for (int i = 0; i < numResponsesToStore; i++) {
+        map.put("xyz_" + i, new byte[0]);
+      }
+
+      // add more elements concurrently to trigger cleanup
+      final int THREAD_COUNT = Math.min(100, numResponsesToStore);
+      List<Callable<Object>> callables = new ArrayList<>();
+      for (int i = 0; i < THREAD_COUNT; i++) {
+        final String key = "xyz_" + (numResponsesToStore + 1);
+        expectedKeys.add(key);
+        callables.add(
+            () -> {
+              map.put(key, new byte[0]);
+              return null;
+            });
+      }
+
+      ExecutorService executorService =
+          ExecutorUtil.newMDCAwareFixedThreadPool(
+              THREAD_COUNT, new SolrNamedThreadFactory("test-concurrent-cleanup"));
+      List<Future<Object>> futures = new ArrayList<>();
+      for (Callable<Object> callable : callables) {
+        futures.add(executorService.submit(callable));
+      }
+      try {
+        for (Future<Object> future : futures) {
+          future.get(); // none of them should throw exception
+        }
+        for (String expectedKey : expectedKeys) {
+          assertTrue(map.contains(expectedKey));
+        }
+        // there's no guarantees on exactly how many elements will be removed, but it should at
+        // least NOT throw exception
+        assertTrue(!deletedItems.isEmpty());
+      } finally {
+        ExecutorUtil.shutdownAndAwaitTermination(executorService);
+      }
+    }
+  }
+
   protected DistributedMap createMap(SolrZkClient zkClient, String path) {
     return new SizeLimitedDistributedMap(zkClient, path, Overseer.NUM_RESPONSES_TO_STORE, null);
   }