You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by er...@apache.org on 2020/11/01 00:17:41 UTC

[lucene-solr] branch branch_8x updated: SOLR-14969: Prevent creating multiple cores with the same name which leads to instabilities (race condition)

This is an automated email from the ASF dual-hosted git repository.

erick pushed a commit to branch branch_8x
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git


The following commit(s) were added to refs/heads/branch_8x by this push:
     new b27c8b9  SOLR-14969: Prevent creating multiple cores with the same name which leads to instabilities (race condition)
b27c8b9 is described below

commit b27c8b90213cd388a9aa79deec7be753d10c9bb9
Author: Erick Erickson <Er...@gmail.com>
AuthorDate: Sat Oct 31 19:33:34 2020 -0400

    SOLR-14969: Prevent creating multiple cores with the same name which leads to instabilities (race condition)
---
 solr/CHANGES.txt                                   |   3 +
 .../java/org/apache/solr/core/CoreContainer.java   | 133 ++++++++++++---------
 .../org/apache/solr/core/TestCoreContainer.java    | 103 ++++++++++++++--
 3 files changed, 175 insertions(+), 64 deletions(-)

diff --git a/solr/CHANGES.txt b/solr/CHANGES.txt
index d2c8bff..0fe9149 100644
--- a/solr/CHANGES.txt
+++ b/solr/CHANGES.txt
@@ -30,6 +30,9 @@ Bug Fixes
 
 * SOLR-14940: ReplicationHandler memory leak through SolrCore.closeHooks with unstable ZK connection. (Anver Sotnikov, Mike Drob)
 
+* SOLR-14969: Prevent creating multiple cores with the same name which leads to instabilities (race condition)
+  (Erick Erickson, Andreas Hubold)
+
 Other Changes
 ---------------------
 
diff --git a/solr/core/src/java/org/apache/solr/core/CoreContainer.java b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
index a0f8c11..1ae652b 100644
--- a/solr/core/src/java/org/apache/solr/core/CoreContainer.java
+++ b/solr/core/src/java/org/apache/solr/core/CoreContainer.java
@@ -29,6 +29,7 @@ import java.util.Collection;
 import java.util.Collections;
 import java.util.Date;
 import java.util.HashMap;
+import java.util.HashSet;
 import java.util.LinkedHashSet;
 import java.util.List;
 import java.util.Locale;
@@ -1261,6 +1262,7 @@ public class CoreContainer {
     return create(coreName, cfg.getCoreRootDirectory().resolve(coreName), parameters, false);
   }
 
+  Set<String> inFlightCreations = new HashSet<>(); // See SOLR-14969
   /**
    * Creates a new core in a specified instance directory, publishing the core state to the cluster
    *
@@ -1270,80 +1272,96 @@ public class CoreContainer {
    * @return the newly created core
    */
   public SolrCore create(String coreName, Path instancePath, Map<String, String> parameters, boolean newCollection) {
+    SolrCore core = null;
+    boolean iAdded = false;
+    try {
+      synchronized (inFlightCreations) {
+        if (inFlightCreations.add(coreName)) {
+          iAdded = true;
+        } else {
+          String msg = "Already creating a core with name '" + coreName + "', call aborted '";
+          log.warn(msg);
+          throw new SolrException(ErrorCode.SERVER_ERROR, msg);
+        }
+      }
+      CoreDescriptor cd = new CoreDescriptor(coreName, instancePath, parameters, getContainerProperties(), getZkController());
 
-    CoreDescriptor cd = new CoreDescriptor(coreName, instancePath, parameters, getContainerProperties(), getZkController());
-
-    // TODO: There's a race here, isn't there?
-    // Since the core descriptor is removed when a core is unloaded, it should never be anywhere when a core is created.
-    if (getAllCoreNames().contains(coreName)) {
-      log.warn("Creating a core with existing name is not allowed");
-      // TODO: Shouldn't this be a BAD_REQUEST?
-      throw new SolrException(ErrorCode.SERVER_ERROR, "Core with name '" + coreName + "' already exists.");
-    }
+      // Since the core descriptor is removed when a core is unloaded, it should never be anywhere when a core is created.
+      if (getAllCoreNames().contains(coreName)) {
+        log.warn("Creating a core with existing name is not allowed: '{}'", coreName);
+        // TODO: Shouldn't this be a BAD_REQUEST?
+        throw new SolrException(ErrorCode.SERVER_ERROR, "Core with name '" + coreName + "' already exists.");
+      }
 
-    // Validate paths are relative to known locations to avoid path traversal
-    assertPathAllowed(cd.getInstanceDir());
-    assertPathAllowed(Paths.get(cd.getDataDir()));
+      // Validate paths are relative to known locations to avoid path traversal
+      assertPathAllowed(cd.getInstanceDir());
+      assertPathAllowed(Paths.get(cd.getDataDir()));
 
-    boolean preExisitingZkEntry = false;
-    try {
-      if (getZkController() != null) {
+      boolean preExisitingZkEntry = false;
+      try {
+        if (getZkController() != null) {
         if (!Overseer.isLegacy(getZkController().getZkStateReader())) {
           if (cd.getCloudDescriptor().getCoreNodeName() == null) {
             throw new SolrException(ErrorCode.SERVER_ERROR, "non legacy mode coreNodeName missing " + parameters.toString());
 
           }
+          }
+          preExisitingZkEntry = getZkController().checkIfCoreNodeNameAlreadyExists(cd);
         }
-        preExisitingZkEntry = getZkController().checkIfCoreNodeNameAlreadyExists(cd);
-      }
 
-      // Much of the logic in core handling pre-supposes that the core.properties file already exists, so create it
-      // first and clean it up if there's an error.
-      coresLocator.create(this, cd);
-
-      SolrCore core = null;
-      try {
-        solrCores.waitAddPendingCoreOps(cd.getName());
-        core = createFromDescriptor(cd, true, newCollection);
-        coresLocator.persist(this, cd); // Write out the current core properties in case anything changed when the core was created
-      } finally {
-        solrCores.removeFromPendingOps(cd.getName());
-      }
+        // Much of the logic in core handling pre-supposes that the core.properties file already exists, so create it
+        // first and clean it up if there's an error.
+        coresLocator.create(this, cd);
 
-      return core;
-    } catch (Exception ex) {
-      // First clean up any core descriptor, there should never be an existing core.properties file for any core that
-      // failed to be created on-the-fly.
-      coresLocator.delete(this, cd);
-      if (isZooKeeperAware() && !preExisitingZkEntry) {
         try {
-          getZkController().unregister(coreName, cd);
-        } catch (InterruptedException e) {
-          Thread.currentThread().interrupt();
-          SolrException.log(log, null, e);
-        } catch (KeeperException e) {
-          SolrException.log(log, null, e);
-        } catch (Exception e) {
-          SolrException.log(log, null, e);
+          solrCores.waitAddPendingCoreOps(cd.getName());
+          core = createFromDescriptor(cd, true, newCollection);
+          coresLocator.persist(this, cd); // Write out the current core properties in case anything changed when the core was created
+        } finally {
+          solrCores.removeFromPendingOps(cd.getName());
         }
-      }
 
-      Throwable tc = ex;
-      Throwable c = null;
-      do {
-        tc = tc.getCause();
-        if (tc != null) {
-          c = tc;
+        return core;
+      } catch (Exception ex) {
+        // First clean up any core descriptor, there should never be an existing core.properties file for any core that
+        // failed to be created on-the-fly.
+        coresLocator.delete(this, cd);
+        if (isZooKeeperAware() && !preExisitingZkEntry) {
+          try {
+            getZkController().unregister(coreName, cd);
+          } catch (InterruptedException e) {
+            Thread.currentThread().interrupt();
+            SolrException.log(log, null, e);
+          } catch (KeeperException e) {
+            SolrException.log(log, null, e);
+          } catch (Exception e) {
+            SolrException.log(log, null, e);
+          }
         }
-      } while (tc != null);
 
-      String rootMsg = "";
-      if (c != null) {
-        rootMsg = " Caused by: " + c.getMessage();
-      }
+        Throwable tc = ex;
+        Throwable c = null;
+        do {
+          tc = tc.getCause();
+          if (tc != null) {
+            c = tc;
+          }
+        } while (tc != null);
+
+        String rootMsg = "";
+        if (c != null) {
+          rootMsg = " Caused by: " + c.getMessage();
+        }
 
-      throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
-          "Error CREATEing SolrCore '" + coreName + "': " + ex.getMessage() + rootMsg, ex);
+        throw new SolrException(SolrException.ErrorCode.BAD_REQUEST,
+            "Error CREATEing SolrCore '" + coreName + "': " + ex.getMessage() + rootMsg, ex);
+      }
+    } finally {
+      synchronized (inFlightCreations) {
+        if (iAdded) {
+          inFlightCreations.remove(coreName);
+        }
+      }
     }
   }
 
@@ -1798,6 +1816,7 @@ public class CoreContainer {
     }
 
     if (cd == null) {
+      log.warn("Cannot unload non-existent core '{}'", name);
       throw new SolrException(ErrorCode.BAD_REQUEST, "Cannot unload non-existent core [" + name + "]");
     }
 
diff --git a/solr/core/src/test/org/apache/solr/core/TestCoreContainer.java b/solr/core/src/test/org/apache/solr/core/TestCoreContainer.java
index 06914b8..5852764 100644
--- a/solr/core/src/test/org/apache/solr/core/TestCoreContainer.java
+++ b/solr/core/src/test/org/apache/solr/core/TestCoreContainer.java
@@ -20,6 +20,7 @@ import java.io.File;
 import java.io.FileOutputStream;
 import java.nio.file.Path;
 import java.nio.file.Paths;
+
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
@@ -119,22 +120,27 @@ public class TestCoreContainer extends SolrTestCaseJ4 {
     }
   }
 
+  private static class TestReloadThread extends Thread {
+    private final CoreContainer cc;
+    TestReloadThread(CoreContainer cc) {
+      this.cc = cc;
+    }
+    @Override
+    public void run() {
+      cc.reload("core1");
+    }
+  }
+
   @Test
   public void testReloadThreaded() throws Exception {
     final CoreContainer cc = init(CONFIGSETS_SOLR_XML);
     cc.create("core1", ImmutableMap.of("configSet", "minimal"));
 
-    class TestThread extends Thread {
-      @Override
-      public void run() {
-        cc.reload("core1");
-      }
-    }
 
     List<Thread> threads = new ArrayList<>();
     int numThreads = 4;
     for (int i = 0; i < numThreads; i++) {
-      threads.add(new TestThread());
+      threads.add(new TestReloadThread(cc));
     }
 
     for (Thread thread : threads) {
@@ -149,6 +155,89 @@ public class TestCoreContainer extends SolrTestCaseJ4 {
 
   }
 
+  private static class TestCreateThread extends Thread {
+    final CoreContainer cc;
+    final String coreName;
+    boolean foundExpectedError = false;
+    SolrCore core = null;
+
+    TestCreateThread(CoreContainer cc, String coreName) {
+      this.cc = cc;
+      this.coreName = coreName;
+    }
+    @Override
+    public void run() {
+      try {
+        core = cc.create(coreName, ImmutableMap.of("configSet", "minimal"));
+      } catch (SolrException e) {
+        String msg = e.getMessage();
+        foundExpectedError = msg.contains("Already creating a core with name") || msg.contains("already exists");
+      }
+    }
+
+    int verifyMe() {
+      if (foundExpectedError) {
+        assertNull("failed create should have returned null for core", core);
+        return 0;
+      } else {
+        assertNotNull("Core should not be null if there was no error", core);
+        return 1;
+      }
+    }
+  }
+
+  @Test
+  public void testCreateThreaded() throws Exception {
+    final CoreContainer cc = init(CONFIGSETS_SOLR_XML);
+    final int NUM_THREADS = 3;
+
+
+    // Try this a few times to increase the chances of failure.
+    for (int idx = 0; idx < 3; ++idx) {
+      TestCreateThread[] threads = new TestCreateThread[NUM_THREADS];
+      // Let's add a little stress in here by using the same core name each
+      // time around. The final unload in the loop should delete the core and
+      // allow the next time around to succeed.
+      // This also checks the bookkeeping in CoreContainer.create
+      // that prevents muliple simulatneous creations,
+      // currently "inFlightCreations"
+      String testName = "coreToTest";
+      for (int thread = 0; thread < NUM_THREADS; ++thread) {
+        threads[thread] = new TestCreateThread(cc, testName);
+      }
+      // only one of these should succeed.
+      for (int thread = 0; thread < NUM_THREADS; ++thread) {
+        threads[thread].start();
+      }
+
+      for (int thread = 0; thread < NUM_THREADS; ++thread) {
+        threads[thread].join();
+      }
+
+      // We can't guarantee that which thread failed, so go find it
+      int goodCount = 0;
+      for (int thread = 0; thread < NUM_THREADS; ++thread) {
+        goodCount += threads[thread].verifyMe();
+      }
+      assertEquals("Only one create should have succeeded", 1, goodCount);
+
+
+      // Check bookkeeping by removing and creating the core again, making sure
+      // we didn't leave the record of trying to create this core around.
+      // NOTE: unloading the core closes it too.
+      cc.unload(testName, true, true, true);
+      cc.create(testName, ImmutableMap.of("configSet", "minimal"));
+      // This call should fail with a different error because the core was
+      // created successfully.
+      SolrException thrown = expectThrows(SolrException.class, () ->
+          cc.create(testName, ImmutableMap.of("configSet", "minimal")));
+      assertTrue("Should have 'already exists' error", thrown.getMessage().contains("already exists"));
+
+      cc.unload(testName, true, true, true);
+    }
+    cc.shutdown();
+  }
+
   @Test
   public void testNoCores() throws Exception {