You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2020/09/01 20:33:17 UTC

[lucene-solr] 01/02: @678 Work on debugging testRetryUpdatesWhenClusterStateIsStale.

This is an automated email from the ASF dual-hosted git repository.

markrmiller pushed a commit to branch reference_impl_dev
in repository https://gitbox.apache.org/repos/asf/lucene-solr.git

commit ac8acdcd9aba5d11877730fa65b7e2891414be60
Author: markrmiller@gmail.com <ma...@gmail.com>
AuthorDate: Tue Sep 1 15:23:42 2020 -0500

    @678 Work on debugging testRetryUpdatesWhenClusterStateIsStale.
---
 .../solr/cloud/ShardLeaderElectionContext.java     | 15 +++++++-
 .../solr/cloud/ShardLeaderElectionContextBase.java |  4 +-
 .../java/org/apache/solr/cloud/ZkController.java   | 20 +++++++---
 .../src/java/org/apache/solr/core/SolrCore.java    | 44 +++++++++++++---------
 .../solrj/impl/CloudHttp2SolrClientTest.java       | 15 +++-----
 .../solrj/impl/CloudSolrClientCacheTest.java       |  1 +
 6 files changed, 63 insertions(+), 36 deletions(-)

diff --git a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
index 2f564f9..f195e8c 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContext.java
@@ -270,12 +270,15 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
           }
         }
         if (!success) {
+          if (isClosed()) {
+            return;
+          }
           rejoinLeaderElection(core);
           return;
         }
 
       }
-      if (!isClosed) {
+      if (!isClosed()) {
         try {
           if (replicaType == Replica.Type.TLOG) {
             // stop replicate from old leader
@@ -298,6 +301,10 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
                     "without being up-to-date with the previous leader", coreNodeName);
             zkController.getShardTerms(collection, shardId).setTermEqualsToLeader(coreNodeName);
           }
+          if (isClosed()) {
+            return;
+          }
+
           super.runLeaderProcess(context, weAreReplacement, 0);
 
           assert shardId != null;
@@ -316,6 +323,9 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
 
           try (SolrCore core = cc.getCore(coreName)) {
             if (core != null) {
+              if (isClosed()) {
+                return;
+              }
               core.getCoreDescriptor().getCloudDescriptor().setLeader(true);
               publishActiveIfRegisteredAndNotActive(core);
             } else {
@@ -347,6 +357,9 @@ final class ShardLeaderElectionContext extends ShardLeaderElectionContextBase {
 
               // we could not publish ourselves as leader - try and rejoin election
               try {
+                if (isClosed()) {
+                  return;
+                }
                 rejoinLeaderElection(core);
               } catch (Exception exc) {
                 ParWork.propegateInterrupt(e);
diff --git a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
index 57a888a..c8138f8 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ShardLeaderElectionContextBase.java
@@ -83,7 +83,7 @@ class ShardLeaderElectionContextBase extends ElectionContext {
           log.debug("Removing leader registration node on cancel: {} {}", leaderPath, version);
           List<Op> ops = new ArrayList<>(2);
           ops.add(Op.check(Paths.get(leaderPath).getParent().toString(), version));
-          ops.add(Op.check(electionPath, -1));
+          ops.add(Op.check(leaderSeqPath, -1));
           ops.add(Op.delete(leaderPath, -1));
           zkClient.multi(ops);
         } catch (KeeperException e) {
@@ -110,8 +110,6 @@ class ShardLeaderElectionContextBase extends ElectionContext {
           return;
         } catch (Exception e) {
           throw new SolrException(ErrorCode.SERVER_ERROR, "Exception canceling election", e);
-        } finally {
-          version = null;
         }
       } else {
         log.info("No version found for ephemeral leader parent node, won't remove previous leader registration.");
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index c8480c5..67d45b1 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -149,6 +149,7 @@ public class ZkController implements Closeable {
   public static final String CLUSTER_SHUTDOWN = "/cluster/shutdown";
 
   static final int WAIT_DOWN_STATES_TIMEOUT_SECONDS = 60;
+  public static final byte[] EMPTY_BYTE_ARRAY = new byte[0];
   public final int WAIT_FOR_STATE = Integer.getInteger("solr.waitForState", 10);
 
   private final boolean SKIP_AUTO_RECOVERY = Boolean.getBoolean("solrcloud.skip.autorecovery");
@@ -384,7 +385,16 @@ public class ZkController implements Closeable {
     ContextKey contextKey = new ContextKey(collection, coreNodeName);
     ElectionContext context = electionContexts.get(contextKey);
     if (context != null) {
-      context.close();
+      try {
+        context.cancelElection();
+      } catch (InterruptedException e) {
+        ParWork.propegateInterrupt(e);
+        throw new SolrException(ErrorCode.SERVER_ERROR, e);
+      } catch (KeeperException e) {
+        throw new SolrException(ErrorCode.SERVER_ERROR, e);
+      } finally {
+        context.close();
+      }
     }
   }
 
@@ -403,12 +413,12 @@ public class ZkController implements Closeable {
       public synchronized void command() {
 
         try (ParWork worker = new ParWork("disconnected", true)) {
-          worker.collect(overseerContexts);
-          worker.collect( ZkController.this.overseer);
+          worker.collect("OverseerElectionContexts", overseerContexts.values());
+          worker.collect("Overseer", ZkController.this.overseer);
           worker.collect("", () -> {
             clearZkCollectionTerms();
           });
-          worker.collect(electionContexts.values());
+          worker.collect("electionContexts", electionContexts.values());
           worker.collect("",() -> {
             markAllAsNotLeader(descriptorsSupplier);
           });
@@ -2317,7 +2327,7 @@ public class ZkController implements Closeable {
    */
   public boolean claimAsyncId(String asyncId) throws KeeperException {
     try {
-      return asyncIdsMap.putIfAbsent(asyncId, new byte[0]);
+      return asyncIdsMap.putIfAbsent(asyncId, EMPTY_BYTE_ARRAY);
     } catch (InterruptedException e) {
       ParWork.propegateInterrupt(e);
       throw new RuntimeException(e);
diff --git a/solr/core/src/java/org/apache/solr/core/SolrCore.java b/solr/core/src/java/org/apache/solr/core/SolrCore.java
index 6e3fa0a..8978bca 100644
--- a/solr/core/src/java/org/apache/solr/core/SolrCore.java
+++ b/solr/core/src/java/org/apache/solr/core/SolrCore.java
@@ -3069,24 +3069,7 @@ public final class SolrCore implements SolrInfoBean, Closeable {
       }
     }
     if (deleteInstanceDir) {
-      addCloseHook(new CloseHook() {
-        @Override
-        public void preClose(SolrCore core) {
-          // empty block
-        }
-
-        @Override
-        public void postClose(SolrCore core) {
-          if (desc != null) {
-            try {
-              FileUtils.deleteDirectory(desc.getInstanceDir().toFile());
-            } catch (IOException e) {
-              SolrException.log(log, "Failed to delete instance dir for core:"
-                  + core.getName() + " dir:" + desc.getInstanceDir());
-            }
-          }
-        }
-      });
+      addCloseHook(new SolrCoreDeleteCloseHook(desc));
     }
   }
 
@@ -3332,4 +3315,29 @@ public final class SolrCore implements SolrInfoBean, Closeable {
   public void runAsync(Runnable r) {
     ParWork.getMyPerThreadExecutor().submit(r);
   }
+
+  private static class SolrCoreDeleteCloseHook extends CloseHook {
+    private final CoreDescriptor desc;
+
+    public SolrCoreDeleteCloseHook(CoreDescriptor desc) {
+      this.desc = desc;
+    }
+
+    @Override
+    public void preClose(SolrCore core) {
+      // empty block
+    }
+
+    @Override
+    public void postClose(SolrCore core) {
+      if (desc != null) {
+        try {
+          FileUtils.deleteDirectory(desc.getInstanceDir().toFile());
+        } catch (IOException e) {
+          SolrException.log(log, "Failed to delete instance dir for core:"
+              + core.getName() + " dir:" + desc.getInstanceDir());
+        }
+      }
+    }
+  }
 }
diff --git a/solr/solrj/src/test/org/apache/solr/client/solrj/impl/CloudHttp2SolrClientTest.java b/solr/solrj/src/test/org/apache/solr/client/solrj/impl/CloudHttp2SolrClientTest.java
index a34e2c0..3ec2dea 100644
--- a/solr/solrj/src/test/org/apache/solr/client/solrj/impl/CloudHttp2SolrClientTest.java
+++ b/solr/solrj/src/test/org/apache/solr/client/solrj/impl/CloudHttp2SolrClientTest.java
@@ -436,7 +436,6 @@ public class CloudHttp2SolrClientTest extends SolrCloudTestCase {
     CollectionAdminRequest.createCollection(collectionName, "conf", liveNodes, liveNodes)
         .setMaxShardsPerNode(liveNodes * liveNodes)
         .process(cluster.getSolrClient());
-    cluster.waitForActiveCollection(collectionName, liveNodes, liveNodes * liveNodes);
     // Add some new documents
     new UpdateRequest()
         .add(id, "0", "a_t", "hello1")
@@ -517,7 +516,6 @@ public class CloudHttp2SolrClientTest extends SolrCloudTestCase {
     CollectionAdminRequest.createCollection(collectionName, "conf", 1, liveNodes/3, liveNodes/3, liveNodes/3)
         .setMaxShardsPerNode(liveNodes)
         .process(cluster.getSolrClient());
-    cluster.waitForActiveCollection(collectionName, 1, liveNodes);
 
     // Add some new documents
     new UpdateRequest()
@@ -611,7 +609,6 @@ public class CloudHttp2SolrClientTest extends SolrCloudTestCase {
     try (CloudSolrClient client = SolrTestCaseJ4.getCloudSolrClient(cluster.getZkServer().getZkAddress())) {
       // important to have one replica on each node
       CollectionAdminRequest.createCollection("foo", "conf", 1, NODE_COUNT).process(client);
-      cluster.waitForActiveCollection("foo", 1, NODE_COUNT);
       client.setDefaultCollection("foo");
 
       Map<String, String> adminPathToMbean = new HashMap<>(CommonParams.ADMIN_PATHS.size());
@@ -666,8 +663,6 @@ public class CloudHttp2SolrClientTest extends SolrCloudTestCase {
 
       CollectionAdminRequest.waitForAsyncRequest(async1, client, TIMEOUT);
       CollectionAdminRequest.waitForAsyncRequest(async2, client, TIMEOUT);
-      cluster.waitForActiveCollection("multicollection1", 2, 2);
-      cluster.waitForActiveCollection("multicollection2", 2, 2);
       client.setDefaultCollection("multicollection1");
 
       List<SolrInputDocument> docs = new ArrayList<>(3);
@@ -812,7 +807,6 @@ public class CloudHttp2SolrClientTest extends SolrCloudTestCase {
   @Ignore // nocommit ~ possible regression? response doesn't contain "adds"?
   public void testVersionsAreReturned() throws Exception {
     CollectionAdminRequest.createCollection("versions_collection", "conf", 2, 1).process(cluster.getSolrClient());
-    cluster.waitForActiveCollection("versions_collection", 2, 2);
     
     // assert that "adds" are returned
     UpdateRequest updateRequest = new UpdateRequest()
@@ -898,7 +892,6 @@ public class CloudHttp2SolrClientTest extends SolrCloudTestCase {
                  CollectionAdminRequest.createCollection(COL, "conf", 1, 1)
                  .setCreateNodeSet(old_leader_node.getNodeName())
                  .process(cluster.getSolrClient()).getStatus());
-    cluster.waitForActiveCollection(COL, 1, 1);
 
     // determine the coreNodeName of only current replica
     Collection<Slice> slices = cluster.getSolrClient().getZkStateReader().getClusterState().getCollection(COL).getSlices();
@@ -926,6 +919,8 @@ public class CloudHttp2SolrClientTest extends SolrCloudTestCase {
                    .setNode(new_leader_node.getNodeName())
                    // NOTE: don't use our stale_client for this -- don't tip it off of a collection change
                    .process(cluster.getSolrClient()).getStatus());
+
+      cluster.waitForActiveCollection(COL, 1, 2);
       
       // ...and delete our original leader.
       assertEquals("Couldn't create collection", 0,
@@ -933,7 +928,10 @@ public class CloudHttp2SolrClientTest extends SolrCloudTestCase {
                    // NOTE: don't use our stale_client for this -- don't tip it off of a collection change
                    .process(cluster.getSolrClient()).getStatus());
 
-      // stale_client's collection state cache should now only point at a leader that no longer exists.
+//      Thread.currentThread().sleep(3000);
+//
+//      cluster.getZkClient().printLayout();
+      cluster.waitForActiveCollection(COL, 1, 1);
       
       // attempt a (direct) update that should succeed in spite of cached cluster state
       // pointing solely to a node that's no longer part of our collection...
@@ -974,7 +972,6 @@ public class CloudHttp2SolrClientTest extends SolrCloudTestCase {
     CollectionAdminRequest.createCollection(collectionName, "conf", liveNodes, 1, 1, pullReplicas)
         .setMaxShardsPerNode(liveNodes)
         .process(cluster.getSolrClient());
-    cluster.waitForActiveCollection(collectionName, liveNodes, liveNodes * (2 + pullReplicas));
     
     // Add some new documents
     new UpdateRequest()
diff --git a/solr/solrj/src/test/org/apache/solr/client/solrj/impl/CloudSolrClientCacheTest.java b/solr/solrj/src/test/org/apache/solr/client/solrj/impl/CloudSolrClientCacheTest.java
index 08a535b..b647905 100644
--- a/solr/solrj/src/test/org/apache/solr/client/solrj/impl/CloudSolrClientCacheTest.java
+++ b/solr/solrj/src/test/org/apache/solr/client/solrj/impl/CloudSolrClientCacheTest.java
@@ -64,6 +64,7 @@ public class CloudSolrClientCacheTest extends SolrTestCaseJ4 {
         this.c = c;
       }
 
+
       @Override
       public boolean isLazilyLoaded() {
         return true;