You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by an...@apache.org on 2016/04/21 23:11:42 UTC

[2/4] lucene-solr:branch_5_5: SOLR-8697: Add synchronization around registering as leader and canceling.

SOLR-8697: Add synchronization around registering as leader and canceling.


Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/78bed536
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/78bed536
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/78bed536

Branch: refs/heads/branch_5_5
Commit: 78bed536984dbfd4ba2f802deb58b29979d59329
Parents: f8dd98f
Author: markrmiller <ma...@apache.org>
Authored: Fri Feb 26 12:32:12 2016 -0500
Committer: Anshum Gupta <an...@apache.org>
Committed: Thu Apr 21 13:42:07 2016 -0700

----------------------------------------------------------------------
 .../org/apache/solr/cloud/ElectionContext.java  | 111 ++++++++++---------
 .../org/apache/solr/cloud/ZkController.java     |   2 +-
 .../org/apache/solr/cloud/OverseerTest.java     |   7 ++
 3 files changed, 69 insertions(+), 51 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/78bed536/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
index 871dbcd..d4d02d8 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
@@ -110,8 +110,11 @@ class ShardLeaderElectionContextBase extends ElectionContext {
   protected String shardId;
   protected String collection;
   protected LeaderElector leaderElector;
-  protected volatile Integer leaderZkNodeParentVersion;
-  
+  private Integer leaderZkNodeParentVersion;
+
+  // Prevents a race between cancelling and becoming leader.
+  private final Object lock = new Object();
+
   public ShardLeaderElectionContextBase(LeaderElector leaderElector,
       final String shardId, final String collection, final String coreNodeName,
       ZkNodeProps props, ZkStateReader zkStateReader) {
@@ -138,31 +141,33 @@ class ShardLeaderElectionContextBase extends ElectionContext {
   @Override
   public void cancelElection() throws InterruptedException, KeeperException {
     super.cancelElection();
-    if (leaderZkNodeParentVersion != null) {
-      try {
-        // We need to be careful and make sure we *only* delete our own leader registration node.
-        // We do this by using a multi and ensuring the parent znode of the leader registration node
-        // matches the version we expect - there is a setData call that increments the parent's znode
-        // version whenever a leader registers.
-        log.info("Removing leader registration node on cancel: {} {}", leaderPath, leaderZkNodeParentVersion);
-        List<Op> ops = new ArrayList<>(2);
-        ops.add(Op.check(new Path(leaderPath).getParent().toString(), leaderZkNodeParentVersion));
-        ops.add(Op.delete(leaderPath, -1));
-        zkClient.multi(ops, true);
-      } catch (KeeperException.NoNodeException nne) {
-        // no problem
-        log.info("No leader registration node found to remove: {}", leaderPath);
-      } catch (KeeperException.BadVersionException bve) {
-        log.info("Cannot remove leader registration node because the current registered node is not ours: {}", leaderPath);
-        // no problem
-      } catch (InterruptedException e) {
-        throw e;
-      } catch (Exception e) {
-        SolrException.log(log, e);
+    synchronized (lock) {
+      if (leaderZkNodeParentVersion != null) {
+        try {
+          // We need to be careful and make sure we *only* delete our own leader registration node.
+          // We do this by using a multi and ensuring the parent znode of the leader registration node
+          // matches the version we expect - there is a setData call that increments the parent's znode
+          // version whenever a leader registers.
+          log.info("Removing leader registration node on cancel: {} {}", leaderPath, leaderZkNodeParentVersion);
+          List<Op> ops = new ArrayList<>(2);
+          ops.add(Op.check(new Path(leaderPath).getParent().toString(), leaderZkNodeParentVersion));
+          ops.add(Op.delete(leaderPath, -1));
+          zkClient.multi(ops, true);
+        } catch (KeeperException.NoNodeException nne) {
+          // no problem
+          log.info("No leader registration node found to remove: {}", leaderPath);
+        } catch (KeeperException.BadVersionException bve) {
+          log.info("Cannot remove leader registration node because the current registered node is not ours: {}", leaderPath);
+          // no problem
+        } catch (InterruptedException e) {
+          throw e;
+        } catch (Exception e) {
+          SolrException.log(log, e);
+        }
+        leaderZkNodeParentVersion = null;
+      } else {
+        log.info("No version found for ephemeral leader parent node, won't remove previous leader registration.");
       }
-      leaderZkNodeParentVersion = null;
-    } else {
-      log.info("No version found for ephemeral leader parent node, won't remove previous leader registration.");
     }
   }
   
@@ -179,31 +184,31 @@ class ShardLeaderElectionContextBase extends ElectionContext {
         
         @Override
         public void execute() throws InterruptedException, KeeperException {
-          log.info("Creating leader registration node {} after winning as {}", leaderPath, leaderSeqPath);
-          List<Op> ops = new ArrayList<>(2);
-          
-          // We use a multi operation to get the parent nodes version, which will
-          // be used to make sure we only remove our own leader registration node.
-          // The setData call used to get the parent version is also the trigger to
-          // increment the version. We also do a sanity check that our leaderSeqPath exists.
-          
-          ops.add(Op.check(leaderSeqPath, -1));
-          ops.add(Op.create(leaderPath, json, zkClient.getZkACLProvider().getACLsToAdd(leaderPath), CreateMode.EPHEMERAL));
-          // we set the json on the parent too for back compat, see SOLR-7844
-          ops.add(Op.setData(parent, json, -1));
-          List<OpResult> results;
-          
-          results = zkClient.multi(ops, true);
-          
-          for (OpResult result : results) {
-            if (result.getType() == ZooDefs.OpCode.setData) {
-              SetDataResult dresult = (SetDataResult) result;
-              Stat stat = dresult.getStat();
-              leaderZkNodeParentVersion = stat.getVersion();
-              return;
+          synchronized (lock) {
+            log.info("Creating leader registration node {} after winning as {}", leaderPath, leaderSeqPath);
+            List<Op> ops = new ArrayList<>(2);
+
+            // We use a multi operation to get the parent nodes version, which will
+            // be used to make sure we only remove our own leader registration node.
+            // The setData call used to get the parent version is also the trigger to
+            // increment the version. We also do a sanity check that our leaderSeqPath exists.
+
+            ops.add(Op.check(leaderSeqPath, -1));
+            ops.add(Op.create(leaderPath, Utils.toJSON(leaderProps), zkClient.getZkACLProvider().getACLsToAdd(leaderPath), CreateMode.EPHEMERAL));
+            ops.add(Op.setData(parent, null, -1));
+            List<OpResult> results;
+
+            results = zkClient.multi(ops, true);
+            for (OpResult result : results) {
+              if (result.getType() == ZooDefs.OpCode.setData) {
+                SetDataResult dresult = (SetDataResult) result;
+                Stat stat = dresult.getStat();
+                leaderZkNodeParentVersion = stat.getVersion();
+                return;
+              }
             }
+            assert leaderZkNodeParentVersion != null;
           }
-          assert leaderZkNodeParentVersion != null;
         }
       });
     } catch (Throwable t) {
@@ -226,7 +231,13 @@ class ShardLeaderElectionContextBase extends ElectionContext {
 
   public LeaderElector getLeaderElector() {
     return leaderElector;
-  }  
+  }
+
+  Integer getLeaderZkNodeParentVersion() {
+    synchronized (lock) {
+      return leaderZkNodeParentVersion;
+    }
+  }
 }
 
 // add core container and stop passing core around...

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/78bed536/solr/core/src/java/org/apache/solr/cloud/ZkController.java
----------------------------------------------------------------------
diff --git a/solr/core/src/java/org/apache/solr/cloud/ZkController.java b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
index e7b5f50..cc8f7a2 100644
--- a/solr/core/src/java/org/apache/solr/cloud/ZkController.java
+++ b/solr/core/src/java/org/apache/solr/cloud/ZkController.java
@@ -2194,7 +2194,7 @@ public final class ZkController {
     // we use this version and multi to ensure *only* the current zk registered leader
     // for a shard can put a replica into LIR
     
-    Integer leaderZkNodeParentVersion = ((ShardLeaderElectionContextBase)context).leaderZkNodeParentVersion;
+    Integer leaderZkNodeParentVersion = ((ShardLeaderElectionContextBase)context).getLeaderZkNodeParentVersion();
     
     // TODO: should we do this optimistically to avoid races?
     if (zkClient.exists(znodePath, retryOnConnLoss)) {

http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/78bed536/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java
----------------------------------------------------------------------
diff --git a/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java b/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java
index 108ccbe..98b04b4 100644
--- a/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java
+++ b/solr/core/src/test/org/apache/solr/cloud/OverseerTest.java
@@ -121,6 +121,13 @@ public class OverseerTest extends SolrTestCaseJ4 {
     }
 
     public void close() {
+      for (ElectionContext ec : electionContext.values()) {
+        try {
+          ec.cancelElection();
+        } catch (Exception e) {
+          log.warn(String.format("Error cancelling election for %s", ec.id), e);
+        }
+      }
       deleteNode(ZkStateReader.LIVE_NODES_ZKNODE + "/" + nodeName);
       zkClient.close();
     }