You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2014/01/08 17:36:37 UTC

svn commit: r1556573 - in /lucene/dev/branches/branch_4x: ./ solr/ solr/core/ solr/core/src/java/org/apache/solr/cloud/ solr/solrj/ solr/solrj/src/java/org/apache/solr/common/cloud/

Author: markrmiller
Date: Wed Jan  8 16:36:37 2014
New Revision: 1556573

URL: http://svn.apache.org/r1556573
Log:
SOLR-5615: Deadlock while trying to recover after a ZK session expiration.

Modified:
    lucene/dev/branches/branch_4x/   (props changed)
    lucene/dev/branches/branch_4x/solr/   (props changed)
    lucene/dev/branches/branch_4x/solr/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_4x/solr/core/   (props changed)
    lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
    lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
    lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/ZkController.java
    lucene/dev/branches/branch_4x/solr/solrj/   (props changed)
    lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/cloud/ConnectionManager.java

Modified: lucene/dev/branches/branch_4x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/CHANGES.txt?rev=1556573&r1=1556572&r2=1556573&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_4x/solr/CHANGES.txt Wed Jan  8 16:36:37 2014
@@ -308,6 +308,10 @@ Bug Fixes
 * SOLR-5608: Don't allow a closed SolrCore to publish state to ZooKeeper.
   (Mark Miller, Shawn Heisey)
 
+* SOLR-5615: Deadlock while trying to recover after a ZK session expiration.
+  (Ramkumar Aiyengar, Mark Miller)
+  
+
 Optimizations
 ----------------------
 

Modified: lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java?rev=1556573&r1=1556572&r2=1556573&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/ElectionContext.java Wed Jan  8 16:36:37 2014
@@ -458,6 +458,11 @@ final class OverseerElectionContext exte
     overseer.start(id);
   }
   
+  public void cancelElection() throws InterruptedException, KeeperException {
+    super.cancelElection();
+    overseer.close();
+  }
+  
   @Override
   public void joinedElectionFired() {
     overseer.close();

Modified: lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java?rev=1556573&r1=1556572&r2=1556573&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/LeaderElector.java Wed Jan  8 16:36:37 2014
@@ -64,7 +64,6 @@ public  class LeaderElector {
   
   private ZkCmdExecutor zkCmdExecutor;
 
-  // for tests
   private volatile ElectionContext context;
 
   public LeaderElector(SolrZkClient zkClient) {
@@ -72,7 +71,6 @@ public  class LeaderElector {
     zkCmdExecutor = new ZkCmdExecutor(zkClient.getZkClientTimeout());
   }
   
-  // for tests
   public ElectionContext getContext() {
     return context;
   }

Modified: lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/ZkController.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/ZkController.java?rev=1556573&r1=1556572&r2=1556573&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/ZkController.java (original)
+++ lucene/dev/branches/branch_4x/solr/core/src/java/org/apache/solr/cloud/ZkController.java Wed Jan  8 16:36:37 2014
@@ -17,6 +17,28 @@ package org.apache.solr.cloud;
  * limitations under the License.
  */
 
+import java.io.File;
+import java.io.IOException;
+import java.io.UnsupportedEncodingException;
+import java.net.InetAddress;
+import java.net.NetworkInterface;
+import java.net.URLEncoder;
+import java.net.UnknownHostException;
+import java.util.ArrayList;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.Enumeration;
+import java.util.HashMap;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Properties;
+import java.util.Set;
+import java.util.concurrent.Future;
+import java.util.concurrent.TimeoutException;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.lang.StringUtils;
 import org.apache.solr.client.solrj.impl.HttpSolrServer;
@@ -42,7 +64,6 @@ import org.apache.solr.core.CoreDescript
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.handler.component.ShardHandler;
 import org.apache.solr.update.UpdateLog;
-import org.apache.solr.update.UpdateShardHandler;
 import org.apache.zookeeper.CreateMode;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.KeeperException.NoNodeException;
@@ -51,28 +72,6 @@ import org.apache.zookeeper.data.Stat;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
-import java.io.File;
-import java.io.IOException;
-import java.io.UnsupportedEncodingException;
-import java.net.InetAddress;
-import java.net.NetworkInterface;
-import java.net.URLEncoder;
-import java.net.UnknownHostException;
-import java.util.ArrayList;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.Enumeration;
-import java.util.HashMap;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Properties;
-import java.util.Set;
-import java.util.concurrent.Future;
-import java.util.concurrent.TimeoutException;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
 /**
  * Handle ZooKeeper interactions.
  * 
@@ -140,13 +139,13 @@ public final class ZkController {
   }
   private final Map<ContextKey, ElectionContext> electionContexts = Collections.synchronizedMap(new HashMap<ContextKey, ElectionContext>());
   
-  private SolrZkClient zkClient;
-  private ZkCmdExecutor cmdExecutor;
-  private ZkStateReader zkStateReader;
+  private final SolrZkClient zkClient;
+  private final ZkCmdExecutor cmdExecutor;
+  private final ZkStateReader zkStateReader;
 
-  private LeaderElector leaderElector;
+  private final LeaderElector leaderElector;
   
-  private String zkServerAddress;          // example: 127.0.0.1:54062/solr
+  private final String zkServerAddress;          // example: 127.0.0.1:54062/solr
 
   private final String localHostPort;      // example: 54065
   private final String localHostContext;   // example: solr
@@ -227,6 +226,11 @@ public final class ZkController {
               ElectionContext context = new OverseerElectionContext(zkClient,
                   overseer, getNodeName());
               
+              ElectionContext prevContext = overseerElector.getContext();
+              if (prevContext != null) {
+                prevContext.cancelElection();
+              }
+              
               overseerElector.joinElection(context, true);
               zkStateReader.createClusterStateWatchersAndUpdate();
               
@@ -919,6 +923,17 @@ public final class ZkController {
 
 
   private void joinElection(CoreDescriptor cd, boolean afterExpiration) throws InterruptedException, KeeperException, IOException {
+    // look for old context - if we find it, cancel it
+    String collection = cd.getCloudDescriptor().getCollectionName();
+    final String coreNodeName = cd.getCloudDescriptor().getCoreNodeName();
+    
+    ContextKey contextKey = new ContextKey(collection, coreNodeName);
+    
+    ElectionContext prevContext = electionContexts.get(contextKey);
+    
+    if (prevContext != null) {
+      prevContext.cancelElection();
+    }
     
     String shardId = cd.getCloudDescriptor().getShardId();
     
@@ -928,16 +943,15 @@ public final class ZkController {
     props.put(ZkStateReader.CORE_NAME_PROP, cd.getName());
     props.put(ZkStateReader.NODE_NAME_PROP, getNodeName());
     
-    final String coreNodeName = cd.getCloudDescriptor().getCoreNodeName();
+ 
     ZkNodeProps ourProps = new ZkNodeProps(props);
-    String collection = cd.getCloudDescriptor()
-        .getCollectionName();
+
     
     ElectionContext context = new ShardLeaderElectionContext(leaderElector, shardId,
         collection, coreNodeName, ourProps, this, cc);
 
     leaderElector.setup(context);
-    electionContexts.put(new ContextKey(collection, coreNodeName), context);
+    electionContexts.put(contextKey, context);
     leaderElector.joinElection(context, false);
   }
 

Modified: lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/cloud/ConnectionManager.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/cloud/ConnectionManager.java?rev=1556573&r1=1556572&r2=1556573&view=diff
==============================================================================
--- lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/cloud/ConnectionManager.java (original)
+++ lucene/dev/branches/branch_4x/solr/solrj/src/java/org/apache/solr/common/cloud/ConnectionManager.java Wed Jan  8 16:36:37 2014
@@ -34,10 +34,9 @@ public class ConnectionManager implement
       .getLogger(ConnectionManager.class);
 
   private final String name;
-  private CountDownLatch clientConnected;
-  private KeeperState state = KeeperState.Disconnected;
+  private final CountDownLatch clientConnected = new CountDownLatch(1);
+  
   private boolean connected = false;
-  private boolean likelyExpired = true;
 
   private final ZkClientConnectionStrategy connectionStrategy;
 
@@ -48,7 +47,9 @@ public class ConnectionManager implement
   private final OnReconnect onReconnect;
   private final BeforeReconnect beforeReconnect;
 
+  private volatile KeeperState state = KeeperState.Disconnected;
   private volatile boolean isClosed = false;
+  private volatile boolean likelyExpired = true;
   
   private volatile Timer disconnectedTimer;
 
@@ -59,16 +60,16 @@ public class ConnectionManager implement
     this.zkServerAddress = zkServerAddress;
     this.onReconnect = onConnect;
     this.beforeReconnect = beforeReconnect;
-    clientConnected = new CountDownLatch(1);
   }
   
   private synchronized void connected() {
-    connected = true;
     if (disconnectedTimer != null) {
       disconnectedTimer.cancel();
       disconnectedTimer = null;
     }
+    connected = true;
     likelyExpired = false;
+    notifyAll();
   }
 
   private synchronized void disconnected() {
@@ -82,18 +83,17 @@ public class ConnectionManager implement
         
         @Override
         public void run() {
-          synchronized (ConnectionManager.this) {
-            likelyExpired = true;
-          }
+          likelyExpired = true;
         }
         
       }, (long) (client.getZkClientTimeout() * 0.90));
     }
     connected = false;
+    notifyAll();
   }
 
   @Override
-  public synchronized void process(WatchedEvent event) {
+  public void process(WatchedEvent event) {
     if (log.isInfoEnabled()) {
       log.info("Watcher " + this + " name:" + name + " got event " + event
           + " path:" + event.getPath() + " type:" + event.getType());
@@ -103,8 +103,9 @@ public class ConnectionManager implement
       log.info("Client->ZooKeeper status change trigger but we are already closed");
       return;
     }
-
+    
     state = event.getState();
+    
     if (state == KeeperState.SyncConnected) {
       connected();
       clientConnected.countDown();
@@ -117,12 +118,16 @@ public class ConnectionManager implement
       
       connected = false;
       likelyExpired = true;
+      
       log.info("Our previous ZooKeeper session was expired. Attempting to reconnect to recover relationship with ZooKeeper...");
+      
       if (beforeReconnect != null) {
         beforeReconnect.command();
       }
+      
       try {
-        connectionStrategy.reconnect(zkServerAddress, client.getZkClientTimeout(), this,
+        connectionStrategy.reconnect(zkServerAddress,
+            client.getZkClientTimeout(), this,
             new ZkClientConnectionStrategy.ZkUpdate() {
               @Override
               public void update(SolrZooKeeper keeper) {
@@ -146,12 +151,22 @@ public class ConnectionManager implement
                   throw new RuntimeException(t);
                 }
                 
+                connected();
+                
                 if (onReconnect != null) {
-                  onReconnect.command();
+                  Thread thread = new Thread() {
+                    @Override
+                    public void run() {
+                      try {
+                        onReconnect.command();
+                      } catch (Exception e) {
+                        log.warn("Exception running onReconnect command", e);
+                      }
+                    }
+                  };
+                  thread.start();
                 }
                 
-                connected();
-                
               }
             });
       } catch (Exception e) {
@@ -165,7 +180,6 @@ public class ConnectionManager implement
     } else {
       disconnected();
     }
-    notifyAll();
   }
 
   public synchronized boolean isConnected() {
@@ -185,12 +199,8 @@ public class ConnectionManager implement
       this.disconnectedTimer = null;
     }
   }
-
-  public synchronized KeeperState state() {
-    return state;
-  }
   
-  public synchronized boolean isLikelyExpired() {
+  public boolean isLikelyExpired() {
     return likelyExpired;
   }
 
@@ -241,4 +251,4 @@ public class ConnectionManager implement
           "", e);
     }
   }
-}
+}
\ No newline at end of file