You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@zookeeper.apache.org by ma...@apache.org on 2010/03/19 18:16:33 UTC

svn commit: r925339 - in /hadoop/zookeeper/branches/branch-3.2: ./ src/docs/src/documentation/content/xdocs/ src/java/main/org/apache/zookeeper/server/ src/java/test/org/apache/zookeeper/test/

Author: mahadev
Date: Fri Mar 19 17:16:33 2010
New Revision: 925339

URL: http://svn.apache.org/viewvc?rev=925339&view=rev
Log:
ZOOKEEPER-710. permanent ZSESSIONMOVED error after client app reconnects to zookeeper cluster (phunt via mahadev)

Modified:
    hadoop/zookeeper/branches/branch-3.2/CHANGES.txt
    hadoop/zookeeper/branches/branch-3.2/src/docs/src/documentation/content/xdocs/zookeeperProgrammers.xml
    hadoop/zookeeper/branches/branch-3.2/src/java/main/org/apache/zookeeper/server/FinalRequestProcessor.java
    hadoop/zookeeper/branches/branch-3.2/src/java/main/org/apache/zookeeper/server/NIOServerCnxn.java
    hadoop/zookeeper/branches/branch-3.2/src/java/test/org/apache/zookeeper/test/QuorumTest.java

Modified: hadoop/zookeeper/branches/branch-3.2/CHANGES.txt
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/branches/branch-3.2/CHANGES.txt?rev=925339&r1=925338&r2=925339&view=diff
==============================================================================
--- hadoop/zookeeper/branches/branch-3.2/CHANGES.txt (original)
+++ hadoop/zookeeper/branches/branch-3.2/CHANGES.txt Fri Mar 19 17:16:33 2010
@@ -1,3 +1,11 @@
+3.2 branch
+
+Backward compatible changes:
+
+BUGFIXES:
+  ZOOKEEPER-710. permanent ZSESSIONMOVED error after client app reconnects to
+  zookeeper cluster (phunt via mahadev)
+
 Release 3.2.2 - 2009-12-11
 
 Backward compatible changes:

Modified: hadoop/zookeeper/branches/branch-3.2/src/docs/src/documentation/content/xdocs/zookeeperProgrammers.xml
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/branches/branch-3.2/src/docs/src/documentation/content/xdocs/zookeeperProgrammers.xml?rev=925339&r1=925338&r2=925339&view=diff
==============================================================================
--- hadoop/zookeeper/branches/branch-3.2/src/docs/src/documentation/content/xdocs/zookeeperProgrammers.xml (original)
+++ hadoop/zookeeper/branches/branch-3.2/src/docs/src/documentation/content/xdocs/zookeeperProgrammers.xml Fri Mar 19 17:16:33 2010
@@ -479,12 +479,13 @@
       which has be reestablished on a different server. The normal cause of this error is
       a client that sends a request to a server, but the network packet gets delayed, so
       the client times out and connects to a new server. When the delayed packet arrives at
-      the first server, the old server detects that the session has moved and sends back
-      the SESSION_MOVED error. Clients normally do not see this error since they do not read
+      the first server, the old server detects that the session has moved, and closes the
+      client connection. Clients normally do not see this error since they do not read
       from those old connections. (Old connections are usually closed.) One situation in which this
-      error can be seen is when two clients try to reestablish the same connection using
+      condition can be seen is when two clients try to reestablish the same connection using
       a saved session id and password. One of the clients will reestablish the connection
-      and the second client will get the SessionMovedException.</para>
+      and the second client will be disconnected (causing the pair to attempt to re-establish
+      it's connection/session indefinitely).</para>
 
   </section>
 

Modified: hadoop/zookeeper/branches/branch-3.2/src/java/main/org/apache/zookeeper/server/FinalRequestProcessor.java
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/branches/branch-3.2/src/java/main/org/apache/zookeeper/server/FinalRequestProcessor.java?rev=925339&r1=925338&r2=925339&view=diff
==============================================================================
--- hadoop/zookeeper/branches/branch-3.2/src/java/main/org/apache/zookeeper/server/FinalRequestProcessor.java (original)
+++ hadoop/zookeeper/branches/branch-3.2/src/java/main/org/apache/zookeeper/server/FinalRequestProcessor.java Fri Mar 19 17:16:33 2010
@@ -27,6 +27,7 @@ import org.apache.log4j.Logger;
 import org.apache.zookeeper.KeeperException;
 import org.apache.zookeeper.ZooDefs;
 import org.apache.zookeeper.KeeperException.Code;
+import org.apache.zookeeper.KeeperException.SessionMovedException;
 import org.apache.zookeeper.ZooDefs.OpCode;
 import org.apache.zookeeper.data.ACL;
 import org.apache.zookeeper.data.Stat;
@@ -46,6 +47,7 @@ import org.apache.zookeeper.proto.SetWat
 import org.apache.zookeeper.proto.SyncRequest;
 import org.apache.zookeeper.proto.SyncResponse;
 import org.apache.zookeeper.server.DataTree.ProcessTxnResult;
+import org.apache.zookeeper.server.NIOServerCnxn;
 import org.apache.zookeeper.server.NIOServerCnxn.Factory;
 import org.apache.zookeeper.server.ZooKeeperServer.ChangeRecord;
 import org.apache.zookeeper.txn.CreateSessionTxn;
@@ -253,6 +255,17 @@ public class FinalRequestProcessor imple
                 rsp = new GetChildrenResponse(children);
                 break;
             }
+        } catch (SessionMovedException e) {
+            // session moved is a connection level error, we need to tear
+            // down the connection otw ZOOKEEPER-710 might happen
+            // ie client on slow follower starts to renew session, fails
+            // before this completes, then tries the fast follower (leader)
+            // and is successful, however the initial renew is then
+            // successfully fwd/processed by the leader and as a result
+            // the client and leader disagree on where the client is most
+            // recently attached (and therefore invalid SESSION MOVED generated)
+            ((NIOServerCnxn)request.cnxn).sendBuffer(NIOServerCnxn.closeConn);
+            return;
         } catch (KeeperException e) {
             err = e.code();
         } catch (Exception e) {

Modified: hadoop/zookeeper/branches/branch-3.2/src/java/main/org/apache/zookeeper/server/NIOServerCnxn.java
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/branches/branch-3.2/src/java/main/org/apache/zookeeper/server/NIOServerCnxn.java?rev=925339&r1=925338&r2=925339&view=diff
==============================================================================
--- hadoop/zookeeper/branches/branch-3.2/src/java/main/org/apache/zookeeper/server/NIOServerCnxn.java (original)
+++ hadoop/zookeeper/branches/branch-3.2/src/java/main/org/apache/zookeeper/server/NIOServerCnxn.java Fri Mar 19 17:16:33 2010
@@ -351,20 +351,22 @@ public class NIOServerCnxn implements Wa
 
     void sendBuffer(ByteBuffer bb) {
         try {
-            // We check if write interest here because if it is NOT set, nothing
-            // is queued, so
-            // we can try to send the buffer right away without waking up the
-            // selector
-            if ((sk.interestOps() & SelectionKey.OP_WRITE) == 0) {
-                try {
-                    sock.write(bb);
-                } catch (IOException e) {
-                    // we are just doing best effort right now
+            if (bb != closeConn) {
+                // We check if write interest here because if it is NOT set, nothing
+                // is queued, so
+                // we can try to send the buffer right away without waking up the
+                // selector
+                if ((sk.interestOps() & SelectionKey.OP_WRITE) == 0) {
+                    try {
+                        sock.write(bb);
+                    } catch (IOException e) {
+                        // we are just doing best effort right now
+                    }
+                }
+                // if there is nothing left to send, we are done
+                if (bb.remaining() == 0) {
+                    return;
                 }
-            }
-            // if there is nothing left to send, we are done
-            if (bb.remaining() == 0) {
-                return;
             }
             synchronized (factory) {
                 sk.selector().wakeup();

Modified: hadoop/zookeeper/branches/branch-3.2/src/java/test/org/apache/zookeeper/test/QuorumTest.java
URL: http://svn.apache.org/viewvc/hadoop/zookeeper/branches/branch-3.2/src/java/test/org/apache/zookeeper/test/QuorumTest.java?rev=925339&r1=925338&r2=925339&view=diff
==============================================================================
--- hadoop/zookeeper/branches/branch-3.2/src/java/test/org/apache/zookeeper/test/QuorumTest.java (original)
+++ hadoop/zookeeper/branches/branch-3.2/src/java/test/org/apache/zookeeper/test/QuorumTest.java Fri Mar 19 17:16:33 2010
@@ -32,6 +32,7 @@ import org.apache.zookeeper.WatchedEvent
 import org.apache.zookeeper.Watcher;
 import org.apache.zookeeper.ZooDefs;
 import org.apache.zookeeper.ZooKeeper;
+import org.apache.zookeeper.Watcher.Event.KeeperState;
 import org.apache.zookeeper.data.Stat;
 import org.apache.zookeeper.server.quorum.FollowerHandler;
 import org.apache.zookeeper.server.quorum.Leader;
@@ -165,14 +166,14 @@ public class QuorumTest extends TestCase
     @Test
     public void testSessionMoved() throws IOException, InterruptedException, KeeperException {
         String hostPorts[] = qb.hostPort.split(",");
-        ZooKeeper zk = new DisconnectableZooKeeper(hostPorts[0], ClientBase.CONNECTION_TIMEOUT, new Watcher() {
+        DisconnectableZooKeeper zk = new DisconnectableZooKeeper(hostPorts[0], ClientBase.CONNECTION_TIMEOUT, new Watcher() {
             public void process(WatchedEvent event) {
             }});
         zk.create("/sessionMoveTest", new byte[0], Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
         // we want to loop through the list twice
         for(int i = 0; i < hostPorts.length*2; i++) {
             // This should stomp the zk handle
-            ZooKeeper zknew = new DisconnectableZooKeeper(hostPorts[(i+1)%hostPorts.length], ClientBase.CONNECTION_TIMEOUT,
+            DisconnectableZooKeeper zknew = new DisconnectableZooKeeper(hostPorts[(i+1)%hostPorts.length], ClientBase.CONNECTION_TIMEOUT,
                     new Watcher() {public void process(WatchedEvent event) {
                     }},
                     zk.getSessionId(),
@@ -181,14 +182,23 @@ public class QuorumTest extends TestCase
             try {
                 zk.setData("/", new byte[1], -1);
                 fail("Should have lost the connection");
-            } catch(KeeperException.SessionMovedException e) {
+            } catch(KeeperException.ConnectionLossException e) {
             }
-            //zk.close();
+            zk.disconnect(); // close w/o closing session
             zk = zknew;
         }
         zk.close();
     }
     
+    private static class DiscoWatcher implements Watcher {
+        volatile boolean zkDisco = false;
+        public void process(WatchedEvent event) {
+            if (event.getState() == KeeperState.Disconnected) {
+                zkDisco = true;
+            }
+        }
+    }
+
     @Test
     /**
      * Connect to two different servers with two different handles using the same session and
@@ -196,28 +206,37 @@ public class QuorumTest extends TestCase
      */
     public void testSessionMove() throws IOException, InterruptedException, KeeperException {
         String hps[] = qb.hostPort.split(",");
-        ZooKeeper zk = new DisconnectableZooKeeper(hps[0], ClientBase.CONNECTION_TIMEOUT, new Watcher() {
-            public void process(WatchedEvent event) {
-        }});
+        DiscoWatcher oldWatcher = new DiscoWatcher();
+        ZooKeeper zk = new DisconnectableZooKeeper(hps[0],
+                ClientBase.CONNECTION_TIMEOUT, oldWatcher);
         zk.create("/t1", new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
         // This should stomp the zk handle
-        ZooKeeper zknew = new DisconnectableZooKeeper(hps[1], ClientBase.CONNECTION_TIMEOUT, new Watcher() {
-            public void process(WatchedEvent event) {
-            }}, zk.getSessionId(), zk.getSessionPasswd());
+        DiscoWatcher watcher = new DiscoWatcher();
+        ZooKeeper zknew = new DisconnectableZooKeeper(hps[1],
+                ClientBase.CONNECTION_TIMEOUT, watcher, zk.getSessionId(),
+                zk.getSessionPasswd());
         zknew.create("/t2", new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
         try {
             zk.create("/t3", new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
             fail("Should have lost the connection");
-        } catch(KeeperException.SessionMovedException e) {
+        } catch(KeeperException.ConnectionLossException e) {
+            // wait up to 30 seconds for the disco to be delivered
+            for (int i = 0; i < 30; i++) {
+                if (oldWatcher.zkDisco) {
+                    break;
+        }
+                Thread.sleep(1000);
+            }
+            assertTrue(oldWatcher.zkDisco);
         }
         
         ArrayList<ZooKeeper> toClose = new ArrayList<ZooKeeper>();
         toClose.add(zknew);
         // Let's just make sure it can still move
         for(int i = 0; i < 10; i++) {
-            zknew = new DisconnectableZooKeeper(hps[1], ClientBase.CONNECTION_TIMEOUT, new Watcher() {
-                public void process(WatchedEvent event) {
-                }}, zk.getSessionId(), zk.getSessionPasswd());
+            zknew = new DisconnectableZooKeeper(hps[1],
+                    ClientBase.CONNECTION_TIMEOUT, new DiscoWatcher(),
+                    zk.getSessionId(), zk.getSessionPasswd());
             toClose.add(zknew);
             zknew.create("/t-"+i, new byte[0], ZooDefs.Ids.OPEN_ACL_UNSAFE, CreateMode.EPHEMERAL);
         }