You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@zookeeper.apache.org by Pramod Srinivasan <pr...@gmail.com> on 2019/11/12 06:04:35 UTC

enable fast reconnect from client

I have a few rudimentary questions on c-client reconnect, please help in
clarifying the details.

I came across a problem where the c client would take a long time to
reconnect when the server is restarted. Looking at the client code, I see
that if the client were to issue a connect to the server and the server has
not yet started, depending on receive and send timeout thresholds we can
potentially wait up to 2/3*session timeout before we try another connect
(zookeeper_connect). In my case, the client was getting connected to the
server after 120 seconds (180 seconds is my session timeout). I saw similar
behavior during a sync operation as well. I experimented with the code a
bit and I am able to reconnect faster from 0-120 seconds to 2-3 seconds (I
don't know if the diff is correct)

A few questions:

- Why do we tie zookeeper session timeout and the client side reconnect
time? If the client is in connecting state, shouldn't we attempt a
reconnect faster?
- Is there any way to keep the session time out large (180 seconds) but
still have the ability to reconnect faster?
- What is wrong with the diff below?

diff -up
/tmp/org/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/mt_adaptor.c
src/mt_adaptor.c

---
/mysb/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/mt_adaptor.c
2015-07-28
00:20:16.000000000 -0700

+++ src/mt_adaptor.c 2019-11-06 21:53:07.207761838 -0800

@@ -386,6 +386,11 @@ void *do_io(void *v)

         }

         timeout=tv.tv_sec * 1000 + (tv.tv_usec/1000);



+ if (timeout > 5000) {

+     timeout = 5000;

+     LOG_INFO(LOGCALLBACK(zh), "Setting timeout to 5 seconds");

+ }

+

         poll(fds,maxfd,timeout);

         if (fd != -1) {

             interest=(fds[1].revents&POLLIN)?ZOOKEEPER_READ:0;

diff -up
/tmp/org/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/zookeeper.c
src/zookeeper.c

---
/mysb/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/zookeeper.c
2019-11-04
13:49:55.612389268 -0800

+++ src/zookeeper.c 2019-11-09 01:32:29.235401128 -0800

@@ -2136,8 +2136,9 @@ static socket_t zookeeper_connect(zhandl

     addr_len = sizeof(struct sockaddr_in);

 #endif



-    LOG_DEBUG(LOGCALLBACK(zh), "[zk] connect()\n");

+    LOG_INFO(LOGCALLBACK(zh), "[zk] connect() to
%s\n",zoo_get_current_server(zh));

     rc = connect(fd, (struct sockaddr *)addr, addr_len);

+    LOG_INFO(LOGCALLBACK(zh), "[zk] connect() to %s rc %d errno
%d\n",zoo_get_current_server(zh), rc, errno);



 #ifdef _WIN32

     get_errno();

@@ -2160,6 +2161,7 @@ int zookeeper_interest(zhandle_t *zh, so

      struct timeval *tv)

 {

     int rc = 0;

+    static int retry_connect = 0;

     struct timeval now;

     if(zh==0 || fd==0 ||interest==0 || tv==0)

         return ZBADARGUMENTS;

@@ -2185,6 +2187,21 @@ int zookeeper_interest(zhandle_t *zh, so

     tv->tv_sec = 0;

     tv->tv_usec = 0;



+    if (retry_connect) {

+        if (zh->state == ZOO_CONNECTING_STATE) {

+

+ LOG_INFO(LOGCALLBACK(zh),

+     "Retry connect to zookeeper as zh is in connecting state");

+

+ close(zh->fd);

+ zh->fd = -1;

+ zh->state = ZOO_NOTCONNECTED_STATE;

+ zh->reconfig = 1;

+ }

+

+ retry_connect = 0;

+    }

+

     if (*fd == -1) {

         /*

          * If we previously failed to connect to server pool (zh->delay ==
1)

@@ -2322,6 +2339,21 @@ int zookeeper_interest(zhandle_t *zh, so

         // choose the lesser value as the timeout

         *tv = get_timeval(min(recv_to, send_to));



+ if (zh->state == ZOO_CONNECTING_STATE) {

+

+     retry_connect = 1;

+

+            LOG_INFO(LOGCALLBACK(zh),

+     "Zookeeper in connecting state, retry in sec %ld usec %ld",

+     tv->tv_sec, tv->tv_usec);

+

+     *tv = get_timeval(1000);

+

+            LOG_INFO(LOGCALLBACK(zh),

+     "Retry connect in sec %ld usec %ld instead",

+     tv->tv_sec, tv->tv_usec);

+ }

+

         zh->next_deadline.tv_sec = now.tv_sec + tv->tv_sec;

         zh->next_deadline.tv_usec = now.tv_usec + tv->tv_usec;

         if (zh->next_deadline.tv_usec > 1000000) {

Re: enable fast reconnect from client

Posted by Pramod Srinivasan <pr...@gmail.com>.
Any thoughts on this?

On Mon, Nov 11, 2019 at 10:04 PM Pramod Srinivasan <pr...@gmail.com>
wrote:

> I have a few rudimentary questions on c-client reconnect, please help in
> clarifying the details.
>
> I came across a problem where the c client would take a long time to
> reconnect when the server is restarted. Looking at the client code, I see
> that if the client were to issue a connect to the server and the server has
> not yet started, depending on receive and send timeout thresholds we can
> potentially wait up to 2/3*session timeout before we try another connect
> (zookeeper_connect). In my case, the client was getting connected to the
> server after 120 seconds (180 seconds is my session timeout). I saw similar
> behavior during a sync operation as well. I experimented with the code a
> bit and I am able to reconnect faster from 0-120 seconds to 2-3 seconds (I
> don't know if the diff is correct)
>
> A few questions:
>
> - Why do we tie zookeeper session timeout and the client side reconnect
> time? If the client is in connecting state, shouldn't we attempt a
> reconnect faster?
> - Is there any way to keep the session time out large (180 seconds) but
> still have the ability to reconnect faster?
> - What is wrong with the diff below?
>
> diff -up
> /tmp/org/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/mt_adaptor.c
> src/mt_adaptor.c
>
> ---
> /mysb/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/mt_adaptor.c 2015-07-28
> 00:20:16.000000000 -0700
>
> +++ src/mt_adaptor.c 2019-11-06 21:53:07.207761838 -0800
>
> @@ -386,6 +386,11 @@ void *do_io(void *v)
>
>          }
>
>          timeout=tv.tv_sec * 1000 + (tv.tv_usec/1000);
>
>
>
> + if (timeout > 5000) {
>
> +     timeout = 5000;
>
> +     LOG_INFO(LOGCALLBACK(zh), "Setting timeout to 5 seconds");
>
> + }
>
> +
>
>          poll(fds,maxfd,timeout);
>
>          if (fd != -1) {
>
>              interest=(fds[1].revents&POLLIN)?ZOOKEEPER_READ:0;
>
> diff -up
> /tmp/org/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/zookeeper.c
> src/zookeeper.c
>
> ---
> /mysb/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/zookeeper.c 2019-11-04
> 13:49:55.612389268 -0800
>
> +++ src/zookeeper.c 2019-11-09 01:32:29.235401128 -0800
>
> @@ -2136,8 +2136,9 @@ static socket_t zookeeper_connect(zhandl
>
>      addr_len = sizeof(struct sockaddr_in);
>
>  #endif
>
>
>
> -    LOG_DEBUG(LOGCALLBACK(zh), "[zk] connect()\n");
>
> +    LOG_INFO(LOGCALLBACK(zh), "[zk] connect() to
> %s\n",zoo_get_current_server(zh));
>
>      rc = connect(fd, (struct sockaddr *)addr, addr_len);
>
> +    LOG_INFO(LOGCALLBACK(zh), "[zk] connect() to %s rc %d errno
> %d\n",zoo_get_current_server(zh), rc, errno);
>
>
>
>  #ifdef _WIN32
>
>      get_errno();
>
> @@ -2160,6 +2161,7 @@ int zookeeper_interest(zhandle_t *zh, so
>
>       struct timeval *tv)
>
>  {
>
>      int rc = 0;
>
> +    static int retry_connect = 0;
>
>      struct timeval now;
>
>      if(zh==0 || fd==0 ||interest==0 || tv==0)
>
>          return ZBADARGUMENTS;
>
> @@ -2185,6 +2187,21 @@ int zookeeper_interest(zhandle_t *zh, so
>
>      tv->tv_sec = 0;
>
>      tv->tv_usec = 0;
>
>
>
> +    if (retry_connect) {
>
> +        if (zh->state == ZOO_CONNECTING_STATE) {
>
> +
>
> + LOG_INFO(LOGCALLBACK(zh),
>
> +     "Retry connect to zookeeper as zh is in connecting state");
>
> +
>
> + close(zh->fd);
>
> + zh->fd = -1;
>
> + zh->state = ZOO_NOTCONNECTED_STATE;
>
> + zh->reconfig = 1;
>
> + }
>
> +
>
> + retry_connect = 0;
>
> +    }
>
> +
>
>      if (*fd == -1) {
>
>          /*
>
>           * If we previously failed to connect to server pool (zh->delay
> == 1)
>
> @@ -2322,6 +2339,21 @@ int zookeeper_interest(zhandle_t *zh, so
>
>          // choose the lesser value as the timeout
>
>          *tv = get_timeval(min(recv_to, send_to));
>
>
>
> + if (zh->state == ZOO_CONNECTING_STATE) {
>
> +
>
> +     retry_connect = 1;
>
> +
>
> +            LOG_INFO(LOGCALLBACK(zh),
>
> +     "Zookeeper in connecting state, retry in sec %ld usec %ld",
>
> +     tv->tv_sec, tv->tv_usec);
>
> +
>
> +     *tv = get_timeval(1000);
>
> +
>
> +            LOG_INFO(LOGCALLBACK(zh),
>
> +     "Retry connect in sec %ld usec %ld instead",
>
> +     tv->tv_sec, tv->tv_usec);
>
> + }
>
> +
>
>          zh->next_deadline.tv_sec = now.tv_sec + tv->tv_sec;
>
>          zh->next_deadline.tv_usec = now.tv_usec + tv->tv_usec;
>
>          if (zh->next_deadline.tv_usec > 1000000) {
>

Re: enable fast reconnect from client

Posted by Pramod Srinivasan <pr...@gmail.com>.
Any thoughts on this?

On Mon, Nov 11, 2019 at 10:04 PM Pramod Srinivasan <pr...@gmail.com>
wrote:

> I have a few rudimentary questions on c-client reconnect, please help in
> clarifying the details.
>
> I came across a problem where the c client would take a long time to
> reconnect when the server is restarted. Looking at the client code, I see
> that if the client were to issue a connect to the server and the server has
> not yet started, depending on receive and send timeout thresholds we can
> potentially wait up to 2/3*session timeout before we try another connect
> (zookeeper_connect). In my case, the client was getting connected to the
> server after 120 seconds (180 seconds is my session timeout). I saw similar
> behavior during a sync operation as well. I experimented with the code a
> bit and I am able to reconnect faster from 0-120 seconds to 2-3 seconds (I
> don't know if the diff is correct)
>
> A few questions:
>
> - Why do we tie zookeeper session timeout and the client side reconnect
> time? If the client is in connecting state, shouldn't we attempt a
> reconnect faster?
> - Is there any way to keep the session time out large (180 seconds) but
> still have the ability to reconnect faster?
> - What is wrong with the diff below?
>
> diff -up
> /tmp/org/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/mt_adaptor.c
> src/mt_adaptor.c
>
> ---
> /mysb/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/mt_adaptor.c 2015-07-28
> 00:20:16.000000000 -0700
>
> +++ src/mt_adaptor.c 2019-11-06 21:53:07.207761838 -0800
>
> @@ -386,6 +386,11 @@ void *do_io(void *v)
>
>          }
>
>          timeout=tv.tv_sec * 1000 + (tv.tv_usec/1000);
>
>
>
> + if (timeout > 5000) {
>
> +     timeout = 5000;
>
> +     LOG_INFO(LOGCALLBACK(zh), "Setting timeout to 5 seconds");
>
> + }
>
> +
>
>          poll(fds,maxfd,timeout);
>
>          if (fd != -1) {
>
>              interest=(fds[1].revents&POLLIN)?ZOOKEEPER_READ:0;
>
> diff -up
> /tmp/org/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/zookeeper.c
> src/zookeeper.c
>
> ---
> /mysb/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/zookeeper.c 2019-11-04
> 13:49:55.612389268 -0800
>
> +++ src/zookeeper.c 2019-11-09 01:32:29.235401128 -0800
>
> @@ -2136,8 +2136,9 @@ static socket_t zookeeper_connect(zhandl
>
>      addr_len = sizeof(struct sockaddr_in);
>
>  #endif
>
>
>
> -    LOG_DEBUG(LOGCALLBACK(zh), "[zk] connect()\n");
>
> +    LOG_INFO(LOGCALLBACK(zh), "[zk] connect() to
> %s\n",zoo_get_current_server(zh));
>
>      rc = connect(fd, (struct sockaddr *)addr, addr_len);
>
> +    LOG_INFO(LOGCALLBACK(zh), "[zk] connect() to %s rc %d errno
> %d\n",zoo_get_current_server(zh), rc, errno);
>
>
>
>  #ifdef _WIN32
>
>      get_errno();
>
> @@ -2160,6 +2161,7 @@ int zookeeper_interest(zhandle_t *zh, so
>
>       struct timeval *tv)
>
>  {
>
>      int rc = 0;
>
> +    static int retry_connect = 0;
>
>      struct timeval now;
>
>      if(zh==0 || fd==0 ||interest==0 || tv==0)
>
>          return ZBADARGUMENTS;
>
> @@ -2185,6 +2187,21 @@ int zookeeper_interest(zhandle_t *zh, so
>
>      tv->tv_sec = 0;
>
>      tv->tv_usec = 0;
>
>
>
> +    if (retry_connect) {
>
> +        if (zh->state == ZOO_CONNECTING_STATE) {
>
> +
>
> + LOG_INFO(LOGCALLBACK(zh),
>
> +     "Retry connect to zookeeper as zh is in connecting state");
>
> +
>
> + close(zh->fd);
>
> + zh->fd = -1;
>
> + zh->state = ZOO_NOTCONNECTED_STATE;
>
> + zh->reconfig = 1;
>
> + }
>
> +
>
> + retry_connect = 0;
>
> +    }
>
> +
>
>      if (*fd == -1) {
>
>          /*
>
>           * If we previously failed to connect to server pool (zh->delay
> == 1)
>
> @@ -2322,6 +2339,21 @@ int zookeeper_interest(zhandle_t *zh, so
>
>          // choose the lesser value as the timeout
>
>          *tv = get_timeval(min(recv_to, send_to));
>
>
>
> + if (zh->state == ZOO_CONNECTING_STATE) {
>
> +
>
> +     retry_connect = 1;
>
> +
>
> +            LOG_INFO(LOGCALLBACK(zh),
>
> +     "Zookeeper in connecting state, retry in sec %ld usec %ld",
>
> +     tv->tv_sec, tv->tv_usec);
>
> +
>
> +     *tv = get_timeval(1000);
>
> +
>
> +            LOG_INFO(LOGCALLBACK(zh),
>
> +     "Retry connect in sec %ld usec %ld instead",
>
> +     tv->tv_sec, tv->tv_usec);
>
> + }
>
> +
>
>          zh->next_deadline.tv_sec = now.tv_sec + tv->tv_sec;
>
>          zh->next_deadline.tv_usec = now.tv_usec + tv->tv_usec;
>
>          if (zh->next_deadline.tv_usec > 1000000) {
>