You are viewing a plain text version of this content. The canonical link for it is here.
Posted to dev@zookeeper.apache.org by Pramod Srinivasan <pr...@gmail.com> on 2019/11/12 06:04:35 UTC
enable fast reconnect from client
I have a few rudimentary questions on c-client reconnect, please help in
clarifying the details.
I came across a problem where the c client would take a long time to
reconnect when the server is restarted. Looking at the client code, I see
that if the client were to issue a connect to the server and the server has
not yet started, depending on receive and send timeout thresholds we can
potentially wait up to 2/3*session timeout before we try another connect
(zookeeper_connect). In my case, the client was getting connected to the
server after 120 seconds (180 seconds is my session timeout). I saw similar
behavior during a sync operation as well. I experimented with the code a
bit and I am able to reconnect faster from 0-120 seconds to 2-3 seconds (I
don't know if the diff is correct)
A few questions:
- Why do we tie zookeeper session timeout and the client side reconnect
time? If the client is in connecting state, shouldn't we attempt a
reconnect faster?
- Is there any way to keep the session time out large (180 seconds) but
still have the ability to reconnect faster?
- What is wrong with the diff below?
diff -up
/tmp/org/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/mt_adaptor.c
src/mt_adaptor.c
---
/mysb/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/mt_adaptor.c
2015-07-28
00:20:16.000000000 -0700
+++ src/mt_adaptor.c 2019-11-06 21:53:07.207761838 -0800
@@ -386,6 +386,11 @@ void *do_io(void *v)
}
timeout=tv.tv_sec * 1000 + (tv.tv_usec/1000);
+ if (timeout > 5000) {
+ timeout = 5000;
+ LOG_INFO(LOGCALLBACK(zh), "Setting timeout to 5 seconds");
+ }
+
poll(fds,maxfd,timeout);
if (fd != -1) {
interest=(fds[1].revents&POLLIN)?ZOOKEEPER_READ:0;
diff -up
/tmp/org/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/zookeeper.c
src/zookeeper.c
---
/mysb/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/zookeeper.c
2019-11-04
13:49:55.612389268 -0800
+++ src/zookeeper.c 2019-11-09 01:32:29.235401128 -0800
@@ -2136,8 +2136,9 @@ static socket_t zookeeper_connect(zhandl
addr_len = sizeof(struct sockaddr_in);
#endif
- LOG_DEBUG(LOGCALLBACK(zh), "[zk] connect()\n");
+ LOG_INFO(LOGCALLBACK(zh), "[zk] connect() to
%s\n",zoo_get_current_server(zh));
rc = connect(fd, (struct sockaddr *)addr, addr_len);
+ LOG_INFO(LOGCALLBACK(zh), "[zk] connect() to %s rc %d errno
%d\n",zoo_get_current_server(zh), rc, errno);
#ifdef _WIN32
get_errno();
@@ -2160,6 +2161,7 @@ int zookeeper_interest(zhandle_t *zh, so
struct timeval *tv)
{
int rc = 0;
+ static int retry_connect = 0;
struct timeval now;
if(zh==0 || fd==0 ||interest==0 || tv==0)
return ZBADARGUMENTS;
@@ -2185,6 +2187,21 @@ int zookeeper_interest(zhandle_t *zh, so
tv->tv_sec = 0;
tv->tv_usec = 0;
+ if (retry_connect) {
+ if (zh->state == ZOO_CONNECTING_STATE) {
+
+ LOG_INFO(LOGCALLBACK(zh),
+ "Retry connect to zookeeper as zh is in connecting state");
+
+ close(zh->fd);
+ zh->fd = -1;
+ zh->state = ZOO_NOTCONNECTED_STATE;
+ zh->reconfig = 1;
+ }
+
+ retry_connect = 0;
+ }
+
if (*fd == -1) {
/*
* If we previously failed to connect to server pool (zh->delay ==
1)
@@ -2322,6 +2339,21 @@ int zookeeper_interest(zhandle_t *zh, so
// choose the lesser value as the timeout
*tv = get_timeval(min(recv_to, send_to));
+ if (zh->state == ZOO_CONNECTING_STATE) {
+
+ retry_connect = 1;
+
+ LOG_INFO(LOGCALLBACK(zh),
+ "Zookeeper in connecting state, retry in sec %ld usec %ld",
+ tv->tv_sec, tv->tv_usec);
+
+ *tv = get_timeval(1000);
+
+ LOG_INFO(LOGCALLBACK(zh),
+ "Retry connect in sec %ld usec %ld instead",
+ tv->tv_sec, tv->tv_usec);
+ }
+
zh->next_deadline.tv_sec = now.tv_sec + tv->tv_sec;
zh->next_deadline.tv_usec = now.tv_usec + tv->tv_usec;
if (zh->next_deadline.tv_usec > 1000000) {
Re: enable fast reconnect from client
Posted by Pramod Srinivasan <pr...@gmail.com>.
Any thoughts on this?
On Mon, Nov 11, 2019 at 10:04 PM Pramod Srinivasan <pr...@gmail.com>
wrote:
> I have a few rudimentary questions on c-client reconnect, please help in
> clarifying the details.
>
> I came across a problem where the c client would take a long time to
> reconnect when the server is restarted. Looking at the client code, I see
> that if the client were to issue a connect to the server and the server has
> not yet started, depending on receive and send timeout thresholds we can
> potentially wait up to 2/3*session timeout before we try another connect
> (zookeeper_connect). In my case, the client was getting connected to the
> server after 120 seconds (180 seconds is my session timeout). I saw similar
> behavior during a sync operation as well. I experimented with the code a
> bit and I am able to reconnect faster from 0-120 seconds to 2-3 seconds (I
> don't know if the diff is correct)
>
> A few questions:
>
> - Why do we tie zookeeper session timeout and the client side reconnect
> time? If the client is in connecting state, shouldn't we attempt a
> reconnect faster?
> - Is there any way to keep the session time out large (180 seconds) but
> still have the ability to reconnect faster?
> - What is wrong with the diff below?
>
> diff -up
> /tmp/org/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/mt_adaptor.c
> src/mt_adaptor.c
>
> ---
> /mysb/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/mt_adaptor.c 2015-07-28
> 00:20:16.000000000 -0700
>
> +++ src/mt_adaptor.c 2019-11-06 21:53:07.207761838 -0800
>
> @@ -386,6 +386,11 @@ void *do_io(void *v)
>
> }
>
> timeout=tv.tv_sec * 1000 + (tv.tv_usec/1000);
>
>
>
> + if (timeout > 5000) {
>
> + timeout = 5000;
>
> + LOG_INFO(LOGCALLBACK(zh), "Setting timeout to 5 seconds");
>
> + }
>
> +
>
> poll(fds,maxfd,timeout);
>
> if (fd != -1) {
>
> interest=(fds[1].revents&POLLIN)?ZOOKEEPER_READ:0;
>
> diff -up
> /tmp/org/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/zookeeper.c
> src/zookeeper.c
>
> ---
> /mysb/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/zookeeper.c 2019-11-04
> 13:49:55.612389268 -0800
>
> +++ src/zookeeper.c 2019-11-09 01:32:29.235401128 -0800
>
> @@ -2136,8 +2136,9 @@ static socket_t zookeeper_connect(zhandl
>
> addr_len = sizeof(struct sockaddr_in);
>
> #endif
>
>
>
> - LOG_DEBUG(LOGCALLBACK(zh), "[zk] connect()\n");
>
> + LOG_INFO(LOGCALLBACK(zh), "[zk] connect() to
> %s\n",zoo_get_current_server(zh));
>
> rc = connect(fd, (struct sockaddr *)addr, addr_len);
>
> + LOG_INFO(LOGCALLBACK(zh), "[zk] connect() to %s rc %d errno
> %d\n",zoo_get_current_server(zh), rc, errno);
>
>
>
> #ifdef _WIN32
>
> get_errno();
>
> @@ -2160,6 +2161,7 @@ int zookeeper_interest(zhandle_t *zh, so
>
> struct timeval *tv)
>
> {
>
> int rc = 0;
>
> + static int retry_connect = 0;
>
> struct timeval now;
>
> if(zh==0 || fd==0 ||interest==0 || tv==0)
>
> return ZBADARGUMENTS;
>
> @@ -2185,6 +2187,21 @@ int zookeeper_interest(zhandle_t *zh, so
>
> tv->tv_sec = 0;
>
> tv->tv_usec = 0;
>
>
>
> + if (retry_connect) {
>
> + if (zh->state == ZOO_CONNECTING_STATE) {
>
> +
>
> + LOG_INFO(LOGCALLBACK(zh),
>
> + "Retry connect to zookeeper as zh is in connecting state");
>
> +
>
> + close(zh->fd);
>
> + zh->fd = -1;
>
> + zh->state = ZOO_NOTCONNECTED_STATE;
>
> + zh->reconfig = 1;
>
> + }
>
> +
>
> + retry_connect = 0;
>
> + }
>
> +
>
> if (*fd == -1) {
>
> /*
>
> * If we previously failed to connect to server pool (zh->delay
> == 1)
>
> @@ -2322,6 +2339,21 @@ int zookeeper_interest(zhandle_t *zh, so
>
> // choose the lesser value as the timeout
>
> *tv = get_timeval(min(recv_to, send_to));
>
>
>
> + if (zh->state == ZOO_CONNECTING_STATE) {
>
> +
>
> + retry_connect = 1;
>
> +
>
> + LOG_INFO(LOGCALLBACK(zh),
>
> + "Zookeeper in connecting state, retry in sec %ld usec %ld",
>
> + tv->tv_sec, tv->tv_usec);
>
> +
>
> + *tv = get_timeval(1000);
>
> +
>
> + LOG_INFO(LOGCALLBACK(zh),
>
> + "Retry connect in sec %ld usec %ld instead",
>
> + tv->tv_sec, tv->tv_usec);
>
> + }
>
> +
>
> zh->next_deadline.tv_sec = now.tv_sec + tv->tv_sec;
>
> zh->next_deadline.tv_usec = now.tv_usec + tv->tv_usec;
>
> if (zh->next_deadline.tv_usec > 1000000) {
>
Re: enable fast reconnect from client
Posted by Pramod Srinivasan <pr...@gmail.com>.
Any thoughts on this?
On Mon, Nov 11, 2019 at 10:04 PM Pramod Srinivasan <pr...@gmail.com>
wrote:
> I have a few rudimentary questions on c-client reconnect, please help in
> clarifying the details.
>
> I came across a problem where the c client would take a long time to
> reconnect when the server is restarted. Looking at the client code, I see
> that if the client were to issue a connect to the server and the server has
> not yet started, depending on receive and send timeout thresholds we can
> potentially wait up to 2/3*session timeout before we try another connect
> (zookeeper_connect). In my case, the client was getting connected to the
> server after 120 seconds (180 seconds is my session timeout). I saw similar
> behavior during a sync operation as well. I experimented with the code a
> bit and I am able to reconnect faster from 0-120 seconds to 2-3 seconds (I
> don't know if the diff is correct)
>
> A few questions:
>
> - Why do we tie zookeeper session timeout and the client side reconnect
> time? If the client is in connecting state, shouldn't we attempt a
> reconnect faster?
> - Is there any way to keep the session time out large (180 seconds) but
> still have the ability to reconnect faster?
> - What is wrong with the diff below?
>
> diff -up
> /tmp/org/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/mt_adaptor.c
> src/mt_adaptor.c
>
> ---
> /mysb/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/mt_adaptor.c 2015-07-28
> 00:20:16.000000000 -0700
>
> +++ src/mt_adaptor.c 2019-11-06 21:53:07.207761838 -0800
>
> @@ -386,6 +386,11 @@ void *do_io(void *v)
>
> }
>
> timeout=tv.tv_sec * 1000 + (tv.tv_usec/1000);
>
>
>
> + if (timeout > 5000) {
>
> + timeout = 5000;
>
> + LOG_INFO(LOGCALLBACK(zh), "Setting timeout to 5 seconds");
>
> + }
>
> +
>
> poll(fds,maxfd,timeout);
>
> if (fd != -1) {
>
> interest=(fds[1].revents&POLLIN)?ZOOKEEPER_READ:0;
>
> diff -up
> /tmp/org/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/zookeeper.c
> src/zookeeper.c
>
> ---
> /mysb/zookeeper/3.5.1-alpha-r0/zookeeper-3.5.1-alpha/src/c/src/zookeeper.c 2019-11-04
> 13:49:55.612389268 -0800
>
> +++ src/zookeeper.c 2019-11-09 01:32:29.235401128 -0800
>
> @@ -2136,8 +2136,9 @@ static socket_t zookeeper_connect(zhandl
>
> addr_len = sizeof(struct sockaddr_in);
>
> #endif
>
>
>
> - LOG_DEBUG(LOGCALLBACK(zh), "[zk] connect()\n");
>
> + LOG_INFO(LOGCALLBACK(zh), "[zk] connect() to
> %s\n",zoo_get_current_server(zh));
>
> rc = connect(fd, (struct sockaddr *)addr, addr_len);
>
> + LOG_INFO(LOGCALLBACK(zh), "[zk] connect() to %s rc %d errno
> %d\n",zoo_get_current_server(zh), rc, errno);
>
>
>
> #ifdef _WIN32
>
> get_errno();
>
> @@ -2160,6 +2161,7 @@ int zookeeper_interest(zhandle_t *zh, so
>
> struct timeval *tv)
>
> {
>
> int rc = 0;
>
> + static int retry_connect = 0;
>
> struct timeval now;
>
> if(zh==0 || fd==0 ||interest==0 || tv==0)
>
> return ZBADARGUMENTS;
>
> @@ -2185,6 +2187,21 @@ int zookeeper_interest(zhandle_t *zh, so
>
> tv->tv_sec = 0;
>
> tv->tv_usec = 0;
>
>
>
> + if (retry_connect) {
>
> + if (zh->state == ZOO_CONNECTING_STATE) {
>
> +
>
> + LOG_INFO(LOGCALLBACK(zh),
>
> + "Retry connect to zookeeper as zh is in connecting state");
>
> +
>
> + close(zh->fd);
>
> + zh->fd = -1;
>
> + zh->state = ZOO_NOTCONNECTED_STATE;
>
> + zh->reconfig = 1;
>
> + }
>
> +
>
> + retry_connect = 0;
>
> + }
>
> +
>
> if (*fd == -1) {
>
> /*
>
> * If we previously failed to connect to server pool (zh->delay
> == 1)
>
> @@ -2322,6 +2339,21 @@ int zookeeper_interest(zhandle_t *zh, so
>
> // choose the lesser value as the timeout
>
> *tv = get_timeval(min(recv_to, send_to));
>
>
>
> + if (zh->state == ZOO_CONNECTING_STATE) {
>
> +
>
> + retry_connect = 1;
>
> +
>
> + LOG_INFO(LOGCALLBACK(zh),
>
> + "Zookeeper in connecting state, retry in sec %ld usec %ld",
>
> + tv->tv_sec, tv->tv_usec);
>
> +
>
> + *tv = get_timeval(1000);
>
> +
>
> + LOG_INFO(LOGCALLBACK(zh),
>
> + "Retry connect in sec %ld usec %ld instead",
>
> + tv->tv_sec, tv->tv_usec);
>
> + }
>
> +
>
> zh->next_deadline.tv_sec = now.tv_sec + tv->tv_sec;
>
> zh->next_deadline.tv_usec = now.tv_usec + tv->tv_usec;
>
> if (zh->next_deadline.tv_usec > 1000000) {
>