You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@hawq.apache.org by "Chunling Wang (JIRA)" <ji...@apache.org> on 2017/09/21 07:22:00 UTC

[jira] [Closed] (HAWQ-568) After query finished, kill a QE but can still recv() data from this QE socket

     [ https://issues.apache.org/jira/browse/HAWQ-568?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Chunling Wang closed HAWQ-568.
------------------------------

> After query finished, kill a QE but can still recv() data from this QE socket
> -----------------------------------------------------------------------------
>
>                 Key: HAWQ-568
>                 URL: https://issues.apache.org/jira/browse/HAWQ-568
>             Project: Apache HAWQ
>          Issue Type: Bug
>          Components: Dispatcher
>    Affects Versions: 2.0.0.0-incubating
>            Reporter: Chunling Wang
>            Assignee: Lili Ma
>             Fix For: 2.0.0.0-incubating
>
>
> After query finished, we kill a QE and other QEs remain in QE pool. When check the connection to this QE is whether alive, we use recv() to this QE socket, but can still receive data.
> 1. Run a query and remain some QEs.
> {code}
> dispatch=# select count(*) from test_dispatch as t1, test_dispatch as t2, test_dispatch as t3 where t1.id *2 = t2.id and t1.id < t3.id;
>  count
> -------
>   3725
> (1 row)
> {code}
> {code}
> $ ps -ef|grep postgres
>   501 55701     1   0  5:38下午 ??         0:00.38 /usr/local/hawq/bin/postgres -D /Users/wangchunling/hawq-data-directory/masterdd -i -M master -p 5432 --silent-mode=true
>   501 55702 55701   0  5:38下午 ??         0:00.01 postgres: port  5432, master logger process
>   501 55705 55701   0  5:38下午 ??         0:00.00 postgres: port  5432, stats collector process
>   501 55706 55701   0  5:38下午 ??         0:00.04 postgres: port  5432, writer process
>   501 55707 55701   0  5:38下午 ??         0:00.01 postgres: port  5432, checkpoint process
>   501 55708 55701   0  5:38下午 ??         0:00.00 postgres: port  5432, seqserver process
>   501 55709 55701   0  5:38下午 ??         0:00.01 postgres: port  5432, WAL Send Server process
>   501 55710 55701   0  5:38下午 ??         0:00.00 postgres: port  5432, DFS Metadata Cache process
>   501 55711 55701   0  5:38下午 ??         0:00.26 postgres: port  5432, master resource manager
>   501 55727     1   0  5:38下午 ??         0:00.52 /usr/local/hawq/bin/postgres -D /Users/wangchunling/hawq-data-directory/segmentdd -i -M segment -p 40000 --silent-mode=true
>   501 55728 55727   0  5:38下午 ??         0:00.06 postgres: port 40000, logger process
>   501 55731 55727   0  5:38下午 ??         0:00.00 postgres: port 40000, stats collector process
>   501 55732 55727   0  5:38下午 ??         0:00.04 postgres: port 40000, writer process
>   501 55733 55727   0  5:38下午 ??         0:00.01 postgres: port 40000, checkpoint process
>   501 55734 55727   0  5:38下午 ??         0:00.09 postgres: port 40000, segment resource manager
>   501 55741 55748   0  5:38下午 ??         0:00.05 postgres: port  5432, wangchunling dispatch [local] con12 cmd6 idle [local]
>   501 55743 55727   0  5:38下午 ??         0:00.36 postgres: port 40000, wangchunling dispatch 127.0.0.1(50800) con12 seg0 idle
>   501 55770 55727   0  5:43下午 ??         0:00.12 postgres: port 40000, wangchunling dispatch 127.0.0.1(50853) con12 seg0 idle
>   501 55771 55727   0  5:44下午 ??         0:00.11 postgres: port 40000, wangchunling dispatch 127.0.0.1(50855) con12 seg0 idle
>   501 55774 26980   0  5:44下午 ttys008    0:00.00 grep postgres
> {code}
> 2. Kill one QE.
> {code}
> $ kill 55771
> $ ps -ef|grep postgres
>   501 55701     1   0  5:38下午 ??         0:00.38 /usr/local/hawq/bin/postgres -D /Users/wangchunling/hawq-data-directory/masterdd -i -M master -p 5432 --silent-mode=true
>   501 55702 55701   0  5:38下午 ??         0:00.01 postgres: port  5432, master logger process
>   501 55705 55701   0  5:38下午 ??         0:00.00 postgres: port  5432, stats collector process
>   501 55706 55701   0  5:38下午 ??         0:00.04 postgres: port  5432, writer process
>   501 55707 55701   0  5:38下午 ??         0:00.01 postgres: port  5432, checkpoint process
>   501 55708 55701   0  5:38下午 ??         0:00.00 postgres: port  5432, seqserver process
>   501 55709 55701   0  5:38下午 ??         0:00.01 postgres: port  5432, WAL Send Server process
>   501 55710 55701   0  5:38下午 ??         0:00.00 postgres: port  5432, DFS Metadata Cache process
>   501 55711 55701   0  5:38下午 ??         0:00.27 postgres: port  5432, master resource manager
>   501 55727     1   0  5:38下午 ??         0:00.52 /usr/local/hawq/bin/postgres -D /Users/wangchunling/hawq-data-directory/segmentdd -i -M segment -p 40000 --silent-mode=true
>   501 55728 55727   0  5:38下午 ??         0:00.06 postgres: port 40000, logger process
>   501 55731 55727   0  5:38下午 ??         0:00.00 postgres: port 40000, stats collector process
>   501 55732 55727   0  5:38下午 ??         0:00.04 postgres: port 40000, writer process
>   501 55733 55727   0  5:38下午 ??         0:00.01 postgres: port 40000, checkpoint process
>   501 55734 55727   0  5:38下午 ??         0:00.09 postgres: port 40000, segment resource manager
>   501 55741 55748   0  5:38下午 ??         0:00.05 postgres: port  5432, wangchunling dispatch [local] con12 cmd6 idle [local]
>   501 55743 55727   0  5:38下午 ??         0:00.36 postgres: port 40000, wangchunling dispatch 127.0.0.1(50800) con12 seg0 idle
>   501 55770 55727   0  5:43下午 ??         0:00.12 postgres: port 40000, wangchunling dispatch 127.0.0.1(50853) con12 seg0 idle
>   501 55776 26980   0  5:44下午 ttys008    0:00.00 grep postgres
> {code}
> 3. Attach to QD and run query.
> {code}
> dispatch=# select count(*) from test_dispatch as t1, test_dispatch as t2, test_dispatch as t3 where t1.id *2 = t2.id and t1.id < t3.id;
> {code}
> 4. In executormgr_allocate_executor_by_name(), we can get the QE which we have just killed and check whether is alive in dispatch_validate_conn() through recv() from this socket.
> {code}
> * thread #1: tid = 0x242340, 0x000000010f5f130a postgres`executormgr_allocate_executor_by_name(name=0x00007fd2ea808320, is_writer='\0') + 42 at executormgr.c:707, queue = 'com.apple.main-thread', stop reason = step over
>     frame #0: 0x000000010f5f130a postgres`executormgr_allocate_executor_by_name(name=0x00007fd2ea808320, is_writer='\0') + 42 at executormgr.c:707
>    704 	  // running until finding a valid one or the pool becomes NULL
>    705 	  SegmentDatabaseDescriptor *desc =
>    706 	      poolmgr_get_item_by_name(executor_cache.pool, name);
> -> 707 	  while (desc != NULL && !executormgr_validate_conn(desc->conn)) {
>    708 	    desc = poolmgr_get_item_by_name(executor_cache.pool, name);
>    709 	  }
>    710 	  return desc;
> (lldb) p *desc
> (SegmentDatabaseDescriptor) $11 = {
>   segment = 0x00007fd2e9884e60
>   conn = 0x00007fd2e9701a30
>   errcode = 0
>   error_message = (data = "", len = 0, maxlen = 256)
>   motionListener = -773536088
>   backendPid = 55771
>   whoami = 0x00007fd2e95083d0 "seg0 localhost:40000 pid=55771"
> }
> (lldb) s
> Process 55741 stopped
> * thread #1: tid = 0x242340, 0x000000010f5f1cec postgres`executormgr_validate_conn(conn=0x00007fd2e9701a30) + 12 at executormgr.c:365, queue = 'com.apple.main-thread', stop reason = step in
>     frame #0: 0x000000010f5f1cec postgres`executormgr_validate_conn(conn=0x00007fd2e9701a30) + 12 at executormgr.c:365
>    362 	static bool
>    363 	executormgr_validate_conn(PGconn *conn)
>    364 	{
> -> 365 		if (conn == NULL)
>    366 			return false;
>    367 		if (!dispatch_validate_conn(conn->sock)) {
>    368 			printfPQExpBuffer(&conn->errorMessage,
> (lldb) n
> Process 55741 stopped
> * thread #1: tid = 0x242340, 0x000000010f5f1d03 postgres`executormgr_validate_conn(conn=0x00007fd2e9701a30) + 35 at executormgr.c:367, queue = 'com.apple.main-thread', stop reason = step over
>     frame #0: 0x000000010f5f1d03 postgres`executormgr_validate_conn(conn=0x00007fd2e9701a30) + 35 at executormgr.c:367
>    364 	{
>    365 		if (conn == NULL)
>    366 			return false;
> -> 367 		if (!dispatch_validate_conn(conn->sock)) {
>    368 			printfPQExpBuffer(&conn->errorMessage,
>    369 					libpq_gettext(
>    370 							"server closed the connection unexpectedly\n"
> (lldb) s
> Process 55741 stopped
> * thread #1: tid = 0x242340, 0x000000010f5ec2cb postgres`dispatch_validate_conn(sock=61) + 11 at dispatcher.c:1830, queue = 'com.apple.main-thread', stop reason = step in
>     frame #0: 0x000000010f5ec2cb postgres`dispatch_validate_conn(sock=61) + 11 at dispatcher.c:1830
>    1827	  ssize_t   ret;
>    1828	  char    buf;
>    1829
> -> 1830	  if (sock < 0)
>    1831	    return false;
>    1832
>    1833	#ifndef WIN32
> (lldb) p sock
> (pgsocket) $12 = 61
> (lldb) n
> Process 55741 stopped
> * thread #1: tid = 0x242340, 0x000000010f5ec2f1 postgres`dispatch_validate_conn(sock=61) + 49 at dispatcher.c:1834, queue = 'com.apple.main-thread', stop reason = step over
>     frame #0: 0x000000010f5ec2f1 postgres`dispatch_validate_conn(sock=61) + 49 at dispatcher.c:1834
>    1831	    return false;
>    1832
>    1833	#ifndef WIN32
> -> 1834	    ret = recv(sock, &buf, 1, MSG_PEEK|MSG_DONTWAIT);
>    1835	#else
>    1836	    ret = recv(sock, &buf, 1, MSG_PEEK|MSG_PARTIAL);
>    1837	#endif
> (lldb)
> Process 55741 stopped
> * thread #1: tid = 0x242340, 0x000000010f5ec2fd postgres`dispatch_validate_conn(sock=61) + 61 at dispatcher.c:1839, queue = 'com.apple.main-thread', stop reason = step over
>     frame #0: 0x000000010f5ec2fd postgres`dispatch_validate_conn(sock=61) + 61 at dispatcher.c:1839
>    1836	    ret = recv(sock, &buf, 1, MSG_PEEK|MSG_PARTIAL);
>    1837	#endif
>    1838
> -> 1839	  if (ret == 0) /* socket has been closed. EOF */
>    1840	    return false;
>    1841
>    1842	  if (ret > 0) /* data waiting on socket, it must be OK. */
> (lldb) p ret
> (ssize_t) $13 = 1
> {code}
> So the result of this query is:
> {code}
> dispatch=# select count(*) from test_dispatch as t1, test_dispatch as t2, test_dispatch as t3 where t1.id *2 = t2.id and t1.id < t3.id;
> ERROR:  terminating connection due to administrator command  (seg0 localhost:40000 pid=55771)
> {code}



--
This message was sent by Atlassian JIRA
(v6.4.14#64029)