You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hawq.apache.org by ml...@apache.org on 2017/02/24 02:09:12 UTC

incubator-hawq git commit: HAWQ-1342. Fixed QE process hang in shared input scan on segment node

Repository: incubator-hawq
Updated Branches:
  refs/heads/master b65d7561f -> 21d78d37a


HAWQ-1342. Fixed QE process hang in shared input scan on segment node

The basic idea for this kinds of hung problem is to:
(1) The error thrown segment will invoke rollback the whole transaction, and all related fd will be closed during transaction end.
(2) The other segment just act as before, when wait for select(), it will loop until the specific fd is closed, then the code will run until process interrupt (the rollback transaction will send cancel signal) again in other place afterward.

So some previous fix (HAWQ-166,  HAWQ-1282) will be changed accordingly.

Signed-off-by: Amy Bai <ab...@pivotal.io>


Project: http://git-wip-us.apache.org/repos/asf/incubator-hawq/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-hawq/commit/21d78d37
Tree: http://git-wip-us.apache.org/repos/asf/incubator-hawq/tree/21d78d37
Diff: http://git-wip-us.apache.org/repos/asf/incubator-hawq/diff/21d78d37

Branch: refs/heads/master
Commit: 21d78d37acbf009c138205c5aa1e94dd52ac4e8f
Parents: b65d756
Author: Ming Li <ml...@apache.org>
Authored: Thu Feb 23 13:22:56 2017 +0800
Committer: Ming LI <ml...@apache.org>
Committed: Fri Feb 24 10:08:22 2017 +0800

----------------------------------------------------------------------
 src/backend/executor/nodeShareInputScan.c | 49 +++++++++++---------------
 1 file changed, 21 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-hawq/blob/21d78d37/src/backend/executor/nodeShareInputScan.c
----------------------------------------------------------------------
diff --git a/src/backend/executor/nodeShareInputScan.c b/src/backend/executor/nodeShareInputScan.c
index 0f08848..74dbcb5 100644
--- a/src/backend/executor/nodeShareInputScan.c
+++ b/src/backend/executor/nodeShareInputScan.c
@@ -40,7 +40,6 @@
 
 #include "postgres.h"
 
-#include "access/xact.h"
 #include "cdb/cdbvars.h"
 #include "executor/executor.h"
 #include "executor/nodeShareInputScan.h"
@@ -641,10 +640,6 @@ read_retry:
 		goto read_retry;
 	else
 	{
-		if(fd >= 0)
-		{
-			gp_retry_close(fd);
-		}
 		elog(ERROR, "could not read from fifo: %m");
 	}
 	Assert(!"Never be here");
@@ -664,10 +659,6 @@ write_retry:
 		goto write_retry;
 	else
 	{
-		if(fd >= 0)
-		{
-			gp_retry_close(fd);
-		}
 		elog(ERROR, "could not write to fifo: %m");
 	}
 
@@ -794,7 +785,14 @@ shareinput_reader_waitready(int share_id, PlanGenerator planGen)
 		{
 			int save_errno = errno;
 			elog(LOG, "SISC READER (shareid=%d, slice=%d): Wait ready try again, errno %d ... ",
-					share_id, currentSliceId, save_errno);
+								share_id, currentSliceId, save_errno);
+			if(save_errno == EBADF)
+			{
+				/* The file description is invalid, maybe this FD has been already closed by writer in some cases
+				 * we need to break here to avoid endless loop and continue to run CHECK_FOR_INTERRUPTS.
+				 */
+				break;
+			}
 		}
 	}
 	return (void *) pctxt;
@@ -925,9 +923,12 @@ writer_wait_for_acks(ShareInput_Lk_Context *pctxt, int share_id, int xslice)
 			int save_errno = errno;
 			elog(LOG, "SISC WRITER (shareid=%d, slice=%d): notify still wait for an answer, errno %d",
 					share_id, currentSliceId, save_errno);
-			/*if error(except EINTR) happens in select, we just return to avoid endless loop*/
-			if(errno != EINTR){
-				return;
+			if(save_errno == EBADF)
+			{
+				/* The file description is invalid, maybe this FD has been already closed by writer in some cases
+				 * we need to break here to avoid endless loop and continue to run CHECK_FOR_INTERRUPTS.
+				 */
+				break;
 			}
 		}
 	}
@@ -979,21 +980,6 @@ shareinput_writer_waitdone(void *ctxt, int share_id, int nsharer_xslice)
 	while(ack_needed > 0)
 	{
 		CHECK_FOR_INTERRUPTS();
-
-		/*
-		 * Writer won't wait for data reading done notification from readers if transaction is
-		 * aborting. Readers may fail to send data reading done notification to writer in two
-		 * cases:
-		 *
-		 *    1. The transaction is aborted due to interrupts or exceptions, i.e., user cancels
-		 *       query, division by zero on some segment
-		 *
-		 *    2. Logic errors in reader which incur its unexpected exit, i.e., segmentation fault
-		 */
-		if (IsAbortInProgress())
-		{
-			break;
-		}
 	
 		MPP_FD_ZERO(&rset);
 		MPP_FD_SET(pctxt->donefd, &rset);
@@ -1024,6 +1010,13 @@ shareinput_writer_waitdone(void *ctxt, int share_id, int nsharer_xslice)
 			int save_errno = errno;
 			elog(LOG, "SISC WRITER (shareid=%d, slice=%d): wait done time out once, errno %d",
 					share_id, currentSliceId, save_errno);
+			if(save_errno == EBADF)
+			{
+				/* The file description is invalid, maybe this FD has been already closed by writer in some cases
+				 * we need to break here to avoid endless loop and continue to run CHECK_FOR_INTERRUPTS.
+				 */
+				break;
+			}
 		}
 	}