You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@hawq.apache.org by "Ming LI (JIRA)" <ji...@apache.org> on 2017/02/13 07:02:41 UTC

[jira] [Created] (HAWQ-1324) Query cancel cause segment to go into Crash recovery

Ming LI created HAWQ-1324:
-----------------------------

             Summary: Query cancel cause segment to go into Crash recovery
                 Key: HAWQ-1324
                 URL: https://issues.apache.org/jira/browse/HAWQ-1324
             Project: Apache HAWQ
          Issue Type: Bug
            Reporter: Ming LI
            Assignee: Ed Espino


A query was cancelled due to this connection issue to HDFS on Isilon. Seg26 then went into crash recovery due to a INSERT query being cancelled. What should be the expected behaviour when HDFS becomes unavailable and a Query fails due to HDFS unavailability.
Below is the HDFS error
{code}
2017-01-04 03:04:08.382615 JST,"carund","dwhrun",p574246,th1862944896,"192.168.10.12","47554",2017-01-04 03:03:08 JST,0,con198952,,seg29,,,,,"FATAL","08006","connection to client lost",,,,,,,0,,"postgres.c",3518,
2017-01-04 03:04:08.420099 JST,,,p755778,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","3rd party error log:
2017-01-04 03:04:08.419969, p574222, th140507423066240, ERROR Handle Exception: NamenodeImpl.cpp: 670: Unexpected error: status: STATUS_FILE_NOT_AVAILABLE = 0xC0000467 Path: hawq_default/16385/16563/802748/26 with path=
""/hawq_default/16385/16563/802748/26"", clientname=libhdfs3_client_random_866998528_count_1_pid_574222_tid_140507423066240
@ Hdfs::Internal::UnWrapper<Hdfs::UnresolvedLinkException, Hdfs::HdfsIOException, Hdfs::Internal::Nothing, Hdfs::Internal::Nothing, Hdfs::Internal::Nothing, Hdfs::Internal::Nothing, Hdfs::Internal::Nothing , Hdfs::Internal::Nothing, Hdfs::Internal::Nothing, Hdfs::Internal::Nothing, Hdfs::Internal::Nothing>::unwrap(char const, int)
@ Hdfs::Internal::UnWrapper<Hdfs::FileNotFoundException, Hdfs::UnresolvedLinkException, Hdfs::HdfsIOException, Hdfs::Internal::Nothing, Hdfs::Internal::Nothing, Hdfs::Internal::Nothing, Hdfs::Internal::Not hing, Hdfs::Internal::Nothing, Hdfs::Internal::Nothing, Hdfs::Internal::Nothing, Hdfs::Internal::Nothing>::unwrap(char const, int)
@ Hdfs::Internal::NamenodeImpl::fsync(std::string const&, std::string const&)
@ Hdfs::Internal::NamenodeProxy::fsync(std::string const&, std::string const&)
@ Hdfs::Internal::OutputStreamImpl::closePipeline()
@ Hdfs::Internal::OutputStreamImpl::close()
@ hdfsCloseFile
@ gpfs_hdfs_closefile
@ HdfsCloseFile
@ HdfsFileClose
@ CleanupTempFiles
@ AbortTransaction
@ AbortCurrentTransaction
@ PostgresMain
@ BackendStartup
@ ServerLoop
@ PostmasterMain
@ main
@ Unknown
@ Unknown",,,,,,,,"SysLoggerMain","syslogger.c",518,
2017-01-04 03:04:08.420272 JST,"carund","dwhrun",p574222,th1862944896,"192.168.10.12","47550",2017-01-04 03:03:08 JST,40678725,con198952,cmd4,seg25,,,x40678725,sx1,"WARNING","58030","could not close file 7 : (hdfs://ffd
lakehd.ffwin.fujifilm.co.jp:8020/hawq_default/16385/16563/802748/26) errno 5","Unexpected error: status: STATUS_FILE_NOT_AVAILABLE = 0xC0000467 Path: hawq_default/16385/16563/802748/26 with path=""/hawq_default/16385/16
563/802748/26"", clientname=libhdfs3_client_random_866998528_count_1_pid_574222_tid_140507423066240",,,,,,0,,"fd.c",2762,
{code}
Segment 26 going into Crash recovery - from seg26 log file
{code}
2017-01-04 03:04:08.420314 JST,"carund","dwhrun",p574222,th1862944896,"192.168.10.12","47550",2017-01-04 03:03:08 JST,40678725,con198952,cmd4,seg25,,,x40678725,sx1,"LOG","08006","could not send data to client: 接続が相
手からリセットされました",,,,,,,0,,"pqcomm.c",1292,
2017-01-04 03:04:08.420358 JST,"carund","dwhrun",p574222,th1862944896,"192.168.10.12","47550",2017-01-04 03:03:08 JST,0,con198952,,seg25,,,,,"LOG","08006","could not send data to client: パイプが切断されました",,,,,,,0,
,"pqcomm.c",1292,
2017-01-04 03:04:08.420375 JST,"carund","dwhrun",p574222,th1862944896,"192.168.10.12","47550",2017-01-04 03:03:08 JST,0,con198952,,seg25,,,,,"FATAL","08006","connection to client lost",,,,,,,0,,"postgres.c",3518,
2017-01-04 03:04:08.950354 JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","server process (PID 574240) was terminated by signal 11: Segmentation fault",,,,,,,0,,"postmaster.c",4748,
2017-01-04 03:04:08.950403 JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","terminating any other active server processes",,,,,,,0,,"postmaster.c",4486,
2017-01-04 03:04:08.954044 JST,,,p41605,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","Segment RM exits.",,,,,,,0,,"resourcemanager.c",340,
2017-01-04 03:04:08.954078 JST,,,p41605,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","Clean up handler in message server is called.",,,,,,,0,,"rmcomm_MessageServer.c",105,
2017-01-04 03:04:08.972706 JST,,,p574711,th1862944896,"192.168.10.12","48121",2017-01-04 03:04:08 JST,0,,,seg-10000,,,,,"LOG","00000","PID 574308 in cancel request did not match any process",,,,,,,0,,"postmaster.c",3166
,
2017-01-04 03:04:08.976211 JST,,,p574712,th1862944896,"192.168.10.12","48127",2017-01-04 03:04:08 JST,0,,,seg-10000,,,,,"LOG","00000","PID 574320 in cancel request did not match any process",,,,,,,0,,"postmaster.c",3166,
2017-01-04 03:04:08.979680 JST,,,p574713,th1862944896,"192.168.10.12","48133",2017-01-04 03:04:08 JST,0,,,seg-10000,,,,,"LOG","00000","PID 574326 in cancel request did not match any process",,,,,,,0,,"postmaster.c",3166,
2017-01-04 03:04:08.998035 JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","BeginResetOfPostmasterAfterChildrenAreShutDown: counter 1",,,,,,,0,,"postmaster.c",1902,
2017-01-04 03:04:08.998062 JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","gp_session_id high-water mark is 565729",,,,,,,0,,"postmaster.c",1926,
2017-01-04 03:04:09.076801 JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","resetting shared memory",,,,,,,0,,"postmaster.c",3334,
2017-01-04 03:04:09.076835 JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","PersistentFilespace_ShmemSize: 69192 = gp_max_filespaces: 8 sizeof(FilespaceDirEntryData): 1048 + PersistentFilespace_SharedDataSize(): 80",,,,,,,0,,"cdbpersistentfilespace.c",1144,
2017-01-04 03:04:09.076844 JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","PersistentTablespace_ShmemSize: 6304 = gp_max_tablespaces: 16 sizeof(TablespaceDirEntryData): 32 + PersistentTablespace_SharedDataSize(): 80",,,,,,,0,,"cdbpersistenttablespace.c",1192,
2017-01-04 03:04:09.076851 JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","PersistentDatabase_ShmemSize: 15984 = PersistentDatabase_SharedDataSize(): 15984 = PersistentDatabaseSharedData: 80 + MaxPersistentDatabaseDirectories: 256 (db: 16 ts: 16) sizeof(DatabaseDirEntryData): 56",,,,,,,0,,"cdbpersistentdatabase.c",1477,
2017-01-04 03:04:09.076868 JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","PersistentRelation_ShmemSize: 3673504 = gp_max_relations: 65536 * sizeof(RelationDirEntryData): 32 + PersistentRelation_SharedDataSize(): 80",,,,,,,0,,"cdbpersistentrelation.c",454,
2017-01-04 03:04:09.076876 JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","Metadata Cache Share Memory Size : 155720180",,,,,,,0,,"ipci.c",175,
2017-01-04 03:04:09.077068 JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","temporary files using default location",,,,,,,0,,"primary_mirror_mode.c",282,
2017-01-04 03:04:09.077081 JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","transaction files using default pg_system filespace",,,,,,,0,,"primary_mirror_mode.c",1133,
2017-01-04 03:04:09.255557 JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","[MetadataCache] Metadata cache initialize successfully. block_capacity:2097152",,,,,,,0,,"cdbmetadatacache.c",248,
2017-01-04 03:04:09.266262 JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","PrimaryMirrorMode: Processing postmaster reset with recent mode of 3",,,,,,,0,,"primary_mirror_mode.c",877,
2017-01-04 03:04:09.266286 JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","PrimaryMirrorMode: Processing postmaster reset to non-fault state",,,,,,,0,,"primary_mirror_mode.c",885,
2017-01-04 03:04:09.266293 JST,,,p755773,th1862944896,,,,0,,,seg-10000,,,,,"LOG","00000","removing all temporary files",,,,,,,0,,"fd.c",2194,
{code}
There was a core file generated at the time of the Crash recovery. Below is output from the core file
{code}
---from the core file on segment for PID 574240
#0 0x0000003e7780f867 in ?? () from /data/logs/46429/packcore-core.f418p206.574240.postgres.1483466648/lib64/libgcc_s.so.1
#1 0x0000003e77810119 in _Unwind_Backtrace () from /data/logs/46429/packcore-core.f418p206.574240.postgres.1483466648/lib64/libgcc_s.so.1
#2 0x0000003e75cfebf6 in backtrace () from /data/logs/46429/packcore-core.f418p206.574240.postgres.1483466648/lib64/libc.so.6
#3 0x000000000086e01e in gp_backtrace (processName=<value optimized out>, postgres_signal_arg=11) at elog.c:4374
#4 SafeHandlerForSegvBusIll (processName=<value optimized out>, postgres_signal_arg=11) at elog.c:4510
#5 <signal handler called>
#6 0x0000003e7780f867 in ?? () from /data/logs/46429/packcore-core.f418p206.574240.postgres.1483466648/lib64/libgcc_s.so.1
#7 0x0000003e77810119 in _Unwind_Backtrace () from /data/logs/46429/packcore-core.f418p206.574240.postgres.1483466648/lib64/libgcc_s.so.1
#8 0x0000003e75cfebf6 in backtrace () from /data/logs/46429/packcore-core.f418p206.574240.postgres.1483466648/lib64/libc.so.6
#9 0x000000000087212f in errstart (elevel=15, filename=0xb29fa8 "postgres.c", lineno=3523, funcname=0xb25d00 "ProcessInterrupts", domain=<value optimized out>) at elog.c:492
#10 0x0000000000872819 in elog_finish (elevel=15, fmt=0xb28710 "Process interrupt for 'query cancel pending'.") at elog.c:1443
#11 0x00000000007abfed in ProcessInterrupts () at postgres.c:3523
#12 0x00000000007ac3c6 in StatementCancelHandler (postgres_signal_arg=<value optimized out>) at postgres.c:3376
#13 <signal handler called>
#14 0x00000000004cbfab in varattrib_untoast_ptr_len (d=33352720, datastart=<value optimized out>, len=<value optimized out>, tofree=<value optimized out>) at tuptoaster.c:322
#15 0x0000000000000002 in ?? () at /opt/gcc-4.4.2/lib/gcc/x86_64-unknown-linux-gnu/4.4.2/../../../../include/c++/4.4.2/bits/vector.tcc:372
#16 0x00007fff0ff8c2b0 in ?? ()
#17 0x000000000065f65d in ExecMakeFunctionResultNoSets (fcache=0x50ff8c5c2, econtext=0x1fcec14, isNull=<value optimized out>, isDone=<value optimized out>) at execQual.c:1831
#18 0x0000000000664d4a in ExecTargetList (projInfo=<value optimized out>, isDone=<value optimized out>) at execQual.c:5467
#19 ExecProject (projInfo=<value optimized out>, isDone=<value optimized out>) at execQual.c:5653
#20 0x0000000000685641 in ExecResult (node=0x1f95ae0) at nodeResult.c:222
#21 0x000000000065e0b4 in ExecProcNode (node=0x1f95ae0) at execProcnode.c:891
#22 0x00000000006855c5 in NextInputSlot (node=0x1f671e0) at nodeResult.c:95
#23 ExecResult (node=0x1f671e0) at nodeResult.c:194
#24 0x000000000065e0b4 in ExecProcNode (node=0x1f671e0) at execProcnode.c:891
#25 0x000000000069d4a2 in ExecDML (node=0x1f62f08) at nodeDML.c:65
#26 0x000000000065e295 in ExecProcNode (node=0x1f62f08) at execProcnode.c:1014
#27 0x000000000065a05a in ExecutePlan (estate=0x1f62bb0, planstate=<value optimized out>, operation=<value optimized out>, numberTuples=<value optimized out>, direction=<value optimized out>,
dest=<value optimized out>) at execMain.c:3231
#28 0x000000000065a9e3 in ExecutorRun (queryDesc=<value optimized out>, direction=<value optimized out>, count=<value optimized out>) at execMain.c:1213
#29 0x00000000007b576a in ProcessQuery (portal=<value optimized out>, stmt=0x1f31110, params=<value optimized out>, dest=0x7fca48141ec8, completionTag=0x7fff0ff8ced0 "") at pquery.c:324
#30 0x00000000007b5e0f in PortalRunMulti (portal=0x1f09610, isTopLevel=1 '\001', dest=0x7fca48141ec8, altdest=<value optimized out>, completionTag=0x7fff0ff8ced0 "") at pquery.c:2051
#31 0x00000000007b63a2 in PortalRun (portal=<value optimized out>, count=<value optimized out>, isTopLevel=16 '\020', dest=<value optimized out>, altdest=<value optimized out>, completionTag=<value optimized out>)
at pquery.c:1596
#32 0x00000000007b2e61 in exec_mpp_query (argc=<value optimized out>, argv=<value optimized out>, username=<value optimized out>) at postgres.c:1414
#33 PostgresMain (argc=<value optimized out>, argv=<value optimized out>, username=<value optimized out>) at postgres.c:4945
#34 0x0000000000763f23 in BackendRun (port=0x1df4210) at postmaster.c:5889
#35 BackendStartup (port=0x1df4210) at postmaster.c:5484
#36 0x000000000076467d in ServerLoop () at postmaster.c:2163
#37 0x00000000007664ae in PostmasterMain (argc=9, argv=0x1deb5b0) at postmaster.c:1454
#38 0x00000000006c072a in main (argc=9, argv=0x1deb570) at main.c:226
{code}
From theMaster log files showing connection issues with server f418p206.f418.local where seg26 is located
{code}
Dispatcher thinks f418p206.f418.local is down.",,,,,,"INSERT INTO
carun_db.CATS_CUSTM_SER1_GROUP_CR / CATS_CUSTM_SER1_GROUP_CR /
SELECT
A.cd_custm,
A.cd_itm_ser_1_group,
A.name_itm_ser_1_group,
MAX(A.ctg_sex) AS ctg_sex,
MAX(CASE
WHEN A.dt_frst_ord_cmprd_ser_1_g IS NULL
THEN '9'
WHEN A.dt_frst_ord_trial_ser_1_g IS NOT NULL AND A.dt_frst_ord_cmprd_ser_1_g IS NOT NULL
AND A.dt_frst_ord_trial_ser_1_g < A.dt_frst_ord_cmprd_ser_1_g
THEN '2'
WHEN A.dt_frst_ord_smpl_ser_1_g IS NOT NULL AND A.dt_frst_ord_cmprd_ser_1_g IS NOT NULL
AND A.dt_frst_ord_smpl_ser_1_g < A.dt_frst_ord_cmprd_ser_1_g
THEN '1'
ELSE '6'
END) AS ctg_trial_drct,
MIN(A.flg_dm_allow_now) AS flg_dm_allow_now,
MIN(A.flg_em_allow_now) AS flg_em_allow_now,
MIN(A.flg_tel_allow_now) AS flg_tel_allow_now,
MIN(A.flg_ctel_allow_now) AS flg_ctel_allow_now,
MAX(A.ctg_prdc_tr_now_ser_1_g) AS ctg_prdc_tr_now_ser_1_g,
MAX(CASE
WHEN A.ctg_prdc_tr_now_ser_1_g IN ('0','1') THEN '02'
ELSE '01'
2017-01-04 03:04:09.475256 JST,"carund","dwhrun",p551568,th-1936484224,"192.168.10.14","3885",2017-01-04 03:03:05 JST,968689,con198950,cmd4,seg-1,,,x968689,sx1,"LOG","00000","Dispatcher sends 3 failed host(s) to resource manager.",,,,,,"INSERT INTO
carun_db.CATS_CUSTM_SER1_GROUP_CR / CATS_CUSTM_SER1_GROUP_CR /
2017-01-04 03:04:09.475828 JST,,,p214748,th-1936484224,,,,0,con152207,,seg10000,,,,,"LOG","00000","Resource manager does not probe the status of host f418p206.f418.local because it is in RUAlive pending status already.",,,,,,,0,,"requesthandler.c",1059,
2017-01-04 03:04:09.475835 JST,,,p214748,th-1936484224,,,,0,con152207,,seg-10000,,,,,"LOG","00000","Resource manager does not probe the status of host f418p206.f418.local because it is in RUAlive pending status already.",,,,,,,0,,"requesthandler.c",1059,
2017-01-04 03:04:09.475851 JST,"carund","dwhrun",p551568,th-1936484224,"192.168.10.14","3885",2017-01-04 03:03:05 JST,968689,con198950,cmd4,seg-1,,,x968689,sx1,"LOG","00000","Succeed in sending failed host to resource manager.",,,,,,"INSERT INTO
carun_db.CATS_CUSTM_SER1_GROUP_CR / CATS_CUSTM_SER1_GROUP_CR /
SELECT
A.cd_custm,
A.cd_itm_ser_1_group,
2017-01-04 03:04:09.479952 JST,,,p214748,th-1936484224,,,,0,con152207,,seg-10000,,,,,"LOG","00000","Master RM finds segment:f418p206.f418.local 's RM process has restarted. old status:1",,,,,,,0,,"resourcepool.c",1122,
2017-01-04 03:04:09.479971 JST,,,p214748,th-1936484224,,,,0,con152207,,seg-10000,,,,,"LOG","00000","Host f418p206.f418.local is set availability 2. Cluster currently has 11 available nodes.",,,,,,,0,,"resourcepool.c",1833,
2017-01-04 03:04:09.480841 JST,"gpadmin","template1",p551911,th-1936484224,"[local]",,2017-01-04 03:04:09 JST,968702,,cmd1,seg-10000,,,x968702,sx1,"LOG","00000","statement: BEGIN",,,,,,"BEGIN",0,,"postgres.c",1575,
2017-01-04 03:04:09.480943 JST,"gpadmin","template1",p551911,th-1936484224,"[local]",,2017-01-04 03:04:09 JST,968702,,cmd1,seg-10000,,,x968702,sx1,"LOG","00000","duration: 0.190 ms",,,,,,"BEGIN",0,,"postgres.c",1834,
2017-01-04 03:04:09.481069 JST,"gpadmin","template1",p551911,th-1936484224,"[local]",,2017-01-04 03:04:09 JST,968702,,cmd2,seg-10000,,,x968702,sx1,"LOG","00000","statement: UPDATE gp_segment_configuration SET status='d', description='resource manager process was reset' WHERE registration_order=2",,,,,,"UPDATE gp_segment_configuration SET status='d', description='resource manager process was reset' WHERE registration_order=2",0,,"postgres.c",1575,
{code}
All the log files and core file are located on the scripts server:
gpadmin@10.65.109.130
passwd: changeme
cdsr 46429
Go to HDB_log_files_2017-01-04_134818/HDB_info to for HAWQ settings.
Go to HDB_log_files_2017-01-04_134818/logs/<hostname> for segment log files
f418p101 is HAWQ master.
f418p102 is HAWQ standby master.
f418p206 is HAWQ server hosting seg26 that had the crash recovery



--
This message was sent by Atlassian JIRA
(v6.3.15#6346)