You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@trafficserver.apache.org by "Bryan Call (JIRA)" <ji...@apache.org> on 2016/08/15 20:29:20 UTC

[jira] [Updated] (TS-1545) Possible crash in records stat snap

     [ https://issues.apache.org/jira/browse/TS-1545?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Bryan Call updated TS-1545:
---------------------------
    Summary: Possible crash in records stat snap  (was: possible crash in records stat snap)

> Possible crash in records stat snap
> -----------------------------------
>
>                 Key: TS-1545
>                 URL: https://issues.apache.org/jira/browse/TS-1545
>             Project: Traffic Server
>          Issue Type: Bug
>          Components: Core, Metrics
>    Affects Versions: 3.3.0
>         Environment: debug build, long time regression testing
>            Reporter: Zhao Yongming
>              Labels: Crash
>             Fix For: sometime
>
>         Attachments: records.snap, stats.snap
>
>
> when I running regression testing in the loop script, we come to fail to start the server. well, the codes may handle the issue when we build without --enable-debug, but that looks like we have something may break records snap. open this issue in case someone need it.
> {code}
> [TrafficServer] using root directory '/opt/ats'
> FATAL: RecMessage.cc:426: failed assert `eh->magic == REC_MESSAGE_ELE_MAGIC`
> /opt/ats/bin/traffic_server - STACK TRACE: 
> /opt/ats/lib/libtsutil.so.3(ink_fatal_die+0x0)[0x7ffff7baeca1]
> /opt/ats/lib/libtsutil.so.3(_Z12ink_get_randv+0x0)[0x7ffff7badbb8]
> /opt/ats/bin/traffic_server(_Z23RecMessageUnmarshalNextP13RecMessageHdrP13RecMessageItrPP9RecRecord+0xbf)[0x6ed860]
> /opt/ats/bin/traffic_server(_Z16RecReadStatsFilev+0xc1)[0x6e5e36]
> /opt/ats/bin/traffic_server(_Z11RecCoreInit8RecModeTP5Diags+0xec)[0x6e254b]
> /opt/ats/bin/traffic_server(_Z14RecProcessInit8RecModeTP5Diags+0x3b)[0x6e7769]
> /opt/ats/bin/traffic_server[0x51d4a5]
> /opt/ats/bin/traffic_server(main+0x1df)[0x51ee39]
> /lib64/libc.so.6(__libc_start_main+0xed)[0x7ffff515b60d]
> /opt/ats/bin/traffic_server[0x4d8f99]
> Program received signal SIGABRT, Aborted.
> 0x00007ffff516ec15 in raise () from /lib64/libc.so.6
> (gdb) bt
> #0  0x00007ffff516ec15 in raise () from /lib64/libc.so.6
> #1  0x00007ffff517008b in abort () from /lib64/libc.so.6
> #2  0x00007ffff7baeb2c in ink_die_die_die (retval=1) at ink_error.cc:43
> #3  0x00007ffff7baebfe in ink_fatal_va(int, const char *, typedef __va_list_tag __va_list_tag *) (return_code=1, 
>     message_format=0x7ffff7bca3e0 "%s:%d: failed assert `%s`", ap=0x7fffffffc8b8) at ink_error.cc:65
> #4  0x00007ffff7baeca1 in ink_fatal (return_code=1, message_format=0x7ffff7bca3e0 "%s:%d: failed assert `%s`") at ink_error.cc:73
> #5  0x00007ffff7badbb8 in _ink_assert (expression=0x76ffa0 "eh->magic == REC_MESSAGE_ELE_MAGIC", file=0x76fe40 "RecMessage.cc", line=426)
>     at ink_assert.cc:38
> #6  0x00000000006ed860 in RecMessageUnmarshalNext (msg=0xfe6110, itr=0x7fffffffca00, record=0x7fffffffca10) at RecMessage.cc:426
> #7  0x00000000006e5e36 in RecReadStatsFile () at P_RecCore.i:569
> #8  0x00000000006e254b in RecCoreInit (mode_type=RECM_STAND_ALONE, _diags=0xfe5f70) at RecCore.cc:209
> #9  0x00000000006e7769 in RecProcessInit (mode_type=RECM_STAND_ALONE, _diags=0xfe5f70) at RecProcess.cc:313
> #10 0x000000000051d4a5 in initialize_process_manager () at Main.cc:413
> #11 0x000000000051ee39 in main (argc=1, argv=0x7fffffffdd08) at Main.cc:1409
> (gdb) f 6
> #6  0x00000000006ed860 in RecMessageUnmarshalNext (msg=0xfe6110, itr=0x7fffffffca00, record=0x7fffffffca10) at RecMessage.cc:426
> 426       ink_debug_assert(eh->magic == REC_MESSAGE_ELE_MAGIC);
> (gdb) l
> 421         itr->ele_hdr = (RecMessageEleHdr *) ((char *) (msg) + itr->ele_hdr->o_next);
> 422         itr->next += 1;
> 423         eh = itr->ele_hdr;
> 424       }
> 425
> 426       ink_debug_assert(eh->magic == REC_MESSAGE_ELE_MAGIC);
> 427
> 428       // If the file is corrupt, ignore the the rest of the file.
> 429       if (eh->magic != REC_MESSAGE_ELE_MAGIC) {
> 430         Warning("Persistent statistics file records.stat is corrupted. Ignoring the rest of the file\n");
> (gdb) p eh->magic
> $1 = 0
> (gdb) p REC_MESSAGE_ELE_MAGIC
> $2 = 4027445261
> (gdb) p eh
> $3 = (RecMessageEleHdr *) 0xff4138
> (gdb) p *eh
> $4 = {magic = 0, o_next = 0}
> (gdb) f 7
> #7  0x00000000006e5e36 in RecReadStatsFile () at P_RecCore.i:569
> 569           } while (RecMessageUnmarshalNext(m, &itr, &r) != REC_ERR_FAIL);
> (gdb) l
> 564         if (RecMessageUnmarshalFirst(m, &itr, &r) != REC_ERR_FAIL) {
> 565           do {
> 566             if ((r->name == NULL) || (!strlen(r->name)))
> 567               continue;
> 568             RecSetRecord(r->rec_type, r->name, r->data_type, &(r->data), &(r->stat_meta.data_raw), false);
> 569           } while (RecMessageUnmarshalNext(m, &itr, &r) != REC_ERR_FAIL);
> 570         }
> 571       }
> 572
> 573       ink_rwlock_unlock(&g_records_rwlock);
> (gdb) p r
> $5 = (RecRecord *) 0xff4070
> (gdb) p *r
> $6 = {rec_type = RECT_PROCESS, name = 0xff4118 "", data_type = RECD_INT, data = {rec_int = 0, rec_float = 0, rec_string = 0x0, 
>     rec_counter = 0}, data_default = {rec_int = 0, rec_float = 0, rec_string = 0x0, rec_counter = 0}, lock = {nthread_holding = 1, 
>     thread_holding = 140598138681088, the_mutex = {__data = {__lock = 1, __count = 0, __owner = 19059, __nusers = 1, __kind = 0, 
>         __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, 
>       __size = "\001\000\000\000\000\000\000\000sJ\000\000\001", '\000' <repeats 26 times>, __align = 1}}, sync_required = 1 '\001', 
>   registered = true, {stat_meta = {data_raw = {sum = 0, count = 0, last_sum = 0, last_count = 0}, 
>       sync_cb = 0x6e8059 <RecRawStatSyncHrTimeAvg(char const*, RecDataT, RecData*, RecRawStatBlock*, int)>, sync_rsb = 0x107f640, 
>       sync_id = 5, persist_type = RECP_NULL}, config_meta = {update_required = 0 '\000', update_cb_list = 0x0, update_cookie = 0x0, 
>       update_type = RECU_NULL, check_type = RECC_NULL, 
>       check_expr = 0x6e8059 <RecRawStatSyncHrTimeAvg(char const*, RecDataT, RecData*, RecRawStatBlock*, int)> "UH\211\345H\203\354`H\211}u\304H\211U\270H\211M\260D\211E\300H\213\005\275>0", access_type = (unknown: 17299008)}}, order = 0, rsb_id = 0}
> (gdb) p *itr
> No symbol "operator*" in current context.
> (gdb) p m
> $7 = (RecMessage *) 0xfe6110
> (gdb) p *m
> $8 = {msg_type = RECG_NULL, o_start = 24, o_write = 59760, o_end = 59760, entries = 265, alignment = 0}
> (gdb) 
> {code}



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)