You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@trafficserver.apache.org by "Zhao Yongming (JIRA)" <ji...@apache.org> on 2012/10/22 15:16:12 UTC

[jira] [Created] (TS-1545) possible crash in records stat snap

Zhao Yongming created TS-1545:
---------------------------------

             Summary: possible crash in records stat snap
                 Key: TS-1545
                 URL: https://issues.apache.org/jira/browse/TS-1545
             Project: Traffic Server
          Issue Type: Bug
          Components: Core, Stats
    Affects Versions: 3.3.0
         Environment: debug build, long time regression testing
            Reporter: Zhao Yongming
             Fix For: 3.3.2


when I running regression testing in the loop script, we come to fail to start the server. well, the codes may handle the issue when we build without --enable-debug, but that looks like we have something may break records snap. open this issue in case someone need it.

{code}
[TrafficServer] using root directory '/opt/ats'
FATAL: RecMessage.cc:426: failed assert `eh->magic == REC_MESSAGE_ELE_MAGIC`
/opt/ats/bin/traffic_server - STACK TRACE: 
/opt/ats/lib/libtsutil.so.3(ink_fatal_die+0x0)[0x7ffff7baeca1]
/opt/ats/lib/libtsutil.so.3(_Z12ink_get_randv+0x0)[0x7ffff7badbb8]
/opt/ats/bin/traffic_server(_Z23RecMessageUnmarshalNextP13RecMessageHdrP13RecMessageItrPP9RecRecord+0xbf)[0x6ed860]
/opt/ats/bin/traffic_server(_Z16RecReadStatsFilev+0xc1)[0x6e5e36]
/opt/ats/bin/traffic_server(_Z11RecCoreInit8RecModeTP5Diags+0xec)[0x6e254b]
/opt/ats/bin/traffic_server(_Z14RecProcessInit8RecModeTP5Diags+0x3b)[0x6e7769]
/opt/ats/bin/traffic_server[0x51d4a5]
/opt/ats/bin/traffic_server(main+0x1df)[0x51ee39]
/lib64/libc.so.6(__libc_start_main+0xed)[0x7ffff515b60d]
/opt/ats/bin/traffic_server[0x4d8f99]

Program received signal SIGABRT, Aborted.
0x00007ffff516ec15 in raise () from /lib64/libc.so.6
(gdb) bt
#0  0x00007ffff516ec15 in raise () from /lib64/libc.so.6
#1  0x00007ffff517008b in abort () from /lib64/libc.so.6
#2  0x00007ffff7baeb2c in ink_die_die_die (retval=1) at ink_error.cc:43
#3  0x00007ffff7baebfe in ink_fatal_va(int, const char *, typedef __va_list_tag __va_list_tag *) (return_code=1, 
    message_format=0x7ffff7bca3e0 "%s:%d: failed assert `%s`", ap=0x7fffffffc8b8) at ink_error.cc:65
#4  0x00007ffff7baeca1 in ink_fatal (return_code=1, message_format=0x7ffff7bca3e0 "%s:%d: failed assert `%s`") at ink_error.cc:73
#5  0x00007ffff7badbb8 in _ink_assert (expression=0x76ffa0 "eh->magic == REC_MESSAGE_ELE_MAGIC", file=0x76fe40 "RecMessage.cc", line=426)
    at ink_assert.cc:38
#6  0x00000000006ed860 in RecMessageUnmarshalNext (msg=0xfe6110, itr=0x7fffffffca00, record=0x7fffffffca10) at RecMessage.cc:426
#7  0x00000000006e5e36 in RecReadStatsFile () at P_RecCore.i:569
#8  0x00000000006e254b in RecCoreInit (mode_type=RECM_STAND_ALONE, _diags=0xfe5f70) at RecCore.cc:209
#9  0x00000000006e7769 in RecProcessInit (mode_type=RECM_STAND_ALONE, _diags=0xfe5f70) at RecProcess.cc:313
#10 0x000000000051d4a5 in initialize_process_manager () at Main.cc:413
#11 0x000000000051ee39 in main (argc=1, argv=0x7fffffffdd08) at Main.cc:1409
(gdb) f 6
#6  0x00000000006ed860 in RecMessageUnmarshalNext (msg=0xfe6110, itr=0x7fffffffca00, record=0x7fffffffca10) at RecMessage.cc:426
426       ink_debug_assert(eh->magic == REC_MESSAGE_ELE_MAGIC);
(gdb) l
421         itr->ele_hdr = (RecMessageEleHdr *) ((char *) (msg) + itr->ele_hdr->o_next);
422         itr->next += 1;
423         eh = itr->ele_hdr;
424       }
425
426       ink_debug_assert(eh->magic == REC_MESSAGE_ELE_MAGIC);
427
428       // If the file is corrupt, ignore the the rest of the file.
429       if (eh->magic != REC_MESSAGE_ELE_MAGIC) {
430         Warning("Persistent statistics file records.stat is corrupted. Ignoring the rest of the file\n");
(gdb) p eh->magic
$1 = 0
(gdb) p REC_MESSAGE_ELE_MAGIC
$2 = 4027445261
(gdb) p eh
$3 = (RecMessageEleHdr *) 0xff4138
(gdb) p *eh
$4 = {magic = 0, o_next = 0}
(gdb) f 7
#7  0x00000000006e5e36 in RecReadStatsFile () at P_RecCore.i:569
569           } while (RecMessageUnmarshalNext(m, &itr, &r) != REC_ERR_FAIL);
(gdb) l
564         if (RecMessageUnmarshalFirst(m, &itr, &r) != REC_ERR_FAIL) {
565           do {
566             if ((r->name == NULL) || (!strlen(r->name)))
567               continue;
568             RecSetRecord(r->rec_type, r->name, r->data_type, &(r->data), &(r->stat_meta.data_raw), false);
569           } while (RecMessageUnmarshalNext(m, &itr, &r) != REC_ERR_FAIL);
570         }
571       }
572
573       ink_rwlock_unlock(&g_records_rwlock);
(gdb) p r
$5 = (RecRecord *) 0xff4070
(gdb) p *r
$6 = {rec_type = RECT_PROCESS, name = 0xff4118 "", data_type = RECD_INT, data = {rec_int = 0, rec_float = 0, rec_string = 0x0, 
    rec_counter = 0}, data_default = {rec_int = 0, rec_float = 0, rec_string = 0x0, rec_counter = 0}, lock = {nthread_holding = 1, 
    thread_holding = 140598138681088, the_mutex = {__data = {__lock = 1, __count = 0, __owner = 19059, __nusers = 1, __kind = 0, 
        __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, 
      __size = "\001\000\000\000\000\000\000\000sJ\000\000\001", '\000' <repeats 26 times>, __align = 1}}, sync_required = 1 '\001', 
  registered = true, {stat_meta = {data_raw = {sum = 0, count = 0, last_sum = 0, last_count = 0}, 
      sync_cb = 0x6e8059 <RecRawStatSyncHrTimeAvg(char const*, RecDataT, RecData*, RecRawStatBlock*, int)>, sync_rsb = 0x107f640, 
      sync_id = 5, persist_type = RECP_NULL}, config_meta = {update_required = 0 '\000', update_cb_list = 0x0, update_cookie = 0x0, 
      update_type = RECU_NULL, check_type = RECC_NULL, 
      check_expr = 0x6e8059 <RecRawStatSyncHrTimeAvg(char const*, RecDataT, RecData*, RecRawStatBlock*, int)> "UH\211\345H\203\354`H\211}u\304H\211U\270H\211M\260D\211E\300H\213\005\275>0", access_type = (unknown: 17299008)}}, order = 0, rsb_id = 0}
(gdb) p *itr
No symbol "operator*" in current context.
(gdb) p m
$7 = (RecMessage *) 0xfe6110
(gdb) p *m
$8 = {msg_type = RECG_NULL, o_start = 24, o_write = 59760, o_end = 59760, entries = 265, alignment = 0}
(gdb) 
{code}

--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators
For more information on JIRA, see: http://www.atlassian.com/software/jira

[jira] [Updated] (TS-1545) possible crash in records stat snap

Posted by "Zhao Yongming (JIRA)" <ji...@apache.org>.
     [ https://issues.apache.org/jira/browse/TS-1545?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Zhao Yongming updated TS-1545:
------------------------------

    Attachment: stats.snap
                records.snap

the snap file which is crashing the server
                
> possible crash in records stat snap
> -----------------------------------
>
>                 Key: TS-1545
>                 URL: https://issues.apache.org/jira/browse/TS-1545
>             Project: Traffic Server
>          Issue Type: Bug
>          Components: Core, Stats
>    Affects Versions: 3.3.0
>         Environment: debug build, long time regression testing
>            Reporter: Zhao Yongming
>             Fix For: 3.3.2
>
>         Attachments: records.snap, stats.snap
>
>
> when I running regression testing in the loop script, we come to fail to start the server. well, the codes may handle the issue when we build without --enable-debug, but that looks like we have something may break records snap. open this issue in case someone need it.
> {code}
> [TrafficServer] using root directory '/opt/ats'
> FATAL: RecMessage.cc:426: failed assert `eh->magic == REC_MESSAGE_ELE_MAGIC`
> /opt/ats/bin/traffic_server - STACK TRACE: 
> /opt/ats/lib/libtsutil.so.3(ink_fatal_die+0x0)[0x7ffff7baeca1]
> /opt/ats/lib/libtsutil.so.3(_Z12ink_get_randv+0x0)[0x7ffff7badbb8]
> /opt/ats/bin/traffic_server(_Z23RecMessageUnmarshalNextP13RecMessageHdrP13RecMessageItrPP9RecRecord+0xbf)[0x6ed860]
> /opt/ats/bin/traffic_server(_Z16RecReadStatsFilev+0xc1)[0x6e5e36]
> /opt/ats/bin/traffic_server(_Z11RecCoreInit8RecModeTP5Diags+0xec)[0x6e254b]
> /opt/ats/bin/traffic_server(_Z14RecProcessInit8RecModeTP5Diags+0x3b)[0x6e7769]
> /opt/ats/bin/traffic_server[0x51d4a5]
> /opt/ats/bin/traffic_server(main+0x1df)[0x51ee39]
> /lib64/libc.so.6(__libc_start_main+0xed)[0x7ffff515b60d]
> /opt/ats/bin/traffic_server[0x4d8f99]
> Program received signal SIGABRT, Aborted.
> 0x00007ffff516ec15 in raise () from /lib64/libc.so.6
> (gdb) bt
> #0  0x00007ffff516ec15 in raise () from /lib64/libc.so.6
> #1  0x00007ffff517008b in abort () from /lib64/libc.so.6
> #2  0x00007ffff7baeb2c in ink_die_die_die (retval=1) at ink_error.cc:43
> #3  0x00007ffff7baebfe in ink_fatal_va(int, const char *, typedef __va_list_tag __va_list_tag *) (return_code=1, 
>     message_format=0x7ffff7bca3e0 "%s:%d: failed assert `%s`", ap=0x7fffffffc8b8) at ink_error.cc:65
> #4  0x00007ffff7baeca1 in ink_fatal (return_code=1, message_format=0x7ffff7bca3e0 "%s:%d: failed assert `%s`") at ink_error.cc:73
> #5  0x00007ffff7badbb8 in _ink_assert (expression=0x76ffa0 "eh->magic == REC_MESSAGE_ELE_MAGIC", file=0x76fe40 "RecMessage.cc", line=426)
>     at ink_assert.cc:38
> #6  0x00000000006ed860 in RecMessageUnmarshalNext (msg=0xfe6110, itr=0x7fffffffca00, record=0x7fffffffca10) at RecMessage.cc:426
> #7  0x00000000006e5e36 in RecReadStatsFile () at P_RecCore.i:569
> #8  0x00000000006e254b in RecCoreInit (mode_type=RECM_STAND_ALONE, _diags=0xfe5f70) at RecCore.cc:209
> #9  0x00000000006e7769 in RecProcessInit (mode_type=RECM_STAND_ALONE, _diags=0xfe5f70) at RecProcess.cc:313
> #10 0x000000000051d4a5 in initialize_process_manager () at Main.cc:413
> #11 0x000000000051ee39 in main (argc=1, argv=0x7fffffffdd08) at Main.cc:1409
> (gdb) f 6
> #6  0x00000000006ed860 in RecMessageUnmarshalNext (msg=0xfe6110, itr=0x7fffffffca00, record=0x7fffffffca10) at RecMessage.cc:426
> 426       ink_debug_assert(eh->magic == REC_MESSAGE_ELE_MAGIC);
> (gdb) l
> 421         itr->ele_hdr = (RecMessageEleHdr *) ((char *) (msg) + itr->ele_hdr->o_next);
> 422         itr->next += 1;
> 423         eh = itr->ele_hdr;
> 424       }
> 425
> 426       ink_debug_assert(eh->magic == REC_MESSAGE_ELE_MAGIC);
> 427
> 428       // If the file is corrupt, ignore the the rest of the file.
> 429       if (eh->magic != REC_MESSAGE_ELE_MAGIC) {
> 430         Warning("Persistent statistics file records.stat is corrupted. Ignoring the rest of the file\n");
> (gdb) p eh->magic
> $1 = 0
> (gdb) p REC_MESSAGE_ELE_MAGIC
> $2 = 4027445261
> (gdb) p eh
> $3 = (RecMessageEleHdr *) 0xff4138
> (gdb) p *eh
> $4 = {magic = 0, o_next = 0}
> (gdb) f 7
> #7  0x00000000006e5e36 in RecReadStatsFile () at P_RecCore.i:569
> 569           } while (RecMessageUnmarshalNext(m, &itr, &r) != REC_ERR_FAIL);
> (gdb) l
> 564         if (RecMessageUnmarshalFirst(m, &itr, &r) != REC_ERR_FAIL) {
> 565           do {
> 566             if ((r->name == NULL) || (!strlen(r->name)))
> 567               continue;
> 568             RecSetRecord(r->rec_type, r->name, r->data_type, &(r->data), &(r->stat_meta.data_raw), false);
> 569           } while (RecMessageUnmarshalNext(m, &itr, &r) != REC_ERR_FAIL);
> 570         }
> 571       }
> 572
> 573       ink_rwlock_unlock(&g_records_rwlock);
> (gdb) p r
> $5 = (RecRecord *) 0xff4070
> (gdb) p *r
> $6 = {rec_type = RECT_PROCESS, name = 0xff4118 "", data_type = RECD_INT, data = {rec_int = 0, rec_float = 0, rec_string = 0x0, 
>     rec_counter = 0}, data_default = {rec_int = 0, rec_float = 0, rec_string = 0x0, rec_counter = 0}, lock = {nthread_holding = 1, 
>     thread_holding = 140598138681088, the_mutex = {__data = {__lock = 1, __count = 0, __owner = 19059, __nusers = 1, __kind = 0, 
>         __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, 
>       __size = "\001\000\000\000\000\000\000\000sJ\000\000\001", '\000' <repeats 26 times>, __align = 1}}, sync_required = 1 '\001', 
>   registered = true, {stat_meta = {data_raw = {sum = 0, count = 0, last_sum = 0, last_count = 0}, 
>       sync_cb = 0x6e8059 <RecRawStatSyncHrTimeAvg(char const*, RecDataT, RecData*, RecRawStatBlock*, int)>, sync_rsb = 0x107f640, 
>       sync_id = 5, persist_type = RECP_NULL}, config_meta = {update_required = 0 '\000', update_cb_list = 0x0, update_cookie = 0x0, 
>       update_type = RECU_NULL, check_type = RECC_NULL, 
>       check_expr = 0x6e8059 <RecRawStatSyncHrTimeAvg(char const*, RecDataT, RecData*, RecRawStatBlock*, int)> "UH\211\345H\203\354`H\211}u\304H\211U\270H\211M\260D\211E\300H\213\005\275>0", access_type = (unknown: 17299008)}}, order = 0, rsb_id = 0}
> (gdb) p *itr
> No symbol "operator*" in current context.
> (gdb) p m
> $7 = (RecMessage *) 0xfe6110
> (gdb) p *m
> $8 = {msg_type = RECG_NULL, o_start = 24, o_write = 59760, o_end = 59760, entries = 265, alignment = 0}
> (gdb) 
> {code}

--
This message is automatically generated by JIRA.
If you think it was sent incorrectly, please contact your JIRA administrators
For more information on JIRA, see: http://www.atlassian.com/software/jira