You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@trafficserver.apache.org by "xiongzongtao (JIRA)" <ji...@apache.org> on 2013/11/20 08:43:24 UTC

[jira] [Created] (TS-2373) the cache is not initialized properly

xiongzongtao created TS-2373:
--------------------------------

             Summary: the cache is not initialized properly
                 Key: TS-2373
                 URL: https://issues.apache.org/jira/browse/TS-2373
             Project: Traffic Server
          Issue Type: Bug
          Components: Cache
            Reporter: xiongzongtao


in recent days, there are several ATS servers on line failed to initialize cache system when restart, so, all requests are simply forwarded to origins.
   Note:  the storage is configured to use raw devices, like this:
/dev/raw/raw1 146163105792
/dev/raw/raw2 146163105792
/dev/raw/raw3 146163105792
/dev/raw/raw4 146163105792
/dev/raw/raw5 146163105792
volume.config is not configured. so, one vol per raw device

1.  checking the traffic.out, found that:  "cache enabled" line that 
    should follow the "traffic server running" line
2.  echo "show:cache-stats"|traffic_shell shows that all stats are ZEROs
3.  dump the memory of traffic_server by "gdb gcore" for convenience of debuging
4.  use gdb to debug:
     a)  all vols are set in global gvol except /dev/raw/raw4.
     b)  one  vol (/dev/raw/raw4) hunged in function handle_recover_from_data
     c) check aio_req, found that, one io event are queued in 
	    aio_req[4]'s aio_temp_list
     d) check io threads,  all are in pthread_cond_wait status.
here, we can see that, handle_recover_from_data issued an io request 
and queued it in aio_req[4]'s aio_temp_list, but all io threads are 
in pthread_cond_wait status, no one to process this request, so that, 
handle_recover_from_data waiting endless. And cache will not enabled 
until all the vols are inited successfully.

inspecting the code, found that, 
code snippet1  from aio_thread_main
1      if (!INK_ATOMICLIST_EMPTY(my_aio_req->aio_temp_list))
2      aio_move(my_aio_req);
3      if (!(op = my_aio_req->aio_todo.pop()) && !(op = my_aio_req->http_aio_todo.pop()))
4       break;

code snippet2 from aio_queue_req
1.  if (!ink_mutex_try_acquire(&req->aio_mutex)) {
    ink_atomiclist_push(&req->aio_temp_list, op);
2. } else {
    /* check if any pending requests on the atomic list */
    if (!INK_ATOMICLIST_EMPTY(req->aio_temp_list))
      aio_move(req);
    /* now put the new request */
    aio_insert(op, req);
    ink_cond_signal(&req->aio_cond);
    ink_mutex_release(&req->aio_mutex);
  }

now suppose aio_temp_list is empty, thread A running code snippet1, thread B 
running code snippet2, and just when thread A is between 1 and  2 in snippet1, 
thread B is executing 1 in snippet2. here, thread B insert io request to 
aio_temp_list, but no notification, and thread A continue to 3, and then 4(break), 
finally fall in pthread_cond_wait. thus,no io thread will process this request



something about my debuging
(gdb) p theCache
$1 = (Cache *) 0xd828cb0
(gdb) p *theCache
$2 = {cache_read_done = 1, total_good_nvol = 4, total_nvol = 5, ready = 0, cache_size = 89169900, 
  hosttable = 0x0, total_initialized_vol = 4, scheme = 1}
(gdb) set $i=0
(gdb) p gvol[$i++]->path
$3 = 0xd826ca0 "/dev/raw/raw2"
(gdb) p gvol[$i++]->path
$4 = 0xd8267a0 "/dev/raw/raw1"
(gdb) p gvol[$i++]->path
$5 = 0xd826a00 "/dev/raw/raw5"
(gdb) p gvol[$i++]->path
$6 = 0xd8269c0 "/dev/raw/raw3"
(gdb) p gvol[$i++]->path
Cannot access memory at address 0x0
(gdb) p cp_list->head->vols[0]
$7 = (Vol *) 0xd828d10
(gdb) set $i=0
(gdb) p *cp_list->head->vols[4]  
$8 = {<Continuation> = {<force_VFPT_to_top> = {_vptr.force_VFPT_to_top = 0x713010}, 
    handler = 0x672070 <Vol::aggWrite(int, void*)>, mutex = {m_ptr = 0xd806ac0}, 
    link = {<SLink<Continuation>> = {next = 0x0}, prev = 0x0}}, path = 0xd826a00 "/dev/raw/raw5", 
  hash_id = 0xd825db0 "/dev/raw/raw5 40960:17833980", hash_id_md5 = {b = {16306325817679019090, 
      17491422742868856741}}, fd = 38, raw_dir = 0x2aaaf0efb000 "\r025", dir = 0x2aaaf0efd000, 
  header = 0x2aaaf0efb000, footer = 0x2aaafbcb9000, segments = 278, buckets = 16382, 
  recover_pos = 68007976448, prev_recover_pos = 67994133504, scan_pos = 76968569856, skip = 40960, 
  start = 364421120, len = 146095964160, data_blocks = 17789500, hit_evacuate_window = 0, 
  io = {<AIOCallback> = {<Continuation> = {<force_VFPT_to_top> = {_vptr.force_VFPT_to_top = 0x712d70}, 
        handler = 0x647640 <AIOCallbackInternal::io_complete(int, void*)>, mutex = {m_ptr = 0xd806ac0}, 
        link = {<SLink<Continuation>> = {next = 0x0}, prev = 0x0}}, aiocb = {aio_fildes = 0, 
        aio_buf = 0x2aaabc47e000, aio_nbytes = 8388608, aio_offset = 67994133504, aio_reqprio = 0, 
        aio_lio_opcode = 1, aio_state = 0, aio__pad = {0}}, action = {_vptr.Action = 0x0, 
        continuation = 0xd8bc010, mutex = {m_ptr = 0xd806ac0}, cancelled = 0}, thread = 0x0, then = 0x0, 
      aio_result = 8388608}, first = 0x0, aio_req = 0xc6532f0, sleep_time = 0}, 
  agg = {<DLL<CacheVC,Continuation::Link_link>> = {head = 0x0}, tail = 0x0}, 
  stat_cache_vcs = {<DLL<CacheVC,Continuation::Link_link>> = {head = 0x0}, tail = 0x0}, 
  sync = {<DLL<CacheVC,Continuation::Link_link>> = {head = 0x0}, tail = 0x0}, agg_buffer = 0x2aaaf07be000 "", 
  agg_todo_size = 0, agg_buf_pos = 0, trigger = 0x0, open_dir = {<Continuation> = {<force_VFPT_to_top> = {
        _vptr.force_VFPT_to_top = 0x713050}, handler = 0x64c580 <OpenDir::signal_readers(int, Event*)>, 
      mutex = {m_ptr = 0xd806ac0}, link = {<SLink<Continuation>> = {next = 0x0}, prev = 0x0}}, 
    delayed_readers = {<DLL<CacheVC,Link_CacheVC_opendir_link>> = {head = 0x0}, tail = 0x0}, bucket = {{
        head = 0x0} <repeats 256 times>}}, ram_cache = 0x0, evacuate_size = 8709, evacuate = 0xd8bd290, 
  lookaside = {{head = 0x0} <repeats 256 times>}, doc_evacuator = 0x0, init_info = 0x0, disk = 0xc650f90, 
---Type <return> to continue, or q <return> to quit---q
cache Quit
(gdb) p *cp_list->head->vols[3]
warning: can't find linker symbol for virtual table for `Vol' value
warning:   found `CacheProcessor::start_internal(int)::__func__' instead
$9 = {<Continuation> = {<force_VFPT_to_top> = {_vptr.force_VFPT_to_top = 0x713010}, 
    handler = 0x6460e0 <Vol::handle_recover_from_data(int, void*)>, mutex = {m_ptr = 0xd806b10}, 
    link = {<SLink<Continuation>> = {next = 0x0}, prev = 0x0}}, path = 0xd8269e0 "/dev/raw/raw4", 
  hash_id = 0xd8266a0 "/dev/raw/raw4 40960:17833980", hash_id_md5 = {b = {9625335342759085551, 
      15230879294086164331}}, fd = 37, raw_dir = 0x2aaae59fc000 "\r025", dir = 0x2aaae59fe000, 
  header = 0x2aaae59fc000, footer = 0x2aaaf07ba000, segments = 278, buckets = 16382, 
  recover_pos = 89786622464, prev_recover_pos = 89786622464, scan_pos = 0, skip = 40960, start = 364421120, 
  len = 146095964160, data_blocks = 17789500, hit_evacuate_window = 0, 
  io = {<AIOCallback> = {<Continuation> = {<force_VFPT_to_top> = {_vptr.force_VFPT_to_top = 0x712d70}, 
        handler = 0x647640 <AIOCallbackInternal::io_complete(int, void*)>, mutex = {m_ptr = 0xd806b10}, 
        link = {<SLink<Continuation>> = {next = 0x0}, prev = 0x0}}, aiocb = {aio_fildes = 37, 
        aio_buf = 0x2aab02ca9000, aio_nbytes = 8388608, aio_offset = 89786622464, aio_reqprio = 0, 
        aio_lio_opcode = 1, aio_state = 0, aio__pad = {0}}, action = {_vptr.Action = 0x0, 
        continuation = 0xd87a010, mutex = {m_ptr = 0xd806b10}, cancelled = 0}, thread = 0x0, then = 0x0, 
      aio_result = 8388608}, first = 0x0, aio_req = 0xc6458b0, sleep_time = 0}, 
  agg = {<DLL<CacheVC,Continuation::Link_link>> = {head = 0x0}, tail = 0x0}, 
  stat_cache_vcs = {<DLL<CacheVC,Continuation::Link_link>> = {head = 0x0}, tail = 0x0}, 
  sync = {<DLL<CacheVC,Continuation::Link_link>> = {head = 0x0}, tail = 0x0}, agg_buffer = 0x2aaae5382000 "", 
  agg_todo_size = 0, agg_buf_pos = 0, trigger = 0x0, open_dir = {<Continuation> = {<force_VFPT_to_top> = {
        _vptr.force_VFPT_to_top = 0x713050}, handler = 0x64c580 <OpenDir::signal_readers(int, Event*)>, 
      mutex = {m_ptr = 0xd806b10}, link = {<SLink<Continuation>> = {next = 0x0}, prev = 0x0}}, 
    delayed_readers = {<DLL<CacheVC,Link_CacheVC_opendir_link>> = {head = 0x0}, tail = 0x0}, bucket = {{
        head = 0x0} <repeats 256 times>}}, ram_cache = 0x0, evacuate_size = 8709, evacuate = 0xd8a2aa0, 
  lookaside = {{head = 0x0} <repeats 256 times>}, doc_evacuator = 0x0, init_info = 0xd83b860, 
---Type <return> to continue, or q <return> to quit---q
disk = 0xc643550, Quit
(gdb) p aio_reqs
$10 = {0x0, 0xc5d75f0, 0xc62f4b0, 0xc63a670, 0xc6458b0, 0xc6532f0, 0x0 <repeats 94 times>}
(gdb) p aio_reqs[5]
$11 = (AIO_Reqs *) 0xc6532f0
(gdb) p *aio_reqs[5]
$12 = {aio_todo = {<DLL<AIOCallback, Continuation::Link_link>> = {head = 0x0}, tail = 0x0}, 
  http_aio_todo = {<DLL<AIOCallback, Continuation::Link_link>> = {head = 0x0}, tail = 0x0}, aio_temp_list = {
    head = {data = 0}, name = 0x717564 "temp_list", offset = 32}, aio_mutex = {__data = {__lock = 0, 
      __count = 0, __owner = 0, __nusers = 8, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, 
    __size = '\0' <repeats 12 times>, "\b", '\0' <repeats 26 times>, __align = 0}, aio_cond = {__data = {
      __lock = 0, __futex = 58, __total_seq = 33, __wakeup_seq = 25, __woken_seq = 25, __mutex = 0xc653328, 
      __nwaiters = 16, __broadcast_seq = 0}, 
    __size = "\000\000\000\000:\000\000\000!\000\000\000\000\000\000\000\031\000\000\000\000\000\000\000\031\000\000\000\000\000\000\000(3e\f\000\000\000\000\020\000\000\000\000\000\000", __align = 249108103168}, 
  index = 5, pending = 0, queued = 0, filedes = 38, requests_queued = 0}
(gdb) p *aio_reqs[4]
$13 = {aio_todo = {<DLL<AIOCallback, Continuation::Link_link>> = {head = 0x0}, tail = 0x0}, 
  http_aio_todo = {<DLL<AIOCallback, Continuation::Link_link>> = {head = 0x0}, tail = 0x0}, aio_temp_list = {
    head = {data = 226992344}, name = 0x717564 "temp_list", offset = 32}, aio_mutex = {__data = {__lock = 0, 
      __count = 0, __owner = 0, __nusers = 8, __kind = 0, __spins = 0, __list = {__prev = 0x0, __next = 0x0}}, 
    __size = '\0' <repeats 12 times>, "\b", '\0' <repeats 26 times>, __align = 0}, aio_cond = {__data = {
      __lock = 0, __futex = 50, __total_seq = 29, __wakeup_seq = 21, __woken_seq = 21, __mutex = 0xc6458e8, 
      __nwaiters = 16, __broadcast_seq = 0}, 
    __size = "\000\000\000\0002\000\000\000\035\000\000\000\000\000\000\000\025\000\000\000\000\000\000\000\025\000\000\000\000\000\000\000f\000\000\000\000\020\000\000\000\000\000\000", __align = 214748364800}, 
  index = 4, pending = 0, queued = 0, filedes = 37, requests_queued = 1}
(gdb) p/x *aio_reqs[4]
$14 = {aio_todo = {<DLL<AIOCallback, Continuation::Link_link>> = {head = 0x0}, tail = 0x0}, 
  http_aio_todo = {<DLL<AIOCallback, Continuation::Link_link>> = {head = 0x0}, tail = 0x0}, aio_temp_list = {
    head = {data = 0xd87a0d8}, name = 0x717564, offset = 0x20}, aio_mutex = {__data = {__lock = 0x0, 
      __count = 0x0, __owner = 0x0, __nusers = 0x8, __kind = 0x0, __spins = 0x0, __list = {__prev = 0x0, 
        __next = 0x0}}, __size = {0x0 <repeats 12 times>, 0x8, 0x0 <repeats 27 times>}, __align = 0x0}, 
  aio_cond = {__data = {__lock = 0x0, __futex = 0x32, __total_seq = 0x1d, __wakeup_seq = 0x15, 
      __woken_seq = 0x15, __mutex = 0xc6458e8, __nwaiters = 0x10, __broadcast_seq = 0x0}, __size = {0x0, 0x0, 
      0x0, 0x0, 0x32, 0x0, 0x0, 0x0, 0x1d, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x15, 0x0, 0x0, 0x0, 0x0, 0x0, 
      0x0, 0x0, 0x15, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0xe8, 0x58, 0x64, 0xc, 0x0, 0x0, 0x0, 0x0, 0x10, 0x0, 
      0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, __align = 0x3200000000}, index = 0x4, pending = 0x0, queued = 0x0, 
  filedes = 0x25, requests_queued = 0x1}
(gdb) p &cp_list->head->vols[3]->io
$15 = (AIOCallbackInternal *) 0xd87a0d8



--
This message was sent by Atlassian JIRA
(v6.1#6144)