You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues-all@impala.apache.org by "carolinchen (Jira)" <ji...@apache.org> on 2021/08/11 04:07:00 UTC

[jira] [Updated] (IMPALA-10855) Hang in PartitionedHashJoinNode::Close() for cancel the query when disk spilling

     [ https://issues.apache.org/jira/browse/IMPALA-10855?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

carolinchen updated IMPALA-10855:
---------------------------------
    Description: 
Query has been cancelled due to overtime,but two fragment instances still haven't been released.  pstack the remaining two fragment instances, where the two threads was blocked. 

this is the stack of the blocked fis.
 * thread A

{code:java}
// code placeholder
Thread 1 (process 1947185):
#0  0x00007fbaa033a945 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1  0x0000000001529834 in Wait (lock=..., this=0x706c44a8) at /opt/Impala/be/src/util/condition-variable.h:49
#2  impala::JoinBuilder::HandoffToProbesAndWait (this=this@entry=0x706c43c0, build_side_state=build_side_state@entry=0xdcbb4e00) at /opt/Impala/be/src/exec/join-builder.cc:103
#3  0x00000000014889d5 in impala::PhjBuilder::FlushFinal (this=0x706c43c0, state=0xdcbb4e00) at /opt/Impala/be/src/exec/partitioned-hash-join-builder.cc:367
#4  0x00000000010eb2d6 in impala::FragmentInstanceState::ExecInternal (this=this@entry=0x1961a8700) at /opt/Impala/be/src/runtime/fragment-instance-state.cc:403
#5  0x00000000010ed51a in impala::FragmentInstanceState::Exec (this=this@entry=0x1961a8700) at /opt/Impala/be/src/runtime/fragment-instance-state.cc:98
#6  0x00000000010cc7b7 in impala::QueryState::ExecFInstance (this=0x21e8278000, fis=0x1961a8700) at /opt/Impala/be/src/runtime/query-state.cc:719
#7  0x00000000013807bb in operator() (this=0x7fb131b7fc00) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/function/function_template.hpp:771
#8  impala::Thread::SuperviseThread(std::string const&, std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*, impala::Promise<long, (impala::PromiseMode)0>*) (name=..., category=..., functor=..., parent_thread_info=<optimized out>, thread_started=0x7fb12fb7ae10) at /opt/Impala/be/src/util/thread.cc:360
#9  0x000000000138168a in operator()<void (*)(const std::basic_string<char>&, const std::basic_string<char>&, boost::function<void()>, const impala::ThreadDebugInfo*, impala::Promise<long int>*), boost::_bi::list0> (f=@0x5ecd8fb8: 0x13804b0 <impala::Thread::SuperviseThread(std::string const&, std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*, impala::Promise<long, (impala::PromiseMode)0>*)>, a=<synthetic pointer>, this=0x5ecd8fc0) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/bind/bind.hpp:531
#10 operator() (this=0x5ecd8fb8) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/bind/bind.hpp:1222
#11 boost::detail::thread_data<boost::_bi::bind_t<void, void (*)(std::string const&, std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*, impala::Promise<long, (impala::PromiseMode)0>*), boost::_bi::list5<boost::_bi::value<std::string>, boost::_bi::value<std::string>, boost::_bi::value<boost::function<void ()> >, boost::_bi::value<impala::ThreadDebugInfo*>, boost::_bi::value<impala::Promise<long, (impala::PromiseMode)0>*> > > >::run() (this=0x5ecd8e00) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/thread/detail/thread.hpp:116
#12 0x0000000001b55f4a in thread_proxy ()
#13 0x00007fbaa0336e25 in start_thread () from /lib64/libpthread.so.0
#14 0x00007fba9cf2e35d in clone () from /lib64/libc.so.6


{code}
thread B
{code:java}
Thread 1 (process 1947174):
#0  0x00007fbaa033a945 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1  0x00000000013ab45d in Wait (lock=..., this=0x21b9e3b50) at /opt/Impala/be/src/util/condition-variable.h:49
#2  impala::BufferPool::Client::CleanPages (this=this@entry=0x21b9e3a40, client_lock=client_lock@entry=0x7fb139b8f530, len=len@entry=524288, lazy_flush=lazy_flush@entry=true) at /opt/Impala/be/src/runtime/bufferpool/buffer-pool.cc:684
#3  0x00000000013ac487 in impala::BufferPool::Client::TransferReservationTo (this=0x21b9e3a40, dst=0x455c3498, bytes=524288, transferred=transferred@entry=0x7fb139b8f5cf) at /opt/Impala/be/src/runtime/bufferpool/buffer-pool.cc:639
#4  0x00000000013ac554 in impala::BufferPool::ClientHandle::TransferReservationTo (this=this@entry=0xcd27f7b0, dst=<optimized out>, bytes=<optimized out>, transferred=transferred@entry=0x7fb139b8f5cf) at /opt/Impala/be/src/runtime/bufferpool/buffer-pool.cc:343
#5  0x00000000013ac578 in impala::BufferPool::ClientHandle::TransferReservationTo (this=this@entry=0xcd27f7b0, dst=<optimized out>, bytes=<optimized out>, transferred=transferred@entry=0x7fb139b8f5cf) at /opt/Impala/be/src/runtime/bufferpool/buffer-pool.cc:349
#6  0x0000000001481b96 in impala::PhjBuilder::ReturnReservation (this=<optimized out>, probe_client=probe_client@entry=0xcd27f7b0, bytes=<optimized out>) at /opt/Impala/be/src/exec/partitioned-hash-join-builder.cc:978
#7  0x0000000001496cf6 in impala::PartitionedHashJoinNode::Close (this=0xcd27f680, state=0xa7491a00) at /opt/Impala/be/src/exec/partitioned-hash-join-node.cc:316
#8  0x00000000013f2a91 in impala::ExecNode::Close (this=0x612b861c0, state=0xa7491a00) at /opt/Impala/be/src/exec/exec-node.cc:305
#9  0x00000000010ea410 in impala::FragmentInstanceState::Close (this=this@entry=0x7cccc820) at /opt/Impala/be/src/runtime/fragment-instance-state.cc:431
#10 0x00000000010ed327 in impala::FragmentInstanceState::Exec (this=this@entry=0x7cccc820) at /opt/Impala/be/src/runtime/fragment-instance-state.cc:104
#11 0x00000000010cc7b7 in impala::QueryState::ExecFInstance (this=0x21e8278000, fis=0x7cccc820) at /opt/Impala/be/src/runtime/query-state.cc:719
#12 0x00000000013807bb in operator() (this=0x7fb139b8fc00) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/function/function_template.hpp:771
#13 impala::Thread::SuperviseThread(std::string const&, std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*, impala::Promise<long, (impala::PromiseMode)0>*) (name=..., category=..., functor=..., parent_thread_info=<optimized out>, thread_started=0x7fb12fb7ae10) at /opt/Impala/be/src/util/thread.cc:360
#14 0x000000000138168a in operator()<void (*)(const std::basic_string<char>&, const std::basic_string<char>&, boost::function<void()>, const impala::ThreadDebugInfo*, impala::Promise<long int>*), boost::_bi::list0> (f=@0x1133db5b8: 0x13804b0 <impala::Thread::SuperviseThread(std::string const&, std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*, impala::Promise<long, (impala::PromiseMode)0>*)>, a=<synthetic pointer>, this=0x1133db5c0) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/bind/bind.hpp:531
#15 operator() (this=0x1133db5b8) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/bind/bind.hpp:1222
#16 boost::detail::thread_data<boost::_bi::bind_t<void, void (*)(std::string const&, std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*, impala::Promise<long, (impala::PromiseMode)0>*), boost::_bi::list5<boost::_bi::value<std::string>, boost::_bi::value<std::string>, boost::_bi::value<boost::function<void ()> >, boost::_bi::value<impala::ThreadDebugInfo*>, boost::_bi::value<impala::Promise<long, (impala::PromiseMode)0>*> > > >::run() (this=0x1133db400) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/thread/detail/thread.hpp:116
#17 0x0000000001b55f4a in thread_proxy ()
#18 0x00007fbaa0336e25 in start_thread () from /lib64/libpthread.so.0
#19 0x00007fba9cf2e35d in clone () from /lib64/libc.so.6
{code}
 # thread A hang in HandoffToProbeAndWait(), wait  in the ConditionVariable bulid_wakeup_cv to notify. 
 # Under normal circumstances, thread B will notify thread A to get the lock, then end the wait.
 # but in this  circumstances, thread B won't notify thread A. because thread B hang in CleanPages() on the cv write_complete_cv_ waiting to be notified,  it is  blocked so can't notify thread A to release .  

as i kown reproduce the issue needs these factor(Probabilistic):
 # join
 # spill
 # cancel (due to overtime or voluntarily cancel)
 # mt

 

  was:
Query has been cancelled due to overtime,but two fragment instances still haven't been released.  pstack the remaining two fragment instances, where the two threads was blocked. 

this is the stack of the blocked fis.
 * thread A

{code:java}
// code placeholder
Thread 1 (process 1947185):
#0  0x00007fbaa033a945 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
#1  0x0000000001529834 in Wait (lock=..., this=0x706c44a8) at /opt/Impala/be/src/util/condition-variable.h:49
#2  impala::JoinBuilder::HandoffToProbesAndWait (this=this@entry=0x706c43c0, build_side_state=build_side_state@entry=0xdcbb4e00) at /opt/Impala/be/src/exec/join-builder.cc:103
#3  0x00000000014889d5 in impala::PhjBuilder::FlushFinal (this=0x706c43c0, state=0xdcbb4e00) at /opt/Impala/be/src/exec/partitioned-hash-join-builder.cc:367
#4  0x00000000010eb2d6 in impala::FragmentInstanceState::ExecInternal (this=this@entry=0x1961a8700) at /opt/Impala/be/src/runtime/fragment-instance-state.cc:403
#5  0x00000000010ed51a in impala::FragmentInstanceState::Exec (this=this@entry=0x1961a8700) at /opt/Impala/be/src/runtime/fragment-instance-state.cc:98
#6  0x00000000010cc7b7 in impala::QueryState::ExecFInstance (this=0x21e8278000, fis=0x1961a8700) at /opt/Impala/be/src/runtime/query-state.cc:719
#7  0x00000000013807bb in operator() (this=0x7fb131b7fc00) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/function/function_template.hpp:771
#8  impala::Thread::SuperviseThread(std::string const&, std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*, impala::Promise<long, (impala::PromiseMode)0>*) (name=..., category=..., functor=..., parent_thread_info=<optimized out>, thread_started=0x7fb12fb7ae10) at /opt/Impala/be/src/util/thread.cc:360
#9  0x000000000138168a in operator()<void (*)(const std::basic_string<char>&, const std::basic_string<char>&, boost::function<void()>, const impala::ThreadDebugInfo*, impala::Promise<long int>*), boost::_bi::list0> (f=@0x5ecd8fb8: 0x13804b0 <impala::Thread::SuperviseThread(std::string const&, std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*, impala::Promise<long, (impala::PromiseMode)0>*)>, a=<synthetic pointer>, this=0x5ecd8fc0) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/bind/bind.hpp:531
#10 operator() (this=0x5ecd8fb8) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/bind/bind.hpp:1222
#11 boost::detail::thread_data<boost::_bi::bind_t<void, void (*)(std::string const&, std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*, impala::Promise<long, (impala::PromiseMode)0>*), boost::_bi::list5<boost::_bi::value<std::string>, boost::_bi::value<std::string>, boost::_bi::value<boost::function<void ()> >, boost::_bi::value<impala::ThreadDebugInfo*>, boost::_bi::value<impala::Promise<long, (impala::PromiseMode)0>*> > > >::run() (this=0x5ecd8e00) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/thread/detail/thread.hpp:116
#12 0x0000000001b55f4a in thread_proxy ()
#13 0x00007fbaa0336e25 in start_thread () from /lib64/libpthread.so.0
#14 0x00007fba9cf2e35d in clone () from /lib64/libc.so.6
{code}


> Hang in PartitionedHashJoinNode::Close() for cancel the query when disk spilling
> --------------------------------------------------------------------------------
>
>                 Key: IMPALA-10855
>                 URL: https://issues.apache.org/jira/browse/IMPALA-10855
>             Project: IMPALA
>          Issue Type: Bug
>          Components: Backend
>    Affects Versions: Impala 3.4.0
>            Reporter: carolinchen
>            Priority: Major
>
> Query has been cancelled due to overtime,but two fragment instances still haven't been released.  pstack the remaining two fragment instances, where the two threads was blocked. 
> this is the stack of the blocked fis.
>  * thread A
> {code:java}
> // code placeholder
> Thread 1 (process 1947185):
> #0  0x00007fbaa033a945 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
> #1  0x0000000001529834 in Wait (lock=..., this=0x706c44a8) at /opt/Impala/be/src/util/condition-variable.h:49
> #2  impala::JoinBuilder::HandoffToProbesAndWait (this=this@entry=0x706c43c0, build_side_state=build_side_state@entry=0xdcbb4e00) at /opt/Impala/be/src/exec/join-builder.cc:103
> #3  0x00000000014889d5 in impala::PhjBuilder::FlushFinal (this=0x706c43c0, state=0xdcbb4e00) at /opt/Impala/be/src/exec/partitioned-hash-join-builder.cc:367
> #4  0x00000000010eb2d6 in impala::FragmentInstanceState::ExecInternal (this=this@entry=0x1961a8700) at /opt/Impala/be/src/runtime/fragment-instance-state.cc:403
> #5  0x00000000010ed51a in impala::FragmentInstanceState::Exec (this=this@entry=0x1961a8700) at /opt/Impala/be/src/runtime/fragment-instance-state.cc:98
> #6  0x00000000010cc7b7 in impala::QueryState::ExecFInstance (this=0x21e8278000, fis=0x1961a8700) at /opt/Impala/be/src/runtime/query-state.cc:719
> #7  0x00000000013807bb in operator() (this=0x7fb131b7fc00) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/function/function_template.hpp:771
> #8  impala::Thread::SuperviseThread(std::string const&, std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*, impala::Promise<long, (impala::PromiseMode)0>*) (name=..., category=..., functor=..., parent_thread_info=<optimized out>, thread_started=0x7fb12fb7ae10) at /opt/Impala/be/src/util/thread.cc:360
> #9  0x000000000138168a in operator()<void (*)(const std::basic_string<char>&, const std::basic_string<char>&, boost::function<void()>, const impala::ThreadDebugInfo*, impala::Promise<long int>*), boost::_bi::list0> (f=@0x5ecd8fb8: 0x13804b0 <impala::Thread::SuperviseThread(std::string const&, std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*, impala::Promise<long, (impala::PromiseMode)0>*)>, a=<synthetic pointer>, this=0x5ecd8fc0) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/bind/bind.hpp:531
> #10 operator() (this=0x5ecd8fb8) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/bind/bind.hpp:1222
> #11 boost::detail::thread_data<boost::_bi::bind_t<void, void (*)(std::string const&, std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*, impala::Promise<long, (impala::PromiseMode)0>*), boost::_bi::list5<boost::_bi::value<std::string>, boost::_bi::value<std::string>, boost::_bi::value<boost::function<void ()> >, boost::_bi::value<impala::ThreadDebugInfo*>, boost::_bi::value<impala::Promise<long, (impala::PromiseMode)0>*> > > >::run() (this=0x5ecd8e00) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/thread/detail/thread.hpp:116
> #12 0x0000000001b55f4a in thread_proxy ()
> #13 0x00007fbaa0336e25 in start_thread () from /lib64/libpthread.so.0
> #14 0x00007fba9cf2e35d in clone () from /lib64/libc.so.6
> {code}
> thread B
> {code:java}
> Thread 1 (process 1947174):
> #0  0x00007fbaa033a945 in pthread_cond_wait@@GLIBC_2.3.2 () from /lib64/libpthread.so.0
> #1  0x00000000013ab45d in Wait (lock=..., this=0x21b9e3b50) at /opt/Impala/be/src/util/condition-variable.h:49
> #2  impala::BufferPool::Client::CleanPages (this=this@entry=0x21b9e3a40, client_lock=client_lock@entry=0x7fb139b8f530, len=len@entry=524288, lazy_flush=lazy_flush@entry=true) at /opt/Impala/be/src/runtime/bufferpool/buffer-pool.cc:684
> #3  0x00000000013ac487 in impala::BufferPool::Client::TransferReservationTo (this=0x21b9e3a40, dst=0x455c3498, bytes=524288, transferred=transferred@entry=0x7fb139b8f5cf) at /opt/Impala/be/src/runtime/bufferpool/buffer-pool.cc:639
> #4  0x00000000013ac554 in impala::BufferPool::ClientHandle::TransferReservationTo (this=this@entry=0xcd27f7b0, dst=<optimized out>, bytes=<optimized out>, transferred=transferred@entry=0x7fb139b8f5cf) at /opt/Impala/be/src/runtime/bufferpool/buffer-pool.cc:343
> #5  0x00000000013ac578 in impala::BufferPool::ClientHandle::TransferReservationTo (this=this@entry=0xcd27f7b0, dst=<optimized out>, bytes=<optimized out>, transferred=transferred@entry=0x7fb139b8f5cf) at /opt/Impala/be/src/runtime/bufferpool/buffer-pool.cc:349
> #6  0x0000000001481b96 in impala::PhjBuilder::ReturnReservation (this=<optimized out>, probe_client=probe_client@entry=0xcd27f7b0, bytes=<optimized out>) at /opt/Impala/be/src/exec/partitioned-hash-join-builder.cc:978
> #7  0x0000000001496cf6 in impala::PartitionedHashJoinNode::Close (this=0xcd27f680, state=0xa7491a00) at /opt/Impala/be/src/exec/partitioned-hash-join-node.cc:316
> #8  0x00000000013f2a91 in impala::ExecNode::Close (this=0x612b861c0, state=0xa7491a00) at /opt/Impala/be/src/exec/exec-node.cc:305
> #9  0x00000000010ea410 in impala::FragmentInstanceState::Close (this=this@entry=0x7cccc820) at /opt/Impala/be/src/runtime/fragment-instance-state.cc:431
> #10 0x00000000010ed327 in impala::FragmentInstanceState::Exec (this=this@entry=0x7cccc820) at /opt/Impala/be/src/runtime/fragment-instance-state.cc:104
> #11 0x00000000010cc7b7 in impala::QueryState::ExecFInstance (this=0x21e8278000, fis=0x7cccc820) at /opt/Impala/be/src/runtime/query-state.cc:719
> #12 0x00000000013807bb in operator() (this=0x7fb139b8fc00) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/function/function_template.hpp:771
> #13 impala::Thread::SuperviseThread(std::string const&, std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*, impala::Promise<long, (impala::PromiseMode)0>*) (name=..., category=..., functor=..., parent_thread_info=<optimized out>, thread_started=0x7fb12fb7ae10) at /opt/Impala/be/src/util/thread.cc:360
> #14 0x000000000138168a in operator()<void (*)(const std::basic_string<char>&, const std::basic_string<char>&, boost::function<void()>, const impala::ThreadDebugInfo*, impala::Promise<long int>*), boost::_bi::list0> (f=@0x1133db5b8: 0x13804b0 <impala::Thread::SuperviseThread(std::string const&, std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*, impala::Promise<long, (impala::PromiseMode)0>*)>, a=<synthetic pointer>, this=0x1133db5c0) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/bind/bind.hpp:531
> #15 operator() (this=0x1133db5b8) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/bind/bind.hpp:1222
> #16 boost::detail::thread_data<boost::_bi::bind_t<void, void (*)(std::string const&, std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*, impala::Promise<long, (impala::PromiseMode)0>*), boost::_bi::list5<boost::_bi::value<std::string>, boost::_bi::value<std::string>, boost::_bi::value<boost::function<void ()> >, boost::_bi::value<impala::ThreadDebugInfo*>, boost::_bi::value<impala::Promise<long, (impala::PromiseMode)0>*> > > >::run() (this=0x1133db400) at /opt/Impala/toolchain/boost-1.61.0-p2/include/boost/thread/detail/thread.hpp:116
> #17 0x0000000001b55f4a in thread_proxy ()
> #18 0x00007fbaa0336e25 in start_thread () from /lib64/libpthread.so.0
> #19 0x00007fba9cf2e35d in clone () from /lib64/libc.so.6
> {code}
>  # thread A hang in HandoffToProbeAndWait(), wait  in the ConditionVariable bulid_wakeup_cv to notify. 
>  # Under normal circumstances, thread B will notify thread A to get the lock, then end the wait.
>  # but in this  circumstances, thread B won't notify thread A. because thread B hang in CleanPages() on the cv write_complete_cv_ waiting to be notified,  it is  blocked so can't notify thread A to release .  
> as i kown reproduce the issue needs these factor(Probabilistic):
>  # join
>  # spill
>  # cancel (due to overtime or voluntarily cancel)
>  # mt
>  



--
This message was sent by Atlassian Jira
(v8.3.4#803005)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-all-unsubscribe@impala.apache.org
For additional commands, e-mail: issues-all-help@impala.apache.org