You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@impala.apache.org by "Tim Armstrong (Jira)" <ji...@apache.org> on 2020/12/23 18:10:00 UTC

[jira] [Resolved] (IMPALA-6787) On large secure clusters the connection setup thread becomes bottleneck at warmup and cause occasional timeout failures

     [ https://issues.apache.org/jira/browse/IMPALA-6787?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]

Tim Armstrong resolved IMPALA-6787.
-----------------------------------
    Resolution: Won't Do

I think this is unlikely to be necessary now that we've converted all of the data and control plane RPCs to KRPC (statestore and catalog are still thrift but less likely to be an issue).

> On large secure clusters the connection setup thread becomes bottleneck at warmup and cause occasional timeout failures
> -----------------------------------------------------------------------------------------------------------------------
>
>                 Key: IMPALA-6787
>                 URL: https://issues.apache.org/jira/browse/IMPALA-6787
>             Project: IMPALA
>          Issue Type: Sub-task
>          Components: Distributed Exec
>    Affects Versions: Impala 2.12.0
>            Reporter: Mostafa Mokhtar
>            Priority: Major
>              Labels: rpc
>
> On +200 node clusters a single thread is not sufficient and ends up being a bottleneck for a while, which appears to cause queries to fail with 
> {code}
> I0401 20:20:55.032140 1806361 thrift-util.cc:123] TSocket::open() connect() <Host: va1007.foo.com Port: 22000>Connection timed out
> I0401 20:20:55.032346 1806361 thrift-client.cc:78] Couldn't open transport for va1007.foo.com:22000 (connect() failed: Connection timed out)
> I0401 20:20:55.032364 1806361 thrift-client.cc:94] Unable to connect to va1007.foo.com:22000
> {code}
> {code}
> // Only using one thread here is sufficient for performance, and it avoids potential
>   // thread safety issues with the thrift code called in SetupConnection.
>   constexpr int CONNECTION_SETUP_POOL_SIZE = 1;
>   // New - this is the thread pool used to process the internal accept queue.
>   ThreadPool<shared_ptr<TTransport>> connection_setup_pool("setup-server", "setup-worker",
>       CONNECTION_SETUP_POOL_SIZE, FLAGS_accepted_cnxn_queue_depth,
>       [this](int tid, const shared_ptr<TTransport>& item) {
>         this->SetupConnection(item);
>       });
> {code}
> {code}
> #0  0x00007fd927de8e20 in krb5int_MD5Update () from /lib64/libk5crypto.so.3
> #1  0x00007fd927de7bca in k5_md5_hash () from /lib64/libk5crypto.so.3
> #2  0x00007fd927e01e32 in krb5int_hmac_keyblock () from /lib64/libk5crypto.so.3
> #3  0x00007fd927dfc448 in usage_key.isra.2 () from /lib64/libk5crypto.so.3
> #4  0x00007fd927dfc9fc in krb5int_arcfour_decrypt () from /lib64/libk5crypto.so.3
> #5  0x00007fd927df97e4 in krb5_k_decrypt () from /lib64/libk5crypto.so.3
> #6  0x00007fd927df98bd in krb5_c_decrypt () from /lib64/libk5crypto.so.3
> #7  0x00007fd9297191fb in rd_req_decoded_opt () from /lib64/libkrb5.so.3
> #8  0x00007fd92971a1da in krb5_rd_req_decoded () from /lib64/libkrb5.so.3
> #9  0x00007fd9282371df in kg_accept_krb5 () from /lib64/libgssapi_krb5.so.2
> #10 0x00007fd9282388ca in krb5_gss_accept_sec_context_ext () from /lib64/libgssapi_krb5.so.2
> #11 0x00007fd928238a29 in krb5_gss_accept_sec_context () from /lib64/libgssapi_krb5.so.2
> #12 0x00007fd92822607a in gss_accept_sec_context () from /lib64/libgssapi_krb5.so.2
> #13 0x00007fd92653aedc in gssapi_server_mech_step () from /usr/lib64/sasl2/libgssapiv2.so
> #14 0x00007fd92bc27b9b in sasl_server_step () from /lib64/libsasl2.so.3
> #15 0x0000000000caf3b1 in sasl::TSaslServer::evaluateChallengeOrResponse(unsigned char const*, unsigned int, unsigned int*) ()
> #16 0x0000000000cb3040 in apache::thrift::transport::TSaslTransport::doSaslNegotiation() ()
> #17 0x0000000000cb1488 in apache::thrift::transport::TSaslServerTransport::Factory::getTransport(boost::shared_ptr<apache::thrift::transport::TTransport>) ()
> #18 0x0000000000b143c7 in apache::thrift::server::TAcceptQueueServer::SetupConnection(boost::shared_ptr<apache::thrift::transport::TTransport>) ()
> #19 0x0000000000b14eb2 in boost::detail::function::void_function_obj_invoker2<apache::thrift::server::TAcceptQueueServer::serve()::{lambda(int, boost::shared_ptr<apache::thrift::transport::TTransport> const&)#1}, void, int, boost::shared_ptr<apache::thrift::transport::TTransport> const&>::invoke(boost::detail::function::function_buffer&, int, boost::shared_ptr<apache::thrift::transport::TTransport> const&) ()
> #20 0x0000000000b17d79 in impala::ThreadPool<boost::shared_ptr<apache::thrift::transport::TTransport> >::WorkerThread(int) ()
> #21 0x0000000000d6049f in impala::Thread::SuperviseThread(std::string const&, std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*, impala::Promise<long>*) ()
> #22 0x0000000000d60c9a in boost::detail::thread_data<boost::_bi::bind_t<void, void (*)(std::string const&, std::string const&, boost::function<void ()>, impala::ThreadDebugInfo const*, impala::Promise<long>*), boost::_bi::list5<boost::_bi::value<std::string>, boost::_bi::value<std::string>, boost::_bi::value<boost::function<void ()> >, boost::_bi::value<impala::ThreadDebugInfo*>, boost::_bi::value<impala::Promise<long>*> > > >::run() ()
> #23 0x00000000012d794a in thread_proxy ()
> #24 0x00007fd928c7ddc5 in start_thread () from /lib64/libpthread.so.0
> #25 0x00007fd9289aaced in clone () from /lib64/libc.so.6
> {code}



--
This message was sent by Atlassian Jira
(v8.3.4#803005)