You are viewing a plain text version of this content. The canonical link for it is here.
Posted to issues@trafficserver.apache.org by "Sudheer Vinukonda (JIRA)" <ji...@apache.org> on 2015/07/23 19:01:04 UTC

[jira] [Commented] (TS-3794) Enhance post SSL handshake read race condition for SPDY and H2 scenarios.

    [ https://issues.apache.org/jira/browse/TS-3794?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=14639128#comment-14639128 ] 

Sudheer Vinukonda commented on TS-3794:
---------------------------------------

Below's the patch I've tested functionally and with light load. I'm also trying to test this in prod as well.

{code}
diff --git a/iocore/net/SSLNetVConnection.cc b/iocore/net/SSLNetVConnection.cc
index e06f749..b122f5f 100644
--- a/iocore/net/SSLNetVConnection.cc
+++ b/iocore/net/SSLNetVConnection.cc
@@ -491,7 +491,7 @@ SSLNetVConnection::net_read_io(NetHandler *nh, EThread *lthread)
     if (ret == EVENT_ERROR) {
       this->read.triggered = 0;
       readSignalError(nh, err);
-    } else if (ret == SSL_HANDSHAKE_WANT_READ || ret == SSL_HANDSHAKE_WANT_ACCEPT || ret == EVENT_CONT) {
+    } else if (ret == SSL_HANDSHAKE_WANT_READ || ret == SSL_HANDSHAKE_WANT_ACCEPT) {
       if (SSLConfigParams::ssl_handshake_timeout_in > 0) {
         double handshake_time = ((double)(Thread::get_hrtime() - sslHandshakeBeginTime) / 1000000000);
         Debug("ssl", "ssl handshake for vc %p, took %.3f seconds, configured handshake_timer: %d", this, handshake_time,
diff --git a/proxy/http/HttpSM.cc b/proxy/http/HttpSM.cc
index 3f4dd72..4ac8d4a 100644
--- a/proxy/http/HttpSM.cc
+++ b/proxy/http/HttpSM.cc
@@ -614,7 +614,7 @@ HttpSM::state_read_client_request_header(int event, void *data)
   //   time we've been called.  The timeout had been set to
   //   the accept timeout by the HttpClientSession
   //
-  if (client_request_hdr_bytes == 0) {
+  if ((ua_buffer_reader->read_avail() > 0) && (client_request_hdr_bytes == 0)) {
     milestones[TS_MILESTONE_UA_FIRST_READ] = Thread::get_hrtime();
     ua_session->get_netvc()->set_inactivity_timeout(HRTIME_SECONDS(t_state.txn_conf->transaction_no_activity_timeout_in));
   }
diff --git a/proxy/http2/Http2ClientSession.cc b/proxy/http2/Http2ClientSession.cc
index 4e3b85e..0ea6fca 100644
--- a/proxy/http2/Http2ClientSession.cc
+++ b/proxy/http2/Http2ClientSession.cc
@@ -195,6 +195,13 @@ Http2ClientSession::do_io_close(int alerrno)
   DebugHttp2Ssn0("session closed");
 
   ink_assert(this->mutex->thread_holding == this_ethread());
+  if (client_vc) {
+    // clean up ssl's first byte iobuf
+    SSLNetVConnection *ssl_vc = dynamic_cast<SSLNetVConnection *>(client_vc);
+    if (ssl_vc) {
+      ssl_vc->set_ssl_iobuf(NULL);
+    }
+  }
   HTTP2_DECREMENT_THREAD_DYN_STAT(HTTP2_STAT_CURRENT_CLIENT_SESSION_COUNT, this->mutex->thread_holding);
   send_connection_event(&this->connection_state, HTTP2_SESSION_EVENT_FINI, this);
   do_api_callout(TS_HTTP_SSN_CLOSE_HOOK);
diff --git a/proxy/http2/Http2SessionAccept.cc b/proxy/http2/Http2SessionAccept.cc
index fbb25db..b917555 100644
--- a/proxy/http2/Http2SessionAccept.cc
+++ b/proxy/http2/Http2SessionAccept.cc
@@ -65,7 +65,15 @@ Http2SessionAccept::mainEvent(int event, void *data)
   ink_release_assert((event == NET_EVENT_ACCEPT) ? (data != 0) : (1));
 
   if (event == NET_EVENT_ACCEPT) {
-    this->accept(static_cast<NetVConnection *>(data), NULL, NULL);
+    NetVConnection *netvc = static_cast<NetVConnection *>(edata);
+    SSLNetVConnection *ssl_vc = dynamic_cast<SSLNetVConnection *>(netvc);
+    MIOBuffer * iobuf = NULL;
+    IOBufferReader * reader = NULL;
+    if (ssl_vc) {
+      iobuf = ssl_vc->get_ssl_iobuf();
+      reader = ssl_vc->get_ssl_reader();
+    }
+    this->accept(static_cast<NetVConnection *>(data), iobuf, reader);
     return EVENT_CONT;
   }
 
diff --git a/proxy/spdy/SpdyClientSession.cc b/proxy/spdy/SpdyClientSession.cc
index 2f8720e..d60e7a5 100644
--- a/proxy/spdy/SpdyClientSession.cc
+++ b/proxy/spdy/SpdyClientSession.cc
@@ -143,6 +143,11 @@ SpdyClientSession::clear()
   this->mutex = NULL;
 
   if (vc) {
+    // clean up ssl's first byte iobuf
+    SSLNetVConnection *ssl_vc = dynamic_cast<SSLNetVConnection *>(vc);
+    if (ssl_vc) {
+      ssl_vc->set_ssl_iobuf(NULL);
+    }
     TSVConnClose(reinterpret_cast<TSVConn>(vc));
     vc = NULL;
   }
@@ -203,7 +208,7 @@ SpdyClientSession::start()
     {SPDYLAY_SETTINGS_INITIAL_WINDOW_SIZE, SPDYLAY_ID_FLAG_SETTINGS_NONE, spdy_initial_window_size}};
   int r;
 
-  if (TSIOBufferReaderAvail(this->req_reader) > 0) {
+  if (this->write_vio && (TSIOBufferReaderAvail(this->req_reader) > 0)) {
     spdy_process_read(TS_EVENT_VCONN_WRITE_READY, this);
   }
 
diff --git a/proxy/spdy/SpdySessionAccept.cc b/proxy/spdy/SpdySessionAccept.cc
index 98ff9ab..531ff8f 100644
--- a/proxy/spdy/SpdySessionAccept.cc
+++ b/proxy/spdy/SpdySessionAccept.cc
@@ -45,7 +45,15 @@ SpdySessionAccept::mainEvent(int event, void *edata)
 #if TS_HAS_SPDY
     SpdyClientSession *sm = SpdyClientSession::alloc();
     sm->version = this->version;
-    sm->new_connection(netvc, NULL, NULL, false);
+
+    SSLNetVConnection *ssl_vc = dynamic_cast<SSLNetVConnection *>(netvc);
+    MIOBuffer * iobuf = NULL;
+    IOBufferReader * reader = NULL;
+    if (ssl_vc) {
+      iobuf = ssl_vc->get_ssl_iobuf();
+      reader = ssl_vc->get_ssl_reader();
+    }
+    sm->new_connection(netvc, iobuf, reader, false);
 #else
     Error("accepted a SPDY session, but SPDY support is not available");
     netvc->do_io_close();

{code}

> Enhance post SSL handshake read race condition for SPDY and H2 scenarios.
> -------------------------------------------------------------------------
>
>                 Key: TS-3794
>                 URL: https://issues.apache.org/jira/browse/TS-3794
>             Project: Traffic Server
>          Issue Type: Bug
>          Components: Core
>    Affects Versions: 5.3.0
>            Reporter: Sudheer Vinukonda
>            Assignee: Sudheer Vinukonda
>             Fix For: 6.1.0
>
>
> This is a follow up for TS-3714 to extend the SSL post handshake read to SPDY and H2 scenarios.
> An example slow log showing very high *ua_first_read*.
> {code}
> ERROR: [8624075] Slow Request: client_ip: xx.xx.xx.xxx url: http://xxxxxxxxxxxxxxxxxx.... status: 200 unique id: bytes: 86 fd: 0 client state: 0 serv
> er state: 9 ua_begin: 0.000 ua_first_read: 42.224 ua_read_header_done: 42.224 cache_open_rea
> d_begin: 42.224 cache_open_read_end: 42.224 dns_lookup_begin: 42.224 dns_lookup_end: 42.224
> server_connect: 42.224 server_first_read: 42.229 server_read_header_done: 42.229 server_clos
> e: 42.229 ua_begin_write: 42.229 ua_close: 42.229 sm_finish: 42.229
> {code}
> Initially, I suspected that it might be caused by browser's connecting early before sending any bytes to TS. However, this seems to be happening too frequently and with unrealistically high delay between *ua_begin* and *ua_first_read*.
> I suspect it's caused due to the code that disables the read temporarily before calling *TXN_START* hook and re-enables it after the API call out. The read disable is done via a 0-byte *do_io_read* on the client vc, but, the problem is that a valid *mbuf* is still passed. Based on what I am seeing, it's possible this results in actually enabling the *read_vio* all the way uptil *ssl_read_from_net* for instance (if there's a race condition and there were bytes already from the client resulting in an epoll read event), which would finally disable the read since, the *ntodo* (nbytes) is 0. However, this may result in the epoll event being lost until a new i/o happens from the client. I'm trying out further experiments to confirm the theory. In most cases, the read buffer already has some bytes by the time the http session and http sm is created, which makes it just work. But, if there's a slight delay in the receiving bytes after making a connection, the epoll mechanism should read it, but, due to the way the temporary read disable is being done, the event may be lost (this is coz, ATS uses the epoll edge triggered mode).
> Some history on this issue - 
> This issue has been a problem for a long time and affects both http and https requests. When this issue was first reported, our router operations team eventually closed it indicating that disabling *accept* threads resolved it ([~yzlai] also reported similar observations and conclusions). It's possible that the race condition window may be slightly reduced by disabling accept threads, but, to the overall performance and through put, accept threads are very important. I now notice that the issue still exists (regardless of whether or not accept threads are enabled/disabled) and am testing further to confirm the issue.



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)