You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafficserver.apache.org by we...@apache.org on 2013/12/04 04:39:03 UTC

[1/6] refine the codes of cluster

Updated Branches:
  refs/heads/refine_cluster 7ffc10a9c -> 62504a9f8 (forced update)


http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/session.cc
----------------------------------------------------------------------
diff --git a/iocore/cluster/session.cc b/iocore/cluster/session.cc
new file mode 100644
index 0000000..7adead6
--- /dev/null
+++ b/iocore/cluster/session.cc
@@ -0,0 +1,1267 @@
+/** @file
+
+  A brief file description
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/epoll.h>
+#include "Diags.h"
+#include "machine.h"
+#include "global.h"
+#include "connection.h"
+#include "clusterinterface.h"
+#include "nio.h"
+#ifndef TS_INLINE
+#define TS_INLINE inline
+#endif
+#include "I_IOBuffer.h"
+#include "P_Cluster.h"
+#include "P_RecCore.h"
+#include "session.h"
+
+#ifndef USE_MULTI_ALLOCATOR
+static Allocator in_message_allocator("InMessage", sizeof(InMessage), 1024);
+#endif
+
+static Allocator session_allocator("SessionEntry", sizeof(SessionEntry), 1024);
+
+static MachineSessions *all_sessions;  //[src ip % MAX_MACHINE_COUNT]
+static ink_mutex session_lock;
+static int my_machine_id = 0;
+
+struct SessionRecords {
+  RecRecord * create_total_count;   //create session total count
+  RecRecord * create_success_count; //create session success count
+  RecRecord * create_retry_times;   //create session retry times
+  RecRecord * close_total_count;    //close session count
+  RecRecord * close_success_count;  //close session success count
+  RecRecord * session_miss_count;     //session miss count
+  RecRecord * session_occupied_count; //session occupied count
+};
+
+static SessionRecords server_session_records = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+static SessionRecords client_session_records = {NULL, NULL, NULL, NULL, NULL, NULL, NULL};
+
+static void init_session_stat(SessionRecords *pSessionRecords, const char *prefix);
+
+inline static int get_session_machine_index(const unsigned int ip)
+{
+  int id;
+  int count;
+  int index;
+
+  id = ip % MAX_MACHINE_COUNT;
+  if (all_sessions[id].ip == ip) {
+    return id;
+  }
+
+  count = 1;
+  while (count <= MAX_MACHINE_COUNT) {
+    index = (id + count) % MAX_MACHINE_COUNT;
+    if (all_sessions[index].ip == ip) {
+      return index;
+    }
+    count++;
+  }
+
+  return -1;
+}
+
+static int alloc_session_machine_index(const unsigned int ip)
+{
+  int id;
+  int count;
+  int index;
+
+  id = ip % MAX_MACHINE_COUNT;
+  if (all_sessions[id].ip == 0) {
+    return id;
+  }
+
+  count = 1;
+  while (count <= MAX_MACHINE_COUNT) {
+    index = (id + count) % MAX_MACHINE_COUNT;
+    if (all_sessions[index].ip == 0) {
+      return index;
+    }
+    count++;
+  }
+
+  return -1;
+}
+
+inline static void release_in_message(SocketContext *pSockContext,
+    InMessage *pMessage)
+{
+  ink_atomic_increment(&pSockContext->thread_context->stats.
+      dequeue_in_msg_count, 1);
+  ink_atomic_increment(&pSockContext->thread_context->stats.
+      dequeue_in_msg_bytes, MSG_HEADER_LENGTH + pMessage->data_len);
+
+  pMessage->blocks = NULL;  //free pointer
+#ifdef USE_MULTI_ALLOCATOR
+  pSockContext->in_msg_allocator->free_void(pMessage);
+#else
+  (void)pSockContext;
+  in_message_allocator.free_void(pMessage);
+#endif
+}
+
+int init_machine_sessions(ClusterMachine *machine, const bool bMyself)
+{
+  int result;
+  int sessions_bytes;
+  int locks_bytes;
+  int machine_id;
+  MachineSessions *pMachineSessions;
+  ink_mutex *pLock;
+  ink_mutex *pLockEnd;
+
+  ink_mutex_acquire(&session_lock);
+  if ((machine_id=get_session_machine_index(machine->ip)) < 0) {
+    if ((machine_id=alloc_session_machine_index(machine->ip)) < 0) {
+      ink_mutex_release(&session_lock);
+      return ENOSPC;
+    }
+  }
+
+  pMachineSessions = all_sessions + machine_id;
+  if (pMachineSessions->init_done) {  //already init
+    ink_mutex_release(&session_lock);
+    return 0;
+  }
+
+  pMachineSessions->is_myself = bMyself;
+  pMachineSessions->ip = machine->ip;
+
+  sessions_bytes = sizeof(SessionEntry) * max_session_count_per_machine;
+  pMachineSessions->sessions = (SessionEntry *)malloc(sessions_bytes);
+  if (pMachineSessions->sessions == NULL) {
+    Error("file: "__FILE__", line: %d, "
+        "malloc %d bytes fail, errno: %d, error info: %s",
+        __LINE__, sessions_bytes, errno, strerror(errno));
+    ink_mutex_release(&session_lock);
+    return errno != 0 ? errno : ENOMEM;
+  }
+  memset(pMachineSessions->sessions, 0, sessions_bytes);
+
+  locks_bytes = sizeof(ink_mutex) * session_lock_count_per_machine;
+  pMachineSessions->locks = (ink_mutex *)malloc(locks_bytes);
+  if (pMachineSessions->locks == NULL) {
+    Error("file: "__FILE__", line: %d, "
+        "malloc %d bytes fail, errno: %d, error info: %s",
+        __LINE__, locks_bytes, errno, strerror(errno));
+    ink_mutex_release(&session_lock);
+    return errno != 0 ? errno : ENOMEM;
+  }
+
+  pLockEnd = pMachineSessions->locks + session_lock_count_per_machine;
+  for (pLock=pMachineSessions->locks; pLock<pLockEnd; pLock++) {
+    if ((result=ink_mutex_init(pLock, "session_locks")) != 0) {
+      ink_mutex_release(&session_lock);
+      return result;
+    }
+  }
+
+  pMachineSessions->init_done = true;
+  ink_mutex_release(&session_lock);
+  return 0;
+}
+
+int session_init()
+{
+  int bytes;
+  int result;
+  ClusterMachine *myMachine;
+
+  bytes = sizeof(MachineSessions) * MAX_MACHINE_COUNT;
+  all_sessions = (MachineSessions *)malloc(bytes);
+  if (all_sessions == NULL) {
+    Error("file: "__FILE__", line: %d, "
+        "malloc %d bytes fail, errno: %d, error info: %s",
+        __LINE__, bytes, errno, strerror(errno));
+    return errno != 0 ? errno : ENOMEM;
+  }
+  memset(all_sessions, 0, bytes);
+
+  myMachine = cluster_machines + 0;
+  if ((result=init_machine_sessions(myMachine, true)) != 0) {
+    return result;
+  }
+
+  if ((result=ink_mutex_init(&session_lock, "session_lock")) != 0) {
+    return result;
+  }
+
+  my_machine_id = get_session_machine_index(myMachine->ip);
+  Debug(CLUSTER_DEBUG_TAG, "my_machine_id: %d", my_machine_id);
+
+  init_session_stat(&server_session_records, "proxy.process.cluster.server_session");
+  init_session_stat(&client_session_records, "proxy.process.cluster.client_session");
+
+  return 0;
+}
+
+int cluster_create_session(ClusterSession *session,
+    const ClusterMachine *machine, void *arg, const int events)
+{
+  MachineSessions *pMachineSessions;
+  SessionEntry *pSessionEntry;
+  SocketContext *pSockContext;
+  int i;
+  int session_index;
+  int version;
+  SequenceType seq;
+
+  pMachineSessions = all_sessions + my_machine_id;
+
+  ink_atomic_increment(&pMachineSessions->session_stat.create_total_count, 1);
+
+  if ((pSockContext=get_socket_context(machine)) == NULL) {
+    return ENOENT;
+  }
+  version = pSockContext->version;
+
+  for (i=0; i<128; i++) {
+    seq = ink_atomic_increment(&pMachineSessions->current_seq, 1);
+    session_index = seq % max_session_count_per_machine;
+    pSessionEntry = pMachineSessions->sessions + session_index;
+    if (IS_SESSION_EMPTY(pSessionEntry->session_id)) {
+      SESSION_LOCK(pMachineSessions, session_index);
+      if (IS_SESSION_EMPTY(pSessionEntry->session_id)) {
+        pSessionEntry->session_id.fields.ip = my_machine_ip;
+        pSessionEntry->session_id.fields.timestamp = CURRENT_TIME();
+        pSessionEntry->session_id.fields.seq = seq;
+        pSessionEntry->sock_context = pSockContext;
+        pSessionEntry->user_data = arg;
+        pSessionEntry->response_events = events;
+        pSessionEntry->current_msg_seq = 0;
+        pSessionEntry->version = version;
+
+        *session = pSessionEntry->session_id;
+
+#ifdef TRIGGER_STAT_FLAG
+        if (pSessionEntry->response_events & RESPONSE_EVENT_NOTIFY_DEALER) {
+          pSessionEntry->stat_start_time = CURRENT_NS();
+        }
+#endif
+        SESSION_UNLOCK(pMachineSessions, session_index);
+
+        ink_atomic_increment(&pMachineSessions->session_stat.
+            create_success_count, 1);
+        ink_atomic_increment(&pMachineSessions->session_stat.
+            create_retry_times, i + 1);
+        return 0;
+      }
+      SESSION_UNLOCK(pMachineSessions, session_index);
+    }
+  }
+
+  ink_atomic_increment(&pMachineSessions->session_stat.
+      create_retry_times, i);
+
+  return ENOSPC;
+}
+
+#define GET_MACHINE_INDEX(machine_id, ip, pMachineSessions, return_value) \
+  do { \
+    if ((machine_id=get_session_machine_index(ip)) < 0) { \
+      Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, " \
+          "ip: %u not exist!", __LINE__, ip); \
+      return return_value; \
+    } \
+    pMachineSessions = all_sessions + machine_id; \
+    if (!(pMachineSessions)->init_done) { \
+      Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, " \
+          "ip: %u not init!", __LINE__, ip); \
+      return return_value; \
+    } \
+  } while (0)
+
+
+inline static SessionEntry *get_session(
+    const ClusterSession *session_id, SessionEntry *pSession)
+{
+  SessionEntry *pCurrent;
+  pCurrent = pSession;
+  do {
+    if (IS_SESSION_EQUAL(pCurrent->session_id, *session_id)) {
+      return pCurrent;
+    }
+
+    pCurrent = pCurrent->next;
+  } while (pCurrent != NULL);
+
+  return NULL;
+}
+
+int cluster_bind_session(ClusterSession session, void *arg)
+{
+  SessionEntry *pSessionEntry;
+  MachineSessions *pMachineSessions;
+  int result;
+  int machine_id;
+  int session_index;
+
+  GET_MACHINE_INDEX(machine_id, session.fields.ip, pMachineSessions, ENOENT);
+
+  session_index = session.fields.seq % max_session_count_per_machine;
+  pSessionEntry = pMachineSessions->sessions + session_index;
+  SESSION_LOCK(pMachineSessions, session_index);
+  if ((pSessionEntry=get_session(&session, pSessionEntry)) != NULL) {
+    pSessionEntry->user_data = arg;
+    result = 0;
+  }
+  else {
+    result = ENOENT;
+  }
+  SESSION_UNLOCK(pMachineSessions, session_index);
+  return result;
+}
+
+int cluster_set_events(ClusterSession session, const int events)
+{
+  SessionEntry *pSessionEntry;
+  MachineSessions *pMachineSessions;
+  SocketContext *pSockContext;
+  InMessage *pMessage;
+  void *user_data;
+  int result;
+  int machine_id;
+  int session_index;
+
+  GET_MACHINE_INDEX(machine_id, session.fields.ip, pMachineSessions, ENOENT);
+
+  session_index = session.fields.seq % max_session_count_per_machine;
+  pSessionEntry = pMachineSessions->sessions + session_index;
+  SESSION_LOCK(pMachineSessions, session_index);
+
+  if ((pSessionEntry=get_session(&session, pSessionEntry)) != NULL) {
+    pSockContext = pSessionEntry->sock_context;
+    if (pSockContext != NULL) {
+      if (events & RESPONSE_EVENT_NOTIFY_DEALER) {
+
+        //assert((pSessionEntry->response_events & RESPONSE_EVENT_NOTIFY_DEALER) == 0);
+
+#ifdef TRIGGER_STAT_FLAG
+        //for stat
+        if (pMachineSessions->is_myself) {  //client
+          pSessionEntry->stat_start_time = CURRENT_NS();
+        }
+        else { //server
+          if (pSessionEntry->stat_start_time != 0) {
+            ink_atomic_increment(&pMachineSessions->trigger_stat.count, 1);
+            ink_atomic_increment(&pMachineSessions->trigger_stat.time_used,
+                CURRENT_NS() - pSessionEntry->stat_start_time);
+            pSessionEntry->stat_start_time = 0;
+          }
+        }
+#endif
+
+        pMessage = pSessionEntry->messages;
+        if (pMessage == NULL) {
+          pSessionEntry->response_events = events;  //waiting for message to notify
+        }
+        else {
+          pSessionEntry->messages = pSessionEntry->messages->next; //consume one
+        }
+      }
+      else {
+        pMessage = NULL;
+        pSessionEntry->response_events = events;
+      }
+
+      user_data = pSessionEntry->user_data;
+      result = 0;
+    }
+    else {
+      pMessage = NULL;
+      user_data = NULL;
+      result = ENOENT;
+    }
+  }
+  else {
+    pSockContext = NULL;
+    pMessage = NULL;
+    user_data = NULL;
+    result = ENOENT;
+  }
+
+#ifdef TRIGGER_STAT_FLAG
+  if (pMessage != NULL) {
+    if (!pMachineSessions->is_myself) {  //server
+      pSessionEntry->stat_start_time = CURRENT_NS();
+    }
+  }
+#endif
+  SESSION_UNLOCK(pMachineSessions, session_index);
+
+  if (pMessage != NULL) {
+    cluster_msg_deal_func(session, user_data, pMessage->func_id,
+        pMessage->blocks, pMessage->data_len);
+    release_in_message(pSockContext, pMessage);
+  }
+
+  return result;
+}
+
+void *cluster_close_session(ClusterSession session)
+{
+  void *old_data;
+  SessionEntry *previous;
+  SessionEntry *pSessionEntry;
+  MachineSessions *pMachineSessions;
+  InMessage *pMessage;
+  int machine_id;
+  int session_index;
+
+  GET_MACHINE_INDEX(machine_id, session.fields.ip, pMachineSessions, NULL);
+
+  ink_atomic_increment(&pMachineSessions->session_stat.close_total_count, 1);
+
+  session_index = session.fields.seq % max_session_count_per_machine;
+  pSessionEntry = pMachineSessions->sessions + session_index;
+  SESSION_LOCK(pMachineSessions, session_index);
+
+  previous = NULL;
+  do {
+    if (pSessionEntry->sock_context != NULL && IS_SESSION_EQUAL(
+          session, pSessionEntry->session_id))
+    {
+      break;
+    }
+
+    previous = pSessionEntry;
+    pSessionEntry = pSessionEntry->next;
+  } while (pSessionEntry != NULL);
+
+  if (pSessionEntry != NULL) {  //found
+    old_data = pSessionEntry->user_data;
+    while (pSessionEntry->messages != NULL) {
+      pMessage = pSessionEntry->messages;
+      pSessionEntry->messages = pSessionEntry->messages->next;
+
+      release_in_message(pSessionEntry->sock_context, pMessage);
+    }
+    pSessionEntry->sock_context = NULL;
+    pSessionEntry->response_events = 0;
+    pSessionEntry->user_data = NULL;
+    CLEAR_SESSION(pSessionEntry->session_id);
+
+#ifdef TRIGGER_STAT_FLAG
+    if (pSessionEntry->stat_start_time != 0) {
+      ink_atomic_increment(&pMachineSessions->trigger_stat.count, 1);
+      ink_atomic_increment(&pMachineSessions->trigger_stat.time_used,
+          CURRENT_NS() - pSessionEntry->stat_start_time);
+      pSessionEntry->stat_start_time = 0;
+    }
+#endif
+
+    ink_atomic_increment(&pMachineSessions->session_stat.
+        close_success_count, 1);
+
+#ifdef MSG_TIME_STAT_FLAG
+    if (pMachineSessions->is_myself)
+    {//request by me
+      if (pSessionEntry->client_start_time != 0) {
+        ink_atomic_increment(&pMachineSessions->msg_stat.count, 1);
+        ink_atomic_increment(&pMachineSessions->msg_stat.time_used,
+            CURRENT_NS() - pSessionEntry->client_start_time);
+        pSessionEntry->client_start_time = 0;
+      }
+    }
+    else { //request by other
+      if (pSessionEntry->server_start_time != 0) {
+        ink_atomic_increment(&pMachineSessions->msg_stat.count, 1);
+        ink_atomic_increment(&pMachineSessions->msg_stat.time_used,
+            CURRENT_NS() - pSessionEntry->server_start_time);
+        pSessionEntry->server_start_time = 0;
+      }
+    }
+
+    if (pSessionEntry->send_start_time != 0) {
+      ink_atomic_increment(&pMachineSessions->msg_send.count, 1);
+      ink_atomic_increment(&pMachineSessions->msg_send.time_used,
+          CURRENT_NS() - pSessionEntry->send_start_time);
+      pSessionEntry->send_start_time = 0;
+    }
+#endif
+
+    if (previous == NULL) {  //remove the head session
+      SessionEntry *pNextSession;
+      pNextSession = pSessionEntry->next;
+      if (pNextSession != NULL) {
+        memcpy(pSessionEntry, pNextSession, sizeof(SessionEntry));
+        session_allocator.free_void(pNextSession);
+      }
+    }
+    else {
+      previous->next = pSessionEntry->next;
+      session_allocator.free_void(pSessionEntry);
+    }
+  }
+  else {
+    old_data = NULL;
+  }
+  SESSION_UNLOCK(pMachineSessions, session_index);
+  return old_data;
+}
+
+int get_session_for_send(const SessionId *session,
+    MachineSessions **ppMachineSessions, SessionEntry **sessionEntry)
+{
+  int machine_id;
+  int session_index;
+  int result;
+
+  GET_MACHINE_INDEX(machine_id, session->fields.ip, *ppMachineSessions, ENOENT);
+
+  session_index = session->fields.seq % max_session_count_per_machine;
+  *sessionEntry = (*ppMachineSessions)->sessions + session_index;
+  SESSION_LOCK(*ppMachineSessions, session_index);
+
+  if ((*sessionEntry=get_session(session, *sessionEntry)) == NULL) {
+    result = ENOENT;
+  }
+  else if ((*sessionEntry)->messages != NULL) {   //you must consume the recv messages firstly
+    *sessionEntry = NULL;
+    result = EBUSY;
+  }
+  else {
+    result = 0;
+  }
+
+  SESSION_UNLOCK(*ppMachineSessions, session_index);
+  return result;
+}
+
+#ifdef MSG_TIME_STAT_FLAG
+int get_response_session_internal(const MsgHeader *pHeader,
+    MachineSessions **ppMachineSessions, SessionEntry **sessionEntry)
+{
+  SessionEntry *pSession;
+  SessionEntry *pCurrent;
+  int result;
+  int machine_id;
+  int session_index;
+
+  GET_MACHINE_INDEX(machine_id, pHeader->session_id.fields.ip,
+      *ppMachineSessions, ENOENT);
+
+  session_index = pHeader->session_id.fields.seq % max_session_count_per_machine;
+  pSession = (*ppMachineSessions)->sessions + session_index;
+  SESSION_LOCK(*ppMachineSessions, session_index);
+  pCurrent = pSession;
+  do {
+    if (IS_SESSION_EQUAL(pCurrent->session_id, pHeader->session_id)) {
+      *sessionEntry = pCurrent;
+      result = 0;
+      break;
+    }
+
+    pCurrent = pCurrent->next;
+  } while (pCurrent != NULL);
+
+  if (pCurrent == NULL) {
+    if ((*ppMachineSessions)->is_myself) { //request by me
+      *sessionEntry = NULL;
+      result = ENOENT;
+    }
+    else {
+      if (IS_SESSION_EMPTY(pSession->session_id)) {
+        if (pHeader->msg_seq == 1) {  //first time, should create
+          *sessionEntry = pSession;
+          result = 0;
+        }
+        else {
+          *sessionEntry = NULL;
+          result = ENOENT;
+        }
+      }
+      else {
+        *sessionEntry = NULL;
+        result = EEXIST;
+      }
+    }
+  }
+
+  SESSION_UNLOCK(*ppMachineSessions, session_index);
+  return result;
+}
+#endif
+
+int get_response_session(const MsgHeader *pHeader,
+    MachineSessions **ppMachineSessions, SessionEntry **sessionEntry,
+    SocketContext *pSocketContext, bool *call_func, void **user_data)
+{
+  SessionEntry *pSession;
+  SessionEntry *pTail;
+  SessionEntry *pCurrent;
+  int result;
+  int machine_id;
+  int session_index;
+  int chain_count;
+
+  GET_MACHINE_INDEX(machine_id, pHeader->session_id.fields.ip,
+      *ppMachineSessions, ENOENT);
+
+  session_index = pHeader->session_id.fields.seq % max_session_count_per_machine;
+  pSession = (*ppMachineSessions)->sessions + session_index;
+  SESSION_LOCK(*ppMachineSessions, session_index);
+  do {
+    pCurrent = pSession;
+    do {
+      if (IS_SESSION_EQUAL(pCurrent->session_id, pHeader->session_id)) {
+        *sessionEntry = pCurrent;
+        *user_data = pCurrent->user_data;
+        result = 0;
+
+        if (pCurrent->response_events & RESPONSE_EVENT_NOTIFY_DEALER) {
+          pCurrent->response_events = 0;
+          *call_func = true;
+        }
+        else {
+          *call_func = false;
+        }
+
+        break;
+      }
+
+      pCurrent = pCurrent->next;
+    } while (pCurrent != NULL);
+
+    if (pCurrent != NULL) {  //found
+      pSession = pCurrent;
+      break;
+    }
+
+    if ((*ppMachineSessions)->is_myself) { //request by me
+      if (IS_SESSION_EMPTY(pSession->session_id)) {
+        Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+            "client sessionEntry: %16lX:%lX not exist, func_id: %d",
+            __LINE__, pHeader->session_id.ids[0],
+            pHeader->session_id.ids[1], pHeader->func_id);
+        *sessionEntry = NULL;
+        *call_func = false;
+        *user_data = NULL;
+        result = ENOENT;
+
+        ink_atomic_increment(&(*ppMachineSessions)->session_stat.
+            session_miss_count, 1);
+        break;
+      }
+    }
+    else {  //request by other
+      if (pHeader->msg_seq > 1) {   //should discard the message
+        *sessionEntry = NULL;
+        *user_data = NULL;
+        *call_func = false;
+        result = ENOENT;
+
+        Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+            "server sessionEntry: %08X:%u:%"PRId64" not exist, msg seq: %u, "
+            "func_id: %d, data_len: %d",
+            __LINE__, pHeader->session_id.fields.ip,
+            pHeader->session_id.fields.timestamp,
+            pHeader->session_id.ids[1], pHeader->msg_seq,
+            pHeader->func_id, pHeader->data_len);
+
+        ink_atomic_increment(&(*ppMachineSessions)->session_stat.
+            session_miss_count, 1);
+        break;
+      }
+
+      if (IS_SESSION_EMPTY(pSession->session_id)) {
+        pTail = NULL;
+        chain_count = 0;
+      }
+      else {
+        chain_count = 1;
+        pTail = pSession;
+        if (pSession->next != NULL) {
+          ++chain_count;
+          pTail = pSession->next;
+          pSession = pTail->next;
+          while (pSession != NULL) {
+            pTail = pSession;
+            pSession = pSession->next;
+            ++chain_count;
+          }
+        }
+
+        pSession = (SessionEntry *)session_allocator.alloc_void();
+        pSession->messages = NULL;
+        pSession->user_data = NULL;
+        pSession->next = NULL;
+
+#ifdef TRIGGER_STAT_FLAG
+        pSession->stat_start_time = 0;
+#endif
+#ifdef MSG_TIME_STAT_FLAG
+        pSession->client_start_time = 0;
+        pSession->server_start_time = 0;
+        pSession->send_start_time = 0;
+#endif
+      }
+
+      //first time, should create
+      pSession->session_id = pHeader->session_id;  //set sessionEntry id
+      pSession->sock_context = pSocketContext;
+      pSession->version = pSocketContext->version;
+      pSession->response_events = 0;
+      pSession->current_msg_seq = 0;
+      if (pTail != NULL) {
+        pTail->next = pSession;
+
+        Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+            "sessionEntry: %08X:%u:%"PRId64", chain count: %d",
+            __LINE__, pHeader->session_id.fields.ip,
+            pHeader->session_id.fields.timestamp,
+            pHeader->session_id.ids[1], chain_count + 1);
+      }
+
+      *sessionEntry = pSession;
+      *user_data = NULL;
+      *call_func = true;
+      result = 0;
+
+      ink_atomic_increment(&(*ppMachineSessions)->session_stat.
+          create_total_count, 1);
+      break;
+    }
+
+    Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+        "sessionEntry: %08X:%u:%"PRId64", position occupied by %08X:%u:%"PRId64", "
+        "quest by me: %d, time distance: %u, func_id: %d",
+        __LINE__, pHeader->session_id.fields.ip,
+        pHeader->session_id.fields.timestamp, pHeader->session_id.ids[1],
+        pSession->session_id.fields.ip, pSession->session_id.fields.timestamp,
+        pSession->session_id.ids[1], machine_id == my_machine_id,
+        pHeader->session_id.fields.timestamp -
+        pSession->session_id.fields.timestamp, pHeader->func_id);
+    *sessionEntry = NULL;
+    *user_data = NULL;
+    *call_func = false;
+    result = EEXIST;
+
+    ink_atomic_increment(&(*ppMachineSessions)->session_stat.
+        session_occupied_count, 1);
+  } while (0);
+
+#ifdef TRIGGER_STAT_FLAG
+  if (*call_func) {
+    //stat
+    if ((*ppMachineSessions)->is_myself) { //request by me
+      if (pSession->stat_start_time != 0) {
+        ink_atomic_increment(&(*ppMachineSessions)->trigger_stat.count, 1);
+        ink_atomic_increment(&(*ppMachineSessions)->trigger_stat.time_used,
+            CURRENT_NS() - pSession->stat_start_time);
+        pSession->stat_start_time = 0;
+      }
+    }
+    else {
+      pSession->stat_start_time = CURRENT_NS();
+    }
+  }
+#endif
+
+  SESSION_UNLOCK(*ppMachineSessions, session_index);
+  return result;
+}
+
+static int do_notify_connection_closed(const int src_machine_id,
+    SocketContext *pSockContext)
+{
+  int count;
+  int session_index;
+  SessionEntry *pcurrent;
+  SessionEntry *pSessionEntry;
+  SessionEntry *pSessionEnd;
+  void *user_data;
+  bool call_func;
+  SessionId session_id;
+
+  count = 0;
+  pSessionEnd = all_sessions[src_machine_id].sessions +
+    max_session_count_per_machine;
+  for (pSessionEntry=all_sessions[src_machine_id].sessions;
+      pSessionEntry<pSessionEnd; pSessionEntry++)
+  {
+    pcurrent = pSessionEntry;
+    do {
+      if (pcurrent->sock_context == pSockContext) {
+        session_index = pSessionEntry - all_sessions[src_machine_id].sessions;
+        SESSION_LOCK(all_sessions + src_machine_id, session_index);
+        call_func = (pcurrent->response_events &
+            RESPONSE_EVENT_NOTIFY_DEALER) && (pcurrent->messages == NULL);
+        session_id = pcurrent->session_id;
+        user_data = pcurrent->user_data;
+        SESSION_UNLOCK(all_sessions + src_machine_id, session_index);
+
+        if (call_func) {
+          cluster_msg_deal_func(session_id, user_data,
+              FUNC_ID_CONNECTION_CLOSED_NOTIFY, NULL, 0);
+        }
+        else {
+          push_in_message(session_id, all_sessions + src_machine_id,
+              pcurrent, FUNC_ID_CONNECTION_CLOSED_NOTIFY, NULL, 0);
+        }
+
+        count++;
+      }
+
+      pcurrent = pcurrent->next;
+    } while (pcurrent != NULL);
+  }
+
+  return count;
+}
+
+int notify_connection_closed(SocketContext *pSockContext)
+{
+  int count1;
+  int count2;
+  int machine_id;
+
+  count1 = do_notify_connection_closed(my_machine_id, pSockContext);
+  if (count1 > 0) {
+    Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+        "notify my session close count: %d", __LINE__, count1);
+  }
+
+  machine_id = get_session_machine_index(pSockContext->machine->ip);
+  if (machine_id >= 0 && all_sessions[machine_id].init_done) {
+    count2 = do_notify_connection_closed(machine_id, pSockContext);
+    if (count2 > 0) {
+      Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+          "notify %s session close count: %d", __LINE__,
+          pSockContext->machine->hostname, count2);
+    }
+  }
+  else {
+    count2 = 0;
+  }
+
+  return count1 + count2;
+}
+
+int push_in_message(const SessionId session,
+    MachineSessions *pMachineSessions, SessionEntry *pSessionEntry,
+    const int func_id, IOBufferBlock *blocks, const int data_len)
+{
+  SocketContext *pSockContext;
+  InMessage *pMessage;
+  void *user_data;
+  int session_index;
+  bool call_func;
+
+  session_index = session.fields.seq % max_session_count_per_machine;
+  SESSION_LOCK(pMachineSessions, session_index);
+  pSockContext = pSessionEntry->sock_context;
+  if (!(pSockContext != NULL && IS_SESSION_EQUAL(pSessionEntry->session_id,
+          session)))
+  {
+    SESSION_UNLOCK(pMachineSessions, session_index);
+    return ENOENT;
+  }
+
+#ifdef USE_MULTI_ALLOCATOR
+  pMessage = (InMessage *)pSockContext->in_msg_allocator->alloc_void();
+#else
+  pMessage = (InMessage *)in_message_allocator.alloc_void();
+#endif
+
+  if (pMessage == NULL) {
+    Error("file: "__FILE__", line: %d, "
+        "malloc %d bytes fail, errno: %d, error info: %s",
+        __LINE__, (int)sizeof(InMessage), errno, strerror(errno));
+    SESSION_UNLOCK(pMachineSessions, session_index);
+    return errno != 0 ? errno : ENOMEM;
+  }
+
+  pMessage->blocks.m_ptr = NULL;  //must set to NULL before set value
+  pMessage->func_id = func_id;
+  pMessage->blocks = blocks;
+  pMessage->data_len = data_len;
+  pMessage->next = NULL;
+
+  if (pSessionEntry->messages == NULL) {
+    pSessionEntry->messages = pMessage;
+  }
+  else if (pSessionEntry->messages->next == NULL) {
+    pSessionEntry->messages->next = pMessage;
+  }
+  else {
+    InMessage *pTail;
+    pTail = pSessionEntry->messages->next;
+    while (pTail->next != NULL) {
+      pTail = pTail->next;
+    }
+    pTail->next = pMessage;
+  }
+
+  //check if notify dealer
+  if (pSessionEntry->response_events & RESPONSE_EVENT_NOTIFY_DEALER) {
+    pSessionEntry->response_events = 0;
+    pMessage = pSessionEntry->messages;
+    pSessionEntry->messages = pSessionEntry->messages->next; //consume one
+    user_data = pSessionEntry->user_data;
+    call_func = true;
+  }
+  else {
+    user_data = NULL;
+    call_func = false;
+  }
+
+#ifdef TRIGGER_STAT_FLAG
+  if (call_func) {
+    if (!pMachineSessions->is_myself) {  //server
+      pSessionEntry->stat_start_time = CURRENT_NS();
+    }
+  }
+#endif
+  SESSION_UNLOCK(pMachineSessions, session_index);
+
+  pSockContext->thread_context->stats.enqueue_in_msg_count++;
+  pSockContext->thread_context->stats.enqueue_in_msg_bytes +=
+    MSG_HEADER_LENGTH + data_len;
+
+  if (call_func) {
+    cluster_msg_deal_func(session, user_data, pMessage->func_id,
+        pMessage->blocks, pMessage->data_len);
+
+    release_in_message(pSockContext, pMessage);
+  }
+
+  return 0;
+}
+
+static void set_session_stat(SessionRecords *pSessionRecords,
+    const SessionStat *pNewtat, SessionStat *pOldStat)
+{
+  if (pNewtat->create_total_count != pOldStat->create_total_count) {
+    pOldStat->create_total_count = pNewtat->create_total_count;
+    RecDataSetFromInk64(RECD_INT, &pSessionRecords->create_total_count->data,
+        pNewtat->create_total_count);
+  }
+  if (pNewtat->create_success_count != pOldStat->create_success_count) {
+    pOldStat->create_success_count = pNewtat->create_success_count;
+    RecDataSetFromInk64(RECD_INT, &pSessionRecords->create_success_count->data,
+        pNewtat->create_success_count);
+  }
+  if (pNewtat->create_retry_times != pOldStat->create_retry_times) {
+    pOldStat->create_retry_times = pNewtat->create_retry_times;
+    RecDataSetFromInk64(RECD_INT, &pSessionRecords->create_retry_times->data,
+        pNewtat->create_retry_times);
+  }
+  if (pNewtat->close_total_count != pOldStat->close_total_count) {
+    pOldStat->close_total_count = pNewtat->close_total_count;
+    RecDataSetFromInk64(RECD_INT, &pSessionRecords->close_total_count->data,
+        pNewtat->close_total_count);
+  }
+  if (pNewtat->close_success_count != pOldStat->close_success_count) {
+    pOldStat->close_success_count = pNewtat->close_success_count;
+    RecDataSetFromInk64(RECD_INT, &pSessionRecords->close_success_count->data,
+        pNewtat->close_success_count);
+  }
+  if (pNewtat->session_miss_count != pOldStat->session_miss_count) {
+    pOldStat->session_miss_count = pNewtat->session_miss_count;
+    RecDataSetFromInk64(RECD_INT, &pSessionRecords->session_miss_count->data,
+        pNewtat->session_miss_count);
+  }
+  if (pNewtat->session_occupied_count != pOldStat->session_occupied_count) {
+    pOldStat->session_occupied_count = pNewtat->session_occupied_count;
+    RecDataSetFromInk64(RECD_INT, &pSessionRecords->session_occupied_count->data,
+        pNewtat->session_occupied_count);
+  }
+}
+
+static void init_session_stat(SessionRecords *pSessionRecords, const char *prefix)
+{
+  char name[256];
+  RecData data_default;
+  memset(&data_default, 0, sizeof(RecData));
+
+  sprintf(name, "%s.create_total_count", prefix);
+  pSessionRecords->create_total_count = RecRegisterStat(RECT_PROCESS,
+      name, RECD_INT, data_default, RECP_NON_PERSISTENT);
+
+  sprintf(name, "%s.create_success_count", prefix);
+  pSessionRecords->create_success_count = RecRegisterStat(RECT_PROCESS,
+      name, RECD_INT, data_default, RECP_NON_PERSISTENT);
+
+  sprintf(name, "%s.create_retry_times", prefix);
+  pSessionRecords->create_retry_times = RecRegisterStat(RECT_PROCESS,
+      name, RECD_INT, data_default, RECP_NON_PERSISTENT);
+
+  sprintf(name, "%s.close_total_count", prefix);
+  pSessionRecords->close_total_count = RecRegisterStat(RECT_PROCESS,
+      name, RECD_INT, data_default, RECP_NON_PERSISTENT);
+
+  sprintf(name, "%s.close_success_count", prefix);
+  pSessionRecords->close_success_count = RecRegisterStat(RECT_PROCESS,
+      name, RECD_INT, data_default, RECP_NON_PERSISTENT);
+
+  sprintf(name, "%s.miss_count", prefix);
+  pSessionRecords->session_miss_count = RecRegisterStat(RECT_PROCESS,
+      name, RECD_INT, data_default, RECP_NON_PERSISTENT);
+
+  sprintf(name, "%s.occupied_count", prefix);
+  pSessionRecords->session_occupied_count = RecRegisterStat(RECT_PROCESS,
+      name, RECD_INT, data_default, RECP_NON_PERSISTENT);
+}
+
+
+void log_session_stat()
+{
+  ClusterMachine *pMachine;
+  ClusterMachine *pMachineEnd;
+  int machine_id;
+  MachineSessions *pServerSessions;
+  MachineSessions *pClientSessions;
+  SessionStat serverSessionStat;
+  static SessionStat serverOldStat = {0, 0, 0, 0, 0, 0, 0};
+  static SessionStat clientOldStat = {0, 0, 0, 0, 0, 0, 0};
+
+  serverSessionStat.create_total_count = 0;
+  serverSessionStat.create_success_count = 0;
+  serverSessionStat.create_retry_times = 0;
+  serverSessionStat.close_total_count = 0;
+  serverSessionStat.close_success_count = 0;
+  serverSessionStat.session_miss_count = 0;
+  serverSessionStat.session_occupied_count = 0;
+
+  pMachineEnd = cluster_machines + cluster_machine_count;
+  for (pMachine=cluster_machines; pMachine<pMachineEnd; pMachine++) {
+    if ((machine_id=get_session_machine_index(pMachine->ip)) < 0) {
+      continue;
+    }
+    if (pMachine->dead || machine_id == my_machine_id) {
+      continue;
+    }
+
+    pServerSessions = all_sessions + machine_id;
+    serverSessionStat.create_total_count += pServerSessions->session_stat.
+      create_total_count;
+    serverSessionStat.close_total_count += pServerSessions->session_stat.
+      close_total_count;
+    serverSessionStat.close_success_count += pServerSessions->session_stat.
+      close_success_count;
+    serverSessionStat.session_miss_count += pServerSessions->session_stat.
+      session_miss_count;
+    serverSessionStat.session_occupied_count += pServerSessions->session_stat.
+      session_occupied_count;
+  }
+
+  serverSessionStat.create_success_count = serverSessionStat.create_total_count;
+  serverSessionStat.create_retry_times = serverSessionStat.create_total_count;
+
+  pClientSessions = all_sessions + my_machine_id;
+
+  set_session_stat(&server_session_records, &serverSessionStat, &serverOldStat);
+  set_session_stat(&client_session_records, (const SessionStat *)
+      &pClientSessions->session_stat, &clientOldStat);
+}
+
+#ifdef TRIGGER_STAT_FLAG
+void log_trigger_stat()
+{
+  ClusterMachine *pMachine;
+  ClusterMachine *pMachineEnd;
+  MachineSessions *pServerSessions;
+  MachineSessions *pClientSessions;
+  MsgTimeUsed serverTimeUsed;
+  int machine_id;
+  int server_avg_time_used;
+  int client_avg_time_used;
+
+  serverTimeUsed.count = 0;
+  serverTimeUsed.time_used = 0;
+
+  pMachineEnd = cluster_machines + cluster_machine_count;
+  for (pMachine=cluster_machines; pMachine<pMachineEnd; pMachine++) {
+    if ((machine_id=get_session_machine_index(pMachine->ip)) < 0) {
+      continue;
+    }
+    if (pMachine->dead || machine_id == my_machine_id) {
+      continue;
+    }
+
+    pServerSessions = all_sessions + machine_id;
+
+    serverTimeUsed.count += pServerSessions->trigger_stat.count;
+    serverTimeUsed.time_used += pServerSessions->trigger_stat.time_used;
+    if (pServerSessions->trigger_stat.count > 0) {
+      server_avg_time_used = pServerSessions->trigger_stat.time_used /
+        pServerSessions->trigger_stat.count;
+    }
+    else {
+      server_avg_time_used = 0;
+    }
+    Note("%s:%d trigger msg => %"PRId64", avg time used => %d us",
+        pMachine->hostname, pMachine->cluster_port,
+        pServerSessions->trigger_stat.count,
+        server_avg_time_used / 1000);
+
+    pServerSessions->trigger_stat.count = 0;
+    pServerSessions->trigger_stat.time_used = 0;
+  }
+
+  if (serverTimeUsed.count > 0) {
+    server_avg_time_used = serverTimeUsed.time_used / serverTimeUsed.count;
+  }
+  else {
+    server_avg_time_used = 0;
+  }
+  Note("SERVER: trigger msg => %"PRId64", avg time used => %d us",
+      serverTimeUsed.count, server_avg_time_used / 1000);
+
+  pClientSessions = all_sessions + my_machine_id;
+  if (pClientSessions->trigger_stat.count > 0) {
+    client_avg_time_used = pClientSessions->trigger_stat.time_used /
+      pClientSessions->trigger_stat.count;
+  }
+  else {
+    client_avg_time_used = 0;
+  }
+  Note("CLIENT: trigger msg => %"PRId64", avg time used => %d us\n",
+      pClientSessions->trigger_stat.count, client_avg_time_used / 1000);
+
+  pClientSessions->trigger_stat.count = 0;
+  pClientSessions->trigger_stat.time_used = 0;
+}
+#endif
+
+#ifdef MSG_TIME_STAT_FLAG
+void log_msg_time_stat()
+{
+  ClusterMachine *pMachine;
+  ClusterMachine *pMachineEnd;
+  MachineSessions *pServerSessions;
+  MachineSessions *pClientSessions;
+  MsgTimeUsed serverTimeUsed;
+  MsgTimeUsed sendTimeUsed;
+  int machine_id;
+  int server_avg_time_used;
+  int client_avg_time_used;
+  int send_avg_time_used;
+
+  serverTimeUsed.count = 0;
+  serverTimeUsed.time_used = 0;
+  sendTimeUsed.count = 0;
+  sendTimeUsed.time_used = 0;
+
+  pMachineEnd = cluster_machines + cluster_machine_count;
+  for (pMachine=cluster_machines; pMachine<pMachineEnd; pMachine++) {
+    if ((machine_id=get_session_machine_index(pMachine->ip)) < 0) {
+      continue;
+    }
+    if (pMachine->dead || machine_id == my_machine_id) {
+      continue;
+    }
+
+    pServerSessions = all_sessions + machine_id;
+    serverTimeUsed.count += pServerSessions->msg_stat.count;
+    serverTimeUsed.time_used += pServerSessions->msg_stat.time_used;
+    if (pServerSessions->msg_stat.count > 0) {
+      server_avg_time_used = pServerSessions->msg_stat.time_used /
+        pServerSessions->msg_stat.count;
+    }
+    else {
+      server_avg_time_used = 0;
+    }
+
+    sendTimeUsed.count += pServerSessions->msg_send.count;
+    sendTimeUsed.time_used += pServerSessions->msg_send.time_used;
+    if (pServerSessions->msg_send.count > 0) {
+      send_avg_time_used = pServerSessions->msg_send.time_used /
+        pServerSessions->msg_send.count;
+    }
+    else {
+      send_avg_time_used = 0;
+    }
+
+    Note("%s:%d msg count: %"PRId64", avg time used (recv start to send done): %d us, "
+        "send msg count: %"PRId64", send avg time: %d us",
+        pMachine->hostname, pMachine->cluster_port,
+        pServerSessions->msg_stat.count, server_avg_time_used / 1000,
+        pServerSessions->msg_send.count, send_avg_time_used / 1000);
+
+    pServerSessions->msg_stat.count = 0;
+    pServerSessions->msg_stat.time_used = 0;
+    pServerSessions->msg_send.count = 0;
+    pServerSessions->msg_send.time_used = 0;
+  }
+
+  if (serverTimeUsed.count > 0) {
+    server_avg_time_used = serverTimeUsed.time_used / serverTimeUsed.count;
+  }
+  else {
+    server_avg_time_used = 0;
+  }
+
+  if (sendTimeUsed.count > 0) {
+    send_avg_time_used = sendTimeUsed.time_used / sendTimeUsed.count;
+  }
+  else {
+    send_avg_time_used = 0;
+  }
+  Note("SERVER: msg count: %"PRId64", avg time used (recv start to send done): %d us, "
+      "send msg count: %"PRId64", send avg time: %d us",
+      serverTimeUsed.count, server_avg_time_used / 1000,
+      sendTimeUsed.count, send_avg_time_used / 1000);
+
+  pClientSessions = all_sessions + my_machine_id;
+  if (pClientSessions->msg_stat.count > 0) {
+    client_avg_time_used = pClientSessions->msg_stat.time_used /
+      pClientSessions->msg_stat.count;
+  }
+  else {
+    client_avg_time_used = 0;
+  }
+  if (pClientSessions->msg_send.count > 0) {
+    send_avg_time_used = pClientSessions->msg_send.time_used /
+      pClientSessions->msg_send.count;
+  }
+  else {
+    send_avg_time_used = 0;
+  }
+  Note("CLIENT: msg count: %"PRId64", avg time used (send start to recv done): %d us, "
+      "send msg count: %"PRId64", send avg time: %d us\n",
+      pClientSessions->msg_stat.count, client_avg_time_used / 1000,
+      pClientSessions->msg_send.count, send_avg_time_used / 1000);
+
+  pClientSessions->msg_stat.count = 0;
+  pClientSessions->msg_stat.time_used = 0;
+  pClientSessions->msg_send.count = 0;
+  pClientSessions->msg_send.time_used = 0;
+}
+#endif
+

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/session.h
----------------------------------------------------------------------
diff --git a/iocore/cluster/session.h b/iocore/cluster/session.h
new file mode 100644
index 0000000..9dd2559
--- /dev/null
+++ b/iocore/cluster/session.h
@@ -0,0 +1,97 @@
+/** @file
+
+  A brief file description
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#ifndef _SESSION_H_
+#define _SESSION_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "types.h"
+#include "clusterinterface.h"
+
+typedef struct {
+  unsigned int ip;
+  bool init_done;
+  bool is_myself;   //myself, the local host
+  SessionEntry *sessions;
+  ink_mutex *locks;
+  volatile SequenceType current_seq;
+  volatile SessionStat session_stat;
+
+#ifdef TRIGGER_STAT_FLAG
+  volatile MsgTimeUsed trigger_stat;
+#endif
+
+#ifdef MSG_TIME_STAT_FLAG
+  volatile MsgTimeUsed msg_stat;
+  volatile MsgTimeUsed msg_send;
+#endif
+
+} MachineSessions;
+
+#define SESSION_LOCK(pMachineSessions, session_index) \
+	ink_mutex_acquire((pMachineSessions)->locks + session_index % \
+      session_lock_count_per_machine)
+
+#define SESSION_UNLOCK(pMachineSessions, session_index) \
+	ink_mutex_release((pMachineSessions)->locks + session_index % \
+      session_lock_count_per_machine)
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int session_init();
+int init_machine_sessions(ClusterMachine *machine, const bool bMyself);
+
+int get_session_for_send(const SessionId *session,
+    MachineSessions **ppMachineSessions, SessionEntry **sessionEntry);
+int get_response_session(const MsgHeader *pHeader,
+    MachineSessions **ppMachineSessions, SessionEntry **sessionEntry,
+    SocketContext *pSocketContext, bool *call_func, void **user_data);
+
+int notify_connection_closed(SocketContext *pSockContext);
+
+int push_in_message(const SessionId session,
+    MachineSessions *pMachineSessions, SessionEntry *pSessionEntry,
+    const int func_id, IOBufferBlock *blocks, const int data_len);
+
+void log_session_stat();
+
+#ifdef TRIGGER_STAT_FLAG
+void log_trigger_stat();
+#endif
+
+#ifdef MSG_TIME_STAT_FLAG
+int get_response_session_internal(const MsgHeader *pHeader,
+    MachineSessions **ppMachineSessions, SessionEntry **sessionEntry);
+void log_msg_time_stat();
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/types.h
----------------------------------------------------------------------
diff --git a/iocore/cluster/types.h b/iocore/cluster/types.h
new file mode 100644
index 0000000..e11b00c
--- /dev/null
+++ b/iocore/cluster/types.h
@@ -0,0 +1,235 @@
+/** @file
+
+  A brief file description
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#ifndef _CLUSTER_TYPES_H_
+#define _CLUSTER_TYPES_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "clusterinterface.h"
+#include "libts.h"
+
+#define IP_ADDRESS_SIZE 16
+
+//#define USE_MULTI_ALLOCATOR  1
+#define CHECK_MAGIC_NUMBER  1
+
+#define PRIORITY_COUNT      3   //priority queue count
+
+//statistic marco defines
+//#define TRIGGER_STAT_FLAG  1  //trigger statistic flag
+//#define MSG_TIME_STAT_FLAG 1  //data statistic flag
+
+#define MSG_HEADER_LENGTH   ((int)sizeof(MsgHeader))
+#define MAGIC_NUMBER        0x3308
+#define MAX_MSG_LENGTH      (4 * 1024 * 1024)
+
+#define MAX_MACHINE_COUNT        255   //IMPORTANT: can't be 256!!
+
+//combine multi msg to call writev
+#define WRITEV_ARRAY_SIZE   128
+#define WRITEV_ITEM_ONCE    (WRITEV_ARRAY_SIZE / 2)
+#define WRITE_MAX_COMBINE_BYTES  (64 * 1024)
+
+#define CONNECT_TYPE_CLIENT  'C'  //connect by me, client
+#define CONNECT_TYPE_SERVER  'S'  //connect by peer, server
+
+#define DATA_TYPE_BUFFER     'B'  //char buffer
+#define DATA_TYPE_OBJECT     'O'  //IOBufferBlock pointer
+
+#define ALIGN_BYTES  8
+#define BYTE_ALIGN(x,l)  (((x) + ((l) - 1)) & ~((l) - 1))
+#define BYTE_ALIGN8(x)  BYTE_ALIGN(x, ALIGN_BYTES)
+
+#define IS_SESSION_EMPTY(session_id) \
+  ((session_id).ids[0] == 0 && (session_id).ids[1] == 0)
+
+#define IS_SESSION_EQUAL(session_id1, session_id2) \
+  ((session_id1).ids[0] == (session_id2).ids[0] && \
+   (session_id1).ids[1] == (session_id2).ids[1])
+
+typedef struct msg_timeused {
+  volatile int64_t count;     //message count
+  volatile int64_t time_used; //time used
+} MsgTimeUsed;
+
+typedef struct session_stat {
+  volatile int64_t create_total_count;   //create session total count
+  volatile int64_t create_success_count; //create session success count
+  volatile int64_t create_retry_times;   //create session retry times
+  volatile int64_t close_total_count;    //close session count
+  volatile int64_t close_success_count;  //close session success count
+  volatile int64_t session_miss_count;     //session miss count
+  volatile int64_t session_occupied_count; //session occupied count
+} SessionStat;
+
+typedef struct msg_header {
+#ifdef CHECK_MAGIC_NUMBER
+  short magic;            //magic number
+  unsigned short msg_seq; //message sequence no base 1
+#else
+  uint32_t msg_seq; //message sequence no base 1
+#endif
+
+  int func_id; //function id, must be signed int
+  int data_len; //message body length
+  int aligned_data_len;  //aligned body length
+  SessionId session_id; //session id
+} MsgHeader;   //must aligned by 8 bytes
+
+typedef struct in_msg_entry {
+  int func_id;  //function id
+  int data_len; //message body length
+  Ptr<IOBufferBlock> blocks;
+  struct in_msg_entry *next; //for income message queue
+} InMessage;
+
+struct worker_thread_context;
+struct socket_context;
+
+typedef struct session_entry {
+  SessionId session_id;
+  void *user_data;  //user data for callback
+  struct socket_context *sock_context;
+  InMessage *messages;  //income messages
+  int16_t response_events;  //response events
+  uint16_t current_msg_seq;  //current message sequence no
+  uint32_t version;    //avoid CAS ABA
+  struct session_entry *next;  //session chain, only for server session
+
+#ifdef TRIGGER_STAT_FLAG
+  volatile int64_t stat_start_time;   //for message time used stat
+#endif
+
+#ifdef MSG_TIME_STAT_FLAG
+  volatile int64_t client_start_time;  //send start time for client
+  volatile int64_t server_start_time;  //recv done time for server
+  volatile int64_t send_start_time; //send start time for stat send time
+#endif
+
+} SessionEntry;
+
+//out message to send
+typedef struct out_msg_entry {
+  MsgHeader header;
+  char mini_buff[MINI_MESSAGE_SIZE];  //for mini message
+  Ptr<IOBufferBlock> blocks;  //block data passed by caller
+
+	struct out_msg_entry *next; //for send queue
+	int bytes_sent;    //important: including msg header
+  int data_type;     //DATA_TYPE_BUFFER or DATA_TYPE_OBJECT
+  int64_t in_queue_time; //the time when push to send queue
+} OutMessage;
+
+//out message queue
+typedef struct message_queue {
+  OutMessage *head;
+  OutMessage *tail;
+  ink_mutex lock;
+} MessageQueue;
+
+//for recv messages
+typedef struct reader_manager {
+  Ptr<IOBufferData> buffer;   //recv buffer
+  Ptr<IOBufferBlock> blocks;  //recv blocks
+  char *msg_header; //current message start
+  char *current;    //current pointer
+  char *buff_end;   //buffer end
+  int recv_body_bytes;  //recveived body bytes
+} ReaderManager;
+
+typedef struct socket_context {
+  int sock;  //socket fd
+  char padding[ALIGN_BYTES];     //padding buffer
+  struct reader_manager reader;  //recv buffer
+  struct ClusterMachine *machine;     //peer machine, point to global machine
+  struct worker_thread_context *thread_context; //the thread belong to
+  MessageQueue send_queues[PRIORITY_COUNT];  //queue for send
+
+  int queue_index;  //current deal queue index base 0
+  int connect_type;       //CONNECT_TYPE_CLIENT or CONNECT_TYPE_SERVER
+  time_t connected_time;  //connection established timestamp
+  uint32_t version;    //avoid CAS ABA
+
+  int64_t next_write_time; //next time to send message
+
+  int ping_fail_count;     //cluster ping fail counter
+  int64_t next_ping_time;  //next time to send ping message
+  int64_t ping_start_time; //the start time of ping
+
+#ifdef USE_MULTI_ALLOCATOR
+  Allocator *out_msg_allocator;  //for send
+  Allocator *in_msg_allocator;   //for notify dealer
+#endif
+  struct socket_context *next;  //for freelist
+} SocketContext;
+
+typedef struct socket_stats {
+  int64_t send_msg_count;  //send msg count
+  int64_t drop_msg_count;  //droped msg count when close socket
+  int64_t send_bytes;
+  int64_t drop_bytes;
+  int64_t call_writev_count;
+  int64_t send_retry_count;
+  int64_t send_delayed_time;
+
+  volatile int64_t push_msg_count; //push to send queue msg count
+  volatile int64_t push_msg_bytes; //push to send queue msg bytes
+
+  volatile int64_t fail_msg_count; //push to send queue fail msg count
+  volatile int64_t fail_msg_bytes; //push to send queue fail msg bytes
+
+  int64_t recv_msg_count;     //recv msg count
+  int64_t enqueue_in_msg_count;  //push into in msg queue
+  int64_t dequeue_in_msg_count;  //pop from in msg queue
+  int64_t recv_bytes;
+  int64_t enqueue_in_msg_bytes; //push into in msg queue
+  int64_t dequeue_in_msg_bytes; //pop from in msg queue
+
+  int64_t call_read_count;
+  int64_t epoll_wait_count;
+  int64_t epoll_wait_time_used;
+  int64_t loop_usleep_count;
+  int64_t loop_usleep_time;
+
+  int64_t ping_total_count;
+  int64_t ping_success_count;
+  int64_t ping_time_used;
+} SocketStats;
+
+class EventPoll;
+
+typedef struct worker_thread_context
+{
+  EventPoll *ev_poll;
+  int alloc_size;         //max count of epoll events
+  int thread_index;       //my thread index
+  int active_sock_count;
+  SocketStats stats;
+  ink_mutex lock;
+  SocketContext **active_sockets;
+} WorkerThreadContext;
+
+#endif
+

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/eventsystem/I_Event.h
----------------------------------------------------------------------
diff --git a/iocore/eventsystem/I_Event.h b/iocore/eventsystem/I_Event.h
index 7a37ea0..2659131 100644
--- a/iocore/eventsystem/I_Event.h
+++ b/iocore/eventsystem/I_Event.h
@@ -85,6 +85,7 @@
 #define BLOCK_CACHE_EVENT_EVENTS_START            4000
 #define UTILS_EVENT_EVENTS_START                  5000
 #define CONGESTION_EVENT_EVENTS_START             5100
+#define CLUSTER_MSG_START                         6000
 #define INK_API_EVENT_EVENTS_START                60000
 #define SRV_EVENT_EVENTS_START	                  62000
 #define REMAP_EVENT_EVENTS_START                  63000

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/eventsystem/P_IOBuffer.h
----------------------------------------------------------------------
diff --git a/iocore/eventsystem/P_IOBuffer.h b/iocore/eventsystem/P_IOBuffer.h
index 0842aff..261aa1f 100644
--- a/iocore/eventsystem/P_IOBuffer.h
+++ b/iocore/eventsystem/P_IOBuffer.h
@@ -203,7 +203,7 @@ new_IOBufferData_internal(
                            void *b, int64_t size, int64_t asize_index)
 {
   (void) size;
-  IOBufferData *d = THREAD_ALLOC(ioDataAllocator, this_ethread());
+  IOBufferData *d = ioDataAllocator.alloc();
   d->_size_index = asize_index;
   ink_assert(BUFFER_SIZE_INDEX_IS_CONSTANT(asize_index)
              || size <= d->block_size());
@@ -263,7 +263,7 @@ new_IOBufferData_internal(
 #endif
                            int64_t size_index, AllocType type)
 {
-  IOBufferData *d = THREAD_ALLOC(ioDataAllocator, this_ethread());
+  IOBufferData *d = ioDataAllocator.alloc();
 #ifdef TRACK_BUFFER_USER
   d->_location = loc;
 #endif
@@ -336,7 +336,7 @@ TS_INLINE void
 IOBufferData::free()
 {
   dealloc();
-  THREAD_FREE(this, ioDataAllocator, this_ethread());
+  ioDataAllocator.free(this);
 }
 
 //////////////////////////////////////////////////////////////////
@@ -352,7 +352,7 @@ new_IOBufferBlock_internal(
 #endif
   )
 {
-  IOBufferBlock *b = THREAD_ALLOC(ioBlockAllocator, this_ethread());
+  IOBufferBlock *b = ioBlockAllocator.alloc();
 #ifdef TRACK_BUFFER_USER
   b->_location = location;
 #endif
@@ -366,7 +366,7 @@ new_IOBufferBlock_internal(
 #endif
                             IOBufferData * d, int64_t len, int64_t offset)
 {
-  IOBufferBlock *b = THREAD_ALLOC(ioBlockAllocator, this_ethread());
+  IOBufferBlock *b = ioBlockAllocator.alloc();
 #ifdef TRACK_BUFFER_USER
   b->_location = location;
 #endif
@@ -468,7 +468,7 @@ TS_INLINE void
 IOBufferBlock::free()
 {
   dealloc();
-  THREAD_FREE(this, ioBlockAllocator, this_ethread());
+  ioBlockAllocator.free(this);
 }
 
 TS_INLINE void
@@ -777,7 +777,7 @@ TS_INLINE MIOBuffer * new_MIOBuffer_internal(
 #endif
                                                int64_t size_index)
 {
-  MIOBuffer *b = THREAD_ALLOC(ioAllocator, this_ethread());
+  MIOBuffer *b = ioAllocator.alloc();
 #ifdef TRACK_BUFFER_USER
   b->_location = location;
 #endif
@@ -790,7 +790,7 @@ free_MIOBuffer(MIOBuffer * mio)
 {
   mio->_writer = NULL;
   mio->dealloc_all_readers();
-  THREAD_FREE(mio, ioAllocator, this_ethread());
+  ioAllocator.free(mio);
 }
 
 TS_INLINE MIOBuffer * new_empty_MIOBuffer_internal(
@@ -799,7 +799,7 @@ TS_INLINE MIOBuffer * new_empty_MIOBuffer_internal(
 #endif
                                                      int64_t size_index)
 {
-  MIOBuffer *b = THREAD_ALLOC(ioAllocator, this_ethread());
+  MIOBuffer *b = ioAllocator.alloc();
   b->size_index = size_index;
 #ifdef TRACK_BUFFER_USER
   b->_location = location;
@@ -810,7 +810,7 @@ TS_INLINE MIOBuffer * new_empty_MIOBuffer_internal(
 TS_INLINE void
 free_empty_MIOBuffer(MIOBuffer * mio)
 {
-  THREAD_FREE(mio, ioAllocator, this_ethread());
+  ioAllocator.free(mio);
 }
 
 TS_INLINE IOBufferReader *

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/mgmt/RecordsConfig.cc
----------------------------------------------------------------------
diff --git a/mgmt/RecordsConfig.cc b/mgmt/RecordsConfig.cc
index 4a73f19..7677471 100644
--- a/mgmt/RecordsConfig.cc
+++ b/mgmt/RecordsConfig.cc
@@ -814,6 +814,24 @@ RecordElement RecordsConfig[] = {
   ,
   {RECT_CONFIG, "proxy.config.cluster.cluster_port", RECD_INT, "8086", RECU_RESTART_TS, RR_REQUIRED, RECC_NULL, NULL, RECA_NULL}
   ,
+  {RECT_CONFIG, "proxy.config.cluster.flow_ctrl.min_bps", RECD_INT, "804857600", RECU_RESTART_TS, RR_REQUIRED, RECC_NULL, NULL, RECA_NULL}
+  ,
+  {RECT_CONFIG, "proxy.config.cluster.flow_ctrl.max_bps", RECD_INT, "4194304000", RECU_RESTART_TS, RR_REQUIRED, RECC_NULL, NULL, RECA_NULL}
+  ,
+  {RECT_CONFIG, "proxy.config.cluster.flow_ctrl.min_send_wait_time", RECD_INT, "1000", RECU_RESTART_TS, RR_REQUIRED, RECC_NULL, NULL, RECA_NULL}
+  ,
+  {RECT_CONFIG, "proxy.config.cluster.flow_ctrl.max_send_wait_time", RECD_INT, "5000", RECU_RESTART_TS, RR_REQUIRED, RECC_NULL, NULL, RECA_NULL}
+  ,
+  {RECT_CONFIG, "proxy.config.cluster.flow_ctrl.min_loop_interval", RECD_INT, "0", RECU_RESTART_TS, RR_REQUIRED, RECC_NULL, NULL, RECA_NULL}
+  ,
+  {RECT_CONFIG, "proxy.config.cluster.flow_ctrl.max_loop_interval", RECD_INT, "1000", RECU_RESTART_TS, RR_REQUIRED, RECC_NULL, NULL, RECA_NULL}
+  ,
+  {RECT_CONFIG, "proxy.config.cluster.max_sessions_per_machine", RECD_INT, "1000000", RECU_RESTART_TS, RR_NULL, RECC_INT, "[1000-4000000]", RECA_NULL}
+  ,
+  {RECT_CONFIG, "proxy.config.cluster.session_locks_per_machine", RECD_INT, "10949", RECU_RESTART_TS, RR_NULL, RECC_INT, "[1-100000]", RECA_NULL}
+  ,
+  {RECT_CONFIG, "proxy.config.cluster.read_buffer_size", RECD_INT, "2097152", RECU_RESTART_TS, RR_NULL, RECC_INT, "[65536-2097152]", RECA_NULL}
+  ,
   {RECT_CONFIG, "proxy.config.cluster.cluster_configuration", RECD_STRING, "cluster.config", RECU_NULL, RR_NULL, RECC_NULL, NULL, RECA_NULL}
   ,
   {RECT_CONFIG, "proxy.config.cluster.ethernet_interface", RECD_STRING, TS_BUILD_DEFAULT_LOOPBACK_IFACE, RECU_RESTART_TS, RR_REQUIRED, RECC_STR, "^[^[:space:]]*$", RECA_NULL}


[3/6] refine the codes of cluster

Posted by we...@apache.org.
http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/P_ClusterInline.h
----------------------------------------------------------------------
diff --git a/iocore/cluster/P_ClusterInline.h b/iocore/cluster/P_ClusterInline.h
index c653956..1d26b3a 100644
--- a/iocore/cluster/P_ClusterInline.h
+++ b/iocore/cluster/P_ClusterInline.h
@@ -36,25 +36,30 @@ inline Action *
 Cluster_lookup(Continuation * cont, CacheKey * key, CacheFragType frag_type, char *hostname, int host_len)
 {
   // Try to send remote, if not possible, handle locally
-  Action *retAct;
-  ClusterMachine *m = cluster_machine_at_depth(cache_hash(*key));
-  if (m && !clusterProcessor.disable_remote_cluster_ops(m)) {
-    CacheContinuation *cc = CacheContinuation::cacheContAllocator_alloc();
-    cc->action = cont;
-    cc->mutex = cont->mutex;
-    retAct = CacheContinuation::do_remote_lookup(cont, key, cc, frag_type, hostname, host_len);
-    if (retAct) {
-      return retAct;
-    } else {
-      // not remote, do local lookup
-      CacheContinuation::cacheContAllocator_free(cc);
-      return (Action *) NULL;
-    }
-  } else {
-    Action a;
-    a = cont;
-    return CacheContinuation::callback_failure(&a, CACHE_EVENT_LOOKUP_FAILED, 0);
-  }
+//  Action *retAct;
+//  ClusterMachine *m = cluster_machine_at_depth(cache_hash(*key));
+//  if (m && !clusterProcessor.disable_remote_cluster_ops(m)) {
+//    CacheContinuation *cc = CacheContinuation::cacheContAllocator_alloc();
+//    cc->action = cont;
+//    cc->mutex = cont->mutex;
+//    retAct = CacheContinuation::do_remote_lookup(cont, key, cc, frag_type, hostname, host_len);
+//    if (retAct) {
+//      return retAct;
+//    } else {
+//      // not remote, do local lookup
+//      CacheContinuation::cacheContAllocator_free(cc);
+//      return (Action *) NULL;
+//    }
+//  } else {
+//    Action a;
+//    a = cont;
+//    return CacheContinuation::callback_failure(&a, CACHE_EVENT_LOOKUP_FAILED, 0);
+//  }
+  (void) cont;
+  (void) key;
+  (void) frag_type;
+  (void) hostname;
+  (void) host_len;
   return (Action *) NULL;
 }
 
@@ -66,18 +71,24 @@ Cluster_read(ClusterMachine * owner_machine, int opcode,
              time_t pin_in_cache, CacheFragType frag_type, char *hostname, int host_len)
 {
   (void) params;
-  if (clusterProcessor.disable_remote_cluster_ops(owner_machine)) {
-    Action a;
-    a = cont;
-    return CacheContinuation::callback_failure(&a, CACHE_EVENT_OPEN_READ_FAILED, 0);
+  ink_assert(cont);
+  ClusterSession session;
+  if (cluster_create_session(&session, owner_machine, NULL, 0)) {
+    cont->handleEvent(CACHE_EVENT_OPEN_READ_FAILED, NULL);
+    return ACTION_RESULT_DONE;
   }
+
   int vers = CacheOpMsg_long::protoToVersion(owner_machine->msg_proto_major);
+  CacheOpArgs_General readArgs;
+  Ptr<IOBufferData> d;
+
   int flen;
   int len = 0;
   int cur_len;
   int res = 0;
-  char *msg;
+  char *msg = 0;
   char *data;
+  Action *action = NULL;
 
   if (vers == CacheOpMsg_long::CACHE_OP_LONG_MESSAGE_VERSION) {
     if ((opcode == CACHE_OPEN_READ_LONG)
@@ -87,20 +98,21 @@ Cluster_read(ClusterMachine * owner_machine, int opcode,
 
       const char *url_hostname;
       int url_hlen;
-      INK_MD5 url_only_md5;
+      INK_MD5 url_md5;
 
-      Cache::generate_key(&url_only_md5, url, 0);
+      Cache::generate_key(&url_md5, url);
       url_hostname = url->host_get(&url_hlen);
 
       len += request->m_heap->marshal_length();
-      len += params->marshal_length();
+      len += sizeof(CacheLookupHttpConfig) + params->marshal_length();
       len += url_hlen;
 
       if ((flen + len) > DEFAULT_MAX_BUFFER_SIZE)       // Bound marshalled data
         goto err_exit;
 
       // Perform data Marshal operation
-      msg = (char *) ALLOCA_DOUBLE(flen + len);
+      d = new_IOBufferData(iobuffer_size_to_index(flen + len));
+      msg = (char *) d->data();
       data = msg + flen;
 
       cur_len = len;
@@ -110,6 +122,13 @@ Cluster_read(ClusterMachine * owner_machine, int opcode,
       }
       data += res;
       cur_len -= res;
+
+      if (cur_len < (int) sizeof(CacheLookupHttpConfig))
+        goto err_exit;
+      memcpy(data, params, sizeof(CacheLookupHttpConfig));
+      data += sizeof(CacheLookupHttpConfig);
+      cur_len -= sizeof(CacheLookupHttpConfig);
+
       if ((res = params->marshal(data, cur_len)) < 0)
         goto err_exit;
       data += res;
@@ -117,37 +136,33 @@ Cluster_read(ClusterMachine * owner_machine, int opcode,
       memcpy(data, url_hostname, url_hlen);
 
       CacheOpArgs_General readArgs;
-      readArgs.url_md5 = &url_only_md5;
+      readArgs.url_md5 = &url_md5;
       readArgs.pin_in_cache = pin_in_cache;
       readArgs.frag_type = frag_type;
-      return CacheContinuation::do_op(cont, owner_machine, (void *) &readArgs,
-                                      opcode, (char *) msg, (flen + len), -1, buf);
+
+      action = CacheContinuation::do_op(cont, session, (void *) &readArgs,
+                                            opcode, d, (flen + len), -1, buf);
     } else {
       // Build message if we have host data.
+      flen = op_to_sizeof_fixedlen_msg(opcode);
+      len = host_len;
 
-      if (host_len) {
-        // Determine length of data to Marshal
-        flen = op_to_sizeof_fixedlen_msg(opcode);
-        len = host_len;
-
-        if ((flen + len) > DEFAULT_MAX_BUFFER_SIZE)     // Bound marshalled data
-          goto err_exit;
+      if ((flen + len) > DEFAULT_MAX_BUFFER_SIZE)     // Bound marshalled data
+        goto err_exit;
 
-        msg = (char *) ALLOCA_DOUBLE(flen + len);
-        data = msg + flen;
+      d = new_IOBufferData(iobuffer_size_to_index(flen + len));
+      msg = (char *) d->data();
+      data = msg + flen;
+      if (host_len)
         memcpy(data, hostname, host_len);
 
-      } else {
-        msg = 0;
-        flen = 0;
-        len = 0;
-      }
-      CacheOpArgs_General readArgs;
       readArgs.url_md5 = key;
       readArgs.frag_type = frag_type;
-      return CacheContinuation::do_op(cont, owner_machine, (void *) &readArgs,
-                                      opcode, (char *) msg, (flen + len), -1, buf);
+
+      action = CacheContinuation::do_op(cont, session, (void *) &readArgs,
+                                            opcode, d, (flen + len), -1, buf);
     }
+    ink_assert(msg);
 
   } else {
     //////////////////////////////////////////////////////////////
@@ -155,10 +170,12 @@ Cluster_read(ClusterMachine * owner_machine, int opcode,
     //////////////////////////////////////////////////////////////
     ink_release_assert(!"CacheOpMsg_long [read] bad msg version");
   }
+
+  if (action)
+    return action;
 err_exit:
-  Action a;
-  a = cont;
-  return CacheContinuation::callback_failure(&a, CACHE_EVENT_OPEN_READ_FAILED, 0);
+  cont->handleEvent(CACHE_EVENT_OPEN_READ_FAILED, NULL);
+  return ACTION_RESULT_DONE;
 }
 
 inline Action *
@@ -171,10 +188,11 @@ Cluster_write(Continuation * cont, int expected_size,
 {
   (void) key;
   (void) request;
-  if (clusterProcessor.disable_remote_cluster_ops(m)) {
-    Action a;
-    a = cont;
-    return CacheContinuation::callback_failure(&a, CACHE_EVENT_OPEN_WRITE_FAILED, 0);
+  ClusterSession session;
+  ink_assert(cont);
+  if (cluster_create_session(&session, m, NULL, 0)) {
+     cont->handleEvent(CACHE_EVENT_OPEN_WRITE_FAILED, NULL);
+     return ACTION_RESULT_DONE;
   }
   char *msg = 0;
   char *data = 0;
@@ -182,24 +200,22 @@ Cluster_write(Continuation * cont, int expected_size,
   int len = 0;
   int flen = 0;
   int vers = CacheOpMsg_long::protoToVersion(m->msg_proto_major);
+  Ptr<IOBufferData> d;
 
   switch (opcode) {
   case CACHE_OPEN_WRITE:
     {
       // Build message if we have host data
-      if (host_len) {
-        // Determine length of data to Marshal
-        flen = op_to_sizeof_fixedlen_msg(CACHE_OPEN_WRITE);
-        len = host_len;
-
-        if ((flen + len) > DEFAULT_MAX_BUFFER_SIZE)     // Bound marshalled data
-          goto err_exit;
-
-        msg = (char *) ALLOCA_DOUBLE(flen + len);
-        data = msg + flen;
+      len = host_len;
+      flen = op_to_sizeof_fixedlen_msg(CACHE_OPEN_WRITE);
+      if ((flen + len) > DEFAULT_MAX_BUFFER_SIZE)     // Bound marshalled data
+        goto err_exit;
 
+      d = new_IOBufferData(iobuffer_size_to_index(flen + len));
+      msg = (char *) d->data();
+      data = msg + flen;
+      if (host_len)
         memcpy(data, hostname, host_len);
-      }
       break;
     }
   case CACHE_OPEN_WRITE_LONG:
@@ -223,8 +239,9 @@ Cluster_write(Continuation * cont, int expected_size,
       if ((flen + len) > DEFAULT_MAX_BUFFER_SIZE)       // Bound marshalled data
         goto err_exit;
 
+      d = new_IOBufferData(iobuffer_size_to_index(flen + len));
+      msg = (char *) d->data();
       // Perform data Marshal operation
-      msg = (char *) ALLOCA_DOUBLE(flen + len);
       data = msg + flen;
       int res = 0;
 
@@ -257,7 +274,9 @@ Cluster_write(Continuation * cont, int expected_size,
     writeArgs.cfl_flags |= (old_info ? CFL_LOPENWRITE_HAVE_OLDINFO : 0);
     writeArgs.cfl_flags |= (allow_multiple_writes ? CFL_ALLOW_MULTIPLE_WRITES : 0);
 
-    return CacheContinuation::do_op(cont, m, (void *) &writeArgs, opcode, msg, flen + len, expected_size, buf);
+    Action *action = CacheContinuation::do_op(cont, session, (void *) &writeArgs, opcode, d, flen + len, expected_size, buf);
+    if (action)
+      return action;
   } else {
     //////////////////////////////////////////////////////////////
     // Create the specified down rev version of this message
@@ -267,19 +286,21 @@ Cluster_write(Continuation * cont, int expected_size,
   }
 
 err_exit:
-  Action a;
-  a = cont;
-  return CacheContinuation::callback_failure(&a, CACHE_EVENT_OPEN_WRITE_FAILED, 0);
+  cont->handleEvent(CACHE_EVENT_OPEN_WRITE_FAILED, NULL);
+  return ACTION_RESULT_DONE;
 }
 
 inline Action *
 Cluster_link(ClusterMachine * m, Continuation * cont, CacheKey * from, CacheKey * to,
              CacheFragType type, char *hostname, int host_len)
 {
-  if (clusterProcessor.disable_remote_cluster_ops(m)) {
-    Action a;
-    a = cont;
-    return CacheContinuation::callback_failure(&a, CACHE_EVENT_LINK_FAILED, 0);
+  ClusterSession session;
+  Ptr<IOBufferData> d;
+  char *msg = NULL;
+
+  if (cluster_create_session(&session, m, NULL, 0)) {
+    cont->handleEvent(CACHE_EVENT_LINK_FAILED, NULL);
+    return ACTION_RESULT_DONE;
   }
 
   int vers = CacheOpMsg_short_2::protoToVersion(m->msg_proto_major);
@@ -293,7 +314,8 @@ Cluster_link(ClusterMachine * m, Continuation * cont, CacheKey * from, CacheKey
     if ((flen + len) > DEFAULT_MAX_BUFFER_SIZE) // Bound marshalled data
       goto err_exit;
 
-    char *msg = (char *) ALLOCA_DOUBLE(flen + len);
+    d = new_IOBufferData(iobuffer_size_to_index(flen + len));
+    msg = (char *) d->data();
     memcpy((msg + flen), hostname, host_len);
 
     // Setup args for remote link
@@ -301,7 +323,9 @@ Cluster_link(ClusterMachine * m, Continuation * cont, CacheKey * from, CacheKey
     linkArgs.from = from;
     linkArgs.to = to;
     linkArgs.frag_type = type;
-    return CacheContinuation::do_op(cont, m, (void *) &linkArgs, CACHE_LINK, msg, (flen + len));
+    Action *action = CacheContinuation::do_op(cont, session, (void *) &linkArgs, CACHE_LINK, d, (flen + len));
+    if (action)
+      return action;
   } else {
     //////////////////////////////////////////////////////////////
     // Create the specified down rev version of this message
@@ -311,18 +335,20 @@ Cluster_link(ClusterMachine * m, Continuation * cont, CacheKey * from, CacheKey
   }
 
 err_exit:
-  Action a;
-  a = cont;
-  return CacheContinuation::callback_failure(&a, CACHE_EVENT_LINK_FAILED, 0);
+  cont->handleEvent(CACHE_EVENT_LINK_FAILED, NULL);
+  return ACTION_RESULT_DONE;
 }
 
 inline Action *
 Cluster_deref(ClusterMachine * m, Continuation * cont, CacheKey * key, CacheFragType type, char *hostname, int host_len)
 {
-  if (clusterProcessor.disable_remote_cluster_ops(m)) {
-    Action a;
-    a = cont;
-    return CacheContinuation::callback_failure(&a, CACHE_EVENT_DEREF_FAILED, 0);
+  ClusterSession session;
+  Ptr<IOBufferData> d;
+  char *msg = NULL;
+
+  if (cluster_create_session(&session, m, NULL, 0)) {
+    cont->handleEvent(CACHE_EVENT_DEREF_FAILED, NULL);
+    return ACTION_RESULT_DONE ;
   }
 
   int vers = CacheOpMsg_short::protoToVersion(m->msg_proto_major);
@@ -336,14 +362,17 @@ Cluster_deref(ClusterMachine * m, Continuation * cont, CacheKey * key, CacheFrag
     if ((flen + len) > DEFAULT_MAX_BUFFER_SIZE) // Bound marshalled data
       goto err_exit;
 
-    char *msg = (char *) ALLOCA_DOUBLE(flen + len);
+    d = new_IOBufferData(iobuffer_size_to_index(flen + len));
+    msg = (char *) d->data();
     memcpy((msg + flen), hostname, host_len);
 
     // Setup args for remote deref
     CacheOpArgs_Deref drefArgs;
     drefArgs.md5 = key;
     drefArgs.frag_type = type;
-    return CacheContinuation::do_op(cont, m, (void *) &drefArgs, CACHE_DEREF, msg, (flen + len));
+    Action *action = CacheContinuation::do_op(cont, session, (void *) &drefArgs, CACHE_DEREF, d, (flen + len));
+    if (action)
+      return action;
   } else {
     //////////////////////////////////////////////////////////////
     // Create the specified down rev version of this message
@@ -353,19 +382,22 @@ Cluster_deref(ClusterMachine * m, Continuation * cont, CacheKey * key, CacheFrag
   }
 
 err_exit:
-  Action a;
-  a = cont;
-  return CacheContinuation::callback_failure(&a, CACHE_EVENT_DEREF_FAILED, 0);
+  cont->handleEvent(CACHE_EVENT_DEREF_FAILED, NULL);
+  return ACTION_RESULT_DONE ;
 }
 
 inline Action *
 Cluster_remove(ClusterMachine * m, Continuation * cont, CacheKey * key,
                bool rm_user_agents, bool rm_link, CacheFragType frag_type, char *hostname, int host_len)
 {
-  if (clusterProcessor.disable_remote_cluster_ops(m)) {
-    Action a;
-    a = cont;
-    return CacheContinuation::callback_failure(&a, CACHE_EVENT_REMOVE_FAILED, 0);
+  ClusterSession session;
+  Ptr<IOBufferData> d;
+  char *msg = NULL;
+
+  if (cluster_create_session(&session, m, NULL, 0)) {
+    if (cont)
+      cont->handleEvent(CACHE_EVENT_REMOVE_FAILED, NULL);
+    return ACTION_RESULT_DONE;
   }
 
   int vers = CacheOpMsg_short::protoToVersion(m->msg_proto_major);
@@ -379,7 +411,8 @@ Cluster_remove(ClusterMachine * m, Continuation * cont, CacheKey * key,
     if ((flen + len) > DEFAULT_MAX_BUFFER_SIZE) // Bound marshalled data
       goto err_exit;
 
-    char *msg = (char *) ALLOCA_DOUBLE(flen + len);
+    d = new_IOBufferData(iobuffer_size_to_index(flen + len));
+    msg = (char *) d->data();
     memcpy((msg + flen), hostname, host_len);
 
     // Setup args for remote update
@@ -388,7 +421,9 @@ Cluster_remove(ClusterMachine * m, Continuation * cont, CacheKey * key,
     updateArgs.cfl_flags |= (rm_user_agents ? CFL_REMOVE_USER_AGENTS : 0);
     updateArgs.cfl_flags |= (rm_link ? CFL_REMOVE_LINK : 0);
     updateArgs.frag_type = frag_type;
-    return CacheContinuation::do_op(cont, m, (void *) &updateArgs, CACHE_REMOVE, msg, (flen + len));
+    Action *action = CacheContinuation::do_op(cont, session, (void *) &updateArgs, CACHE_REMOVE, d, (flen + len));
+    if (action)
+      return action;
   } else {
     //////////////////////////////////////////////////////////////
     // Create the specified down rev version of this message
@@ -398,9 +433,8 @@ Cluster_remove(ClusterMachine * m, Continuation * cont, CacheKey * key,
   }
 
 err_exit:
-  Action a;
-  a = cont;
-  return CacheContinuation::callback_failure(&a, CACHE_EVENT_REMOVE_FAILED, 0);
+  if (cont)
+    cont->handleEvent(CACHE_EVENT_REMOVE_FAILED, NULL);
+  return ACTION_RESULT_DONE;
 }
-
 #endif /* __CLUSTERINLINE_H__ */

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/clusterinterface.h
----------------------------------------------------------------------
diff --git a/iocore/cluster/clusterinterface.h b/iocore/cluster/clusterinterface.h
new file mode 100644
index 0000000..0f8e510
--- /dev/null
+++ b/iocore/cluster/clusterinterface.h
@@ -0,0 +1,104 @@
+/** @file
+
+  A brief file description
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#ifndef _CLUSTER_INTERFACE_H
+#define _CLUSTER_INTERFACE_H
+
+struct ClusterMachine;
+class IOBufferData;
+class IOBufferBlock;
+
+#define CLUSTER_DEBUG_TAG "cluster_io"
+
+#define new_RecvBuffer(len) \
+  new_IOBufferData(iobuffer_size_to_index(len, MAX_BUFFER_SIZE_INDEX))
+
+#define CURRENT_TIME() (ink_get_hrtime() / HRTIME_SECOND)
+#define CURRENT_MS() (ink_get_hrtime() / HRTIME_MSECOND)
+#define CURRENT_NS() (ink_get_hrtime() / HRTIME_NSECOND)
+
+#define MINI_MESSAGE_SIZE     64  //use internal buffer to store the mini message
+
+#define FUNC_ID_CONNECTION_CLOSED_NOTIFY 6100   //connection closed
+#define FUNC_ID_CLUSTER_PING_REQUEST     6201
+#define FUNC_ID_CLUSTER_PING_RESPONSE    6202
+#define FUNC_ID_CLUSTER_HELLO_REQUEST    6203
+#define FUNC_ID_CLUSTER_HELLO_RESPONSE   6204
+
+#define RESPONSE_EVENT_NOTIFY_DEALER 1
+
+typedef int64_t SequenceType;
+
+typedef union {
+	struct {
+    uint32_t ip;    //src ip addr
+		uint32_t timestamp;  //session create time
+		SequenceType seq;    //session sequence number
+	} fields;
+
+	uint64_t ids[2]; //session id, 0 for free entry
+} SessionId;
+
+typedef SessionId ClusterSession;
+
+typedef enum {
+  PRIORITY_HIGH = 0,
+  PRIORITY_MID,
+  PRIORITY_LOW,
+} MessagePriority;
+
+
+#define CLEAR_SESSION(session_id) \
+  (session_id).ids[0] = (session_id).ids[1] = 0
+
+typedef int (*machine_change_notify_func)(ClusterMachine * m);
+
+typedef void (*message_deal_func)(ClusterSession session, void *arg,
+    const int func_id, IOBufferBlock *blocks, const int data_len);
+
+/*
+typedef void (*message_deal_func)(ClusterSession session, void *arg,
+    const int func_id, void *data, int data_len);
+*/
+
+int cluster_global_init(message_deal_func deal_func,
+    machine_change_notify_func machine_change_notify);
+
+int cluster_create_session(ClusterSession *session,
+    const struct ClusterMachine *machine, void *arg, const int events);
+
+int cluster_bind_session(ClusterSession session, void *arg);
+
+int cluster_set_events(ClusterSession session, const int events);
+
+void *cluster_close_session(ClusterSession session);
+
+/*
+ * data pointer as:
+ *    data_len: -1 for IOBufferBlock *, >= 0 for char buffer
+ **/
+int cluster_send_message(ClusterSession session, const int func_id,
+	void *data, const int data_len, const MessagePriority priority);
+
+#endif
+

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/connection.cc
----------------------------------------------------------------------
diff --git a/iocore/cluster/connection.cc b/iocore/cluster/connection.cc
new file mode 100644
index 0000000..856483c
--- /dev/null
+++ b/iocore/cluster/connection.cc
@@ -0,0 +1,1726 @@
+/** @file
+
+  A brief file description
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <assert.h>
+#if defined(linux)
+#include <sys/prctl.h>
+#endif
+#include "Diags.h"
+#include "machine.h"
+#include "global.h"
+#include "nio.h"
+#include "message.h"
+#include "session.h"
+#include "P_Cluster.h"
+#include "ink_config.h"
+#include "EventPoll.h"
+#include "connection.h"
+
+typedef enum {
+  STATE_NOT_CONNECT = 0,
+  STATE_CONNECTING,
+  STATE_CONNECTED,
+  STATE_SEND_DATA,
+  STATE_RECV_DATA
+} ConnectState;
+
+typedef struct connect_context {
+  SocketContext *pSockContext;
+  int64_t connect_start_time; //connect start time in ms
+  int64_t server_start_time;  //recv data start time in ms
+  int reconnect_interval;     //reconnect interval in ms
+  int connect_count; //already connect times
+  int send_bytes;
+  int recv_bytes;
+  int total_bytes;
+  ConnectState state;
+  char buff[sizeof(MsgHeader) + sizeof(HelloMessage)];
+  bool is_accept;    //true means server socket to accept
+  bool need_reconnect;
+  bool used;
+  bool need_check_timeout;
+} ConnectContext;
+
+struct connection_thread_context
+{
+  EventPoll *ev_poll;
+  int alloc_size;
+  ink_mutex lock;
+
+  ConnectContext *connections_buffer;  //memory pool for malloc
+  ConnectContext **connections;  //existing connections
+  int connection_count;   //current connection count
+};
+
+static struct connection_thread_context connect_thread_context;
+static SocketContext *socket_contexts_pool = NULL;  //first element for accept
+
+SocketContextsByMachine *machine_sockets = NULL;  //sockets by peer machine, [dest ip % MAX_MACHINE_COUNT]
+
+void *connect_worker_entrance(void *arg);
+
+static int remove_connection(SocketContext *pSockContext, const bool needLock)
+{
+  ConnectContext **ppConnection;
+  ConnectContext **ppConnectionEnd;
+  ConnectContext **ppNext;
+
+  /*
+     Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+     "free connection, current count: %d", __LINE__,
+     connect_thread_context.connection_count);
+     */
+
+  if (needLock) {
+    ink_mutex_acquire(&connect_thread_context.lock);
+  }
+
+  ppConnectionEnd = connect_thread_context.connections +
+    connect_thread_context.connection_count;
+  for (ppConnection=connect_thread_context.connections; ppConnection<ppConnectionEnd;
+      ppConnection++)
+  {
+    if ((*ppConnection)->pSockContext == pSockContext) {
+      (*ppConnection)->used = false;
+      (*ppConnection)->pSockContext = NULL;
+      break;
+    }
+  }
+
+  if (ppConnection == ppConnectionEnd) {
+    Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+        "Can't found connection to release!", __LINE__);
+    ink_mutex_release(&connect_thread_context.lock);
+    return ENOENT;
+  }
+
+  ppNext = ppConnection + 1;
+  while (ppNext < ppConnectionEnd) {
+    *(ppNext - 1) = *ppNext;
+    ppNext++;
+  }
+  connect_thread_context.connection_count--;
+
+  if (needLock) {
+    ink_mutex_release(&connect_thread_context.lock);
+  }
+
+  return 0;
+}
+
+static void close_connection(SocketContext *pSockContext)
+{
+  if (pSockContext->sock >= 0) {
+    Debug(CLUSTER_DEBUG_TAG, "file: " __FILE__ ", line: %d, "
+        "close connection #%d %s:%d",
+        __LINE__, pSockContext->sock,
+        pSockContext->machine->hostname,
+        pSockContext->machine->cluster_port);
+
+    close(pSockContext->sock);
+    pSockContext->sock = -1;
+  }
+}
+
+static void release_connection(SocketContext *pSockContext,
+    const bool needLock)
+{
+  close_connection(pSockContext);
+  if (pSockContext->connect_type == CONNECT_TYPE_SERVER) {
+    remove_connection(pSockContext, needLock);
+    free_accept_sock_context(pSockContext);
+  }
+}
+
+inline static int get_machine_index(const unsigned int ip)
+{
+  int id;
+  int count;
+  int index;
+
+  id = ip % MAX_MACHINE_COUNT;
+  if (machine_sockets[id].ip == ip) {
+    return id;
+  }
+
+  count = 1;
+  while (count <= MAX_MACHINE_COUNT) {
+    index = (id + count) % MAX_MACHINE_COUNT;
+    if (machine_sockets[index].ip == ip) {
+      return index;
+    }
+    count++;
+  }
+
+  return -1;
+}
+
+static int alloc_machine_index(const unsigned int ip)
+{
+  int id;
+  int count;
+  int index;
+
+  id = ip % MAX_MACHINE_COUNT;
+  if (machine_sockets[id].ip == 0) {
+    return id;
+  }
+
+  count = 1;
+  while (count <= MAX_MACHINE_COUNT) {
+    index = (id + count) % MAX_MACHINE_COUNT;
+    if (machine_sockets[index].ip == 0) {
+      return index;
+    }
+    count++;
+  }
+
+  Warning("file: "__FILE__", line: %d, "
+      "can't malloc slot for ip: %u.%u.%u.%u",
+      __LINE__, DOT_SEPARATED(ip));
+
+  return -1;
+}
+
+static void fill_send_buffer(ConnectContext *pConnectContext,
+    const int func_id)
+{
+  MsgHeader *pHeader;
+  HelloMessage *pHello;
+
+  pHeader = (MsgHeader *)pConnectContext->buff;
+#ifdef CHECK_MAGIC_NUMBER
+  pHeader->magic = MAGIC_NUMBER;
+#endif
+
+  pHeader->func_id = func_id;
+  pHeader->data_len = sizeof(HelloMessage);
+  pHeader->aligned_data_len = BYTE_ALIGN8(sizeof(HelloMessage));
+  pHeader->session_id.fields.ip = my_machine_ip;
+  pHeader->session_id.fields.timestamp = CURRENT_TIME();
+  pHeader->session_id.fields.seq = 0;
+  pHeader->msg_seq = 11111;   //do not create session
+
+  pHello = (HelloMessage *)(pConnectContext->buff + sizeof(MsgHeader));
+  pHello->major = CLUSTER_MAJOR_VERSION;
+  pHello->minor = CLUSTER_MINOR_VERSION;
+  pHello->min_major = MIN_CLUSTER_MAJOR_VERSION;
+  pHello->min_minor = MIN_CLUSTER_MINOR_VERSION;
+
+  pConnectContext->send_bytes = 0;
+}
+
+static int deal_hello_message(SocketContext *pSockContext, char *data)
+{
+  int proto_major = -1;
+  int proto_minor = -1;
+  uint32_t major;
+  int expect_func_id;
+  MsgHeader *pHeader;
+  HelloMessage *pHelloMessage;
+
+  pHeader = (MsgHeader *)data;
+#ifdef CHECK_MAGIC_NUMBER
+  if (pHeader->magic != MAGIC_NUMBER) {
+    Error("file: "__FILE__", line: %d, "
+        "magic number: %08x != %08x",
+        __LINE__, pHeader->magic, MAGIC_NUMBER);
+    return EINVAL;
+  }
+#endif
+
+  if (pHeader->data_len != sizeof(HelloMessage)) {
+    Error("file: "__FILE__", line: %d, "
+        "message length: %d != %d!", __LINE__,
+        pHeader->data_len, (int)sizeof(HelloMessage));
+    return EINVAL;
+  }
+
+  if (pSockContext->connect_type == CONNECT_TYPE_CLIENT) {
+    expect_func_id = FUNC_ID_CLUSTER_HELLO_RESPONSE;
+  }
+  else {
+    expect_func_id = FUNC_ID_CLUSTER_HELLO_REQUEST;
+  }
+  if (pHeader->func_id != expect_func_id) {
+    Error("file: "__FILE__", line: %d, "
+        "invalid function id: %d != %d!", __LINE__,
+        pHeader->func_id, expect_func_id);
+    return EINVAL;
+  }
+  pHelloMessage = (HelloMessage *)(data + sizeof(MsgHeader));
+
+  /**
+   * Determine the message protocol major version to use, by stepping down
+   * from current to the minimium level until a match is found.
+   * Derive the minor number as follows, if the current (major, minor)
+   * is the current node (major, minor) use the given minor number.
+   * Otherwise, minor number is zero.
+   **/
+  for (major=pHelloMessage->major; major>=pHelloMessage->min_major; --major) {
+    if ((major >= MIN_CLUSTER_MAJOR_VERSION) && (major <= CLUSTER_MAJOR_VERSION)) {
+      proto_major = major;
+    }
+  }
+  if (proto_major > 0) {
+    /* Compute minor version */
+    if (proto_major == (int)pHelloMessage->major) {
+      proto_minor = pHelloMessage->minor;
+      if (proto_minor != CLUSTER_MINOR_VERSION) {
+        Warning("file: "__FILE__", line: %d, "
+            "Different clustering minor versions (%d,%d) for "
+            "node %u.%u.%u.%u, continuing", __LINE__,
+            proto_minor, CLUSTER_MINOR_VERSION,
+            DOT_SEPARATED(pSockContext->machine->ip));
+      }
+    } else {
+      proto_minor = 0;
+    }
+  }
+  else {
+    Error("file: "__FILE__", line: %d, "
+        "Bad cluster major version range (%d-%d) for "
+        "node %u.%u.%u.%u, close connection", __LINE__,
+        pHelloMessage->min_major, pHelloMessage->major,
+        DOT_SEPARATED(pSockContext->machine->ip));
+    return EINVAL;
+  }
+
+  /*
+     Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+     "node: %u.%u.%u.%u, version: %d.%d", __LINE__,
+     DOT_SEPARATED(pSockContext->machine->ip),
+     proto_major, proto_minor);
+     */
+
+  pSockContext->machine->msg_proto_major = pHelloMessage->major;
+  pSockContext->machine->msg_proto_minor = pHelloMessage->minor;
+  return 0;
+}
+
+
+static int do_send_data(ConnectContext *pConnectContext)
+{
+  int bytes;
+  int result;
+
+  bytes = write(pConnectContext->pSockContext->sock, pConnectContext->buff +
+      pConnectContext->send_bytes, pConnectContext->total_bytes -
+      pConnectContext->send_bytes);
+  if (bytes < 0) {
+    result = errno != 0 ? errno : EAGAIN;
+    if (result == EINTR) {
+      Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+          "write to %s:%d fail, errno: %d, error info: %s",
+          __LINE__, pConnectContext->pSockContext->machine->hostname,
+          pConnectContext->pSockContext->machine->cluster_port,
+          result, strerror(result));
+    }
+    else if (!(result == EAGAIN)) {
+      Error("file: "__FILE__", line: %d, "
+          "write to %s:%d fail, errno: %d, error info: %s",
+          __LINE__, pConnectContext->pSockContext->machine->hostname,
+          pConnectContext->pSockContext->machine->cluster_port,
+          result, strerror(result));
+    }
+
+    return result;
+  }
+  else if (bytes == 0) {
+    Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+        "%s:%d connection closed", __LINE__,
+        pConnectContext->pSockContext->machine->hostname,
+        pConnectContext->pSockContext->machine->cluster_port);
+    return ECONNRESET;
+  }
+  pConnectContext->send_bytes += bytes;
+
+  return (pConnectContext->send_bytes == pConnectContext->total_bytes) ?
+    0 : EAGAIN;
+}
+
+static int do_recv_data(ConnectContext *pConnectContext)
+{
+  int bytes;
+  int result;
+
+  bytes = read(pConnectContext->pSockContext->sock, pConnectContext->buff +
+      pConnectContext->recv_bytes, pConnectContext->total_bytes -
+      pConnectContext->recv_bytes);
+  if (bytes < 0) {
+    result = errno != 0 ? errno : EAGAIN;
+    if (result == EINTR) {
+      Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+          "read from %s:%d fail, errno: %d, error info: %s",
+          __LINE__, pConnectContext->pSockContext->machine->hostname,
+          pConnectContext->pSockContext->machine->cluster_port,
+          result, strerror(result));
+    }
+    else if (!(result == EAGAIN)) {
+      Error("file: "__FILE__", line: %d, "
+          "read from %s:%d fail, errno: %d, error info: %s",
+          __LINE__, pConnectContext->pSockContext->machine->hostname,
+          pConnectContext->pSockContext->machine->cluster_port,
+          result, strerror(result));
+    }
+
+    return result;
+  }
+  else if (bytes == 0) {
+    Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+        "%s:%d connection closed", __LINE__,
+        pConnectContext->pSockContext->machine->hostname,
+        pConnectContext->pSockContext->machine->cluster_port);
+    return ECONNRESET;
+  }
+  pConnectContext->recv_bytes += bytes;
+
+  return (pConnectContext->recv_bytes == pConnectContext->total_bytes) ?
+    0 : EAGAIN;
+}
+
+static int check_socket_status(int sock)
+{
+  int result;
+  socklen_t len;
+
+  len = sizeof(result);
+  if (getsockopt(sock, SOL_SOCKET, SO_ERROR,
+        &result, &len) < 0)
+  {
+    result = errno != 0 ? errno : EACCES;
+  }
+
+  return result;
+}
+
+static int connection_handler(ConnectContext *pConnectContext, const bool needLock)
+{
+  int result;
+  SocketContext *pSockContext;
+  int events;
+  bool bNew;
+
+  pSockContext = pConnectContext->pSockContext;
+  bNew = false;
+  events = 0;
+  result = 0;
+  switch (pConnectContext->state) {
+    case STATE_CONNECTING:
+      result = check_socket_status(pSockContext->sock);
+      if (result != 0) {
+        break;
+      }
+      pConnectContext->state = STATE_CONNECTED;
+    case STATE_CONNECTED:
+      if (pSockContext->connect_type == CONNECT_TYPE_CLIENT) {
+        events = EVENTIO_WRITE;
+        pConnectContext->state = STATE_SEND_DATA;
+        fill_send_buffer(pConnectContext, FUNC_ID_CLUSTER_HELLO_REQUEST);
+      }
+      else {  //server
+        events = EVENTIO_READ;
+        bNew = true;
+        pConnectContext->state = STATE_RECV_DATA;
+        pConnectContext->recv_bytes = 0;
+        pConnectContext->server_start_time = CURRENT_MS(); 
+      }
+
+      break;
+    case STATE_SEND_DATA:
+      while ((result=do_send_data(pConnectContext)) == EINTR) {
+      }
+
+      if (result == EAGAIN) {
+        events = EVENTIO_WRITE;
+        break;
+      }
+      else if (result != 0) {
+        break;
+      }
+
+      //send data done
+      if (pSockContext->connect_type == CONNECT_TYPE_CLIENT) {
+        events = EVENTIO_READ;
+        pConnectContext->state = STATE_RECV_DATA;
+        pConnectContext->recv_bytes = 0;
+        pConnectContext->server_start_time = CURRENT_MS(); 
+      }
+      else {  //server deal done
+      }
+      break;
+    case STATE_RECV_DATA:
+      while ((result=do_recv_data(pConnectContext)) == EINTR) {
+      }
+
+      if (result == EAGAIN) {
+        events = EVENTIO_READ;
+        break;
+      }
+      else if (result != 0) {
+        break;
+      }
+
+      //recv data done
+      result = deal_hello_message(pSockContext, pConnectContext->buff);
+      if (pSockContext->connect_type == CONNECT_TYPE_CLIENT) {
+      }
+      else if (result == 0) {
+        events = EVENTIO_WRITE;
+        pConnectContext->state = STATE_SEND_DATA;
+        fill_send_buffer(pConnectContext, FUNC_ID_CLUSTER_HELLO_RESPONSE);
+      }
+      break;
+    default:
+      result = EINVAL;
+      break;
+  }
+
+  if (events != 0) {
+    int ret;
+    if (bNew) {
+      ret = connect_thread_context.ev_poll->attach(pSockContext->sock,
+          events, pConnectContext);
+    }
+    else {
+      ret = connect_thread_context.ev_poll->modify(pSockContext->sock,
+          events, pConnectContext);
+    }
+    if (ret >= 0) {
+      return 0;
+    }
+
+    result = errno != 0 ? errno : ENOMEM;
+    Error("file: " __FILE__ ", line: %d, "
+        "event poll control fail, errno: %d, error info: %s",
+        __LINE__, result, strerror(result));
+  }
+
+   if (connect_thread_context.ev_poll->detach(pSockContext->sock) < 0) {
+    result = errno != 0 ? errno : ENOMEM;
+    Error("file: " __FILE__ ", line: %d, "
+        "event poll detach #%d fail, errno: %d, error info: %s",
+        __LINE__, pSockContext->sock,
+        result, strerror(result));
+  }
+
+  remove_connection(pSockContext, needLock);
+  if (result == 0) {
+    result = machine_add_connection(pSockContext);
+    if (result == 0) {
+      machine_up_notify(pSockContext->machine);
+    }
+  }
+
+  if (result != 0) {
+    close_connection(pSockContext);
+    if (pSockContext->connect_type == CONNECT_TYPE_SERVER) {
+      free_accept_sock_context(pSockContext);
+    }
+  }
+
+  return result;
+}
+
+#ifdef USE_MULTI_ALLOCATOR
+static void check_init_allocator(SocketContext *pSockContext)
+{
+  char name[64];
+  int index;
+
+  if (pSockContext->out_msg_allocator == NULL) {
+    index = pSockContext - socket_contexts_pool;
+    sprintf(name, "OutMessage_%d", index);
+    pSockContext->out_msg_allocator = new Allocator(name,
+        sizeof(OutMessage), 512);
+
+    sprintf(name, "InMessage_%d", index);
+    pSockContext->in_msg_allocator = new Allocator(name,
+        sizeof(InMessage), 128);
+  }
+}
+#endif
+
+static SocketContext *alloc_connect_sock_context(const unsigned int machine_ip)
+{
+  SocketContext *pSockContext;
+  int machine_id;
+
+  ink_mutex_acquire(&connect_thread_context.lock);
+  if ((machine_id=get_machine_index(machine_ip)) < 0) {
+    if ((machine_id=alloc_machine_index(machine_ip)) < 0) {
+      ink_mutex_release(&connect_thread_context.lock);
+      return NULL;
+    }
+
+    machine_sockets[machine_id].ip = machine_ip;
+  }
+
+  pSockContext = machine_sockets[machine_id].connect_free_list;
+  if (pSockContext != NULL) {
+    machine_sockets[machine_id].connect_free_list =
+      pSockContext->next;
+
+#ifdef USE_MULTI_ALLOCATOR
+    check_init_allocator(pSockContext);
+#endif
+  }
+  ink_mutex_release(&connect_thread_context.lock);
+
+  return pSockContext;
+}
+
+static void free_connect_sock_context(SocketContext *pSockContext,
+    const bool needLock)
+{
+  int machine_id;
+  if ((machine_id=get_machine_index(pSockContext->machine->ip)) < 0) {
+    Warning("file: "__FILE__", line: %d, "
+        "can't get slot for ip: %u.%u.%u.%u",
+        __LINE__, DOT_SEPARATED(pSockContext->machine->ip));
+    return;
+  }
+
+  if (needLock) {
+    ink_mutex_acquire(&connect_thread_context.lock);
+  }
+
+  pSockContext->next = machine_sockets[machine_id].connect_free_list;
+  machine_sockets[machine_id].connect_free_list = pSockContext;
+
+  if (needLock) {
+    ink_mutex_release(&connect_thread_context.lock);
+  }
+}
+
+static SocketContext *alloc_accept_sock_context(const unsigned int machine_ip)
+{
+  SocketContext *pSockContext;
+  int machine_id;
+
+  ink_mutex_acquire(&connect_thread_context.lock);
+  if ((machine_id=get_machine_index(machine_ip)) < 0) {
+    if ((machine_id=alloc_machine_index(machine_ip)) < 0) {
+      ink_mutex_release(&connect_thread_context.lock);
+      return NULL;
+    }
+
+    machine_sockets[machine_id].ip = machine_ip;
+  }
+
+  /*
+     Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+     "alloc slot for ip: %u.%u.%u.%u (%u)",
+     __LINE__, DOT_SEPARATED(machine_ip),
+     machine_sockets[machine_id].ip);
+     */
+
+  pSockContext = machine_sockets[machine_id].accept_free_list;
+  if (pSockContext != NULL) {
+    machine_sockets[machine_id].accept_free_list =
+      pSockContext->next;
+
+#ifdef USE_MULTI_ALLOCATOR
+    check_init_allocator(pSockContext);
+#endif
+  }
+  ink_mutex_release(&connect_thread_context.lock);
+
+  return pSockContext;
+}
+
+void free_accept_sock_context(SocketContext *pSockContext)
+{
+  int machine_id;
+  if ((machine_id=get_machine_index(pSockContext->machine->ip)) < 0) {
+    Warning("file: "__FILE__", line: %d, "
+        "can't get slot for ip: %u.%u.%u.%u",
+        __LINE__, DOT_SEPARATED(pSockContext->machine->ip));
+    return;
+  }
+
+  /*
+     Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+     "free slot for ip: %u.%u.%u.%u (%u)",
+     __LINE__, DOT_SEPARATED(pSockContext->machine->ip),
+     machine_sockets[machine_id].ip);
+     */
+
+  ink_mutex_acquire(&connect_thread_context.lock);
+  pSockContext->next = machine_sockets[machine_id].accept_free_list;
+  machine_sockets[machine_id].accept_free_list = pSockContext;
+  ink_mutex_release(&connect_thread_context.lock);
+}
+
+static int alloc_socket_contexts(const int connections_per_machine,
+    SocketContext **pool)
+{
+  int result;
+  int bytes;
+  int i;
+  int total_connections;
+
+  SocketContext *pSockContext;
+  SocketContext *pSockContextEnd;
+
+  total_connections = connections_per_machine * MAX_MACHINE_COUNT + 1;
+  bytes = sizeof(SocketContext) * total_connections;
+  *pool =	(SocketContext *)malloc(bytes);
+  if (*pool == NULL) {
+    Error("file: "__FILE__", line: %d, "
+        "malloc %d bytes fail, errno: %d, error info: %s",
+        __LINE__, bytes, errno, strerror(errno));
+    return errno != 0 ? errno : ENOMEM;
+  }
+  memset(*pool, 0, bytes);
+
+  pSockContextEnd = *pool + total_connections;
+  for (pSockContext=*pool; pSockContext<pSockContextEnd;
+      pSockContext++)
+  {
+    for (i=0; i<PRIORITY_COUNT; i++) {
+      if ((result=ink_mutex_init(&pSockContext->send_queues[i].lock, "send_lock")) != 0) {
+        return result;
+      }
+    }
+  }
+
+  return 0;
+}
+
+static int init_socket_contexts()
+{
+  int result;
+  int half_connections_per_machine;
+  int machine_index;
+  int thread_index;
+  int k;
+  SocketContext *pSockContext;
+
+  if ((result=alloc_socket_contexts(num_of_cluster_connections,
+          &socket_contexts_pool)) != 0)
+  {
+    return result;
+  }
+
+  half_connections_per_machine = num_of_cluster_connections / 2;
+  pSockContext = socket_contexts_pool + 1;   //0 for server accept
+  thread_index = 0;
+  for (machine_index=0; machine_index<MAX_MACHINE_COUNT; machine_index++) {
+    for (k=0; k<half_connections_per_machine; k++) {
+      pSockContext->connect_type = CONNECT_TYPE_SERVER;
+      pSockContext->next = machine_sockets[machine_index].accept_free_list;
+      machine_sockets[machine_index].accept_free_list = pSockContext;
+      pSockContext->thread_context = cluster_worker_thread_contexts +
+        thread_index++ % num_of_cluster_threads;
+      pSockContext++;
+    }
+
+    for (k=0; k<half_connections_per_machine; k++) {
+      pSockContext->connect_type = CONNECT_TYPE_CLIENT;
+      pSockContext->next = machine_sockets[machine_index].connect_free_list;
+      machine_sockets[machine_index].connect_free_list = pSockContext;
+      pSockContext->thread_context = cluster_worker_thread_contexts +
+        thread_index++ % num_of_cluster_threads;
+      pSockContext++;
+    }
+  }
+
+  return 0;
+}
+
+int connection_init()
+{
+  int result;
+  int bytes;
+
+  bytes = sizeof(SocketContextsByMachine) * MAX_MACHINE_COUNT;
+  machine_sockets = (SocketContextsByMachine *)malloc(bytes);
+  if (machine_sockets == NULL) {
+    Error("file: "__FILE__", line: %d, "
+        "malloc %d bytes fail, errno: %d, error info: %s",
+        __LINE__, bytes, errno, strerror(errno));
+    return errno != 0 ? errno : ENOMEM;
+  }
+  memset(machine_sockets, 0, bytes);
+
+  connect_thread_context.alloc_size = MAX_MACHINE_COUNT *
+    num_of_cluster_connections + 1;
+
+  bytes = sizeof(ConnectContext) * connect_thread_context.alloc_size;
+  connect_thread_context.connections_buffer = (ConnectContext *)malloc(bytes);
+  if (connect_thread_context.connections_buffer == NULL) {
+    Error("file: "__FILE__", line: %d, "
+        "malloc %d bytes fail, errno: %d, error info: %s",
+        __LINE__, bytes, errno, strerror(errno));
+    return errno != 0 ? errno : ENOMEM;
+  }
+  memset(connect_thread_context.connections_buffer, 0, bytes);
+
+  bytes = sizeof(ConnectContext *) * connect_thread_context.alloc_size;
+  connect_thread_context.connections = (ConnectContext **)malloc(bytes);
+  if (connect_thread_context.connections == NULL) {
+    Error("file: "__FILE__", line: %d, "
+        "malloc %d bytes fail, errno: %d, error info: %s",
+        __LINE__, bytes, errno, strerror(errno));
+    return errno != 0 ? errno : ENOMEM;
+  }
+  memset(connect_thread_context.connections, 0, bytes);
+  connect_thread_context.connection_count = 0;
+
+  connect_thread_context.ev_poll = new EventPoll(
+      connect_thread_context.alloc_size, 1000);
+  if (connect_thread_context.ev_poll == NULL) {
+    Error("file: " __FILE__ ", line: %d, "
+        "new EventPoll fail, errno: %d, error info: %s",
+        __LINE__, errno, strerror(errno));
+    return errno != 0 ? errno : ENOMEM;
+  }
+
+  if ((result=ink_mutex_init(&connect_thread_context.lock, "connection_lock")) != 0) {
+    return result;
+  }
+
+  if ((result=init_socket_contexts()) != 0) {
+    return result;
+
+  }
+
+  return 0;
+}
+
+void connection_destroy()
+{
+}
+
+static ConnectContext *find_connection(SocketContext *pSockContext)
+{
+  ConnectContext **ppConnection;
+  ConnectContext **ppConnectionEnd;
+
+  ink_mutex_acquire(&connect_thread_context.lock);
+  ppConnectionEnd = connect_thread_context.connections +
+    connect_thread_context.connection_count;
+  for (ppConnection=connect_thread_context.connections; ppConnection<ppConnectionEnd;
+      ppConnection++)
+  {
+    if ((*ppConnection)->pSockContext == pSockContext) {
+      break;
+    }
+  }
+  ink_mutex_release(&connect_thread_context.lock);
+
+  return (ppConnection == ppConnectionEnd) ?  NULL: *ppConnection;
+}
+
+static int do_connect(ConnectContext *pConnectContext, const bool needLock)
+{
+  int result;
+  struct sockaddr_in addr;
+  SocketContext *pSockContext;
+
+  pSockContext = pConnectContext->pSockContext;
+  pSockContext->sock = socket(AF_INET, SOCK_STREAM, 0);
+  pConnectContext->connect_count++;
+  pConnectContext->state = STATE_CONNECTING;
+  if (pSockContext->sock < 0) {
+    Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+        "socket create failed, errno: %d, error info: %s",
+        __LINE__, errno, strerror(errno));
+    return errno != 0 ? errno : EMFILE;
+  }
+
+  if ((result=safe_nonblocking(pSockContext->sock)) != 0) {
+    close_connection(pSockContext);
+    return result;
+  }
+
+  if (safe_setsockopt(pSockContext->sock, IPPROTO_TCP, TCP_NODELAY,
+        SOCKOPT_ON, sizeof(int)) < 0)
+  {
+    Error("file: "__FILE__", line: %d, "
+        "setsockopt failed, errno: %d, error info: %s",
+        __LINE__, errno, strerror(errno));
+    return errno != 0 ? errno : EINVAL;
+  }
+
+  addr.sin_family = PF_INET;
+  addr.sin_port = htons(pSockContext->machine->cluster_port);
+  result = inet_aton(pSockContext->machine->hostname, &addr.sin_addr);
+  if (result == 0) {
+    close_connection(pSockContext);
+    remove_connection(pSockContext, needLock);
+    return EINVAL;
+  }
+
+  pConnectContext->connect_start_time = CURRENT_MS();   //connect start time
+  if (connect(pSockContext->sock, (const struct sockaddr*)&addr,
+        sizeof(addr)) == 0)  //success
+  {
+    pConnectContext->state = STATE_CONNECTED;
+    pConnectContext->need_check_timeout = true;
+    return connection_handler(pConnectContext, needLock);
+  }
+
+  result = errno != 0 ? errno : EINPROGRESS;
+  if (result != EINPROGRESS) {
+    Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+        "connect to %s:%d failed, errno: %d, error info: %s",
+        __LINE__, pSockContext->machine->hostname,
+        pSockContext->machine->cluster_port, result, strerror(result));
+    close_connection(pSockContext);
+    return result;
+  }
+
+  if (connect_thread_context.ev_poll->attach(pSockContext->sock,
+        EVENTIO_WRITE, pConnectContext) < 0)
+  {
+    result = errno != 0 ? errno : ENOMEM;
+    Error("file: " __FILE__ ", line: %d, "
+        "event poll attach fail, errno: %d, error info: %s",
+        __LINE__, errno, strerror(errno));
+    close_connection(pSockContext);
+    return result;
+  }
+
+  pConnectContext->need_check_timeout = true;
+  return result;
+}
+
+static ConnectContext *alloc_connect_context()
+{
+  ConnectContext *pConnectContext;
+  ConnectContext *pConnectEnd;
+
+  ink_mutex_acquire(&connect_thread_context.lock);
+  if (connect_thread_context.connection_count >=
+      connect_thread_context.alloc_size)
+  {
+    ink_mutex_release(&connect_thread_context.lock);
+    Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+        "exceeds max connection: %d",
+        __LINE__, connect_thread_context.alloc_size);
+    return NULL;
+  }
+
+  pConnectEnd = connect_thread_context.connections_buffer +
+    connect_thread_context.alloc_size;
+  for (pConnectContext=connect_thread_context.connections_buffer;
+      pConnectContext<pConnectEnd; pConnectContext++)
+  {
+    if (!pConnectContext->used) {
+      break;
+    }
+  }
+  if (pConnectContext == pConnectEnd) {
+    ink_mutex_release(&connect_thread_context.lock);
+    Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+        "alloc connection from buffer fail", __LINE__);
+    return NULL;
+  }
+
+  pConnectContext->used = true;
+  connect_thread_context.connections[connect_thread_context.
+    connection_count++] = pConnectContext;
+  ink_mutex_release(&connect_thread_context.lock);
+
+  pConnectContext->need_reconnect = false;
+  pConnectContext->need_check_timeout = false;
+  pConnectContext->reconnect_interval = 100;
+  pConnectContext->connect_count = 0;
+  pConnectContext->state = STATE_NOT_CONNECT;
+  pConnectContext->send_bytes = 0;
+  pConnectContext->recv_bytes = 0;
+  pConnectContext->total_bytes = sizeof(MsgHeader) + sizeof(HelloMessage);
+
+  return pConnectContext;
+}
+
+int machine_stop_reconnect(ClusterMachine *m)
+{
+  int count;
+  ConnectContext **ppConnection;
+  ConnectContext **ppConnectionEnd;
+
+  count = 0;
+  ink_mutex_acquire(&connect_thread_context.lock);
+  ppConnectionEnd = connect_thread_context.connections +
+    connect_thread_context.connection_count;
+  for (ppConnection=connect_thread_context.connections; ppConnection<ppConnectionEnd;
+      ppConnection++)
+  {
+    if ((*ppConnection)->pSockContext->machine == m) {
+      count++;
+      (*ppConnection)->need_reconnect = false;
+    }
+  }
+  ink_mutex_release(&connect_thread_context.lock);
+
+  return count > 0 ? 0 : ENOENT;
+}
+
+int machine_make_connections(ClusterMachine *m)
+{
+  int half_connections_per_machine;
+  int i;
+  int result;
+  SocketContext *pSockContext;
+
+  if ((result=init_machine_sessions(m, false)) != 0) {
+    return result;
+  }
+
+  half_connections_per_machine = num_of_cluster_connections / 2;
+  for (i=0; i<half_connections_per_machine; i++) {
+    pSockContext = alloc_connect_sock_context(m->ip);
+    if (pSockContext == NULL) {
+      return ENOSPC;
+    }
+
+    pSockContext->machine = m;
+    make_connection(pSockContext);
+  }
+
+  return 0;
+}
+
+int make_connection(SocketContext *pSockContext)
+{
+  ConnectContext *pConnectContext;
+
+  /*
+     Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+     "alloc connection, current count: %d", __LINE__,
+     connect_thread_context.connection_count);
+     */
+
+  if (find_connection(pSockContext) != NULL) {
+    Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+        "connection: %p already exist!", __LINE__, pSockContext);
+    return EEXIST;
+  }
+
+  pConnectContext = alloc_connect_context();
+  if (pConnectContext == NULL) {
+    return ENOSPC;
+  }
+
+  pConnectContext->need_reconnect = true;
+  pConnectContext->reconnect_interval = 100;
+  pConnectContext->pSockContext = pSockContext;
+  return do_connect(pConnectContext, true);
+}
+
+static int socket_bind(int sock, const char *bind_ipaddr, const int port)
+{
+  struct sockaddr_in bindaddr;
+
+  bindaddr.sin_family = AF_INET;
+  bindaddr.sin_port = htons(port);
+  if (bind_ipaddr == NULL || *bind_ipaddr == '\0') {
+    bindaddr.sin_addr.s_addr = INADDR_ANY;
+  }
+  else {
+    if (inet_aton(bind_ipaddr, &bindaddr.sin_addr) == 0) {
+      Error("file: "__FILE__", line: %d, "
+          "invalid ip address: %s", __LINE__, bind_ipaddr);
+      return EINVAL;
+    }
+  }
+
+  if (bind(sock, (struct sockaddr*)&bindaddr, sizeof(bindaddr)) < 0) {
+    Error("file: "__FILE__", line: %d, "
+        "bind port %d failed, errno: %d, error info: %s",
+        __LINE__, port, errno, strerror(errno));
+    return errno != 0 ? errno : ENOMEM;
+  }
+
+  return 0;
+}
+
+static int socket_server(const char *bind_ipaddr, const int port, int *err_no)
+{
+  int sock;
+  int result;
+
+  sock = socket(AF_INET, SOCK_STREAM, 0);
+  if (sock < 0) {
+    *err_no = errno != 0 ? errno : EMFILE;
+    Error("file: "__FILE__", line: %d, "
+        "socket create failed, errno: %d, error info: %s",
+        __LINE__, errno, strerror(errno));
+    return -1;
+  }
+
+  result = 1;
+  if (setsockopt(sock, SOL_SOCKET, SO_REUSEADDR, &result, sizeof(int))<0) {
+    *err_no = errno != 0 ? errno : ENOMEM;
+    Error("file: "__FILE__", line: %d, "
+        "setsockopt failed, errno: %d, error info: %s",
+        __LINE__, errno, strerror(errno));
+    close(sock);
+    return -1;
+  }
+
+  if ((*err_no=socket_bind(sock, bind_ipaddr, port)) != 0) {
+    close(sock);
+    return -1;
+  }
+
+  if (listen(sock, 1024) < 0)
+  {
+    *err_no = errno != 0 ? errno : EINVAL;
+    Error("file: "__FILE__", line: %d, "
+        "listen port %d failed, errno: %d, error info: %s",
+        __LINE__, port, errno, strerror(errno));
+    close(sock);
+    return -1;
+  }
+
+  *err_no = 0;
+  return sock;
+}
+
+static int set_server_sock_opt(int fd, const int timeout)
+{
+  int flags;
+  struct linger linger;
+
+  linger.l_onoff = 0;
+  linger.l_linger = 0;
+  if (setsockopt(fd, SOL_SOCKET, SO_LINGER,
+        &linger, (socklen_t)sizeof(struct linger)) < 0)
+  {
+    Error("file: "__FILE__", line: %d, "
+        "setsockopt failed, errno: %d, error info: %s",
+        __LINE__, errno, strerror(errno));
+    return errno != 0 ? errno : ENOMEM;
+  }
+
+  if (timeout > 0) {
+    struct timeval waittime;
+    waittime.tv_sec = timeout;
+    waittime.tv_usec = 0;
+    if (setsockopt(fd, SOL_SOCKET, SO_SNDTIMEO,
+          &waittime, (socklen_t)sizeof(struct timeval)) < 0)
+    {
+      Warning("file: "__FILE__", line: %d, "
+          "setsockopt failed, errno: %d, error info: %s",
+          __LINE__, errno, strerror(errno));
+    }
+
+    if (setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO,
+          &waittime, (socklen_t)sizeof(struct timeval)) < 0)
+    {
+      Warning("file: "__FILE__", line: %d, "
+          "setsockopt failed, errno: %d, error info: %s",
+          __LINE__, errno, strerror(errno));
+    }
+  }
+
+  flags = 1;
+  if (setsockopt(fd, IPPROTO_TCP, TCP_NODELAY,
+        (char *)&flags, sizeof(flags)) < 0)
+  {
+    Error("file: "__FILE__", line: %d, "
+        "setsockopt failed, errno: %d, error info: %s",
+        __LINE__, errno, strerror(errno));
+    return errno != 0 ? errno : EINVAL;
+  }
+
+  return 0;
+}
+
+int connection_manager_init(const unsigned int my_ip)
+{
+  ConnectContext *pConnectContext;
+  char bind_addr[IP_ADDRESS_SIZE];
+  int result;
+  int server_sock;
+
+  assert(MSG_HEADER_LENGTH % 16 == 0);
+  *bind_addr = '\0';
+  server_sock = socket_server(bind_addr, cluster_port, &result);
+  if (server_sock < 0) {
+    return errno != 0 ? errno : EIO;
+  }
+
+  if ((result=set_server_sock_opt(server_sock, 0)) != 0) {
+    return result;
+  }
+
+  if ((result=safe_nonblocking(server_sock)) != 0) {
+    return result;
+  }
+
+  if ((result=init_machines()) != 0) { 
+    return result;
+  }
+
+  if (my_ip > 0) {
+    my_machine_ip = my_ip;
+    add_machine(my_ip, cluster_port);
+  }
+
+  if ((result=nio_init()) != 0 || (result=connection_init()) != 0
+      || (result=session_init()) != 0)
+  {
+    return result;
+  }
+
+  pConnectContext = alloc_connect_context();
+  if (pConnectContext == NULL) {
+    return ENOSPC;
+  }
+
+  pConnectContext->pSockContext = socket_contexts_pool + 0;
+  pConnectContext->is_accept = true;
+  pConnectContext->pSockContext->sock = server_sock;
+  if (connect_thread_context.ev_poll->attach(server_sock, EVENTIO_READ,
+        pConnectContext) < 0)
+  {
+    result = errno != 0 ? errno : ENOMEM;
+    Error("file: " __FILE__ ", line: %d, "
+        "event poll attach fail, errno: %d, error info: %s",
+        __LINE__, errno, strerror(errno));
+    return result;
+  }
+
+  return 0;
+}
+
+void connection_manager_destroy()
+{
+}
+
+int connection_manager_start()
+{
+  if (ink_thread_create(connect_worker_entrance, NULL) == 0)
+  {
+    int result;
+    result = errno != 0 ? errno : ENOMEM;
+    Error("file: "__FILE__", line: %d, "
+        "create thread failed, errno: %d, error info: %s",
+        __LINE__, result, strerror(result));
+    return result;
+  }
+
+  return 0;
+}
+
+static int close_timeout_connections()
+{
+#define MAX_TIMEOUT_SOCKET_COUNT  64
+  ConnectContext **ppConnection;
+  ConnectContext **ppConnectionEnd;
+  SocketContext *pSockContext;
+  ConnectContext *timeoutConnectContexts[MAX_TIMEOUT_SOCKET_COUNT];
+  int timeout_count;
+  int i;
+  bool bTimeout;
+
+  timeout_count = 0;
+  ink_mutex_acquire(&connect_thread_context.lock);
+  ppConnectionEnd = connect_thread_context.connections +
+    connect_thread_context.connection_count;
+  ppConnection = connect_thread_context.connections;
+  while (ppConnection < ppConnectionEnd) {
+    pSockContext = (*ppConnection)->pSockContext;
+    if (!(*ppConnection)->need_check_timeout || pSockContext->sock < 0) {
+      ppConnection++;
+      continue;
+    }
+
+    if ((*ppConnection)->state == STATE_RECV_DATA) {
+      bTimeout = (CURRENT_MS() - (*ppConnection)->server_start_time >= 1000);
+    }
+    else {
+      bTimeout = ((*ppConnection)->state == STATE_CONNECTING &&
+          CURRENT_MS() - (*ppConnection)->connect_start_time >=
+          cluster_connect_timeout * 1000);
+    }
+
+    if (bTimeout) {
+      timeoutConnectContexts[timeout_count++] = *ppConnection;
+      if (timeout_count == MAX_TIMEOUT_SOCKET_COUNT) {
+        break;
+      }
+    }
+
+    ppConnection++;
+  }
+
+  for (i=0; i<timeout_count; i++) {
+    pSockContext = timeoutConnectContexts[i]->pSockContext;
+    if (connect_thread_context.ev_poll->detach(pSockContext->sock) < 0) {
+      Error("file: " __FILE__ ", line: %d, "
+          "event poll detach #%d fail, errno: %d, error info: %s",
+          __LINE__, pSockContext->sock, errno, strerror(errno));
+    }
+
+    Debug(CLUSTER_DEBUG_TAG, "file: " __FILE__ ", line: %d, "
+        "close timeout %s connection #%d %s:%d, type: %c",
+        __LINE__, timeoutConnectContexts[i]->state == STATE_RECV_DATA ?
+        "recv" : "connect", pSockContext->sock,
+        pSockContext->machine->hostname,
+        pSockContext->machine->cluster_port,
+        pSockContext->connect_type);
+
+    release_connection(pSockContext, false);
+  }
+
+  ink_mutex_release(&connect_thread_context.lock);
+  return 0;
+}
+
+static int do_reconnect()
+{
+  ConnectContext **ppConnection;
+  ConnectContext **ppConnectionEnd;
+  SocketContext *pSockContext;
+  int max_reconnect_interval;
+
+  ink_mutex_acquire(&connect_thread_context.lock);
+  ppConnectionEnd = connect_thread_context.connections +
+    connect_thread_context.connection_count;
+  ppConnection = connect_thread_context.connections;
+  while (ppConnection < ppConnectionEnd) {
+    if (*ppConnection == NULL || (*ppConnection)->pSockContext == NULL) {
+      Warning("file: " __FILE__ ", line: %d, "
+          "pConnection: %p, pSockContext: %p", __LINE__,
+          *ppConnection, *ppConnection != NULL ? (*ppConnection)->pSockContext : NULL);
+      ppConnection++;
+      continue;
+    }
+
+    if ((*ppConnection)->pSockContext->sock >= 0) {  //already in progress or connected
+      ppConnection++;
+      continue;
+    }
+
+    if ((*ppConnection)->need_reconnect) {
+      if ((*ppConnection)->connect_count > 0) {  //should reconnect
+        if (CURRENT_MS() - (*ppConnection)->connect_start_time <
+            (*ppConnection)->reconnect_interval)
+        {
+          ppConnection++;
+          continue;
+        }
+
+        (*ppConnection)->reconnect_interval *= 2;
+        if ((*ppConnection)->pSockContext->machine->dead) {
+          max_reconnect_interval = 1000;
+        }
+        else {
+          max_reconnect_interval = 30000;
+        }
+        if ((*ppConnection)->reconnect_interval > max_reconnect_interval) {
+          (*ppConnection)->reconnect_interval = max_reconnect_interval;
+        }
+        (*ppConnection)->need_check_timeout = false;
+        do_connect(*ppConnection, false);
+        ppConnection++;
+        ppConnectionEnd = connect_thread_context.connections +
+          connect_thread_context.connection_count;
+      }
+    }
+    else {   //should release
+      pSockContext = (*ppConnection)->pSockContext;
+      if (remove_connection(pSockContext, false) == 0) {  //removed
+        ppConnectionEnd = connect_thread_context.connections +
+          connect_thread_context.connection_count;
+      }
+      else {
+        ppConnection++;
+      }
+
+      free_connect_sock_context(pSockContext, false);
+    }
+  }
+  ink_mutex_release(&connect_thread_context.lock);
+
+  return 0;
+}
+
+static in_addr_t get_peer_ip(int sock, char *buff, const int bufferSize)
+{
+  struct sockaddr_in addr;
+  socklen_t addrlen;
+
+  memset(&addr, 0, sizeof(addr));
+  addrlen = sizeof(addr);
+
+  if (getpeername(sock, (struct sockaddr *)&addr, &addrlen) != 0) {
+    *buff = '\0';
+    return INADDR_NONE;
+  }
+
+  if (addrlen > 0) {
+    if (inet_ntop(AF_INET, &addr.sin_addr, buff, bufferSize) == NULL) {
+      *buff = '\0';
+    }
+  }
+  else {
+    *buff = '\0';
+  }
+
+  return addr.sin_addr.s_addr;
+}
+
+static int deal_income_connection(const int incomesock)
+{
+  int result;
+  char client_ip[IP_ADDRESS_SIZE];
+  in_addr_t ip;
+  ConnectContext *pConnectContext;
+  SocketContext *pSockContext;
+  ClusterMachine *machine;
+
+  if ((result=safe_nonblocking(incomesock)) != 0) {
+    return result;
+  }
+  if (safe_setsockopt(incomesock, IPPROTO_TCP, TCP_NODELAY, SOCKOPT_ON, sizeof(int)) < 0) {
+    Error("file: "__FILE__", line: %d, "
+        "setsockopt failed, errno: %d, error info: %s",
+        __LINE__, errno, strerror(errno));
+    return errno != 0 ? errno : EINVAL;
+  }
+
+  ip = get_peer_ip(incomesock, client_ip, sizeof(client_ip));
+  machine = get_machine(ip, cluster_port);
+  if (machine == NULL) {
+    Debug(CLUSTER_DEBUG_TAG, "file: " __FILE__ ", line: %d, "
+        "client: %s not in my machine list",
+        __LINE__, client_ip);
+    return ENOENT;
+  }
+
+  /*
+     Debug(CLUSTER_DEBUG_TAG, "file: " __FILE__ ", line: %d, "
+     "income client_ip: %s, ip: %u == %u, sock: #%d", __LINE__,
+     client_ip, ip, machine->ip, incomesock);
+     */
+
+  pSockContext = alloc_accept_sock_context(machine->ip);
+  if (pSockContext == NULL) {
+    Debug(CLUSTER_DEBUG_TAG, "file: " __FILE__ ", line: %d, "
+        "client: %s, too many income connections, exceeds %d",
+        __LINE__, client_ip, num_of_cluster_connections / 2);
+    return ENOSPC;
+  }
+
+  pSockContext->sock = incomesock;
+  pSockContext->machine = machine;
+
+  pConnectContext = alloc_connect_context();
+  if (pConnectContext == NULL) {
+    free_accept_sock_context(pSockContext);
+    return ENOSPC;
+  }
+
+  pConnectContext->pSockContext = pSockContext;
+  pConnectContext->state = STATE_CONNECTED;
+  pConnectContext->need_check_timeout = true;
+  connection_handler(pConnectContext, true);
+  return 0;
+}
+
+static int deal_accept_event(SocketContext *pSockContext)
+{
+  int incomesock;
+  int result;
+  struct sockaddr_in inaddr;
+  socklen_t sockaddr_len;
+
+  sockaddr_len = sizeof(inaddr);
+  incomesock = accept(pSockContext->sock, (struct sockaddr*)&inaddr,
+      &sockaddr_len);
+  if (incomesock < 0) {  //error
+    result = errno != 0 ? errno : EAGAIN;
+    if (result == EINTR) {
+      Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+          "accept failed, errno: %d, error info: %s",
+          __LINE__, errno, strerror(errno));
+      return 0; //should try again
+    }
+    else if (!(errno == EAGAIN)) {
+      Error("file: "__FILE__", line: %d, "
+          "accept failed, errno: %d, error info: %s",
+          __LINE__, result, strerror(result));
+    }
+
+    return result;
+  }
+
+  result = deal_income_connection(incomesock);
+  if (result != 0)
+  {
+    close(incomesock);
+  }
+
+  return 0;
+}
+
+static int deal_connect_events(const int count)
+{
+  int events;
+  ConnectContext *pConnectContext;
+  SocketContext *pSockContext;
+  //static int counter = 0;
+
+  for (int i=0; i<count; i++) {
+    events =  connect_thread_context.ev_poll->getEvents(i);
+    pConnectContext = (ConnectContext *)connect_thread_context.ev_poll->getData(i);
+    pSockContext = pConnectContext->pSockContext;
+
+    if (pConnectContext->is_accept) {
+      while (deal_accept_event(pSockContext) == 0) {
+      }
+      continue;
+    }
+
+    /*
+       Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+       "%d. connections #%d  %s:%d, type: %c, poll events: %d", __LINE__,
+       ++counter, pSockContext->sock, pSockContext->machine->hostname,
+       pSockContext->machine->cluster_port, pSockContext->connect_type,
+       events);
+       */
+
+    if ((events & EVENTIO_ERROR) != 0) {
+      Debug(CLUSTER_DEBUG_TAG, "file: " __FILE__ ", line: %d, "
+          "connect %s %s:%d fail, connection closed",
+          __LINE__, pSockContext->connect_type == CONNECT_TYPE_SERVER ?
+          "from" : "to", pSockContext->machine->hostname,
+          pSockContext->machine->cluster_port);
+      release_connection(pSockContext, true);
+      continue;
+    }
+
+    /*
+       Debug(CLUSTER_DEBUG_TAG, "file: " __FILE__ ", line: %d, "
+       "====in: %d, out: %d====", __LINE__, (events & EVENTIO_READ),
+       (events & EVENTIO_WRITE));
+       */
+
+    if ((events & EVENTIO_READ) || (events & EVENTIO_WRITE)) {
+      connection_handler(pConnectContext, true);
+    }
+  }
+
+  return 0;
+}
+
+void *connect_worker_entrance(void * /* arg */)
+{
+  int count;
+  time_t last_cluster_stat_time;
+
+#if defined(TRIGGER_STAT_FLAG) || defined(MSG_TIME_STAT_FLAG)
+  time_t last_msg_stat_time;
+#endif
+
+#if defined(HAVE_SYS_PRCTL_H) && defined(PR_SET_NAME)
+  prctl(PR_SET_NAME, "[ET_CLUSTER 0]", 0, 0, 0); 
+#endif
+
+  last_cluster_stat_time = CURRENT_TIME();
+#if defined(TRIGGER_STAT_FLAG) || defined(MSG_TIME_STAT_FLAG)
+  last_msg_stat_time = CURRENT_TIME();
+#endif
+
+  while (1) {
+    if (CURRENT_TIME() - last_cluster_stat_time > 1) {
+      log_session_stat();
+      log_nio_stats();
+      last_cluster_stat_time = CURRENT_TIME();
+    }
+
+#if defined(TRIGGER_STAT_FLAG) || defined(MSG_TIME_STAT_FLAG)
+    if (CURRENT_TIME() - last_msg_stat_time >= 60) {
+#ifdef TRIGGER_STAT_FLAG
+      log_trigger_stat();
+#endif
+
+#ifdef MSG_TIME_STAT_FLAG
+      log_msg_time_stat();
+#endif
+
+      last_msg_stat_time = CURRENT_TIME();
+    }
+#endif
+
+    if (connect_thread_context.connection_count > 1) {
+      do_reconnect();
+    }
+
+    count = connect_thread_context.ev_poll->poll();
+    if (count == 0) { //timeout
+      if (connect_thread_context.connection_count > 1) {
+        close_timeout_connections();
+      }
+      continue;
+    }
+    if (count < 0) {
+      if (errno != EINTR) {
+        ink_fatal(1, "file: "__FILE__", line: %d, "
+            "call event poll fail, errno: %d, error info: %s\n",
+            __LINE__, errno, strerror(errno));
+      }
+      continue;
+    }
+
+    deal_connect_events(count);
+  }
+
+  return NULL;
+}
+
+int add_machine_sock_context(SocketContext *pSockContext)
+{
+  SocketContextArray *contextArray;
+  SocketContext **oldContexts;
+  SocketContext **newContexts;
+  int bytes;
+  int machine_id;
+  if ((machine_id=get_machine_index(pSockContext->machine->ip)) < 0) {
+    return ENOENT;
+  }
+
+  ink_mutex_acquire(&connect_thread_context.lock);
+  contextArray = &machine_sockets[machine_id].connected_list;
+  if (contextArray->count >= contextArray->alloc_size) {
+    if (contextArray->alloc_size == 0) {
+      contextArray->alloc_size = 64;
+    }
+    else {
+      contextArray->alloc_size *= 2;
+    }
+
+    bytes = sizeof(SocketContext *) * contextArray->alloc_size;
+    newContexts = (SocketContext **)malloc(bytes);
+    if (newContexts == NULL) {
+      Error("file: "__FILE__", line: %d, "
+          "malloc %d bytes fail, errno: %d, error info: %s",
+          __LINE__, bytes, errno, strerror(errno));
+      ink_mutex_release(&connect_thread_context.lock);
+      return errno != 0 ? errno : ENOMEM;
+    }
+
+    memset(newContexts, 0, bytes);
+    if (contextArray->count > 0) {
+      memcpy(newContexts, contextArray->contexts,
+          sizeof(SocketContext *) * contextArray->count);
+    }
+
+    oldContexts = contextArray->contexts;
+    contextArray->contexts = newContexts;
+    if (oldContexts != NULL) {
+      free(oldContexts);
+    }
+  }
+
+  contextArray->contexts[contextArray->count++] = pSockContext;
+  ink_mutex_release(&connect_thread_context.lock);
+
+  return 0;
+}
+
+int remove_machine_sock_context(SocketContext *pSockContext)
+{
+  SocketContextArray *contextArray;
+  unsigned int found;
+  unsigned int i;
+  int machine_id;
+
+  if ((machine_id=get_machine_index(pSockContext->machine->ip)) < 0) {
+    return ENOENT;
+  }
+
+  ink_mutex_acquire(&connect_thread_context.lock);
+  contextArray = &machine_sockets[machine_id].connected_list;
+  if (contextArray->count == 0) {
+    ink_mutex_release(&connect_thread_context.lock);
+    return ENOENT;
+  }
+
+  for (found=0; found<contextArray->count; found++) {
+    if (contextArray->contexts[found] == pSockContext) {
+      break;
+    }
+  }
+
+  if (found == contextArray->count) {
+    ink_mutex_release(&connect_thread_context.lock);
+    return ENOENT;
+  }
+
+  for (i=found+1; i<contextArray->count; i++) {
+    contextArray->contexts[i-1] = contextArray->contexts[i];
+  }
+  contextArray->contexts[--contextArray->count] = NULL;
+  ink_mutex_release(&connect_thread_context.lock);
+
+  return 0;
+}
+
+SocketContext *get_socket_context(const ClusterMachine *machine)
+{
+  SocketContextArray *pSocketContextArray;
+  int machine_id;
+  int context_count;
+  unsigned int context_index;
+
+  if ((machine_id=get_machine_index(machine->ip)) < 0) {
+    Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+        "the index of ip addr: %s not exist", __LINE__, machine->hostname);
+    return NULL;
+  }
+
+  pSocketContextArray = &machine_sockets[machine_id].connected_list;
+  context_count = pSocketContextArray->count;
+  if (context_count <= 0) {
+    /*
+       Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+       "the socket context count of ip addr: %s is zero",
+       __LINE__, machine->hostname);
+       */
+    return NULL;
+  }
+
+  context_index = ink_atomic_increment(&pSocketContextArray->index, 1) %
+    context_count;
+
+  return pSocketContextArray->contexts[context_index];
+}
+

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/connection.h
----------------------------------------------------------------------
diff --git a/iocore/cluster/connection.h b/iocore/cluster/connection.h
new file mode 100644
index 0000000..ac53ff0
--- /dev/null
+++ b/iocore/cluster/connection.h
@@ -0,0 +1,75 @@
+/** @file
+
+  A brief file description
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#ifndef _CONNECTION_H_
+#define _CONNECTION_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "types.h"
+
+typedef struct socket_context_array {
+  SocketContext **contexts;
+  unsigned int alloc_size;   //alloc size
+  unsigned int count;        //item count
+  volatile unsigned int index;   //current select index
+} SocketContextArray;
+
+typedef struct socket_context_by_machine {
+  unsigned int ip;
+  socket_context_array connected_list;  //connected sockets
+  SocketContext *accept_free_list;  //socket malloc for accept
+  SocketContext *connect_free_list; //socket malloc for connect
+} SocketContextsByMachine;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int connection_init();
+void connection_destroy();
+
+int connection_manager_init(const unsigned int my_ip);
+void connection_manager_destroy();
+int connection_manager_start();
+
+int log_message_stat(void *arg);
+
+SocketContext *get_socket_context(const ClusterMachine *machine);
+
+void free_accept_sock_context(SocketContext *pSockContext);
+
+int machine_make_connections(ClusterMachine *m);
+int machine_stop_reconnect(ClusterMachine *m);
+int make_connection(SocketContext *pSockContext);
+
+int add_machine_sock_context(SocketContext *pSockContext);
+int remove_machine_sock_context(SocketContext *pSockContext);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/global.cc
----------------------------------------------------------------------
diff --git a/iocore/cluster/global.cc b/iocore/cluster/global.cc
new file mode 100644
index 0000000..0556763
--- /dev/null
+++ b/iocore/cluster/global.cc
@@ -0,0 +1,40 @@
+/** @file
+
+  A brief file description
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#include "global.h"
+
+int cluster_connect_timeout = 1;
+
+//cluster flow control
+int64_t cluster_flow_ctrl_min_bps = 0; //bit
+int64_t cluster_flow_ctrl_max_bps = 0; //bit
+int cluster_send_min_wait_time = 1000; //us
+int cluster_send_max_wait_time = 5000; //us
+int cluster_min_loop_interval = 0;     //us
+int cluster_max_loop_interval = 1000;  //us
+int64_t cluster_ping_send_interval= 0;
+int64_t cluster_ping_latency_threshold = 0;
+int cluster_ping_retries = 3;
+int max_session_count_per_machine = 1000000;
+int session_lock_count_per_machine =  10949;
+

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/global.h
----------------------------------------------------------------------
diff --git a/iocore/cluster/global.h b/iocore/cluster/global.h
new file mode 100644
index 0000000..7fed404
--- /dev/null
+++ b/iocore/cluster/global.h
@@ -0,0 +1,61 @@
+/** @file
+
+  A brief file description
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#ifndef _GLOBAL_H
+#define _GLOBAL_H
+
+#include <stdint.h>
+#include "types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern int num_of_cluster_threads;
+extern int num_of_cluster_connections;   //must be an even number
+extern int cluster_send_buffer_size;
+extern int cluster_receive_buffer_size;
+extern int cluster_connect_timeout;  //second
+
+//cluster flow control
+extern int64_t cluster_flow_ctrl_min_bps; //bit
+extern int64_t cluster_flow_ctrl_max_bps; //bit
+extern int cluster_send_min_wait_time; //us
+extern int cluster_send_max_wait_time; //us
+extern int cluster_min_loop_interval;  //us
+extern int cluster_max_loop_interval;  //us
+
+//cluster ping
+extern int64_t cluster_ping_send_interval;
+extern int64_t cluster_ping_latency_threshold;
+extern int cluster_ping_retries;
+
+extern int max_session_count_per_machine;
+extern int session_lock_count_per_machine;
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/machine.cc
----------------------------------------------------------------------
diff --git a/iocore/cluster/machine.cc b/iocore/cluster/machine.cc
new file mode 100644
index 0000000..18ac1a9
--- /dev/null
+++ b/iocore/cluster/machine.cc
@@ -0,0 +1,269 @@
+/** @file
+
+  A brief file description
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#include <time.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <unistd.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <netinet/in.h>
+#include <fcntl.h>
+#include "Diags.h"
+#include "global.h"
+#include "nio.h"
+#include "connection.h"
+#include "message.h"
+#include "machine.h"
+#include "ink_config.h"
+#include "P_Cluster.h"
+
+unsigned int my_machine_ip = 0;
+int cluster_machine_count = 0; //total machine count of the cluster
+
+ClusterMachine *cluster_machines = NULL;
+static ClusterMachine **sorted_machines = NULL;  //sort by ip and port
+static ink_mutex machine_lock;
+
+static ClusterMachine *do_add_machine(ClusterMachine *m, int *result);
+
+ClusterMachine *add_machine(const unsigned int ip, const int port)
+{
+  ClusterMachine machine;
+  struct in_addr in;
+  int result;
+  char *ip_addr;
+
+  memset(&machine, 0, sizeof(machine));
+  in.s_addr = ip;
+  ip_addr = inet_ntoa(in);
+  machine.hostname_len = strlen(ip_addr);
+  machine.hostname = strdup(ip_addr);
+  machine.cluster_port = port;
+  machine.ip = ip;
+
+  return do_add_machine(&machine, &result);
+}
+
+int init_machines()
+{
+  int result;
+  int bytes;
+
+  if ((result=ink_mutex_init(&machine_lock, "machine_lock")) != 0) {
+    return result;
+  }
+
+  cluster_machine_count = 0;
+  bytes = sizeof(ClusterMachine) * MAX_MACHINE_COUNT;
+  cluster_machines = (ClusterMachine *)malloc(bytes);
+  if (cluster_machines == NULL) {
+    Error("file: "__FILE__", line: %d, "
+        "malloc %d bytes fail!", __LINE__, bytes);
+    return ENOMEM;
+  }
+  memset(cluster_machines, 0, bytes);
+
+  bytes = sizeof(ClusterMachine *) * MAX_MACHINE_COUNT;
+  sorted_machines = (ClusterMachine **)malloc(bytes);
+  if (sorted_machines == NULL) {
+    Error("file: "__FILE__", line: %d, "
+        "malloc %d bytes fail!", __LINE__, bytes);
+    return ENOMEM;
+  }
+  memset(sorted_machines, 0, bytes);
+
+  return 0;
+}
+
+static int compare_machine(const void *p1, const void *p2)
+{
+  const ClusterMachine **m1 = (const ClusterMachine **)p1;
+  const ClusterMachine **m2 = (const ClusterMachine **)p2;
+
+  if ((*m1)->ip == (*m2)->ip) {
+    return (*m1)->cluster_port - (*m2)->cluster_port;
+  }
+  else {
+    return (*m1)->ip < (*m2)->ip ? -1 : 1;
+  }
+}
+
+static ClusterMachine *do_add_machine(ClusterMachine *m, int *result)
+{
+  ClusterMachine **ppMachine;
+  ClusterMachine **ppMachineEnd;
+  ClusterMachine **pp;
+  ClusterMachine *pMachine;
+  int cr;
+
+  cr = -1;
+  ink_mutex_acquire(&machine_lock);
+  ppMachineEnd = sorted_machines + cluster_machine_count;
+  for (ppMachine=sorted_machines; ppMachine<ppMachineEnd; ppMachine++) {
+    cr = compare_machine(&m, ppMachine);
+    if (cr <= 0) {
+      break;
+    }
+  }
+
+  do {
+    if (cr == 0) {  //found
+      pMachine = *ppMachine;
+      *result = EEXIST;
+      break;
+    }
+
+    if (cluster_machine_count >= MAX_MACHINE_COUNT) {
+      Error("file: "__FILE__", line: %d, "
+          "host: %s:%u, exceeds max machine: %d!", __LINE__, m->hostname,
+          m->cluster_port, MAX_MACHINE_COUNT);
+      *result = ENOSPC;
+      pMachine = NULL;
+      break;
+    }
+
+    for (pp=ppMachineEnd; pp>ppMachine; pp--) {
+      *pp = *(pp - 1);
+    }
+
+    pMachine = cluster_machines + cluster_machine_count;  //the last emlement
+    *ppMachine = pMachine;
+
+    pMachine->dead = true;
+    pMachine->ip = m->ip;
+    pMachine->cluster_port = m->cluster_port;
+    pMachine->hostname_len = m->hostname_len;
+    if (m->hostname_len == 0) {
+      pMachine->hostname = NULL;
+    }
+    else {
+      pMachine->hostname = (char *)malloc(m->hostname_len + 1);
+      if (pMachine->hostname == NULL) {
+        Error("file: "__FILE__", line: %d, "
+            "malloc %d bytes fail!", __LINE__, m->hostname_len + 1);
+        *result = ENOMEM;
+        break;
+      }
+      memcpy(pMachine->hostname, m->hostname, m->hostname_len + 1);
+    }
+
+    cluster_machine_count++;
+    *result = 0;
+  } while (0);
+
+  ink_mutex_release(&machine_lock);
+  return pMachine;
+}
+
+ClusterMachine *get_machine(const unsigned int ip, const int port)
+{
+  ClusterMachine machine;
+  ClusterMachine *target;
+  ClusterMachine **found;
+
+  memset(&machine, 0, sizeof(machine));
+  machine.ip = ip;
+  machine.cluster_port = port;
+  target = &machine;
+  found = (ClusterMachine **)bsearch(&target, sorted_machines, cluster_machine_count,
+      sizeof(ClusterMachine *), compare_machine);
+  if (found != NULL) {
+    return *found;
+  }
+  else {
+    return NULL;
+  }
+}
+
+int machine_up_notify(ClusterMachine *machine)
+{
+  if (machine == NULL) {
+    return ENOENT;
+  }
+
+  ink_mutex_acquire(&machine_lock);
+
+  Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+      "machine_up_notify, %s connection count: %d, dead: %d",
+      __LINE__, machine->hostname, machine->now_connections, machine->dead);
+
+  if (machine->dead) {
+    machine->dead = false;
+    cluster_machine_change_notify(machine);
+  }
+  ink_mutex_release(&machine_lock);
+
+  return 0;
+}
+
+int machine_add_connection(SocketContext *pSockContext)
+{
+  int result;
+  int count;
+
+  ink_mutex_acquire(&machine_lock);
+  if ((result=nio_add_to_epoll(pSockContext)) != 0) {
+    ink_mutex_release(&machine_lock);
+    return result;
+  }
+
+  count = ++pSockContext->machine->now_connections;
+  ink_mutex_release(&machine_lock);
+
+  Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+      "%s add %c connection count: %d, dead: %d", __LINE__,
+      pSockContext->machine->hostname, pSockContext->connect_type,
+      count, pSockContext->machine->dead);
+
+  return 0;
+}
+
+int machine_remove_connection(SocketContext *pSockContext)
+{
+  int count;
+  int result;
+
+  ink_mutex_acquire(&machine_lock);
+  if ((result=remove_machine_sock_context(pSockContext)) != 0) {
+    ink_mutex_release(&machine_lock);
+    return result;
+  }
+
+  count = --pSockContext->machine->now_connections;
+  if (count == 0 && !pSockContext->machine->dead) { //should remove machine from config
+    pSockContext->machine->dead = true;
+    cluster_machine_change_notify(pSockContext->machine);
+  }
+  ink_mutex_release(&machine_lock);
+
+  Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+      "%s remove %c connection count: %d, dead: %d", __LINE__,
+      pSockContext->machine->hostname, pSockContext->connect_type,
+      count, pSockContext->machine->dead);
+
+  return 0;
+}
+

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/machine.h
----------------------------------------------------------------------
diff --git a/iocore/cluster/machine.h b/iocore/cluster/machine.h
new file mode 100644
index 0000000..8ea6981
--- /dev/null
+++ b/iocore/cluster/machine.h
@@ -0,0 +1,51 @@
+/** @file
+
+  A brief file description
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#ifndef _MACHINE_H
+#define _MACHINE_H
+
+#include "types.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern unsigned int my_machine_ip;
+extern int cluster_machine_count;
+extern struct ClusterMachine *cluster_machines;
+
+int init_machines();
+ClusterMachine *add_machine(const unsigned int ip, const int port);
+
+ClusterMachine *get_machine(const unsigned int ip, const int port);
+
+int machine_up_notify(ClusterMachine *machine);
+int machine_add_connection(SocketContext *pSockContext);
+int machine_remove_connection(SocketContext *pSockContext);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+


[6/6] git commit: refine the codes of cluster

Posted by we...@apache.org.
refine the codes of cluster


Project: http://git-wip-us.apache.org/repos/asf/trafficserver/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafficserver/commit/62504a9f
Tree: http://git-wip-us.apache.org/repos/asf/trafficserver/tree/62504a9f
Diff: http://git-wip-us.apache.org/repos/asf/trafficserver/diff/62504a9f

Branch: refs/heads/refine_cluster
Commit: 62504a9f82e7e862015aeb2393cda981ff78789e
Parents: 27246a5
Author: weijin <we...@apache.org>
Authored: Fri Oct 11 16:03:18 2013 +0800
Committer: weijin <ta...@taobao.com>
Committed: Wed Dec 4 11:37:40 2013 +0800

----------------------------------------------------------------------
 iocore/cache/Cache.cc                   |    2 +-
 iocore/cache/CacheRead.cc               |    2 +-
 iocore/cache/I_Cache.h                  |    2 +-
 iocore/cache/P_Cache.h                  |    2 +
 iocore/cache/P_CacheInternal.h          |    7 +-
 iocore/cluster/ClusterCache.cc          | 4679 +++++++++++++++-----------
 iocore/cluster/ClusterConfig.cc         |   80 +-
 iocore/cluster/ClusterMachine.cc        |   14 +-
 iocore/cluster/ClusterProcessor.cc      |  623 ++--
 iocore/cluster/ClusterVConnection.cc    |  548 ++-
 iocore/cluster/EventPoll.cc             |  158 +
 iocore/cluster/EventPoll.h              |  105 +
 iocore/cluster/Makefile.am              |    9 +-
 iocore/cluster/P_Cluster.h              |    2 +
 iocore/cluster/P_ClusterCache.h         |  367 +-
 iocore/cluster/P_ClusterCacheInternal.h |  374 +-
 iocore/cluster/P_ClusterInline.h        |  232 +-
 iocore/cluster/clusterinterface.h       |  104 +
 iocore/cluster/connection.cc            | 1726 ++++++++++
 iocore/cluster/connection.h             |   75 +
 iocore/cluster/global.cc                |   40 +
 iocore/cluster/global.h                 |   61 +
 iocore/cluster/machine.cc               |  269 ++
 iocore/cluster/machine.h                |   51 +
 iocore/cluster/message.cc               |  229 ++
 iocore/cluster/message.h                |   75 +
 iocore/cluster/nio.cc                   | 1701 ++++++++++
 iocore/cluster/nio.h                    |   60 +
 iocore/cluster/session.cc               | 1267 +++++++
 iocore/cluster/session.h                |   97 +
 iocore/cluster/types.h                  |  235 ++
 iocore/eventsystem/I_Event.h            |    1 +
 iocore/eventsystem/P_IOBuffer.h         |   20 +-
 mgmt/RecordsConfig.cc                   |   18 +
 34 files changed, 10851 insertions(+), 2384 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cache/Cache.cc
----------------------------------------------------------------------
diff --git a/iocore/cache/Cache.cc b/iocore/cache/Cache.cc
index 66f2b70..8d4b1e5 100644
--- a/iocore/cache/Cache.cc
+++ b/iocore/cache/Cache.cc
@@ -82,7 +82,7 @@ int cache_config_read_while_writer = 0;
 char cache_system_config_directory[PATH_NAME_MAX + 1];
 int cache_config_mutex_retry_delay = 2;
 #ifdef HTTP_CACHE
-static int enable_cache_empty_http_doc = 0;
+int enable_cache_empty_http_doc = 0;
 #endif
 
 #if TS_USE_INTERIM_CACHE == 1

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cache/CacheRead.cc
----------------------------------------------------------------------
diff --git a/iocore/cache/CacheRead.cc b/iocore/cache/CacheRead.cc
index 3c97305..90a7bd5 100644
--- a/iocore/cache/CacheRead.cc
+++ b/iocore/cache/CacheRead.cc
@@ -722,7 +722,7 @@ CacheVC::openReadMain(int /* event ATS_UNUSED */, Event * /* e ATS_UNUSED */)
       return EVENT_DONE;
     // we have to keep reading until we give the user all the
     // bytes it wanted or we hit the watermark.
-    if (vio.ntodo() > 0 && !vio.buffer.writer()->high_water())
+    if (!f.cluster && vio.ntodo() > 0 && !vio.buffer.writer()->high_water())
       goto Lread;
     return EVENT_CONT;
   }

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cache/I_Cache.h
----------------------------------------------------------------------
diff --git a/iocore/cache/I_Cache.h b/iocore/cache/I_Cache.h
index 00e4791..cd8dda9 100644
--- a/iocore/cache/I_Cache.h
+++ b/iocore/cache/I_Cache.h
@@ -193,7 +193,7 @@ struct CacheVConnection:public VConnection
   virtual void set_http_info(CacheHTTPInfo *info) = 0;
   virtual void get_http_info(CacheHTTPInfo **info) = 0;
 #endif
-
+  virtual bool is_read_from_writer() = 0;
   virtual bool is_ram_cache_hit() const = 0;
   virtual bool set_disk_io_priority(int priority) = 0;
   virtual int get_disk_io_priority() = 0;

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cache/P_Cache.h
----------------------------------------------------------------------
diff --git a/iocore/cache/P_Cache.h b/iocore/cache/P_Cache.h
index ed738e3..e6bba36 100644
--- a/iocore/cache/P_Cache.h
+++ b/iocore/cache/P_Cache.h
@@ -44,4 +44,6 @@
 #include "P_CacheInternal.h"
 #include "P_CacheHosting.h"
 #include "P_CacheHttp.h"
+#include "clusterinterface.h"
+
 #endif /* _P_CACHE_H */

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cache/P_CacheInternal.h
----------------------------------------------------------------------
diff --git a/iocore/cache/P_CacheInternal.h b/iocore/cache/P_CacheInternal.h
index 4f33fdc..e5f0a11 100644
--- a/iocore/cache/P_CacheInternal.h
+++ b/iocore/cache/P_CacheInternal.h
@@ -368,7 +368,9 @@ struct CacheVC: public CacheVConnection
   virtual time_t get_pin_in_cache();
   virtual bool set_disk_io_priority(int priority);
   virtual int get_disk_io_priority();
-
+  virtual bool is_read_from_writer() {
+    return f.read_from_writer_called;
+  }
   /** Get the fragment table.
       @return The address of the start of the fragment table,
       or @c NULL if there is no fragment table.
@@ -505,6 +507,7 @@ struct CacheVC: public CacheVConnection
 #ifdef HTTP_CACHE
       unsigned int allow_empty_doc:1; // used for cache empty http document
 #endif
+      unsigned int cluster:1;
     } f;
   };
   // BTF optimization used to skip reading stuff in cache partition that doesn't contain any
@@ -1054,7 +1057,7 @@ struct Cache
   Action *open_write(Continuation *cont, URL *url, CacheHTTPHdr *request,
                      CacheHTTPInfo *old_info, time_t pin_in_cache = (time_t) 0,
                      CacheFragType type = CACHE_FRAG_TYPE_HTTP);
-  static void generate_key(INK_MD5 *md5, URL *url, CacheHTTPHdr *request);
+  static void generate_key(INK_MD5 *md5, URL *url, CacheHTTPHdr *request = 0);
 #endif
 
   Action *link(Continuation *cont, CacheKey *from, CacheKey *to, CacheFragType type, char *hostname, int host_len);


[2/6] refine the codes of cluster

Posted by we...@apache.org.
http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/message.cc
----------------------------------------------------------------------
diff --git a/iocore/cluster/message.cc b/iocore/cluster/message.cc
new file mode 100644
index 0000000..aa364f2
--- /dev/null
+++ b/iocore/cluster/message.cc
@@ -0,0 +1,229 @@
+/** @file
+
+  A brief file description
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <sys/epoll.h>
+#include "Diags.h"
+#include "global.h"
+#include "machine.h"
+#include "nio.h"
+#include "clusterinterface.h"
+#include "session.h"
+#ifndef TS_INLINE
+#define TS_INLINE inline
+#endif
+#include "I_IOBuffer.h"
+#include "P_Cluster.h"
+#include "message.h"
+
+#ifndef USE_MULTI_ALLOCATOR
+Allocator out_message_allocator("OutMessage", sizeof(OutMessage), 1024);
+#endif
+
+inline int64_t get_total_size(IOBufferBlock *blocks) {
+  IOBufferBlock *b = blocks;
+  int64_t total_avail = 0;
+  while (b != NULL) {
+    total_avail += b->read_avail();
+    b = b->next;
+  }
+  return total_avail;
+}
+
+int cluster_send_message(ClusterSession session, const int func_id,
+    void *data, const int data_len, const MessagePriority priority)
+{
+  MachineSessions *pMachineSessions;
+  SessionEntry *pSessionEntry;
+  SocketContext *pSockContext;
+  OutMessage *pMessage;
+  int result;
+
+  if ((result=get_session_for_send(&session, &pMachineSessions,
+          &pSessionEntry)) != 0)
+  {
+    return result;
+  }
+
+  pSockContext = pSessionEntry->sock_context;
+  if (pSockContext == NULL) {  //session closed
+    return ENOENT;
+  }
+
+#ifdef USE_MULTI_ALLOCATOR
+  pMessage = (OutMessage *)pSockContext->out_msg_allocator->alloc_void();
+#else
+  pMessage = (OutMessage *)out_message_allocator.alloc_void();
+#endif
+
+  if (pMessage == NULL) {
+    Error("file: "__FILE__", line: %d, "
+        "malloc %d bytes fail, errno: %d, error info: %s",
+        __LINE__, (int)sizeof(OutMessage), errno, strerror(errno));
+    return errno != 0 ? errno : ENOMEM;
+  }
+
+#ifdef MSG_TIME_STAT_FLAG
+  int session_index;
+  session_index = session.fields.seq % max_session_count_per_machine;
+  SESSION_LOCK(pMachineSessions, session_index);
+
+  if (session.fields.ip == my_machine_ip) {  //request by me
+    if (pSessionEntry->client_start_time == 0) {
+      pSessionEntry->client_start_time = CURRENT_NS();
+    }
+  }
+
+  if (pSessionEntry->send_start_time == 0) {
+    pSessionEntry->send_start_time = CURRENT_NS();
+  }
+
+  SESSION_UNLOCK(pMachineSessions, session_index);
+#endif
+
+  do {
+#ifdef CHECK_MAGIC_NUMBER
+    pMessage->header.magic = MAGIC_NUMBER;
+#endif
+    pMessage->header.func_id = func_id;
+    pMessage->header.session_id = session;
+    pMessage->header.msg_seq = ink_atomic_increment(
+        &pSessionEntry->current_msg_seq, 1) + 1;
+    pMessage->in_queue_time = CURRENT_NS();
+    pMessage->bytes_sent = 0;
+    pMessage->blocks.m_ptr = NULL;
+    pMessage->next = NULL;
+
+    if (data_len < 0) {  //object
+      pMessage->data_type = DATA_TYPE_OBJECT;
+      pMessage->blocks = (IOBufferBlock *)data;
+      pMessage->header.data_len = get_total_size(pMessage->blocks);
+    }
+    else {
+      if (data_len > MINI_MESSAGE_SIZE) {
+        Error("file: "__FILE__", line: %d, "
+            "invalid data length: %d exceeds %d!",
+            __LINE__, data_len, MINI_MESSAGE_SIZE);
+        result = errno != 0 ? errno : ENOMEM;
+        break;
+      }
+
+      pMessage->data_type = DATA_TYPE_BUFFER;
+      pMessage->blocks = NULL;
+      pMessage->header.data_len = data_len;
+      memcpy(pMessage->mini_buff, data, data_len);
+    }
+
+    pMessage->header.aligned_data_len = BYTE_ALIGN8(
+        pMessage->header.data_len);
+    result = push_to_send_queue(pSockContext,
+        pMessage, priority, pSessionEntry->version);
+  } while (0);
+
+  if (result != 0) {
+    release_out_message(pSockContext, pMessage);
+  }
+
+  return result;
+}
+
+int cluster_send_msg_internal_ex(const ClusterSession *session,
+    SocketContext *pSockContext, const int func_id,
+    void *data, const int data_len, const MessagePriority priority,
+    push_to_send_queue_func push_to_queue_func)
+{
+  OutMessage *pMessage;
+  int result;
+
+  if (pSockContext == NULL) {  //session closed
+    return ENOENT;
+  }
+
+#ifdef USE_MULTI_ALLOCATOR
+  pMessage = (OutMessage *)pSockContext->out_msg_allocator->alloc_void();
+#else
+  pMessage = (OutMessage *)out_message_allocator.alloc_void();
+#endif
+
+  if (pMessage == NULL) {
+    Error("file: "__FILE__", line: %d, "
+        "malloc %d bytes fail, errno: %d, error info: %s",
+        __LINE__, (int)sizeof(OutMessage), errno, strerror(errno));
+    return errno != 0 ? errno : ENOMEM;
+  }
+
+  do {
+#ifdef CHECK_MAGIC_NUMBER
+    pMessage->header.magic = MAGIC_NUMBER;
+#endif
+    pMessage->header.func_id = func_id;
+    pMessage->header.session_id = *session;
+    pMessage->header.msg_seq = 11111;
+    pMessage->in_queue_time = CURRENT_NS();
+    pMessage->bytes_sent = 0;
+    pMessage->blocks.m_ptr = NULL;
+    pMessage->next = NULL;
+
+    if (data_len < 0) {  //object
+      pMessage->data_type = DATA_TYPE_OBJECT;
+      pMessage->blocks = (IOBufferBlock *)data;
+      pMessage->header.data_len = get_total_size(pMessage->blocks);
+    }
+    else {
+      if (data_len > MINI_MESSAGE_SIZE) {
+        Error("file: "__FILE__", line: %d, "
+            "invalid data length: %d exceeds %d!",
+            __LINE__, data_len, MINI_MESSAGE_SIZE);
+        result = errno != 0 ? errno : ENOMEM;
+        break;
+      }
+
+      pMessage->data_type = DATA_TYPE_BUFFER;
+      pMessage->blocks = NULL;
+      pMessage->header.data_len = data_len;
+      if (data_len > 0) {
+        memcpy(pMessage->mini_buff, data, data_len);
+      }
+    }
+
+    pMessage->header.aligned_data_len = BYTE_ALIGN8(
+        pMessage->header.data_len);
+    result = push_to_queue_func(pSockContext, pMessage, priority);
+  } while (0);
+
+  if (result != 0) {
+    release_out_message(pSockContext, pMessage);
+  }
+
+  return result;
+}
+

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/message.h
----------------------------------------------------------------------
diff --git a/iocore/cluster/message.h b/iocore/cluster/message.h
new file mode 100644
index 0000000..d948927
--- /dev/null
+++ b/iocore/cluster/message.h
@@ -0,0 +1,75 @@
+/** @file
+
+  A brief file description
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#ifndef _MESSAGE_H_
+#define _MESSAGE_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "types.h"
+
+struct HelloMessage
+{
+  uint32_t major;  //major version
+  uint32_t minor;  //minor version
+  uint32_t min_major;
+  uint32_t min_minor;
+};
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef USE_MULTI_ALLOCATOR
+  extern Allocator out_message_allocator;
+#endif
+
+typedef int (*push_to_send_queue_func)(SocketContext *pSockContext, OutMessage *pMessage,
+    const MessagePriority priority);
+
+int cluster_send_msg_internal_ex(const ClusterSession *session,
+    SocketContext *pSockContext, const int func_id,
+	void *data, const int data_len, const MessagePriority priority,
+  push_to_send_queue_func push_to_queue_func);
+
+inline void release_out_message(SocketContext *pSockContext,
+    OutMessage *msg)
+{
+  if (msg->data_type == DATA_TYPE_OBJECT && msg->blocks != NULL) {
+    msg->blocks = NULL;
+  }
+#ifdef USE_MULTI_ALLOCATOR
+  pSockContext->out_msg_allocator->free_void(msg);
+#else
+  (void)pSockContext;
+  out_message_allocator.free_void(msg);
+#endif
+}
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/nio.cc
----------------------------------------------------------------------
diff --git a/iocore/cluster/nio.cc b/iocore/cluster/nio.cc
new file mode 100644
index 0000000..b699d84
--- /dev/null
+++ b/iocore/cluster/nio.cc
@@ -0,0 +1,1701 @@
+/** @file
+
+  A brief file description
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/in.h>
+#include <arpa/inet.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <fcntl.h>
+#if defined(linux)
+#include <sys/prctl.h>
+#endif
+#include "Diags.h"
+#include "global.h"
+#include "machine.h"
+#include "session.h"
+#include "message.h"
+#include "connection.h"
+#ifndef TS_INLINE
+#define TS_INLINE inline
+#endif
+#include "I_IOBuffer.h"
+#include "I_EventSystem.h"
+#include "P_Cluster.h"
+#include "P_RecCore.h"
+#include "ink_config.h"
+#include "EventPoll.h"
+#include "nio.h"
+
+int cluster_worker_thread_count = 0;
+WorkerThreadContext *cluster_worker_thread_contexts = NULL;
+static int read_buffer_size = 2 * 1024 * 1024;
+
+static ink_mutex worker_thread_lock;
+
+static void *work_thread_entrance(void* arg);
+static void clear_send_queue(SocketContext * pSockContext, const bool warning);
+
+message_deal_func cluster_msg_deal_func = NULL;
+machine_change_notify_func cluster_machine_change_notify = NULL;
+
+struct NIORecords {
+  RecRecord * send_retry_count;
+
+  RecRecord * send_wait_time;
+  RecRecord * epoll_wait_count;
+  RecRecord * epoll_wait_time_used;
+  RecRecord * loop_usleep_count;
+  RecRecord * loop_usleep_time;
+  RecRecord * io_loop_interval;
+
+#ifdef DEBUG
+  RecRecord * max_write_loop_time_used;
+  RecRecord * max_read_loop_time_used;
+  RecRecord * max_epoll_time_used;
+  RecRecord * max_usleep_time_used;
+  RecRecord * max_callback_time_used;
+#endif
+};
+
+static NIORecords nio_records = {NULL, NULL, NULL, NULL, NULL, NULL, NULL
+#ifdef DEBUG
+  , NULL, NULL, NULL, NULL, NULL
+#endif
+};
+
+static int send_wait_time = 1 * HRTIME_MSECOND;   //write wait time calc by cluster IO
+static int io_loop_interval = 0;  //us
+
+#ifdef DEBUG
+static volatile int64_t max_write_loop_time_used = 0;
+static volatile int64_t max_read_loop_time_used = 0;
+static volatile int64_t max_epoll_time_used = 0;
+static volatile int64_t max_usleep_time_used = 0;
+static volatile int64_t max_callback_time_used = 0;
+#endif
+
+inline int get_iovec(IOBufferBlock *blocks, IOVec *iovec, int size) {
+  int niov;
+  IOBufferBlock *b = blocks;
+  niov = 0;
+  while (b != NULL && niov < size) {
+    int64_t a = b->read_avail();
+    if (a > 0) {
+      iovec[niov].iov_len = a;
+      iovec[niov].iov_base = b->_start;
+      ++niov;
+    }
+    b = b->next;
+  }
+
+  return niov;
+}
+
+inline void consume(OutMessage *pMessage, int64_t l) {
+  while (pMessage->blocks != NULL) {
+    int64_t r = pMessage->blocks->read_avail();
+    if (l < r) {
+      pMessage->blocks->consume(l);
+      break;
+    } else {
+      l -= r;
+      pMessage->blocks = pMessage->blocks->next;
+    }
+  }
+}
+
+static void init_nio_stats()
+{
+  RecData data_default;
+  memset(&data_default, 0, sizeof(RecData));
+
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.io.send_msg_count", 0, RECP_NON_PERSISTENT);
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.io.drop_msg_count", 0, RECP_NON_PERSISTENT);
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.io.send_bytes", 0, RECP_NON_PERSISTENT);
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.io.drop_bytes", 0, RECP_NON_PERSISTENT);
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.io.recv_msg_count", 0, RECP_NON_PERSISTENT);
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.io.recv_bytes", 0, RECP_NON_PERSISTENT);
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.io.enqueue_in_msg_count", 0, RECP_NON_PERSISTENT);
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.io.enqueue_in_msg_bytes", 0, RECP_NON_PERSISTENT);
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.io.dequeue_in_msg_count", 0, RECP_NON_PERSISTENT);
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.io.dequeue_in_msg_bytes", 0, RECP_NON_PERSISTENT);
+
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.io.call_writev_count", 0, RECP_NON_PERSISTENT);
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.io.call_read_count", 0, RECP_NON_PERSISTENT);
+
+  nio_records.send_retry_count = RecRegisterStat(RECT_PROCESS,
+      "proxy.process.cluster.io.send_retry_count", RECD_INT, data_default, RECP_NON_PERSISTENT);
+  nio_records.epoll_wait_count = RecRegisterStat(RECT_PROCESS,
+      "proxy.process.cluster.io.epoll_wait_count", RECD_INT, data_default, RECP_NON_PERSISTENT);
+  nio_records.epoll_wait_time_used = RecRegisterStat(RECT_PROCESS,
+      "proxy.process.cluster.io.epoll_wait_time_used", RECD_INT, data_default, RECP_NON_PERSISTENT);
+  nio_records.loop_usleep_count = RecRegisterStat(RECT_PROCESS,
+      "proxy.process.cluster.io.loop_usleep_count", RECD_INT, data_default, RECP_NON_PERSISTENT);
+  nio_records.loop_usleep_time = RecRegisterStat(RECT_PROCESS,
+      "proxy.process.cluster.io.loop_usleep_time", RECD_INT, data_default, RECP_NON_PERSISTENT);
+  nio_records.send_wait_time = RecRegisterStat(RECT_PROCESS,
+      "proxy.process.cluster.io.send_wait_time", RECD_INT, data_default, RECP_NON_PERSISTENT);
+  nio_records.io_loop_interval = RecRegisterStat(RECT_PROCESS,
+      "proxy.process.cluster.io.loop_interval", RECD_INT, data_default, RECP_NON_PERSISTENT);
+
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.ping_total_count", 0, RECP_NON_PERSISTENT);
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.ping_success_count", 0, RECP_NON_PERSISTENT);
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.ping_time_used", 0, RECP_NON_PERSISTENT);
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.io.send_delayed_time", 0, RECP_NON_PERSISTENT);
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.io.push_msg_count", 0, RECP_NON_PERSISTENT);
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.io.push_msg_bytes", 0, RECP_NON_PERSISTENT);
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.io.fail_msg_count", 0, RECP_NON_PERSISTENT);
+  RecRegisterStatInt(RECT_PROCESS, "proxy.process.cluster.io.fail_msg_bytes", 0, RECP_NON_PERSISTENT);
+
+#ifdef DEBUG
+  nio_records.max_write_loop_time_used = RecRegisterStat(RECT_PROCESS,
+      "proxy.process.cluster.io.max_write_loop_time_used", RECD_INT, data_default, RECP_NON_PERSISTENT);
+  nio_records.max_read_loop_time_used = RecRegisterStat(RECT_PROCESS,
+      "proxy.process.cluster.io.max_read_loop_time_used", RECD_INT, data_default, RECP_NON_PERSISTENT);
+  nio_records.max_epoll_time_used = RecRegisterStat(RECT_PROCESS,
+      "proxy.process.cluster.io.max_epoll_time_used", RECD_INT, data_default, RECP_NON_PERSISTENT);
+  nio_records.max_usleep_time_used = RecRegisterStat(RECT_PROCESS,
+      "proxy.process.cluster.io.max_usleep_time_used", RECD_INT, data_default, RECP_NON_PERSISTENT);
+  nio_records.max_callback_time_used = RecRegisterStat(RECT_PROCESS,
+      "proxy.process.cluster.io.max_callback_time_used", RECD_INT, data_default, RECP_NON_PERSISTENT);
+#endif
+}
+
+void log_nio_stats()
+{
+  RecData data;
+  WorkerThreadContext *pThreadContext;
+  WorkerThreadContext *pContextEnd;
+  SocketStats sum = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  static time_t last_calc_bps_time = CURRENT_TIME();
+  static int64_t last_send_bytes = 0;
+
+  pContextEnd = cluster_worker_thread_contexts + num_of_cluster_threads;
+  for (pThreadContext=cluster_worker_thread_contexts; pThreadContext<pContextEnd;
+      pThreadContext++)
+  {
+    sum.send_msg_count += pThreadContext->stats.send_msg_count;
+    sum.drop_msg_count += pThreadContext->stats.drop_msg_count;
+    sum.send_bytes += pThreadContext->stats.send_bytes;
+    sum.drop_bytes += pThreadContext->stats.drop_bytes;
+    sum.call_writev_count += pThreadContext->stats.call_writev_count;
+    sum.send_retry_count += pThreadContext->stats.send_retry_count;
+    sum.recv_msg_count += pThreadContext->stats.recv_msg_count;
+    sum.recv_bytes += pThreadContext->stats.recv_bytes;
+    sum.enqueue_in_msg_count += pThreadContext->stats.enqueue_in_msg_count;
+    sum.enqueue_in_msg_bytes += pThreadContext->stats.enqueue_in_msg_bytes;
+    sum.dequeue_in_msg_count += pThreadContext->stats.dequeue_in_msg_count;
+    sum.dequeue_in_msg_bytes += pThreadContext->stats.dequeue_in_msg_bytes;
+    sum.call_read_count += pThreadContext->stats.call_read_count;
+    sum.epoll_wait_count += pThreadContext->stats.epoll_wait_count;
+    sum.epoll_wait_time_used += pThreadContext->stats.epoll_wait_time_used;
+    sum.loop_usleep_count += pThreadContext->stats.loop_usleep_count;
+    sum.loop_usleep_time += pThreadContext->stats.loop_usleep_time;
+    sum.ping_total_count += pThreadContext->stats.ping_total_count;
+    sum.ping_success_count += pThreadContext->stats.ping_success_count;
+    sum.ping_time_used += pThreadContext->stats.ping_time_used;
+    sum.send_delayed_time += pThreadContext->stats.send_delayed_time;
+    sum.push_msg_count += pThreadContext->stats.push_msg_count;
+    sum.push_msg_bytes += pThreadContext->stats.push_msg_bytes;
+    sum.fail_msg_count += pThreadContext->stats.fail_msg_count;
+    sum.fail_msg_bytes += pThreadContext->stats.fail_msg_bytes;
+  }
+
+  data.rec_int = sum.send_msg_count;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.io.send_msg_count", RECD_INT, &data, NULL);
+  data.rec_int = sum.drop_msg_count;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.io.drop_msg_count", RECD_INT, &data, NULL);
+  data.rec_int = sum.send_bytes;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.io.send_bytes", RECD_INT, &data, NULL);
+  data.rec_int = sum.drop_bytes;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.io.drop_bytes", RECD_INT, &data, NULL);
+  data.rec_int = sum.recv_msg_count;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.io.recv_msg_count", RECD_INT, &data, NULL);
+  data.rec_int = sum.recv_bytes;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.io.recv_bytes", RECD_INT, &data, NULL);
+  data.rec_int = sum.enqueue_in_msg_count;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.io.enqueue_in_msg_count", RECD_INT, &data, NULL);
+  data.rec_int = sum.enqueue_in_msg_bytes;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.io.enqueue_in_msg_bytes", RECD_INT, &data, NULL);
+  data.rec_int = sum.dequeue_in_msg_count;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.io.dequeue_in_msg_count", RECD_INT, &data, NULL);
+  data.rec_int = sum.dequeue_in_msg_bytes;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.io.dequeue_in_msg_bytes", RECD_INT, &data, NULL);
+  data.rec_int = sum.ping_total_count;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.ping_total_count", RECD_INT, &data, NULL);
+  data.rec_int = sum.ping_success_count;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.ping_success_count", RECD_INT, &data, NULL);
+  data.rec_int = sum.ping_time_used;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.ping_time_used", RECD_INT, &data, NULL);
+  data.rec_int = sum.send_delayed_time;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.io.send_delayed_time", RECD_INT, &data, NULL);
+  data.rec_int = sum.push_msg_count;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.io.push_msg_count", RECD_INT, &data, NULL);
+  data.rec_int = sum.push_msg_bytes;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.io.push_msg_bytes", RECD_INT, &data, NULL);
+  data.rec_int = sum.fail_msg_count;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.io.fail_msg_count", RECD_INT, &data, NULL);
+  data.rec_int = sum.fail_msg_bytes;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.io.fail_msg_bytes", RECD_INT, &data, NULL);
+  data.rec_int = sum.call_writev_count;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.io.call_writev_count", RECD_INT, &data, NULL);
+  data.rec_int = sum.call_read_count;
+  RecSetRecord(RECT_PROCESS, "proxy.process.cluster.io.call_read_count", RECD_INT, &data, NULL);
+
+  RecDataSetFromInk64(RECD_INT, &nio_records.send_retry_count->data,
+      sum.send_retry_count);
+  RecDataSetFromInk64(RECD_INT, &nio_records.epoll_wait_count->data,
+      sum.epoll_wait_count);
+  RecDataSetFromInk64(RECD_INT, &nio_records.epoll_wait_time_used->data,
+      sum.epoll_wait_time_used);
+  RecDataSetFromInk64(RECD_INT, &nio_records.loop_usleep_count->data,
+      sum.loop_usleep_count);
+  RecDataSetFromInk64(RECD_INT, &nio_records.loop_usleep_time->data,
+      sum.loop_usleep_time);
+
+#ifdef DEBUG
+  RecDataSetFromInk64(RECD_INT, &nio_records.max_write_loop_time_used->data,
+      max_write_loop_time_used);
+  RecDataSetFromInk64(RECD_INT, &nio_records.max_read_loop_time_used->data,
+      max_read_loop_time_used);
+  RecDataSetFromInk64(RECD_INT, &nio_records.max_epoll_time_used->data,
+      max_epoll_time_used);
+  RecDataSetFromInk64(RECD_INT, &nio_records.max_usleep_time_used->data,
+      max_usleep_time_used);
+  RecDataSetFromInk64(RECD_INT, &nio_records.max_callback_time_used->data,
+      max_callback_time_used);
+#endif
+
+  int time_pass = CURRENT_TIME() - last_calc_bps_time;
+  if (time_pass > 0) {
+    double io_busy_ratio;
+    int64_t nio_current_bps = 8 * (sum.send_bytes - last_send_bytes) / time_pass;
+    last_calc_bps_time = CURRENT_TIME();
+    last_send_bytes = sum.send_bytes;
+
+    if (cluster_flow_ctrl_max_bps <= 0) {
+      send_wait_time = cluster_send_min_wait_time * HRTIME_USECOND;
+      io_loop_interval = cluster_min_loop_interval;
+    }
+    else {
+      if (nio_current_bps < cluster_flow_ctrl_min_bps) {
+        send_wait_time = cluster_send_min_wait_time * HRTIME_USECOND;
+        io_loop_interval = cluster_min_loop_interval;
+      }
+      else {
+        io_busy_ratio = (double)nio_current_bps / (double)cluster_flow_ctrl_max_bps;
+        if (io_busy_ratio > 1.0) {
+          io_busy_ratio = 1.0;
+        }
+        send_wait_time = (int)((cluster_send_min_wait_time +
+              (cluster_send_max_wait_time - cluster_send_min_wait_time) *
+              io_busy_ratio)) * HRTIME_USECOND;
+        io_loop_interval = cluster_min_loop_interval + (int)((
+              cluster_max_loop_interval - cluster_min_loop_interval) * io_busy_ratio);
+      }
+      RecDataSetFromInk64(RECD_INT, &nio_records.send_wait_time->data,
+          send_wait_time / HRTIME_USECOND);
+      RecDataSetFromInk64(RECD_INT, &nio_records.io_loop_interval->data,
+          io_loop_interval);
+    }
+  }
+}
+
+int nio_init()
+{
+  int result;
+  int bytes;
+  int total_connections;
+  int max_connections_per_thread;
+  WorkerThreadContext *pThreadContext;
+  WorkerThreadContext *pContextEnd;
+
+  REC_EstablishStaticConfigInt32(read_buffer_size, "proxy.config.cluster.read_buffer_size");
+  Debug(CLUSTER_DEBUG_TAG, "file: " __FILE__ ", line: %d, "
+      "read_buffer_size: %d", __LINE__, read_buffer_size);
+
+  if ((result=ink_mutex_init(&worker_thread_lock, "worker_lock")) != 0) {
+    return result;
+  }
+
+  bytes = sizeof(WorkerThreadContext) * num_of_cluster_threads;
+  cluster_worker_thread_contexts = (WorkerThreadContext *)malloc(bytes);
+  if (cluster_worker_thread_contexts == NULL) {
+    Error("file: "__FILE__", line: %d, "
+        "malloc %d bytes fail, errno: %d, error info: %s",
+        __LINE__, bytes, errno, strerror(errno));
+    return errno != 0 ? errno : ENOMEM;
+  }
+  memset(cluster_worker_thread_contexts, 0, bytes);
+
+  total_connections = num_of_cluster_connections * (MAX_MACHINE_COUNT - 1);
+  max_connections_per_thread = total_connections / num_of_cluster_threads;
+  if (total_connections % num_of_cluster_threads != 0) {
+    max_connections_per_thread++;
+  }
+
+  cluster_worker_thread_count = 0;
+  pContextEnd = cluster_worker_thread_contexts + num_of_cluster_threads;
+  for (pThreadContext=cluster_worker_thread_contexts; pThreadContext<pContextEnd; pThreadContext++)
+  {
+    pThreadContext->thread_index = (int)(pThreadContext - cluster_worker_thread_contexts);
+    pThreadContext->alloc_size = max_connections_per_thread;
+
+    pThreadContext->ev_poll = new EventPoll(pThreadContext->alloc_size, 1);
+    if (pThreadContext->ev_poll == NULL)
+    {
+      Error("file: "__FILE__", line: %d, "
+          "new EventPoll fail, errno: %d, error info: %s",
+          __LINE__, errno, strerror(errno));
+      return errno != 0 ? errno : ENOMEM;
+    }
+
+    bytes = sizeof(SocketContext *) * pThreadContext->alloc_size;
+    pThreadContext->active_sockets = (SocketContext **)malloc(bytes);
+    if (pThreadContext->active_sockets == NULL)
+    {
+      Error("file: "__FILE__", line: %d, "
+          "malloc %d bytes fail, errno: %d, error info: %s",
+          __LINE__, bytes, errno, strerror(errno));
+      return errno != 0 ? errno : ENOMEM;
+    }
+
+    if ((result=ink_mutex_init(&pThreadContext->lock, "context_lock")) != 0)
+    {
+      return result;
+    }
+
+    if (ink_thread_create(work_thread_entrance, pThreadContext) == 0)
+    {
+      result = errno != 0 ? errno : ENOMEM;
+      Error("file: "__FILE__", line: %d, "
+          "create thread failed, startup threads: %d, "
+          "errno: %d, error info: %s",
+          __LINE__, cluster_worker_thread_count,
+          result, strerror(result));
+      break;
+    }
+    else
+    {
+      if ((result=ink_mutex_acquire(&worker_thread_lock)) != 0) {
+        Error("file: "__FILE__", line: %d, "
+            "call ink_mutex_acquire fail, "
+            "errno: %d, error info: %s",
+            __LINE__, result, strerror(result));
+      }
+      cluster_worker_thread_count++;
+      if ((result=ink_mutex_release(&worker_thread_lock)) != 0) {
+        Error("file: "__FILE__", line: %d, "
+            "call ink_mutex_release fail, "
+            "errno: %d, error info: %s",
+            __LINE__, result, strerror(result));
+      }
+    }
+  }
+
+  init_nio_stats();
+
+  return 0;
+}
+
+int nio_destroy()
+{
+  ink_mutex_destroy(&worker_thread_lock);
+  return 0;
+}
+
+int cluster_global_init(message_deal_func deal_func,
+    machine_change_notify_func machine_change_notify)
+{
+  cluster_msg_deal_func = deal_func;
+  cluster_machine_change_notify = machine_change_notify;
+  return 0;
+}
+
+#define ALLOC_READER_BUFFER(reader, len) \
+  do { \
+    reader.buffer = new_RecvBuffer(len); \
+    reader.current = reader.buffer->_data; \
+    reader.buff_end = reader.buffer->_data + len; \
+  } while (0)
+
+#define INIT_READER(reader, len) \
+  do { \
+    reader.buffer = new_RecvBuffer(len); \
+    reader.current = reader.msg_header = reader.buffer->_data; \
+    reader.buff_end = reader.msg_header + len; \
+  } while (0)
+
+#define MOVE_TO_NEW_BUFFER(pSockContext, msg_bytes) \
+  do { \
+    Ptr<IOBufferData> oldBuffer; \
+    char *old_msg_header; \
+    oldBuffer = pSockContext->reader.buffer; \
+    old_msg_header = pSockContext->reader.msg_header; \
+    INIT_READER(pSockContext->reader, read_buffer_size); \
+    memcpy(pSockContext->reader.current, old_msg_header, msg_bytes); \
+    pSockContext->reader.current += msg_bytes; \
+    oldBuffer = NULL; \
+  } while (0)
+
+
+static int set_socket_rw_buff_size(int sock)
+{
+  int bytes;
+
+  if (cluster_send_buffer_size > 0) {
+    bytes = cluster_send_buffer_size;
+    if (setsockopt(sock, SOL_SOCKET, SO_SNDBUF,
+          (char *)&bytes, sizeof(int)) < 0)
+    {
+      Error("file: "__FILE__", line: %d, "
+          "setsockopt failed, errno: %d, error info: %s",
+          __LINE__, errno, strerror(errno));
+      return errno != 0 ? errno : ENOMEM;
+    }
+  }
+
+  if (cluster_receive_buffer_size > 0) {
+    bytes = cluster_receive_buffer_size;
+    if (setsockopt(sock, SOL_SOCKET, SO_RCVBUF,
+          (char *)&bytes, sizeof(int)) < 0)
+    {
+      Error("file: "__FILE__", line: %d, "
+          "setsockopt failed, errno: %d, error info: %s",
+          __LINE__, errno, strerror(errno));
+      return errno != 0 ? errno : ENOMEM;
+    }
+  }
+
+  return 0;
+}
+
+static int add_to_active_sockets(SocketContext *pSockContext)
+{
+  ink_mutex_acquire(&pSockContext->thread_context->lock);
+  pSockContext->thread_context->active_sockets[
+    pSockContext->thread_context->active_sock_count] = pSockContext;
+  pSockContext->thread_context->active_sock_count++;
+  ink_mutex_release(&pSockContext->thread_context->lock);
+  return 0;
+}
+
+static int remove_from_active_sockets(SocketContext *pSockContext)
+{
+  int result;
+  SocketContext **ppSockContext;
+  SocketContext **ppContextEnd;
+  SocketContext **ppCurrent;
+
+  ink_mutex_acquire(&pSockContext->thread_context->lock);
+  ppContextEnd = pSockContext->thread_context->active_sockets +
+    pSockContext->thread_context->active_sock_count;
+  for (ppSockContext=pSockContext->thread_context->active_sockets;
+      ppSockContext<ppContextEnd; ppSockContext++)
+  {
+    if (*ppSockContext == pSockContext) {
+      break;
+    }
+  }
+
+  if (ppSockContext == ppContextEnd) {
+    Error("file: "__FILE__", line: %d, "
+        "socket context for %s not found!", __LINE__,
+        pSockContext->machine->hostname);
+    result = ENOENT;
+  }
+  else {
+    for (ppCurrent=ppSockContext+1; ppCurrent<ppContextEnd; ppCurrent++) {
+      *(ppCurrent - 1) = *ppCurrent;
+    }
+    pSockContext->thread_context->active_sock_count--;
+    result = 0;
+  }
+  ink_mutex_release(&pSockContext->thread_context->lock);
+
+  return result;
+}
+
+int nio_add_to_epoll(SocketContext *pSockContext)
+{
+
+  /*
+     Debug(CLUSTER_DEBUG_TAG, "file: " __FILE__ ", line: %d, "
+     "%s:%d nio_add_to_epoll", __LINE__, pSockContext->machine->hostname,
+     pSockContext->machine->cluster_port);
+     */
+
+  pSockContext->connected_time = CURRENT_TIME();
+  clear_send_queue(pSockContext, true);
+
+  pSockContext->queue_index = 0;
+  pSockContext->ping_start_time = 0;
+  pSockContext->ping_fail_count = 0;
+  pSockContext->next_write_time = CURRENT_NS() + send_wait_time;
+  pSockContext->next_ping_time = CURRENT_NS() + cluster_ping_send_interval;
+
+  INIT_READER(pSockContext->reader, read_buffer_size);
+  pSockContext->reader.recv_body_bytes = 0;
+
+  set_socket_rw_buff_size(pSockContext->sock);
+  init_machine_sessions(pSockContext->machine, false);
+  add_machine_sock_context(pSockContext);
+
+  if (pSockContext->thread_context->ev_poll->attach(pSockContext->sock,
+        EVENTIO_READ, pSockContext) < 0)
+  {
+    Error("file: " __FILE__ ", line: %d, "
+        "event poll attach fail, errno: %d, error info: %s",
+        __LINE__, errno, strerror(errno));
+    remove_machine_sock_context(pSockContext);  //rollback
+    return errno != 0 ? errno : ENOMEM;
+  }
+
+  return add_to_active_sockets(pSockContext);
+}
+
+static void clear_send_queue(SocketContext * pSockContext, const bool warning)
+{
+  int i;
+  int count;
+  int64_t drop_bytes;
+  OutMessage *msg;
+  MessageQueue *send_queue;
+
+  count = 0;
+  drop_bytes = 0;
+  for (i=0; i<PRIORITY_COUNT; i++) {
+    send_queue = pSockContext->send_queues + i;
+    ink_mutex_acquire(&send_queue->lock);
+    pSockContext->version++;
+    while (send_queue->head != NULL) {
+      msg = send_queue->head;
+      send_queue->head = send_queue->head->next;
+      drop_bytes += MSG_HEADER_LENGTH + msg->header.aligned_data_len;
+      release_out_message(pSockContext, msg);
+      count++;
+    }
+    send_queue->tail = NULL;
+    ink_mutex_release(&send_queue->lock);
+  }
+
+  if (count > 0) {
+    char buff[256];
+    sprintf(buff, "file: " __FILE__ ", line: %d, "
+        "release %s:%d message count: %d",
+        __LINE__, pSockContext->machine->hostname,
+        pSockContext->machine->cluster_port, count);
+    if (warning) {
+      Warning("%s", buff);
+    }
+    else {
+      Debug(CLUSTER_DEBUG_TAG, "%s", buff);
+    }
+
+    pSockContext->thread_context->stats.drop_msg_count += count;
+    pSockContext->thread_context->stats.drop_bytes += drop_bytes;
+  }
+}
+
+static int close_socket(SocketContext * pSockContext)
+{
+  if (pSockContext->thread_context->ev_poll->detach(pSockContext->sock) < 0) {
+    Error("file: " __FILE__ ", line: %d, "
+        "event poll detach fail, errno: %d, error info: %s",
+        __LINE__, errno, strerror(errno));
+    return errno != 0 ? errno : ENOMEM;
+  }
+  close(pSockContext->sock);
+  pSockContext->sock = -1;
+
+  remove_from_active_sockets(pSockContext);
+  machine_remove_connection(pSockContext);
+
+  pSockContext->reader.blocks = NULL;
+  pSockContext->reader.buffer = NULL;
+
+  clear_send_queue(pSockContext, false);
+  notify_connection_closed(pSockContext);
+
+  if (pSockContext->connect_type == CONNECT_TYPE_CLIENT) {
+    make_connection(pSockContext);
+  }
+  else {
+    free_accept_sock_context(pSockContext);
+  }
+
+  return 0;
+}
+
+inline static int send_ping_message(SocketContext *pSockContext)
+{
+  ClusterSession session;
+
+  //ping message do NOT care session id
+  session.fields.ip = my_machine_ip;
+  session.fields.timestamp = CURRENT_TIME();
+  session.fields.seq = 0;   //just use 0
+  return cluster_send_msg_internal_ex(&session,
+      pSockContext, FUNC_ID_CLUSTER_PING_REQUEST, NULL, 0, PRIORITY_HIGH,
+      insert_into_send_queue_head);
+}
+
+static int deal_write_event(SocketContext * pSockContext)
+{
+#define BUFF_TYPE_HEADER    'H'
+#define BUFF_TYPE_DATA      'D'
+#define BUFF_TYPE_PADDING   'P'
+
+  MessageQueue *send_queue;
+  struct iovec write_vec[WRITEV_ARRAY_SIZE];
+  struct {
+    int priority;
+    int index;     //message index
+    int  buff_type;  //message data or header
+  } msg_indexes[WRITEV_ARRAY_SIZE];
+
+  struct {
+    OutMessage *send_msgs[WRITEV_ITEM_ONCE];
+    OutMessage *done_msgs[WRITEV_ITEM_ONCE];
+    OutMessage **pDoneMsgs;
+    int msg_count;
+    int done_count;
+  } msgs[PRIORITY_COUNT];
+
+  OutMessage *msg;
+  int write_bytes;
+  int remain_len;
+  int priority;
+  int start;
+  int total_msg_count;
+  int vec_count;
+  int total_bytes;
+  int total_done_count;
+  int result;
+  int i, k;
+  bool fetch_done;
+  bool last_msg_complete;
+
+  msgs[0].msg_count = msgs[1].msg_count = msgs[2].msg_count = 0;
+  total_msg_count = 0;
+  vec_count = 0;
+  total_bytes = 0;
+
+  priority = pSockContext->queue_index;
+  if (pSockContext->queue_index == 0) {
+    start = 1;  //only loop 3 times
+  }
+  else {
+    start = 0;  //need loop 4 times
+  }
+
+  last_msg_complete = false;
+  fetch_done = false;
+  for (i=start; i<=PRIORITY_COUNT; i++) {
+    send_queue = pSockContext->send_queues + priority;
+    ink_mutex_acquire(&send_queue->lock);
+    msg = send_queue->head;
+    if (pSockContext->queue_index > 0 &&
+        i == pSockContext->queue_index + 1)
+    {
+      if (msg != NULL) {
+        msg = msg->next;  //should skip to next for the first already consumed
+      }
+    }
+    while (msg != NULL) {
+      if (msg->bytes_sent < MSG_HEADER_LENGTH) {  //should send header
+        write_vec[vec_count].iov_base = ((char *)&msg->header) +
+          msg->bytes_sent;
+        write_vec[vec_count].iov_len = MSG_HEADER_LENGTH -
+          msg->bytes_sent;
+        total_bytes += write_vec[vec_count].iov_len;
+        msg_indexes[vec_count].priority = priority;
+        msg_indexes[vec_count].buff_type = BUFF_TYPE_HEADER;
+        msg_indexes[vec_count].index = msgs[priority].msg_count;
+        vec_count++;
+
+        remain_len = msg->header.aligned_data_len;
+      }
+      else {
+        remain_len = (msg->header.aligned_data_len + MSG_HEADER_LENGTH) -
+          msg->bytes_sent;
+      }
+
+      if (remain_len > 0) {
+        int pad_len;
+        int remain_data_len;
+        pad_len = msg->header.aligned_data_len - msg->header.data_len;
+        remain_data_len = remain_len - pad_len;
+        if (remain_data_len > 0) {
+          if (msg->data_type == DATA_TYPE_OBJECT) {
+            int read_count;
+            int64_t read_bytes;
+
+            read_count = get_iovec(msg->blocks, write_vec + vec_count,
+                WRITEV_ARRAY_SIZE - 1 -  vec_count);
+            read_bytes = 0;
+            for (k=0; k<read_count; k++) {
+              read_bytes += write_vec[vec_count].iov_len;
+              msg_indexes[vec_count].priority = priority;
+              msg_indexes[vec_count].buff_type = BUFF_TYPE_DATA;
+              msg_indexes[vec_count].index = msgs[priority].msg_count;
+              vec_count++;
+            }
+            //assert(read_bytes <= remain_data_len);
+
+            total_bytes += read_bytes;
+            last_msg_complete = read_bytes == remain_data_len;
+          }
+          else {
+            write_vec[vec_count].iov_base = msg->mini_buff +
+              (msg->header.data_len - remain_data_len);
+            write_vec[vec_count].iov_len = remain_data_len;
+            total_bytes += write_vec[vec_count].iov_len;
+            msg_indexes[vec_count].priority = priority;
+            msg_indexes[vec_count].buff_type = BUFF_TYPE_DATA;
+            msg_indexes[vec_count].index = msgs[priority].msg_count;
+            vec_count++;
+            last_msg_complete = true;
+          }
+        }
+        else {  //no more data
+          last_msg_complete = true;
+        }
+
+        if (pad_len > 0 && last_msg_complete) {
+          write_vec[vec_count].iov_base = pSockContext->padding;
+          write_vec[vec_count].iov_len = (remain_data_len > 0) ?
+            pad_len : remain_len;
+          total_bytes += write_vec[vec_count].iov_len;
+          msg_indexes[vec_count].priority = priority;
+          msg_indexes[vec_count].buff_type = BUFF_TYPE_PADDING;
+          msg_indexes[vec_count].index = msgs[priority].msg_count;
+          vec_count++;
+        }
+      }
+      else {
+        last_msg_complete = true;
+      }
+
+      msgs[priority].send_msgs[msgs[priority].msg_count++] = msg;
+      total_msg_count++;
+
+      /*
+         Debug(CLUSTER_DEBUG_TAG, "file: " __FILE__ ", line: %d, "
+         "%s:%d sending msg, data body: %d, msg send bytes: %d, total_bytes: %d",
+         __LINE__,
+         pSockContext->machine->hostname,
+         pSockContext->machine->cluster_port,
+         msg->header.data_len,
+         msg->bytes_sent, total_bytes);
+         */
+      if (total_msg_count == WRITEV_ITEM_ONCE ||
+          vec_count >= WRITEV_ARRAY_SIZE - 2 ||
+          total_bytes >= WRITE_MAX_COMBINE_BYTES)
+      {
+        fetch_done = true;
+        break;
+      }
+      if (i == 0) {  //fetch only one, the head message
+        break;
+      }
+      msg = msg->next;
+    }
+    ink_mutex_release(&send_queue->lock);
+
+    if (fetch_done) {
+      break;
+    }
+
+    if (i == 0) {
+      priority = 0;  //next should start from first priority
+    }
+    else {
+      priority++;
+    }
+  }
+
+  /*
+     Debug(CLUSTER_DEBUG_TAG, "==wwwwww==file: " __FILE__ ", line: %d, "
+     "%s:%d total_bytes: %d, vec_count: %d, total_msg_count: %d", __LINE__,
+     pSockContext->machine->hostname,
+     pSockContext->machine->cluster_port,
+     total_bytes, vec_count, total_msg_count);
+     */
+
+  if (vec_count == 0) {
+    return EAGAIN;
+  }
+
+  pSockContext->thread_context->stats.send_retry_count += total_msg_count;
+  pSockContext->thread_context->stats.call_writev_count++;
+  write_bytes = writev(pSockContext->sock, write_vec, vec_count);
+  if (write_bytes == 0) {   //connection closed
+    Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+        "write to %s fail, connection closed",
+        __LINE__, pSockContext->machine->hostname);
+    return ECONNRESET;
+  }
+  else if (write_bytes < 0) {
+    if (errno == EAGAIN || errno == EWOULDBLOCK) {
+      return EAGAIN;
+    }
+    else if (errno == EINTR) {  //should try again
+      Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__ ", line: %d, "
+          "write to %s fail, errno: %d, error info: %s",
+          __LINE__, pSockContext->machine->hostname,
+          errno, strerror(errno));
+      return 0;
+    }
+    else {
+      result = errno != 0 ? errno : EIO;
+      Error("file: "__FILE__", line: %d, "
+          "write to %s fail, errno: %d, error info: %s",
+          __LINE__, pSockContext->machine->hostname,
+          result, strerror(result));
+      return result;
+    }
+  }
+
+  pSockContext->thread_context->stats.send_bytes += write_bytes;
+  if (write_bytes == total_bytes && fetch_done) {  //send done and have more message to send
+    result = 0;
+  }
+  else {
+    result = EAGAIN;
+  }
+
+  if (write_bytes == total_bytes && last_msg_complete) {  //all done
+    for (i=0; i<PRIORITY_COUNT; i++) {
+      msgs[i].pDoneMsgs = msgs[i].send_msgs;
+      msgs[i].done_count = msgs[i].msg_count;
+    }
+
+    total_done_count = total_msg_count;
+    pSockContext->queue_index = 0;
+  }
+  else {
+    int vi;
+    int remain_bytes;
+    int done_index;
+
+    for (i=0; i<PRIORITY_COUNT; i++) {
+      msgs[i].pDoneMsgs = msgs[i].done_msgs;
+      msgs[i].done_count = 0;
+    }
+    total_done_count = 0;
+
+    remain_bytes = write_bytes;
+    for (vi=0; vi<vec_count; vi++) {
+      remain_bytes -= write_vec[vi].iov_len;
+      msg = msgs[msg_indexes[vi].priority].send_msgs[msg_indexes[vi].index];
+
+      if (remain_bytes >= 0) {
+        if (msg->data_type == DATA_TYPE_OBJECT &&
+            msg_indexes[vi].buff_type == BUFF_TYPE_DATA)
+        {
+          consume(msg, write_vec[vi].iov_len);
+        }
+        msg->bytes_sent += write_vec[vi].iov_len;
+
+        if (msg->bytes_sent >= MSG_HEADER_LENGTH + msg->header.aligned_data_len) {
+          total_done_count++;
+          done_index = msgs[msg_indexes[vi].priority].done_count++;
+          msgs[msg_indexes[vi].priority].done_msgs[done_index] = msg;
+        }
+      }
+      else {
+        if (msg->data_type == DATA_TYPE_OBJECT &&
+            msg_indexes[vi].buff_type == BUFF_TYPE_DATA)
+        {
+          consume(msg, remain_bytes + write_vec[vi].iov_len);
+        }
+        msg->bytes_sent += remain_bytes + write_vec[vi].iov_len;
+
+        break;
+      }
+    }
+
+    if (vi < vec_count) {
+      pSockContext->queue_index = msg_indexes[vi].priority;  //the first not done msg
+    }
+    else {
+      pSockContext->queue_index = msg_indexes[vi - 1].priority;  //the first not done msg
+    }
+
+    if (total_done_count == 0) {
+      return result;
+    }
+  }
+  pSockContext->thread_context->stats.send_msg_count += total_done_count;
+
+  for (i=0; i<PRIORITY_COUNT; i++) {
+    if (msgs[i].done_count == 0) {
+      continue;
+    }
+
+    send_queue = pSockContext->send_queues + i;
+    ink_mutex_acquire(&send_queue->lock);
+    send_queue->head = msgs[i].pDoneMsgs[msgs[i].done_count - 1]->next;
+    if (send_queue->head == NULL) {
+      send_queue->tail = NULL;
+    }
+    ink_mutex_release(&send_queue->lock);
+  }
+
+  for (i=0; i<PRIORITY_COUNT; i++) {
+    for (k=0; k<msgs[i].done_count; k++) {
+      msg = msgs[i].pDoneMsgs[k];
+#ifdef MSG_TIME_STAT_FLAG
+      MachineSessions *pMachineSessions;
+      SessionEntry *pSessionEntry;
+      if (get_response_session_internal(&msg->header,
+            &pMachineSessions, &pSessionEntry) == 0)
+      {
+        int session_index = msg->header.session_id.fields.seq %
+          max_session_count_per_machine;
+        SESSION_LOCK(pMachineSessions, session_index);
+
+        if (!(msg->header.session_id.fields.ip == my_machine_ip))
+        {  //request by other
+          if (pSessionEntry->server_start_time != 0) {
+            ink_atomic_increment(&pMachineSessions->msg_stat.count, 1);
+            ink_atomic_increment(&pMachineSessions->msg_stat.time_used,
+                CURRENT_NS() - pSessionEntry->server_start_time);
+            pSessionEntry->server_start_time = 0;
+          }
+        }
+
+        if (pSessionEntry->send_start_time != 0) {
+          ink_atomic_increment(&pMachineSessions->msg_send.count, 1);
+          ink_atomic_increment(&pMachineSessions->msg_send.time_used,
+              (CURRENT_NS() - pSessionEntry->send_start_time));
+          pSessionEntry->send_start_time = 0;
+        }
+
+        SESSION_UNLOCK(pMachineSessions, session_index);
+      }
+#endif
+
+
+      /*
+         Debug(CLUSTER_DEBUG_TAG, "file: " __FILE__ ", line: %d, "
+         "%s:%d send msg done, data body: %d, send bytes: %d",
+         __LINE__,
+         pSockContext->machine->hostname,
+         pSockContext->machine->cluster_port,
+         msgs[i].pDoneMsgs[k]->header.data_len,
+         msgs[i].pDoneMsgs[k]->bytes_sent);
+         */
+
+      pSockContext->thread_context->stats.send_delayed_time +=
+        CURRENT_NS() - msg->in_queue_time;
+      release_out_message(pSockContext, msg);
+    }
+  }
+
+  return result;
+}
+
+static int deal_message(MsgHeader *pHeader, SocketContext *
+    pSockContext, IOBufferBlock *blocks)
+{
+  int result;
+  bool call_func;
+  MachineSessions *pMachineSessions;
+  SessionEntry *pSessionEntry;
+  void *user_data;
+  int64_t time_used;
+
+  /*
+     Debug(CLUSTER_DEBUG_TAG, "file: "__FILE__", line: %d, "
+     "func_id: %d, data length: %d, recv_msg_count: %"PRId64"", __LINE__,
+     pHeader->func_id, data_len, count + 1);
+     */
+
+  //deal internal ping message first
+  if (pHeader->func_id == FUNC_ID_CLUSTER_PING_REQUEST) {
+    time_used = CURRENT_TIME() - pHeader->session_id.fields.timestamp;
+    if (time_used > 1) {
+      Warning("cluster recv client %s ping, sock: #%d, time pass: %d s",
+          pSockContext->machine->hostname, pSockContext->sock,
+          (int)time_used);
+    }
+    return cluster_send_msg_internal_ex(&pHeader->session_id,
+        pSockContext, FUNC_ID_CLUSTER_PING_RESPONSE, NULL, 0,
+        PRIORITY_HIGH, insert_into_send_queue_head);
+  }
+  else if (pHeader->func_id == FUNC_ID_CLUSTER_PING_RESPONSE) {
+    if (pSockContext->ping_start_time > 0) {
+      time_used = CURRENT_NS() - pSockContext->ping_start_time;
+      pSockContext->thread_context->stats.ping_success_count++;
+      pSockContext->thread_context->stats.ping_time_used += time_used;
+      if (time_used > cluster_ping_latency_threshold) {
+        Warning("cluster server %s, sock: #%d ping response time: %d us > threshold: %d us",
+            pSockContext->machine->hostname, pSockContext->sock,
+            (int)(time_used / HRTIME_USECOND),
+            (int)(cluster_ping_latency_threshold / HRTIME_USECOND));
+      }
+      pSockContext->ping_start_time = 0;  //reset start time
+    }
+    else {
+      Warning("unexpect cluster server %s ping response, sock: #%d, time used: %d s",
+          pSockContext->machine->hostname, pSockContext->sock,
+          (int)(CURRENT_TIME() - pHeader->session_id.fields.timestamp));
+    }
+
+    if (pSockContext->ping_fail_count > 0) {
+      pSockContext->ping_fail_count = 0;  //reset fail count
+    }
+
+    return 0;
+  }
+
+  result = get_response_session(pHeader, &pMachineSessions,
+      &pSessionEntry, pSockContext, &call_func, &user_data);
+  if (result != 0) {
+    /*
+       if (pHeader->session_id.fields.ip != my_machine_ip) {  //request by other
+       cluster_send_msg_internal_ex(&pHeader->session_id, pSockContext,
+       FUNC_ID_CONNECTION_CLOSED_NOTIFY, NULL, 0, PRIORITY_HIGH,
+       push_to_send_queue);
+       }
+       */
+
+    return result;
+  }
+
+#ifdef MSG_TIME_STAT_FLAG
+  if ((pHeader->session_id.fields.ip == my_machine_ip)) {  //request by me
+    int session_index = pHeader->session_id.fields.seq %
+      max_session_count_per_machine;
+    SESSION_LOCK(pMachineSessions, session_index);
+    if (pSessionEntry->client_start_time != 0) {
+      ink_atomic_increment(&pMachineSessions->msg_stat.count, 1);
+      ink_atomic_increment(&pMachineSessions->msg_stat.time_used,
+          CURRENT_NS() - pSessionEntry->client_start_time);
+      pSessionEntry->client_start_time = 0;
+    }
+    SESSION_UNLOCK(pMachineSessions, session_index);
+  }
+#endif
+
+  if (call_func) {
+#ifdef DEBUG
+    int64_t deal_start_time = CURRENT_NS();
+#endif
+
+    cluster_msg_deal_func(pHeader->session_id, user_data,
+        pHeader->func_id, blocks, pHeader->data_len);
+
+#ifdef DEBUG
+    int64_t time_used = CURRENT_NS() - deal_start_time;
+    if (time_used > max_callback_time_used) {
+      max_callback_time_used = time_used;
+    }
+#endif
+  }
+  else {
+    push_in_message(pHeader->session_id, pMachineSessions, pSessionEntry,
+        pHeader->func_id, blocks, pHeader->data_len);
+  }
+
+  return 0;
+}
+
+inline static void append_to_blocks(ReaderManager *pReader,
+    const int current_body_bytes)
+{
+  IOBufferBlock *b;
+  IOBufferBlock *tail;
+
+  if (pReader->blocks == NULL) {  //first block
+    pReader->blocks = new_IOBufferBlock(
+        pReader->buffer, current_body_bytes,
+        (pReader->msg_header + MSG_HEADER_LENGTH)
+        - pReader->buffer->_data);
+    pReader->blocks->_buf_end = pReader->blocks->_end;
+    return;
+  }
+
+  //other block, starting from buffer start
+  b = new_IOBufferBlock(pReader->buffer, current_body_bytes, 0);
+  b->_buf_end = b->_end;
+  if (pReader->blocks->next == NULL) {
+    pReader->blocks->next = b;
+    return;
+  }
+
+  tail = pReader->blocks->next;
+  while (tail->next != NULL) {
+    tail = tail->next;
+  }
+
+  tail->next = b;
+}
+
+static int deal_read_event(SocketContext *pSockContext)
+{
+  int result;
+  int read_bytes;
+  MsgHeader *pHeader;
+
+  pSockContext->thread_context->stats.call_read_count++;
+  read_bytes = read(pSockContext->sock, pSockContext->reader.current,
+      pSockContext->reader.buff_end - pSockContext->reader.current);
+  /*
+     Note("======file: " __FILE__ ", line: %d, "
+     "sock: #%d, %s:%d remain bytes: %"PRId64", recv bytes: %d, errno: %d", __LINE__,
+     pSockContext->sock, pSockContext->machine->hostname,
+     pSockContext->machine->cluster_port,
+     pSockContext->reader.buff_end - pSockContext->reader.current,
+     read_bytes, errno);
+     */
+  if (read_bytes == 0) {
+    Debug(CLUSTER_DEBUG_TAG, "file: " __FILE__ ", line: %d, "
+        "type: %c, read from %s fail, connection #%d closed", __LINE__,
+        pSockContext->connect_type, pSockContext->machine->hostname,
+        pSockContext->sock);
+    return ECONNRESET;
+  }
+  else if (read_bytes < 0) {
+    if (errno == EAGAIN || errno == EWOULDBLOCK) {
+      return EAGAIN;
+    }
+    else if (errno == EINTR) {  //should try again
+      Debug(CLUSTER_DEBUG_TAG, "file: " __FILE__ ", line: %d, "
+          "read from %s fail, errno: %d, error info: %s",
+          __LINE__, pSockContext->machine->hostname,
+          errno, strerror(errno));
+      return 0;
+    }
+    else {
+      result = errno != 0 ? errno : EIO;
+      Error("file: " __FILE__ ", line: %d, "
+          "read from %s fail, errno: %d, error info: %s",
+          __LINE__, pSockContext->machine->hostname,
+          result, strerror(result));
+      return result;
+    }
+  }
+
+  pSockContext->thread_context->stats.recv_bytes += read_bytes;
+  pSockContext->reader.current += read_bytes;
+  result = pSockContext->reader.buff_end - pSockContext->reader.current
+    == 0 ? 0 : EAGAIN;
+
+  //current is the fix buffer
+  while (1) {
+    int msg_bytes;
+    int recv_body_bytes;
+    int current_true_body_bytes;
+    int padding_body_bytes;
+    int padding_len;
+    bool bFirstBlock;
+
+    if (pSockContext->reader.blocks == NULL) { //first data block
+      msg_bytes = pSockContext->reader.current -
+        pSockContext->reader.msg_header;
+      if (msg_bytes < MSG_HEADER_LENGTH) //expect whole msg header
+      {
+        if ((pSockContext->reader.buff_end -
+              pSockContext->reader.current) < 4 * 1024)
+        {
+          if (msg_bytes > 0) {  //remain bytes should be copied
+            MOVE_TO_NEW_BUFFER(pSockContext, msg_bytes);
+          }
+          else {
+            INIT_READER(pSockContext->reader, read_buffer_size);
+          }
+        }
+
+        return result;
+      }
+
+      recv_body_bytes  = msg_bytes - MSG_HEADER_LENGTH;
+      bFirstBlock = true;
+    }
+    else {   //other data block, starting from buffer start
+      msg_bytes = pSockContext->reader.current -
+        pSockContext->reader.buffer->_data;
+      recv_body_bytes = pSockContext->reader.recv_body_bytes + msg_bytes;
+      bFirstBlock = false;
+    }
+
+    pHeader = (MsgHeader *)pSockContext->reader.msg_header;
+#ifdef CHECK_MAGIC_NUMBER
+    if (pHeader->magic != MAGIC_NUMBER) {
+      Error("file: "__FILE__", line: %d, "
+          "magic number: %08x != %08x",
+          __LINE__, pHeader->magic, MAGIC_NUMBER);
+      return EINVAL;
+    }
+#endif
+
+    if (pHeader->aligned_data_len > MAX_MSG_LENGTH) {
+      Error("file: "__FILE__", line: %d, "
+          "message length: %d is too large, exceeds: %d",
+          __LINE__, pHeader->aligned_data_len, MAX_MSG_LENGTH);
+      return ENOSPC;
+    }
+
+#ifdef MSG_TIME_STAT_FLAG
+    if (!(pHeader->session_id.fields.ip == my_machine_ip))
+    {  //request by other
+      MachineSessions *pMachineSessions;
+      SessionEntry *pSessionEntry;
+      if (get_response_session_internal(pHeader,
+            &pMachineSessions, &pSessionEntry) == 0)
+      {
+        int session_index = pHeader->session_id.fields.seq %
+          max_session_count_per_machine;
+        SESSION_LOCK(pMachineSessions, session_index);
+        if (pSessionEntry->server_start_time == 0) {
+          pSessionEntry->server_start_time = CURRENT_NS();
+        }
+        SESSION_UNLOCK(pMachineSessions, session_index);
+      }
+    }
+#endif
+
+    if (recv_body_bytes < pHeader->aligned_data_len) {  //msg not done
+      if (recv_body_bytes + (pSockContext->reader.buff_end - 
+            pSockContext->reader.current) >= pHeader->aligned_data_len)
+      {  //remain buffer is enough
+        return result;
+      }
+
+      padding_body_bytes = recv_body_bytes - pSockContext->
+        reader.recv_body_bytes;
+      int recv_padding_len = recv_body_bytes - pHeader->data_len;
+      if (recv_padding_len > 0) {  //should remove padding bytes
+        current_true_body_bytes = padding_body_bytes - recv_padding_len;
+      }
+      else {
+        current_true_body_bytes = padding_body_bytes;
+      }
+
+      //must be only one block
+      if (pHeader->func_id < 0) {
+        if (!bFirstBlock) {
+          Error("file: "__FILE__", line: %d, "
+              "func_id: %d, data length: %d too large exceeds %d",
+              __LINE__, pHeader->func_id, pHeader->data_len,
+              (int)(read_buffer_size - MSG_HEADER_LENGTH));
+          return EINVAL;
+        }
+
+        MOVE_TO_NEW_BUFFER(pSockContext, msg_bytes);
+        return result;
+      }
+
+      if (pSockContext->reader.buff_end - pSockContext->reader.current >=
+          4 * 1024)
+      { //use remain data buffer
+        return result;
+      }
+
+      if (recv_body_bytes % ALIGN_BYTES != 0) { //must be aligned
+        Debug(CLUSTER_DEBUG_TAG, "file: " __FILE__ ", line: %d, "
+            "recv_body_bytes: %d (%X) should be aligned with %d", __LINE__,
+            recv_body_bytes, recv_body_bytes, ALIGN_BYTES);
+        ink_release_assert(pSockContext->reader.current < pSockContext->reader.buff_end);
+        return result;
+      }
+
+      if (current_true_body_bytes > 0) { //should alloc new buffer
+        append_to_blocks(&pSockContext->reader, current_true_body_bytes);
+      }
+      pSockContext->reader.recv_body_bytes = recv_body_bytes;
+
+      if (bFirstBlock) {
+        if (current_true_body_bytes > 0) {  //should keep the msg_header
+          ALLOC_READER_BUFFER(pSockContext->reader, read_buffer_size);
+        }
+        else { //no data yet!
+          MOVE_TO_NEW_BUFFER(pSockContext, msg_bytes);
+        }
+      }
+      else {  //should keep the msg_header
+        ALLOC_READER_BUFFER(pSockContext->reader, read_buffer_size);
+      }
+
+      return result;
+    }
+
+    if (bFirstBlock) {
+      padding_body_bytes = pHeader->aligned_data_len;
+    }
+    else {
+      padding_body_bytes = pHeader->aligned_data_len -
+        pSockContext->reader.recv_body_bytes;
+    }
+    padding_len = pHeader->aligned_data_len - pHeader->data_len;
+    if (padding_len > 0) {
+      if (padding_body_bytes > padding_len) {
+        current_true_body_bytes = padding_body_bytes - padding_len;
+      }
+      else {
+        current_true_body_bytes = 0;
+      }
+    }
+    else {  //no padding bytes
+      current_true_body_bytes = padding_body_bytes;
+    }
+
+    if (current_true_body_bytes > 0) {
+      append_to_blocks(&pSockContext->reader, current_true_body_bytes);
+    }
+
+    pSockContext->thread_context->stats.recv_msg_count++;
+    deal_message(pHeader, pSockContext, pSockContext->reader.blocks);
+
+    pSockContext->reader.blocks = NULL;  //free memory pointer
+    if (pSockContext->reader.recv_body_bytes > 0) {
+      pSockContext->reader.recv_body_bytes = 0;
+    }
+
+    if (bFirstBlock) {
+      pSockContext->reader.msg_header += MSG_HEADER_LENGTH + padding_body_bytes;
+    }
+    else {  //other block, no msg header
+      pSockContext->reader.msg_header = pSockContext->reader.buffer->_data +
+        padding_body_bytes;
+    }
+  }
+
+  return result;
+}
+
+inline static void deal_epoll_events(WorkerThreadContext *
+    pThreadContext, const int count)
+{
+  int result;
+  int events;
+  SocketContext *pSockContext;
+
+  for (int i=0; i<count; i++) {
+    events =  pThreadContext->ev_poll->getEvents(i);
+    pSockContext = (SocketContext *)pThreadContext->ev_poll->getData(i);
+
+    /*
+       Debug(CLUSTER_DEBUG_TAG, "======file: "__FILE__", line: %d, "
+       "sock #%d get epoll event: %d", __LINE__,
+       pSockContext->sock, pEvent->events);
+       */
+    if ((events & EVENTIO_ERROR) != 0) {
+      Debug(CLUSTER_DEBUG_TAG, "file: " __FILE__ ", line: %d, "
+          "connection %s %s:%d closed", __LINE__,
+          pSockContext->connect_type == CONNECT_TYPE_CLIENT ? "to" : "from",
+          pSockContext->machine->hostname, pSockContext->machine->cluster_port);
+
+      close_socket(pSockContext);
+      continue;
+    }
+
+    while ((result=deal_read_event(pSockContext)) == 0) {
+    }
+
+    if (result != EAGAIN) {
+      close_socket(pSockContext);
+    }
+  }
+
+  return;
+}
+
+inline static void schedule_sock_write(WorkerThreadContext * pThreadContext)
+{
+#define MAX_SOCK_CONTEXT_COUNT 32
+  int result;
+  int fail_count;
+  int64_t current_time;
+  SocketContext **ppSockContext;
+  SocketContext **ppContextEnd;
+  SocketContext *failSockContexts[MAX_SOCK_CONTEXT_COUNT];
+
+  fail_count = 0;
+  current_time = CURRENT_NS();
+  ppContextEnd = pThreadContext->active_sockets +
+    pThreadContext->active_sock_count;
+  for (ppSockContext = pThreadContext->active_sockets;
+      ppSockContext < ppContextEnd; ppSockContext++)
+  {
+    if (current_time < (*ppSockContext)->next_write_time) {
+      continue;
+    }
+
+    if ((*ppSockContext)->ping_start_time > 0) { //ping message already sent
+      if (current_time - (*ppSockContext)->ping_start_time > cluster_ping_latency_threshold) {
+        (*ppSockContext)->ping_start_time = 0;  //reset start time when done
+        (*ppSockContext)->ping_fail_count++;
+        if ((*ppSockContext)->ping_fail_count > cluster_ping_retries) {
+          if (fail_count < MAX_SOCK_CONTEXT_COUNT) {
+            Error("ping cluster server %s timeout more than %d times, close socket #%d",
+                (*ppSockContext)->machine->hostname, cluster_ping_retries,
+                (*ppSockContext)->sock);
+            failSockContexts[fail_count++] = *ppSockContext;
+          }
+          continue;
+        }
+        else {
+          Warning("ping cluster server %s timeout, sock: #%d, fail count: %d",
+              (*ppSockContext)->machine->hostname, (*ppSockContext)->sock,
+              (*ppSockContext)->ping_fail_count);
+        }
+      }
+    }
+    else {
+      if (cluster_ping_send_interval > 0 && current_time >=
+          (*ppSockContext)->next_ping_time)
+      {
+        (*ppSockContext)->thread_context->stats.ping_total_count++;
+        (*ppSockContext)->ping_start_time = current_time;
+        (*ppSockContext)->next_ping_time = current_time + cluster_ping_send_interval;
+        send_ping_message(*ppSockContext);
+      }
+    }
+
+    while ((result=deal_write_event(*ppSockContext)) == 0) {
+    }
+
+    if (result == EAGAIN) {
+      (*ppSockContext)->next_write_time = current_time + send_wait_time;
+    }
+    else {  //error
+      if (fail_count < MAX_SOCK_CONTEXT_COUNT) {
+        failSockContexts[fail_count++] = *ppSockContext;
+      }
+    }
+  }
+
+  if (fail_count == 0) {
+    return;
+  }
+
+  ppContextEnd = failSockContexts + fail_count;
+  for (ppSockContext = failSockContexts; ppSockContext < ppContextEnd;
+      ppSockContext++)
+  {
+    close_socket(*ppSockContext);
+  }
+}
+
+inline static int64_t get_current_time()
+{
+  timeval tv;
+  gettimeofday(&tv, NULL);
+  Thread::cur_time = tv.tv_sec * HRTIME_SECOND +
+    tv.tv_usec * HRTIME_USECOND;
+  return Thread::cur_time;
+}
+
+#define GET_MAX_TIME_USED(v) \
+  do { \
+    deal_end_time = get_current_time(); \
+    time_used = deal_end_time - deal_start_time; \
+    if (time_used > v) { \
+      v = time_used; \
+    } \
+    deal_start_time = deal_end_time; \
+  } while (0)
+
+
+static void *work_thread_entrance(void* arg)
+{
+#define MIN_USLEEP_TIME 100
+
+  int result;
+  int count;
+  int remain_time;
+  int64_t loop_start_time;
+  int64_t deal_start_time;
+#ifdef DEBUG
+  int64_t deal_end_time;
+  int64_t time_used;
+#endif
+  WorkerThreadContext *pThreadContext;
+
+  pThreadContext = (WorkerThreadContext *)arg;
+
+#if defined(HAVE_SYS_PRCTL_H) && defined(PR_SET_NAME)
+  char name[32];
+  sprintf(name, "[ET_CLUSTER %d]", (int)(pThreadContext -
+        cluster_worker_thread_contexts) + 1);
+  prctl(PR_SET_NAME, name, 0, 0, 0); 
+#endif
+
+  while (1) {
+    loop_start_time = get_current_time();
+#ifdef DEBUG
+    deal_start_time = loop_start_time;
+#endif
+
+    schedule_sock_write(pThreadContext);
+
+#ifdef DEBUG
+    GET_MAX_TIME_USED(max_write_loop_time_used);
+#endif
+
+#ifndef DEBUG
+    deal_start_time = CURRENT_NS();
+#endif
+    pThreadContext->stats.epoll_wait_count++;
+    count = pThreadContext->ev_poll->poll();
+    pThreadContext->stats.epoll_wait_time_used += CURRENT_NS() - deal_start_time;
+#ifdef DEBUG
+    GET_MAX_TIME_USED(max_epoll_time_used);
+#endif
+
+    if (count == 0) { //timeout
+    }
+    else if (count < 0) {
+      if (errno != EINTR) {
+        ink_fatal(1, "file: "__FILE__", line: %d, "
+            "call event poll fail, "
+            "errno: %d, error info: %s\n",
+            __LINE__, errno, strerror(errno));
+      }
+    }
+    else {
+      deal_epoll_events(pThreadContext, count);
+
+#ifdef DEBUG
+      GET_MAX_TIME_USED(max_read_loop_time_used);
+#endif
+    }
+
+    if (io_loop_interval > MIN_USLEEP_TIME) {
+      remain_time = io_loop_interval - (int)((CURRENT_NS() -
+            loop_start_time) / HRTIME_USECOND);
+      if (remain_time >= MIN_USLEEP_TIME && remain_time <= io_loop_interval) {
+        pThreadContext->stats.loop_usleep_count++;
+        pThreadContext->stats.loop_usleep_time += remain_time;
+        usleep(remain_time);
+
+#ifdef DEBUG
+        GET_MAX_TIME_USED(max_usleep_time_used);
+#endif
+      }
+    }
+  }
+
+  if ((result=ink_mutex_acquire(&worker_thread_lock)) != 0)
+  {
+    Error("file: "__FILE__", line: %d, "
+        "call ink_mutex_acquire fail, "
+        "errno: %d, error info: %s",
+        __LINE__, result, strerror(result));
+  }
+  cluster_worker_thread_count--;
+  if ((result=ink_mutex_release(&worker_thread_lock)) != 0)
+  {
+    Error("file: "__FILE__", line: %d, "
+        "call ink_mutex_release fail, "
+        "errno: %d, error info: %s",
+        __LINE__, result, strerror(result));
+  }
+
+  return NULL;
+}
+
+int push_to_send_queue(SocketContext *pSockContext, OutMessage *pMessage,
+    const MessagePriority priority, const uint32_t sessionVersion)
+{
+  int result;
+  ink_mutex_acquire(&pSockContext->send_queues[priority].lock);
+  do {
+    if (pSockContext->version != sessionVersion) {
+      Debug(CLUSTER_DEBUG_TAG, "session version: %u != socket context version: %d!",
+          sessionVersion, pSockContext->version);
+      result = EINVAL;
+      break;
+    }
+
+    if (pSockContext->sock < 0) {
+      Debug(CLUSTER_DEBUG_TAG, "sock context is invalid");
+      result = EINVAL;
+      break;
+    }
+    result = 0;
+  } while (0);
+
+  if (result != 0) {
+    ink_mutex_release(&pSockContext->send_queues[priority].lock);
+
+    ink_atomic_increment(&pSockContext->thread_context->stats.fail_msg_count, 1);
+    ink_atomic_increment(&pSockContext->thread_context->stats.fail_msg_bytes,
+        MSG_HEADER_LENGTH + pMessage->header.aligned_data_len);
+    return result;
+  }
+
+  if (pSockContext->send_queues[priority].head == NULL) {
+    pSockContext->send_queues[priority].head = pMessage;
+  }
+  else {
+    pSockContext->send_queues[priority].tail->next = pMessage;
+  }
+  pSockContext->send_queues[priority].tail = pMessage;
+  ink_mutex_release(&pSockContext->send_queues[priority].lock);
+
+  ink_atomic_increment(&pSockContext->thread_context->stats.push_msg_count, 1);
+  ink_atomic_increment(&pSockContext->thread_context->stats.push_msg_bytes,
+      MSG_HEADER_LENGTH + pMessage->header.aligned_data_len);
+  return 0;
+}
+
+int insert_into_send_queue_head(SocketContext *pSockContext, OutMessage *pMessage,
+    const MessagePriority priority)
+{
+  ink_mutex_acquire(&pSockContext->send_queues[priority].lock);
+  if (pSockContext->send_queues[priority].head == NULL) {
+    pSockContext->send_queues[priority].head = pMessage;
+    pSockContext->send_queues[priority].tail = pMessage;
+  }
+  else {
+    if (pSockContext->send_queues[priority].head->bytes_sent == 0) { //head message not send yet
+      pMessage->next = pSockContext->send_queues[priority].head;
+      pSockContext->send_queues[priority].head = pMessage;
+    }
+    else {
+      pMessage->next = pSockContext->send_queues[priority].head->next;
+      pSockContext->send_queues[priority].head->next = pMessage;
+      if (pMessage->next == NULL) {
+        pSockContext->send_queues[priority].tail = pMessage;
+      }
+    }
+  }
+  ink_mutex_release(&pSockContext->send_queues[priority].lock);
+
+  ink_atomic_increment(&pSockContext->thread_context->stats.push_msg_count, 1);
+  ink_atomic_increment(&pSockContext->thread_context->stats.push_msg_bytes,
+      MSG_HEADER_LENGTH + pMessage->header.aligned_data_len);
+
+  return 0;
+}
+

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/nio.h
----------------------------------------------------------------------
diff --git a/iocore/cluster/nio.h b/iocore/cluster/nio.h
new file mode 100644
index 0000000..94a3fdb
--- /dev/null
+++ b/iocore/cluster/nio.h
@@ -0,0 +1,60 @@
+/** @file
+
+  A brief file description
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#ifndef _NIO_H_
+#define _NIO_H_
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include "types.h"
+#include "clusterinterface.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern WorkerThreadContext *cluster_worker_thread_contexts;
+extern int cluster_worker_thread_count;
+
+extern message_deal_func cluster_msg_deal_func;
+extern machine_change_notify_func cluster_machine_change_notify;
+
+int nio_init();
+int nio_destroy();
+
+int nio_add_to_epoll(SocketContext *pSockContext);
+int push_to_send_queue(SocketContext *pSockContext, OutMessage *pMessage,
+    const MessagePriority priority, const uint32_t sessionVersion);
+
+int insert_into_send_queue_head(SocketContext *pSockContext, OutMessage *pMessage,
+    const MessagePriority priority);
+
+void log_nio_stats();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
+


[4/6] refine the codes of cluster

Posted by we...@apache.org.
http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/ClusterConfig.cc
----------------------------------------------------------------------
diff --git a/iocore/cluster/ClusterConfig.cc b/iocore/cluster/ClusterConfig.cc
index a541364..b969f44 100644
--- a/iocore/cluster/ClusterConfig.cc
+++ b/iocore/cluster/ClusterConfig.cc
@@ -28,6 +28,9 @@
 ****************************************************************************/
 
 #include "P_Cluster.h"
+#include "machine.h"
+#include "connection.h"
+
 // updated from the cluster port configuration variable
 int cluster_port = DEFAULT_CLUSTER_PORT_NUMBER;
 
@@ -155,16 +158,30 @@ ClusterAccept::ClusterAcceptMachine(NetVConnection * NetVC)
 }
 
 static void
-make_cluster_connections(MachineList * l)
+make_cluster_connections(MachineList * l, MachineList * old)
 {
   //
   // Connect to all new machines.
   //
-  uint32_t ip = this_cluster_machine()->ip;
-  int num_connections = this_cluster_machine()->num_connections;
+  //uint32_t ip = this_cluster_machine()->ip;
+  //int num_connections = this_cluster_machine()->num_connections;
 
+  int i;
+  int k;
+  ClusterMachine *m;
   if (l) {
-    for (int i = 0; i < l->n; i++) {
+    for (i = 0; i < l->n; i++) {
+      struct in_addr in;
+      in.s_addr = l->machine[i].ip;
+      m = add_machine(l->machine[i].ip, l->machine[i].port);
+      if (m != NULL) {
+        machine_make_connections(m);
+      }
+
+      Debug(CL_NOTE, "do connect hostname: %u.%u.%u.%u:%d, %s, cluster_machine_count: %d\n",
+          DOT_SEPARATED(l->machine[i].ip), l->machine[i].port, inet_ntoa(in), cluster_machine_count);
+
+    /*
 #ifdef LOCAL_CLUSTER_TEST_MODE
       if (ip < l->machine[i].ip || (ip == l->machine[i].ip && (cluster_port < l->machine[i].port))) {
 #else
@@ -175,6 +192,48 @@ make_cluster_connections(MachineList * l)
         }
       }
     }
+    */
+    }
+  }
+
+  if (old == NULL) {
+    return;
+  }
+
+  //found down machines
+  if (l == NULL) {
+    for (i = 0; i < old->n; i++) {
+      struct in_addr in;
+      in.s_addr = old->machine[i].ip;
+      Debug(CL_NOTE, "stop connect hostname: %u.%u.%u.%u:%d, %s\n",
+          DOT_SEPARATED(old->machine[i].ip), old->machine[i].port, inet_ntoa(in));
+      m = get_machine(old->machine[i].ip, old->machine[i].port);
+      if (m != NULL) {
+        machine_stop_reconnect(m);
+      }
+    }
+  }
+  else {
+    for (i = 0; i < old->n; i++) {
+      for (k = 0; k < l->n; k++) {
+        if (l->machine[k].ip == old->machine[i].ip &&
+            l->machine[k].port == old->machine[i].port)
+        {
+          break;
+        }
+      }
+
+      if (k == l->n) {  //not found, machine down
+        struct in_addr in;
+        in.s_addr = old->machine[i].ip;
+        Debug(CL_NOTE, "stop connect hostname: %u.%u.%u.%u:%d, %s\n",
+            DOT_SEPARATED(old->machine[i].ip), old->machine[i].port, inet_ntoa(in));
+        m = get_machine(old->machine[i].ip, old->machine[i].port);
+        if (m != NULL) {
+          machine_stop_reconnect(m);
+        }
+      }
+    }
   }
 }
 
@@ -201,7 +260,7 @@ machine_config_change(const char * /* name ATS_UNUSED */, RecDataT /* data_type
   case CLUSTER_CONFIG:
     old = cluster_config;
     cluster_config = l;
-    make_cluster_connections(l);
+    make_cluster_connections(l, old);
     break;
   }
 #else
@@ -209,7 +268,7 @@ machine_config_change(const char * /* name ATS_UNUSED */, RecDataT /* data_type
   old = cluster_config;
   machines_config = l;
   cluster_config = l;
-  make_cluster_connections(l);
+  make_cluster_connections(l, old);
 #endif
   if (old)
     free_MachineList(old);
@@ -291,8 +350,10 @@ configuration_add_machine(ClusterConfiguration * c, ClusterMachine * m)
   // Build a new cluster configuration with the new machine.
   // Machines are stored in ip sorted order.
   //
+  /*
   EThread *thread = this_ethread();
   ProxyMutex *mutex = thread->mutex;
+  */
   int i = 0;
   ClusterConfiguration *cc = NEW(new ClusterConfiguration(*c));
 
@@ -319,7 +380,7 @@ configuration_add_machine(ClusterConfiguration * c, ClusterMachine * m)
 
   build_cluster_hash_table(cc);
   INK_MEMORY_BARRIER;           // commit writes before freeing old hash table
-  CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CONFIGURATION_CHANGES_STAT);
+  //CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CONFIGURATION_CHANGES_STAT);
 
   free_configuration(c, cc);
   return cc;
@@ -328,9 +389,6 @@ configuration_add_machine(ClusterConfiguration * c, ClusterMachine * m)
 ClusterConfiguration *
 configuration_remove_machine(ClusterConfiguration * c, ClusterMachine * m)
 {
-  EThread *thread = this_ethread();
-  ProxyMutex *mutex = thread->mutex;
-
   //
   // Build a new cluster configuration without a machine
   //
@@ -350,7 +408,7 @@ configuration_remove_machine(ClusterConfiguration * c, ClusterMachine * m)
 
   build_cluster_hash_table(cc);
   INK_MEMORY_BARRIER;           // commit writes before freeing old hash table
-  CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CONFIGURATION_CHANGES_STAT);
+  //CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CONFIGURATION_CHANGES_STAT);
 
   free_configuration(c, cc);
   return cc;

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/ClusterMachine.cc
----------------------------------------------------------------------
diff --git a/iocore/cluster/ClusterMachine.cc b/iocore/cluster/ClusterMachine.cc
index 00e99e6..1ac4fae 100644
--- a/iocore/cluster/ClusterMachine.cc
+++ b/iocore/cluster/ClusterMachine.cc
@@ -77,9 +77,9 @@ ClusterMachine::ClusterMachine(char *ahostname, unsigned int aip, int aport)
     msg_proto_minor(0),
     clusterHandlers(0)
 {
-  EThread *thread = this_ethread();
-  ProxyMutex *mutex = thread->mutex;
-  CLUSTER_INCREMENT_DYN_STAT(CLUSTER_MACHINES_ALLOCATED_STAT);
+  //EThread *thread = this_ethread();
+  //ProxyMutex *mutex = thread->mutex;
+  //CLUSTER_INCREMENT_DYN_STAT(CLUSTER_MACHINES_ALLOCATED_STAT);
   if (!aip) {
     char localhost[1024];
     if (!ahostname) {
@@ -166,7 +166,7 @@ ClusterHandler *ClusterMachine::pop_ClusterHandler(int no_rr)
 ClusterMachine::~ClusterMachine()
 {
   ats_free(hostname);
-  ats_free(clusterHandlers);
+  // ats_free(clusterHandlers);
 }
 
 struct MachineTimeoutContinuation;
@@ -193,10 +193,10 @@ struct MachineTimeoutContinuation: public Continuation
 void
 free_ClusterMachine(ClusterMachine * m)
 {
-  EThread *thread = this_ethread();
-  ProxyMutex *mutex = thread->mutex;
+  //EThread *thread = this_ethread();
+  //ProxyMutex *mutex = thread->mutex;
   // delay before the final free
-  CLUSTER_INCREMENT_DYN_STAT(CLUSTER_MACHINES_FREED_STAT);
+  //CLUSTER_INCREMENT_DYN_STAT(CLUSTER_MACHINES_FREED_STAT);
   m->dead = true;
   eventProcessor.schedule_in(NEW(new MachineTimeoutContinuation(m)), MACHINE_TIMEOUT, ET_CALL);
 }

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/ClusterProcessor.cc
----------------------------------------------------------------------
diff --git a/iocore/cluster/ClusterProcessor.cc b/iocore/cluster/ClusterProcessor.cc
index b01e0ff..ab6c0a0 100644
--- a/iocore/cluster/ClusterProcessor.cc
+++ b/iocore/cluster/ClusterProcessor.cc
@@ -28,21 +28,34 @@
 ****************************************************************************/
 
 #include "P_Cluster.h"
+#include "global.h"
+#include "connection.h"
+
 /*************************************************************************/
 // ClusterProcessor member functions (Public class)
 /*************************************************************************/
 int cluster_port_number = DEFAULT_CLUSTER_PORT_NUMBER;
 int cache_clustering_enabled = 0;
 int num_of_cluster_threads = DEFAULT_NUMBER_OF_CLUSTER_THREADS;
+int num_of_cluster_connections = 0;
 
 ClusterProcessor clusterProcessor;
 RecRawStatBlock *cluster_rsb = NULL;
 int ET_CLUSTER;
 
+void cluster_main_handler(ClusterSession session, void *context,
+    const int func_id, IOBufferBlock *data, const int data_len);
 ClusterProcessor::ClusterProcessor():accept_handler(NULL), this_cluster(NULL)
 {
 }
 
+
+//void cluster_error_handler(int event, void *arg);
+//void cluster_main_handler(ClusterSession *session, const int func_id,
+//  void *data, const int data_len) {
+//  ClusterRPC[func_id](session, data, data_len);
+//}
+
 ClusterProcessor::~ClusterProcessor()
 {
   if (accept_handler) {
@@ -55,98 +68,111 @@ int
 ClusterProcessor::internal_invoke_remote(ClusterHandler *ch, int cluster_fn,
                                          void *data, int len, int options, void *cmsg)
 {
-  EThread *thread = this_ethread();
-  ProxyMutex *mutex = thread->mutex;
-  //
-  // RPC facility for intercluster communication available to other
-  //  subsystems.
-  //
-  bool steal = (options & CLUSTER_OPT_STEAL ? true : false);
-  bool delay = (options & CLUSTER_OPT_DELAY ? true : false);
-  bool data_in_ocntl = (options & CLUSTER_OPT_DATA_IS_OCONTROL ? true : false);
-  bool malloced = (cluster_fn == CLUSTER_FUNCTION_MALLOCED);
-  OutgoingControl *c;
-
-  if (!ch || (!malloced && !((unsigned int) cluster_fn < (uint32_t) SIZE_clusterFunction))) {
-    // Invalid message or node is down, free message data
-    if (malloced) {
-      ats_free(data);
-    }
-    if (cmsg) {
-      invoke_remote_data_args *args = (invoke_remote_data_args *)
-        (((OutgoingControl *) cmsg)->data + sizeof(int32_t));
-      ink_assert(args->magicno == invoke_remote_data_args::MagicNo);
+//  EThread *thread = this_ethread();
+//  ProxyMutex *mutex = thread->mutex;
+//  //
+//  // RPC facility for intercluster communication available to other
+//  //  subsystems.
+//  //
+//  bool steal = (options & CLUSTER_OPT_STEAL ? true : false);
+//  bool delay = (options & CLUSTER_OPT_DELAY ? true : false);
+//  bool data_in_ocntl = (options & CLUSTER_OPT_DATA_IS_OCONTROL ? true : false);
+//  bool malloced = (cluster_fn == CLUSTER_FUNCTION_MALLOCED);
+//  OutgoingControl *c;
+//
+//  if (!ch || (!malloced && !((unsigned int) cluster_fn < (uint32_t) SIZE_clusterFunction))) {
+//    // Invalid message or node is down, free message data
+//    if (malloced) {
+//      ats_free(data);
+//    }
+//    if (cmsg) {
+//      invoke_remote_data_args *args = (invoke_remote_data_args *)
+//        (((OutgoingControl *) cmsg)->data + sizeof(int32_t));
+//      ink_assert(args->magicno == invoke_remote_data_args::MagicNo);
+//
+//      args->data_oc->freeall();
+//      ((OutgoingControl *) cmsg)->freeall();
+//    }
+//    if (data_in_ocntl) {
+//      c = *((OutgoingControl **) ((char *) data - sizeof(OutgoingControl *)));
+//      c->freeall();
+//    }
+//    return -1;
+//  }
+//
+//  if (data_in_ocntl) {
+//    c = *((OutgoingControl **) ((char *) data - sizeof(OutgoingControl *)));
+//  } else {
+//    c = OutgoingControl::alloc();
+//  }
+//  CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CTRL_MSGS_SENT_STAT);
+//  c->submit_time = ink_get_hrtime();
+//  if ((c->zero_body = zero_body)) {
+//    c->free_proc = &CacheContinuation::disposeOfDataBuffer;
+//    c->free_proc_arg = cc;
+//  }
+//
+//  if (malloced) {
+//    c->set_data((char *) data, len);
+//  } else {
+//    if (!data_in_ocntl) {
+//      c->len = len + sizeof(int32_t);
+//      c->alloc_data();
+//    }
+//    if (!c->fast_data()) {
+//      CLUSTER_INCREMENT_DYN_STAT(CLUSTER_SLOW_CTRL_MSGS_SENT_STAT);
+//    }
+//    *(int32_t *) c->data = cluster_fn;
+//    if (!data_in_ocntl) {
+//      memcpy(c->data + sizeof(int32_t), data, len);
+//    }
+//  }
+//
+//  SET_CONTINUATION_HANDLER(c, (OutgoingCtrlHandler) & OutgoingControl::startEvent);
+//
+//  /////////////////////////////////////
+//  // Compound message adjustments
+//  /////////////////////////////////////
+//  if (cmsg) {
+//    invoke_remote_data_args *args = (invoke_remote_data_args *)
+//      (((OutgoingControl *) cmsg)->data + sizeof(int32_t));
+//    ink_assert(args->magicno == invoke_remote_data_args::MagicNo);
+//    args->msg_oc = c;
+//    c = (OutgoingControl *) cmsg;
+//  }
+//#ifndef CLUSTER_THREAD_STEALING
+//  delay = true;
+//#endif
+//  if (!delay) {
+//    EThread *tt = this_ethread();
+//    {
+//      int q = ClusterFuncToQpri(cluster_fn);
+//      ink_atomiclist_push(&ch->outgoing_control_al[q], (void *) c);
+//
+//      MUTEX_TRY_LOCK(lock, ch->mutex, tt);
+//      if (!lock) {
+//		if(ch->thread && ch->thread->signal_hook)
+//		  ch->thread->signal_hook(ch->thread);
+//		return 1;
+//      }
+//      if (steal)
+//        ch->steal_thread(tt);
+//      return 1;
+//    }
+//  } else {
+//    c->mutex = ch->mutex;
+//    eventProcessor.schedule_imm_signal(c);
+//    return 0;
+//  }
+
+  (void) ch;
+  (void) cluster_fn;
+  (void) data;
+  (void) len;
+  (void) options;
+  (void) cmsg;
 
-      args->data_oc->freeall();
-      ((OutgoingControl *) cmsg)->freeall();
-    }
-    if (data_in_ocntl) {
-      c = *((OutgoingControl **) ((char *) data - sizeof(OutgoingControl *)));
-      c->freeall();
-    }
-    return -1;
-  }
-
-  if (data_in_ocntl) {
-    c = *((OutgoingControl **) ((char *) data - sizeof(OutgoingControl *)));
-  } else {
-    c = OutgoingControl::alloc();
-  }
-  CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CTRL_MSGS_SENT_STAT);
-  c->submit_time = ink_get_hrtime();
-
-  if (malloced) {
-    c->set_data((char *) data, len);
-  } else {
-    if (!data_in_ocntl) {
-      c->len = len + sizeof(int32_t);
-      c->alloc_data();
-    }
-    if (!c->fast_data()) {
-      CLUSTER_INCREMENT_DYN_STAT(CLUSTER_SLOW_CTRL_MSGS_SENT_STAT);
-    }
-    *(int32_t *) c->data = cluster_fn;
-    if (!data_in_ocntl) {
-      memcpy(c->data + sizeof(int32_t), data, len);
-    }
-  }
-
-  SET_CONTINUATION_HANDLER(c, (OutgoingCtrlHandler) & OutgoingControl::startEvent);
-
-  /////////////////////////////////////
-  // Compound message adjustments
-  /////////////////////////////////////
-  if (cmsg) {
-    invoke_remote_data_args *args = (invoke_remote_data_args *)
-      (((OutgoingControl *) cmsg)->data + sizeof(int32_t));
-    ink_assert(args->magicno == invoke_remote_data_args::MagicNo);
-    args->msg_oc = c;
-    c = (OutgoingControl *) cmsg;
-  }
-#ifndef CLUSTER_THREAD_STEALING
-  delay = true;
-#endif
-  if (!delay) {
-    EThread *tt = this_ethread();
-    {
-      int q = ClusterFuncToQpri(cluster_fn);
-      ink_atomiclist_push(&ch->outgoing_control_al[q], (void *) c);
-
-      MUTEX_TRY_LOCK(lock, ch->mutex, tt);
-      if (!lock) {
-		if(ch->thread && ch->thread->signal_hook)
-		  ch->thread->signal_hook(ch->thread);
-		return 1;
-      }
-      if (steal)
-        ch->steal_thread(tt);
-      return 1;
-    }
-  } else {
-    c->mutex = ch->mutex;
-    eventProcessor.schedule_imm_signal(c);
-    return 0;
-  }
+  return 0;
 }
 
 int
@@ -162,45 +188,56 @@ ClusterProcessor::invoke_remote_data(ClusterHandler *ch, int cluster_fn,
                                      int dest_channel, ClusterVCToken * token,
                                      void (*bufdata_free_proc) (void *), void *bufdata_free_proc_arg, int options)
 {
-  if (!buf) {
-    // No buffer data, translate this into a invoke_remote() request
-    return internal_invoke_remote(ch, cluster_fn, data, data_len, options, (void *) NULL);
-  }
-  ink_assert(data);
-  ink_assert(data_len);
-  ink_assert(dest_channel);
-  ink_assert(token);
-  ink_assert(bufdata_free_proc);
-  ink_assert(bufdata_free_proc_arg);
-
-  /////////////////////////////////////////////////////////////////////////
-  // Build the compound message as described by invoke_remote_data_args.
-  /////////////////////////////////////////////////////////////////////////
-
-  // Build OutgoingControl for buffer data
-  OutgoingControl *bufdata_oc = OutgoingControl::alloc();
-  bufdata_oc->set_data(buf, bufdata_free_proc, bufdata_free_proc_arg);
-
-  // Build OutgoingControl for compound message header
-  invoke_remote_data_args mh;
-  mh.msg_oc = 0;
-  mh.data_oc = bufdata_oc;
-  mh.dest_channel = dest_channel;
-  mh.token = *token;
-
-  OutgoingControl *chdr = OutgoingControl::alloc();
-  chdr->submit_time = ink_get_hrtime();
-  chdr->len = sizeof(int32_t) + sizeof(mh);
-  chdr->alloc_data();
-  *(int32_t *) chdr->data = -1;   // always -1 for compound message
-  memcpy(chdr->data + sizeof(int32_t), (char *) &mh, sizeof(mh));
-
-  return internal_invoke_remote(ch, cluster_fn, data, data_len, options, (void *) chdr);
+//  if (!buf) {
+//    // No buffer data, translate this into a invoke_remote() request
+//    return internal_invoke_remote(ch, cluster_fn, data, data_len, options, (void *) NULL, zero_body, bufdata_free_proc_arg);
+//  }
+//  ink_assert(data);
+//  ink_assert(data_len);
+//  ink_assert(dest_channel);
+//  ink_assert(token);
+//  ink_assert(bufdata_free_proc);
+//  ink_assert(bufdata_free_proc_arg);
+//
+//  /////////////////////////////////////////////////////////////////////////
+//  // Build the compound message as described by invoke_remote_data_args.
+//  /////////////////////////////////////////////////////////////////////////
+//
+//  // Build OutgoingControl for buffer data
+//  OutgoingControl *bufdata_oc = OutgoingControl::alloc();
+//  bufdata_oc->set_data(buf, bufdata_free_proc, bufdata_free_proc_arg);
+//
+//  // Build OutgoingControl for compound message header
+//  invoke_remote_data_args mh;
+//  mh.msg_oc = 0;
+//  mh.data_oc = bufdata_oc;
+//  mh.dest_channel = dest_channel;
+//  mh.token = *token;
+//
+//  OutgoingControl *chdr = OutgoingControl::alloc();
+//  chdr->submit_time = ink_get_hrtime();
+//  chdr->len = sizeof(int32_t) + sizeof(mh);
+//  chdr->alloc_data();
+//  *(int32_t *) chdr->data = -1;   // always -1 for compound message
+//  memcpy(chdr->data + sizeof(int32_t), (char *) &mh, sizeof(mh));
+//
+//  return internal_invoke_remote(ch, cluster_fn, data, data_len, options, (void *) chdr);
+
+  (void) ch;
+  (void) cluster_fn;
+  (void) data;
+  (void) data_len;
+  (void) buf;
+  (void) dest_channel;
+  (void) token;
+  (void) bufdata_free_proc;
+  (void) bufdata_free_proc_arg;
+  (void) options;
+  return 0;
 }
 
-// TODO: Why pass in the length here if not used ?
 void
-ClusterProcessor::free_remote_data(char *p, int /* l ATS_UNUSED */)
+ClusterProcessor::free_remote_data(char *p, int /* l */)
 {
   char *d = p - sizeof(int32_t);  // reset to ptr to function code
   int data_hdr = ClusterControl::DATA_HDR;
@@ -225,65 +262,70 @@ ClusterProcessor::free_remote_data(char *p, int /* l ATS_UNUSED */)
 }
 
 ClusterVConnection *
-ClusterProcessor::open_local(Continuation * cont, ClusterMachine */* m ATS_UNUSED */, ClusterVCToken & token, int options)
+ClusterProcessor::open_local(Continuation * cont, ClusterMachine * m, ClusterVCToken & token, int options)
 {
-  //
-  //  New connect protocol.
-  //  As a VC initiator, establish the VC connection to the remote node
-  //  by allocating the VC locally and requiring the caller to pass the
-  //  token and channel id in the remote request.  The remote handler calls
-  //  connect_local to establish the remote side of the connection.
-  //
-  bool immediate = ((options & CLUSTER_OPT_IMMEDIATE) ? true : false);
-  bool allow_immediate = ((options & CLUSTER_OPT_ALLOW_IMMEDIATE) ? true : false);
-
-  ClusterHandler *ch = ((CacheContinuation *)cont)->ch;
-  if (!ch)
-    return NULL;
-  EThread *t = ch->thread;
-  if (!t)
-    return NULL;
-
-  EThread *thread = this_ethread();
-  ProxyMutex *mutex = thread->mutex;
-  ClusterVConnection *vc = clusterVCAllocator.alloc();
-  vc->new_connect_read = (options & CLUSTER_OPT_CONN_READ ? 1 : 0);
-  vc->start_time = ink_get_hrtime();
-  vc->last_activity_time = vc->start_time;
-  vc->ch = ch;
-  vc->token.alloc();
-  vc->token.ch_id = ch->id;
-  token = vc->token;
-#ifdef CLUSTER_THREAD_STEALING
-  CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CONNECTIONS_OPENNED_STAT);
-  CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CONNECTIONS_OPEN_STAT);
-  MUTEX_TRY_LOCK(lock, ch->mutex, thread);
-  if (!lock) {
-#endif
-    if (immediate) {
-      clusterVCAllocator_free(vc);
-      return NULL;
-    }
-    vc->action_ = cont;
-    ink_atomiclist_push(&ch->external_incoming_open_local, (void *) vc);
-	if(ch->thread && ch->thread->signal_hook)
-	  ch->thread->signal_hook(ch->thread);
-    return CLUSTER_DELAYED_OPEN;
-
-#ifdef CLUSTER_THREAD_STEALING
-  } else {
-    if (!(immediate || allow_immediate))
-      vc->action_ = cont;
-    if (vc->start(thread) < 0) {
-      return NULL;
-    }
-    if (immediate || allow_immediate) {
-      return vc;
-    } else {
-      return CLUSTER_DELAYED_OPEN;
-    }
-  }
-#endif
+//  //
+//  //  New connect protocol.
+//  //  As a VC initiator, establish the VC connection to the remote node
+//  //  by allocating the VC locally and requiring the caller to pass the
+//  //  token and channel id in the remote request.  The remote handler calls
+//  //  connect_local to establish the remote side of the connection.
+//  //
+//  bool immediate = ((options & CLUSTER_OPT_IMMEDIATE) ? true : false);
+//  bool allow_immediate = ((options & CLUSTER_OPT_ALLOW_IMMEDIATE) ? true : false);
+//
+//  ClusterHandler *ch = ((CacheContinuation *)cont)->ch;
+//  if (!ch)
+//    return NULL;
+//  EThread *t = ch->thread;
+//  if (!t)
+//    return NULL;
+//
+//  EThread *thread = this_ethread();
+//  ProxyMutex *mutex = thread->mutex;
+//  ClusterVConnection *vc = clusterVCAllocator.alloc();
+//  vc->new_connect_read = (options & CLUSTER_OPT_CONN_READ ? 1 : 0);
+//  vc->start_time = ink_get_hrtime();
+//  vc->last_activity_time = vc->start_time;
+//  vc->ch = ch;
+//  vc->token.alloc();
+//  vc->token.ch_id = ch->id;
+//  token = vc->token;
+//#ifdef CLUSTER_THREAD_STEALING
+//  CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CONNECTIONS_OPENNED_STAT);
+//  CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CONNECTIONS_OPEN_STAT);
+//  MUTEX_TRY_LOCK(lock, ch->mutex, thread);
+//  if (!lock) {
+//#endif
+//    if (immediate) {
+//      clusterVCAllocator_free(vc);
+//      return NULL;
+//    }
+//    vc->action_ = cont;
+//    ink_atomiclist_push(&ch->external_incoming_open_local, (void *) vc);
+//	if(ch->thread && ch->thread->signal_hook)
+//	  ch->thread->signal_hook(ch->thread);
+//    return CLUSTER_DELAYED_OPEN;
+//
+//#ifdef CLUSTER_THREAD_STEALING
+//  } else {
+//    if (!(immediate || allow_immediate))
+//      vc->action_ = cont;
+//    if (vc->start(thread) < 0) {
+//      return NULL;
+//    }
+//    if (immediate || allow_immediate) {
+//      return vc;
+//    } else {
+//      return CLUSTER_DELAYED_OPEN;
+//    }
+//  }
+//#endif
+  (void) cont;
+  (void) m;
+  (void) token;
+  (void) options;
+  return NULL;
 }
 
 ClusterVConnection *
@@ -367,9 +409,10 @@ bool ClusterProcessor::disable_remote_cluster_ops(ClusterMachine * m)
 // Simplify debug access to stats
 ////////////////////////////////////////////////////////////////////////////
 ////////////////////////////////////////////////////////////////////////////
-
+/*
 GlobalClusterPeriodicEvent *
   PeriodicClusterEvent;
+*/
 
 #ifdef CLUSTER_TOMCAT
 extern int cache_clustering_enabled;
@@ -386,6 +429,77 @@ unsigned long cluster_packet_tos = 0;
 int RPC_only_CacheCluster = 0;
 #endif
 
+static int machine_change_notify(ClusterMachine * m)
+{
+  //char textbuf[sizeof("255.255.255.255:65535")];
+  int result;
+
+  Debug("cluster_io", "start notify, machine %s %hhu.%hhu.%hhu.%hhu:%d, version: %d.%d",
+      m->dead ? "down" : "up",
+      DOT_SEPARATED(m->ip), m->cluster_port, m->msg_proto_major,
+      m->msg_proto_minor);
+
+  if (m->dead) {
+    ClusterConfiguration *c = this_cluster()->current_configuration();
+    if (c->find(m->ip, m->cluster_port)) {
+      ClusterConfiguration *cc = configuration_remove_machine(c, m);
+      //CLUSTER_DECREMENT_DYN_STAT(CLUSTER_NODES_STAT);
+      this_cluster()->configurations.push(cc);
+      result = 0;
+    }
+    else {
+      result = ENOENT;
+    }
+
+    Note("machine down %hhu.%hhu.%hhu.%hhu:%d, version=%d.%d",
+        DOT_SEPARATED(m->ip), m->cluster_port, m->msg_proto_major,
+        m->msg_proto_minor);
+    /*
+    snprintf(textbuf, sizeof(textbuf), "%hhu.%hhu.%hhu.%hhu:%d", DOT_SEPARATED(m->ip), m->cluster_port);
+    REC_SignalManager(REC_SIGNAL_MACHINE_DOWN, textbuf);
+    */
+  }
+  else {
+    ClusterConfiguration *c = this_cluster()->current_configuration();
+    if (c->find(m->ip, m->cluster_port)) {
+      Warning("machine %hhu.%hhu.%hhu.%hhu:%d already up",
+        DOT_SEPARATED(m->ip), m->cluster_port);
+      result = EEXIST;
+    }
+    else {
+        ClusterConfiguration *cconf = configuration_add_machine(c, m);
+        //CLUSTER_INCREMENT_DYN_STAT(CLUSTER_NODES_STAT);
+        this_cluster()->configurations.push(cconf);
+        result = 0;
+    }
+
+    Note("machine up %hhu.%hhu.%hhu.%hhu:%d, version=%d.%d",
+        DOT_SEPARATED(m->ip), m->cluster_port, m->msg_proto_major,
+        m->msg_proto_minor);
+
+    /*
+    snprintf(textbuf, sizeof(textbuf), "%hhu.%hhu.%hhu.%hhu:%d", DOT_SEPARATED(m->ip), m->cluster_port);
+    REC_SignalManager(REC_SIGNAL_MACHINE_UP, textbuf);
+    */
+  }
+
+  return result;
+}
+
+
+static int
+cluster_ping_config_cb(const char *name, RecDataT /* data_type */, RecData data, void * /* cookie */)
+{
+  if (strcmp(name, "proxy.config.cluster.ping_send_interval_msecs") == 0) {
+    cluster_ping_send_interval = data.rec_int * HRTIME_MSECOND;
+  }
+  else if (strcmp(name, "proxy.config.cluster.ping_latency_threshold_msecs") == 0) {
+    cluster_ping_latency_threshold =  data.rec_int * HRTIME_MSECOND;
+  }
+
+  return 0;
+}
+
 int
 ClusterProcessor::init()
 {
@@ -686,6 +800,11 @@ ClusterProcessor::init()
   if (num_of_cluster_threads == DEFAULT_NUMBER_OF_CLUSTER_THREADS)
     REC_ReadConfigInteger(num_of_cluster_threads, "proxy.config.cluster.threads");
 
+  REC_ReadConfigInteger(num_of_cluster_connections, "proxy.config.cluster.connections");
+  if (num_of_cluster_connections == 0) {
+    num_of_cluster_connections = num_of_cluster_threads;
+  }
+
   REC_EstablishStaticConfigInt32(CacheClusterMonitorEnabled, "proxy.config.cluster.enable_monitor");
   REC_EstablishStaticConfigInt32(CacheClusterMonitorIntervalSecs, "proxy.config.cluster.monitor_interval_secs");
   REC_ReadConfigInteger(cluster_receive_buffer_size, "proxy.config.cluster.receive_buffer_size");
@@ -695,17 +814,28 @@ ClusterProcessor::init()
   REC_ReadConfigInteger(cluster_packet_tos, "proxy.config.cluster.sock_packet_tos");
   REC_EstablishStaticConfigInt32(RPC_only_CacheCluster, "proxy.config.cluster.rpc_cache_cluster");
 
+  REC_EstablishStaticConfigInteger(cluster_flow_ctrl_min_bps, "proxy.config.cluster.flow_ctrl.min_bps");
+  REC_EstablishStaticConfigInteger(cluster_flow_ctrl_max_bps, "proxy.config.cluster.flow_ctrl.max_bps");
+  REC_EstablishStaticConfigInt32(cluster_send_min_wait_time, "proxy.config.cluster.flow_ctrl.min_send_wait_time");
+  REC_EstablishStaticConfigInt32(cluster_send_max_wait_time, "proxy.config.cluster.flow_ctrl.max_send_wait_time");
+  REC_EstablishStaticConfigInt32(cluster_min_loop_interval, "proxy.config.cluster.flow_ctrl.min_loop_interval");
+  REC_EstablishStaticConfigInt32(cluster_max_loop_interval, "proxy.config.cluster.flow_ctrl.max_loop_interval");
+
   int cluster_type = 0;
   REC_ReadConfigInteger(cluster_type, "proxy.local.cluster.type");
 
   create_this_cluster_machine();
+
+  /*
 #ifdef NON_MODULAR
   // Cluster API Initializations
   clusterAPI_init();
 #endif
+
   // Start global Cluster periodic event
   PeriodicClusterEvent = NEW(new GlobalClusterPeriodicEvent);
   PeriodicClusterEvent->init();
+  */
 
   this_cluster = NEW(new Cluster);
   ClusterConfiguration *cc = NEW(new ClusterConfiguration);
@@ -713,19 +843,61 @@ ClusterProcessor::init()
   cc->n_machines = 1;
   cc->machines[0] = this_cluster_machine();
   memset(cc->hash_table, 0, CLUSTER_HASH_TABLE_SIZE);
-  // 0 dummy output data
 
+  /*
+  // 0 dummy output data
   memset(channel_dummy_output, 0, sizeof(channel_dummy_output));
+  */
+
+  int result;
 
   if (cluster_type == 1) {
-    cache_clustering_enabled = 1;
-    Note("cache clustering enabled");
-    compute_cluster_mode();
+    REC_ReadConfigInteger(cluster_ping_send_interval, "proxy.config.cluster.ping_send_interval_msecs");
+    REC_ReadConfigInteger(cluster_ping_latency_threshold, "proxy.config.cluster.ping_latency_threshold_msecs");
+    cluster_ping_send_interval *= HRTIME_MSECOND;
+    cluster_ping_latency_threshold *= HRTIME_MSECOND;
+
+    REC_RegisterConfigUpdateFunc("proxy.config.cluster.ping_send_interval_msecs", cluster_ping_config_cb, NULL);
+    REC_RegisterConfigUpdateFunc("proxy.config.cluster.ping_latency_threshold_msecs", cluster_ping_config_cb, NULL);
+    REC_EstablishStaticConfigInt32(cluster_ping_retries, "proxy.config.cluster.ping_retries");
+
+    REC_ReadConfigInteger(max_session_count_per_machine, "proxy.config.cluster.max_sessions_per_machine");
+    REC_ReadConfigInteger(session_lock_count_per_machine, "proxy.config.cluster.session_locks_per_machine");
+
+    bool found;
+    IpEndpoint cluster_ip;    // ip addr of the cluster interface
+    char *intrName;               // Name of the interface we are to use
+    intrName = REC_readString("proxy.config.cluster.ethernet_interface", &found);
+    ink_assert(found && intrName != NULL);
+
+    found = mgmt_getAddrForIntr(intrName, &cluster_ip.sa);
+    if (!found) {
+      ink_fatal(1, "[ClusterProcessor::init] Unable to find network interface %s.  Exiting...\n", intrName);
+    } else if (!ats_is_ip4(&cluster_ip)) {
+      ink_fatal(1, "[ClusterProcessor::init] Unable to find IPv4 network interface %s.  Exiting...\n", intrName);
+    }
+
+    if (num_of_cluster_connections % 2 != 0) {
+      num_of_cluster_connections++;
+    }
+    cluster_global_init(cluster_main_handler, machine_change_notify);
+
+    result = connection_manager_init(cluster_ip.sin.sin_addr.s_addr);
+    if (result == 0) {
+      cache_clustering_enabled = 1;
+      Note("cache clustering enabled");
+      compute_cluster_mode();
+    }
+    else {
+      cache_clustering_enabled = 0;
+      Note("init fail, cache clustering disabled");
+    }
   } else {
     cache_clustering_enabled = 0;
     Note("cache clustering disabled");
+    result = 0;
   }
-  return 0;
+  return result;
 }
 
 // function added to adhere to the name calling convention of init functions
@@ -742,13 +914,18 @@ ClusterProcessor::start()
   this_cluster_machine()->cluster_port = cluster_port;
 #endif
   if (cache_clustering_enabled && (cacheProcessor.IsCacheEnabled() == CACHE_INITIALIZED)) {
-    size_t stacksize;
 
-    REC_ReadConfigInteger(stacksize, "proxy.config.thread.default.stacksize");
-    ET_CLUSTER = eventProcessor.spawn_event_threads(num_of_cluster_threads, "ET_CLUSTER", stacksize);
+    /*
+    ET_CLUSTER = eventProcessor.spawn_event_threads(num_of_cluster_threads, "ET_CLUSTER");
     for (int i = 0; i < eventProcessor.n_threads_for_type[ET_CLUSTER]; i++) {
-      initialize_thread_for_net(eventProcessor.eventthread[ET_CLUSTER][i]);
+      initialize_thread_for_net(eventProcessor.eventthread[ET_CLUSTER][i], i);
+#ifndef STANDALONE_IOCORE
+      extern void initialize_thread_for_http_sessions(EThread *thread, int thread_index);
+      initialize_thread_for_http_sessions(eventProcessor.eventthread[ET_CLUSTER][i], i);
+#endif
     }
+    */
+
     REC_RegisterConfigUpdateFunc("proxy.config.cluster.cluster_configuration", machine_config_change, (void *) CLUSTER_CONFIG);
     do_machine_config_change((void *) CLUSTER_CONFIG, "proxy.config.cluster.cluster_configuration");
     // TODO: Remove this?
@@ -757,9 +934,12 @@ ClusterProcessor::start()
     do_machine_config_change((void *) MACHINE_CONFIG, "proxy.config.cluster.machine_configuration");
 #endif
 
-    accept_handler = NEW(new ClusterAccept(&cluster_port, cluster_receive_buffer_size, cluster_send_buffer_size));
-    accept_handler->Init();
+    //accept_handler = NEW(new ClusterAccept(&cluster_port, cluster_receive_buffer_size, cluster_send_buffer_size));
+    //accept_handler->Init();
+
+    connection_manager_start();
   }
+
   return 0;
 }
 
@@ -846,4 +1026,53 @@ ClusterProcessor::compute_cluster_mode()
   }
 }
 
+
+void cluster_main_handler(ClusterSession session, void *context,
+    const int func_id, IOBufferBlock *data, const int data_len)
+{
+  int event = func_id < 0 ? -func_id: func_id;
+  switch (event) {
+    case CLUSTER_CACHE_DATA_ABORT:
+    case CLUSTER_CACHE_DATA_READ_REENABLE: {
+      ink_assert(data_len == 0 && context && data == NULL);
+      CacheContinuation *cc = (CacheContinuation *) context;
+      cc->thread->schedule_imm(cc, event);
+      return;
+    }
+    default: {
+      ClusterCont *cc = clusterContAllocator.alloc();
+      SET_CONTINUATION_HANDLER(cc, &ClusterCont::handleEvent);
+      cc->session = session;
+      cc->context = context;
+      cc->func_id = event;
+      cc->data = data;
+      cc->data_len = data_len;
+      cc->_action = (Continuation *) context;
+      if (cc->_action.continuation) {
+        cc->mutex = cc->_action.mutex;
+      }
+#ifdef DEBUG
+      int64_t nbytes = 0;
+      for (IOBufferBlock *b = data; b; b = b->next) {
+        nbytes += b->read_avail();
+      }
+      ink_assert(data_len == nbytes);
+#endif
+
+      if (event == CLUSTER_CACHE_DATA_READ_DONE
+          || event == CLUSTER_CACHE_DATA_ERROR
+          || event == CLUSTER_CACHE_OP_RESULT_CLUSTER_FUNCTION) {
+        ink_assert(context);
+        ClusterCacheVC *cvc = (ClusterCacheVC *) context;
+        cvc->initial_thread->schedule_imm(cc);
+        return;
+      }
+
+      eventProcessor.schedule_imm(cc);
+      return;
+    }
+  }
+}
+
+
 // End of ClusterProcessor.cc

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/ClusterVConnection.cc
----------------------------------------------------------------------
diff --git a/iocore/cluster/ClusterVConnection.cc b/iocore/cluster/ClusterVConnection.cc
index 0630edf..a76c912 100644
--- a/iocore/cluster/ClusterVConnection.cc
+++ b/iocore/cluster/ClusterVConnection.cc
@@ -30,6 +30,17 @@
 #include "P_Cluster.h"
 ClassAllocator<ClusterVConnection> clusterVCAllocator("clusterVCAllocator");
 ClassAllocator<ByteBankDescriptor> byteBankAllocator("byteBankAllocator");
+ClassAllocator<ClusterCacheVC> clusterCacheVCAllocator("custerCacheVCAllocator");
+
+int ClusterCacheVC::size_to_init = -1;
+
+#define CLUSTER_WRITE_MIN_SIZE (1 << 16)
+
+#define CLUSTER_CACHE_VC_CLOSE_SESSION \
+{ \
+  cluster_close_session(cs); \
+  session_closed = true; \
+}
 
 ByteBankDescriptor *
 ByteBankDescriptor::ByteBankDescriptor_alloc(IOBufferBlock * iob)
@@ -271,13 +282,13 @@ ClusterVConnection::do_io_write(Continuation * c, int64_t nbytes, IOBufferReader
 void
 ClusterVConnection::do_io_close(int alerrno)
 {
-  if ((type == VC_CLUSTER) && current_cont) {
-    if (((CacheContinuation *)current_cont)->read_cluster_vc == this)
-      type = VC_CLUSTER_READ;
-    else if (((CacheContinuation *)current_cont)->write_cluster_vc == this)
-      type = VC_CLUSTER_WRITE;
-  }
-  ch->vcs_push(this, type);
+//  if ((type == VC_CLUSTER) && current_cont) {
+//    if (((CacheContinuation *)current_cont)->read_cluster_vc == this)
+//      type = VC_CLUSTER_READ;
+//    else if (((CacheContinuation *)current_cont)->write_cluster_vc == this)
+//      type = VC_CLUSTER_WRITE;
+//  }
+//  ch->vcs_push(this, type);
 
   ClusterVConnectionBase::do_io_close(alerrno);
 }
@@ -650,4 +661,527 @@ ClusterVConnection::get_disk_io_priority()
   return disk_io_priority;
 }
 
+
+ClusterCacheVC::ClusterCacheVC() {
+  size_to_init = sizeof(ClusterCacheVC) - (size_t) & ((ClusterCacheVC *) 0)->vio;
+  memset((char *) &vio, 0, size_to_init);
+}
+
+int
+ClusterCacheVC::handleRead(int, void *)
+{
+  ink_assert(!in_progress && !remote_closed);
+  PUSH_HANDLER(&ClusterCacheVC::openReadReadDone);
+  if (vio.nbytes > 0 && total_len == 0) {
+    SetIOReadMessage msg;
+    msg.nbytes = vio.nbytes;
+    msg.offset = seek_to;
+    if (!cluster_send_message(cs, -CLUSTER_CACHE_DATA_READ_BEGIN, (char *) &msg,
+        sizeof(msg), PRIORITY_HIGH)) {
+      in_progress = true;
+      cluster_set_events(cs, RESPONSE_EVENT_NOTIFY_DEALER);
+      return EVENT_CONT;
+    }
+    goto Lfailed;
+  }
+
+  if (!cluster_send_message(cs, -CLUSTER_CACHE_DATA_READ_REENABLE, NULL, 0,
+      PRIORITY_HIGH)) {
+    in_progress = true;
+    cluster_set_events(cs, RESPONSE_EVENT_NOTIFY_DEALER);
+    return EVENT_CONT;
+  }
+  Lfailed:
+  CLUSTER_CACHE_VC_CLOSE_SESSION;
+  return calluser(VC_EVENT_ERROR);
+}
+
+int
+ClusterCacheVC::openReadReadDone(int event, void *data)
+{
+  cancel_trigger();
+  ink_assert(in_progress);
+  if (event == EVENT_IMMEDIATE)
+    return EVENT_CONT;
+
+  in_progress = false;
+  POP_HANDLER;
+
+  switch (event) {
+    case CLUSTER_CACHE_DATA_ERROR:
+    {
+      ClusterCont *cc = (ClusterCont *) data;
+      ink_assert(cc && cc->data_len > 0);
+      remote_closed = true;
+      event = *(int *) cc->data->start();
+      break;
+    }
+    case CLUSTER_CACHE_DATA_READ_DONE:
+    {
+      ClusterCont *cc = (ClusterCont *) data;
+      ink_assert(cc && d_len == 0);
+
+      d_len = cc->data_len;
+      total_len += d_len;
+      blocks = cc->data;
+      if (total_len >= vio.nbytes)
+        remote_closed = true;
+      break;
+    }
+    case CLUSTER_INTERNEL_ERROR:
+    default:
+      event = VC_EVENT_ERROR;
+      remote_closed = true;
+      break;
+  }
+
+  if (closed) {
+    if (!remote_closed)
+      cluster_send_message(cs, CLUSTER_CACHE_DATA_ABORT, NULL, 0, PRIORITY_HIGH);
+
+    free_ClusterCacheVC(this);
+    return EVENT_DONE;
+  }
+  // recevied data from cluster
+
+  return handleEvent(event, data);
+}
+
+int
+ClusterCacheVC::openReadStart(int event, void *data)
+{
+  ink_assert(in_progress);
+  in_progress = false;
+  if (_action.cancelled) {
+    if (!remote_closed)
+      cluster_send_message(cs, CLUSTER_CACHE_DATA_ABORT, NULL, 0, PRIORITY_HIGH);
+    free_ClusterCacheVC(this);
+    return EVENT_DONE;
+  }
+  if (event != CACHE_EVENT_OPEN_READ) {
+    if (event == CACHE_EVENT_OPEN_WRITE) {
+      // the remote side do the pre_write
+      vio.op = VIO::WRITE;
+      SET_HANDLER(&ClusterCacheVC::openWriteMain);
+      _action.continuation->handleEvent(CACHE_EVENT_OPEN_READ_FAILED, this);
+      return EVENT_DONE;
+    }
+    // prevent further trigger
+    remote_closed = true;
+    CLUSTER_CACHE_VC_CLOSE_SESSION;
+    _action.continuation->handleEvent(CACHE_EVENT_OPEN_READ_FAILED, data);
+    free_ClusterCacheVC(this);
+    return EVENT_DONE;
+  }
+
+  SET_HANDLER(&ClusterCacheVC::openReadMain);
+  callcont(CACHE_EVENT_OPEN_READ);
+  return EVENT_CONT;
+}
+int
+ClusterCacheVC::openReadMain(int event, void *e)
+{
+  cancel_trigger();
+  ink_assert(!in_progress);
+  if (event == VC_EVENT_ERROR || event == VC_EVENT_EOS) {
+    remote_closed = true;
+    CLUSTER_CACHE_VC_CLOSE_SESSION;
+    return calluser(event);
+  }
+
+  int64_t bytes = d_len;
+  int64_t ntodo = vio.ntodo();
+  if (ntodo <= 0)
+    return EVENT_CONT;
+  if (vio.buffer.writer()->max_read_avail() > vio.buffer.writer()->water_mark && vio.ndone) // initiate read of first block
+    return EVENT_CONT;
+  if (!blocks && vio.ntodo() > 0)
+    goto Lread;
+
+  if (bytes > vio.ntodo())
+    bytes = vio.ntodo();
+  vio.buffer.writer()->append_block(blocks);
+  vio.ndone += bytes;
+  blocks = NULL;
+  d_len -= bytes;
+
+  if (vio.ntodo() <= 0)
+    return calluser(VC_EVENT_READ_COMPLETE);
+  else {
+    if (calluser(VC_EVENT_READ_READY) == EVENT_DONE)
+      return EVENT_DONE;
+    // we have to keep reading until we give the user all the
+    // bytes it wanted or we hit the watermark.
+    if (vio.ntodo() > 0 && !vio.buffer.writer()->high_water())
+      goto Lread;
+    return EVENT_CONT;
+  }
+Lread:
+  if (vio.ndone >= (int64_t) doc_len) {
+    // reached the end of the document and the user still wants more
+    return calluser(VC_EVENT_EOS);
+  }
+  // if the state machine calls reenable on the callback from the cache,
+  // we set up a schedule_imm event. The openReadReadDone discards
+  // EVENT_IMMEDIATE events. So, we have to cancel that trigger and set
+  // a new EVENT_INTERVAL event.
+  cancel_trigger();
+  return handleRead(event, e);
+}
+
+int
+ClusterCacheVC::openWriteStart(int event, void *data)
+{
+  ink_assert(in_progress);
+  in_progress = false;
+  if (_action.cancelled) {
+    if (!remote_closed)
+      cluster_send_message(cs, CLUSTER_CACHE_DATA_ABORT, NULL, 0, PRIORITY_HIGH);
+    free_ClusterCacheVC(this);
+    return EVENT_DONE;
+  }
+  // process the data
+  if (event != CACHE_EVENT_OPEN_WRITE) {
+    // prevent further trigger
+    remote_closed = true;
+    CLUSTER_CACHE_VC_CLOSE_SESSION;
+    _action.continuation->handleEvent(CACHE_EVENT_OPEN_WRITE_FAILED, data);
+    free_ClusterCacheVC(this);
+    return EVENT_DONE;
+  }
+  SET_HANDLER(&ClusterCacheVC::openWriteMain);
+  return callcont(CACHE_EVENT_OPEN_WRITE);
+}
+int
+ClusterCacheVC::openWriteMain(int , void *)
+{
+  cancel_trigger();
+  ink_assert(!in_progress);
+
+Lagain:
+  if (remote_closed) {
+    if (calluser(VC_EVENT_ERROR) == EVENT_DONE)
+      return EVENT_DONE;
+    return EVENT_CONT;
+  }
+
+  if (!vio.buffer.writer()) {
+    if (calluser(VC_EVENT_WRITE_READY) == EVENT_DONE)
+      return EVENT_DONE;
+    if (!vio.buffer.writer())
+      return EVENT_CONT;
+  }
+
+  int64_t ntodo = vio.ntodo();
+
+  if (ntodo <= 0) {
+    if (calluser(VC_EVENT_WRITE_COMPLETE) == EVENT_DONE)
+      return EVENT_DONE;
+    ink_assert(!"close expected after write COMPLETE");
+    if (vio.ntodo() <= 0)
+      return EVENT_CONT;
+  }
+
+  ntodo = vio.ntodo() + length;
+  int64_t total_avail = vio.buffer.reader()->read_avail();
+  int64_t avail = total_avail;
+  int64_t towrite = avail + length;
+  if (towrite > ntodo) {
+    avail -= (towrite - ntodo);
+    towrite = ntodo;
+  }
+
+  if (!blocks && towrite) {
+    blocks = vio.buffer.reader()->block;
+    offset = vio.buffer.reader()->start_offset;
+  }
+
+  if (avail > 0) {
+    vio.buffer.reader()->consume(avail);
+    vio.ndone += avail;
+    total_len += avail;
+  }
+
+  ink_assert(towrite >= 0);
+  length = towrite;
+
+  int flen = cache_config_target_fragment_size;
+
+  while (length >= flen) {
+    IOBufferBlock *r = clone_IOBufferBlockList(blocks, offset, flen);
+    blocks = iobufferblock_skip(blocks, &offset, &length, flen);
+
+    remote_closed = cluster_send_message(cs, CLUSTER_CACHE_DATA_WRITE_DONE, r, -1,
+        priority);
+    if (remote_closed)
+      goto Lagain;
+
+    data_sent += flen;
+    Debug("data_sent", "sent bytes %d, reminds %"PRId64"", flen, length);
+  }
+  // for the read_from_writer work better,
+  // especailly the slow original
+  flen = CLUSTER_WRITE_MIN_SIZE;
+  if (length >= flen || (vio.ntodo() <= 0 && length > 0)) {
+    data_sent += length;
+    IOBufferBlock *r = clone_IOBufferBlockList(blocks, offset, length);
+    blocks = iobufferblock_skip(blocks, &offset, &length, length);
+    remote_closed = cluster_send_message(cs, CLUSTER_CACHE_DATA_WRITE_DONE, r,
+              -1, priority);
+    if (remote_closed)
+      goto Lagain;
+    Debug("data_sent", "sent bytes %d, reminds %"PRId64"", flen, length);
+  }
+
+  if (vio.ntodo() <= 0) {
+    ink_assert(length == 0 && total_len == vio.nbytes);
+    return calluser(VC_EVENT_WRITE_COMPLETE);
+  }
+  return calluser(VC_EVENT_WRITE_READY);
+}
+
+int
+ClusterCacheVC::removeEvent(int event, void *data)
+{
+  ink_assert(in_progress);
+  in_progress = false;
+  remote_closed = true;
+  CLUSTER_CACHE_VC_CLOSE_SESSION;
+  if (!_action.cancelled)
+    _action.continuation->handleEvent(event, data);
+  free_ClusterCacheVC(this);
+  return EVENT_DONE;
+}
+
+VIO *
+ClusterCacheVC::do_io_read(Continuation *c, int64_t nbytes, MIOBuffer *abuf)
+{
+  ink_assert(vio.op == VIO::READ && alternate.valid());
+  vio.buffer.writer_for(abuf);
+  vio.set_continuation(c);
+  vio.ndone = 0;
+  vio.nbytes = nbytes;
+  vio.vc_server = this;
+  seek_to = 0;
+  ink_assert(c->mutex->thread_holding);
+
+  ink_assert(!in_progress);
+  if (!trigger && !recursive)
+    trigger = c->mutex->thread_holding->schedule_imm_local(this);
+  return &vio;
+}
+
+VIO *
+ClusterCacheVC::do_io_pread(Continuation *c, int64_t nbytes, MIOBuffer *abuf, int64_t offset)
+{
+  ink_assert(vio.op == VIO::READ && alternate.valid());
+  vio.buffer.writer_for(abuf);
+  vio.set_continuation(c);
+  vio.ndone = 0;
+  vio.nbytes = nbytes;
+  vio.vc_server = this;
+  seek_to = offset;
+  ink_assert(c->mutex->thread_holding);
+
+  ink_assert(!in_progress);
+  if (!trigger && !recursive)
+    trigger = c->mutex->thread_holding->schedule_imm_local(this);
+  return &vio;
+}
+
+VIO *
+ClusterCacheVC::do_io_write(Continuation *c, int64_t nbytes, IOBufferReader *abuf, bool owner)
+{
+  ink_assert(vio.op == VIO::WRITE);
+  ink_assert(!owner && !in_progress);
+  vio.buffer.reader_for(abuf);
+  vio.set_continuation(c);
+  vio.ndone = 0;
+  vio.nbytes = nbytes;
+  doc_len = nbytes; // note: the doc_len maybe not the real length of the body
+  vio.vc_server = this;
+  ink_assert(c->mutex->thread_holding);
+
+  if (nbytes < (1 << 20))
+    priority = PRIORITY_MID;
+  else
+    priority = PRIORITY_LOW;
+
+  CacheHTTPInfo *r = &alternate;
+  SetIOWriteMessage msg;
+  msg.nbytes = nbytes;
+  int len = r->valid() ? r->marshal_length() : 0;
+  msg.hdr_len = len;
+  ink_assert(total_len == 0);
+  ink_assert((frag_type == CACHE_FRAG_TYPE_HTTP && len > 0) ||
+      (frag_type != CACHE_FRAG_TYPE_HTTP && len == 0));
+
+  if (len > 0) {
+    Ptr<IOBufferData> data;
+    data = new_IOBufferData(iobuffer_size_to_index(sizeof msg + len, MAX_BUFFER_SIZE_INDEX));
+    memcpy((char *) data->data(), &msg, sizeof(msg));
+    char *p = (char *) data->data() + sizeof msg;
+    int res = r->marshal(p, len);
+    ink_assert(res >= 0);
+    IOBufferBlock *ret = new_IOBufferBlock(data, sizeof msg + len, 0);
+    ret->_buf_end = ret->_end;
+    remote_closed = cluster_send_message(cs, CLUSTER_CACHE_DATA_WRITE_BEGIN, ret, -1, priority);
+  } else
+    remote_closed = cluster_send_message(cs, -CLUSTER_CACHE_DATA_WRITE_BEGIN, &msg, sizeof msg, priority);
+
+  if (!trigger && !recursive)
+    trigger = c->mutex->thread_holding->schedule_imm_local(this);
+  return &vio;
+}
+
+void
+ClusterCacheVC::do_io_close(int alerrno)
+{
+  ink_assert(mutex->thread_holding == this_ethread());
+  int previous_closed = closed;
+  closed = (alerrno == -1) ? 1 : -1;    // Stupid default arguments
+  DDebug("cache_close", "do_io_close %p %d %d", this, alerrno, closed);
+
+  // special case: to cache 0 bytes document
+  if (f.force_empty)
+    closed = 1;
+
+  if (!remote_closed) {
+    if (closed > 0 && vio.op == VIO::WRITE) {
+      if ((f.update && vio.nbytes == 0) || f.force_empty) {
+        //header only update
+        //
+        if (frag_type == CACHE_FRAG_TYPE_HTTP) {
+          if (alternate.valid()) {
+            SetIOCloseMessage msg;
+            msg.h_len = alternate.marshal_length();
+            msg.d_len = 0;
+            msg.total_len = 0;
+
+            Ptr<IOBufferData> d;
+            d = new_IOBufferData(iobuffer_size_to_index(sizeof msg + msg.h_len));
+            char *data = d->data();
+            memcpy(data, &msg, sizeof msg);
+
+            int res = alternate.marshal((char *) data + sizeof msg, msg.h_len);
+            ink_assert(res >= 0 && res <= msg.h_len);
+
+            IOBufferBlock *ret = new_IOBufferBlock(d, sizeof msg + msg.h_len, 0);
+            ret->_buf_end = ret->_end;
+
+            remote_closed = cluster_send_message(cs,
+                -CLUSTER_CACHE_HEADER_ONLY_UPDATE, ret, -1, PRIORITY_HIGH);
+          } else
+            remote_closed = cluster_send_message(cs, -CLUSTER_CACHE_DATA_CLOSE, &total_len,
+                sizeof total_len, PRIORITY_HIGH);
+        } else {
+          remote_closed = cluster_send_message(cs, -CLUSTER_CACHE_DATA_CLOSE,
+              &total_len, sizeof total_len, priority);
+        }
+
+        goto Lfree;
+      } else if ((total_len < vio.nbytes) || length > 0) {
+        int64_t ntodo = vio.ntodo() + length;
+        int64_t total_avail = vio.buffer.reader()->read_avail();
+        int64_t avail = total_avail;
+        int64_t towrite = avail + length;
+        if (towrite > ntodo) {
+          avail -= (towrite - ntodo);
+          towrite = ntodo;
+        }
+
+        if (!blocks && towrite) {
+          blocks = vio.buffer.reader()->block;
+          offset = vio.buffer.reader()->start_offset;
+        }
+
+        if (avail > 0) {
+          vio.buffer.reader()->consume(avail);
+          vio.ndone += avail;
+          total_len += avail;
+        }
+
+        if (vio.ntodo() > 0) {
+          Warning("writer closed success but still want more data");
+          remote_closed = cluster_send_message(cs, CLUSTER_CACHE_DATA_ABORT, NULL, 0,
+                        priority);
+          goto Lfree;
+        }
+
+        length = towrite;
+        ink_assert(total_len == vio.nbytes);
+        int flen = cache_config_target_fragment_size;
+        while (length >= flen) {
+          IOBufferBlock *ret = clone_IOBufferBlockList(blocks, offset, flen);
+          blocks = iobufferblock_skip(blocks, &offset, &length, flen);
+
+          remote_closed = cluster_send_message(cs, CLUSTER_CACHE_DATA_WRITE_DONE, ret,
+              -1, priority);
+          if (remote_closed)
+            goto Lfree;
+
+          data_sent += flen;
+          Debug("data_sent", "sent bytes %d, reminds %"PRId64"", flen, length);
+        }
+
+        if (length > 0) {
+          data_sent += length;
+          IOBufferBlock *ret = clone_IOBufferBlockList(blocks, offset, length);
+          blocks = iobufferblock_skip(blocks, &offset, &length, length);
+          remote_closed = cluster_send_message(cs, CLUSTER_CACHE_DATA_WRITE_DONE, ret, -1,
+              priority);
+          if (remote_closed)
+            goto Lfree;
+          Debug("data_sent", "sent bytes done: %"PRId64", reminds %"PRId64"", data_sent, length);
+        }
+      }
+
+      if (doc_len != vio.nbytes) {
+        // for trunk
+        ink_assert(total_len == vio.nbytes && length == 0);
+        remote_closed = cluster_send_message(cs, CLUSTER_CACHE_DATA_CLOSE,
+            &total_len, sizeof total_len, priority);
+        goto Lfree;
+      }
+      ink_assert(data_sent == total_len);
+    }
+
+    if (closed < 0 && vio.op == VIO::WRITE)
+      remote_closed = cluster_send_message(cs, CLUSTER_CACHE_DATA_ABORT, NULL, 0, PRIORITY_HIGH);
+
+    if (vio.op == VIO::READ && !in_progress) {
+      remote_closed = cluster_send_message(cs, CLUSTER_CACHE_DATA_ABORT, NULL, 0, PRIORITY_HIGH);
+    }
+  }
+Lfree:
+  if (!previous_closed && !recursive && !in_progress) {
+    free_ClusterCacheVC(this);
+  }
+}
+
+void
+ClusterCacheVC::reenable(VIO *avio)
+{
+  DDebug("cache_reenable", "reenable %p, trigger %p, in_progress %d", this, trigger, in_progress);
+  (void) avio;
+  ink_assert(avio->mutex->thread_holding);
+  if (!trigger && !in_progress) {
+    trigger = avio->mutex->thread_holding->schedule_imm_local(this);
+  }
+}
+
+void
+ClusterCacheVC::reenable_re(VIO *avio)
+{
+  DDebug("cache_reenable", "reenable %p", this);
+  (void) avio;
+  ink_assert(avio->mutex->thread_holding);
+
+  if (!trigger) {
+    if (!in_progress && !recursive) {
+      handleEvent(EVENT_NONE, (void *) 0);
+    } else if (!in_progress)
+      trigger = avio->mutex->thread_holding->schedule_imm_local(this);
+  }
+}
 // End of ClusterVConnection.cc

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/EventPoll.cc
----------------------------------------------------------------------
diff --git a/iocore/cluster/EventPoll.cc b/iocore/cluster/EventPoll.cc
new file mode 100644
index 0000000..8a6b9e6
--- /dev/null
+++ b/iocore/cluster/EventPoll.cc
@@ -0,0 +1,158 @@
+/** @file
+
+  A brief file description
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#include "EventPoll.h"
+
+EventPoll::EventPoll(const int size, int timeout) : _size(size)
+{
+  int bytes;
+
+#if TS_USE_EPOLL
+  _extra_events = EPOLLET;
+  _timeout = timeout;
+  _poll_fd = epoll_create(_size);
+  bytes = sizeof(struct epoll_event) * size;
+  _events = (struct epoll_event *)ats_malloc(bytes);
+#elif TS_USE_KQUEUE
+  _extra_events = INK_EV_EDGE_TRIGGER;
+  _timeout.tv_sec = timeout / 1000;
+  _timeout.tv_nsec = 1000000 * (timeout % 1000);
+  _poll_fd = kqueue();
+  bytes = sizeof(struct kevent) * size;
+  _events = (struct kevent *)ats_malloc(bytes);
+#elif TS_USE_PORT
+  _extra_events = 0;
+  _timeout.tv_sec = timeout / 1000;
+  _timeout.tv_nsec = 1000000 * (timeout % 1000);
+  _poll_fd = port_create();
+  bytes = sizeof(port_event_t) * size;
+  _events = (port_event_t *)ats_malloc(bytes);
+#endif
+}
+
+EventPoll::~EventPoll()
+{
+  ats_free(_events);
+  close(_poll_fd);
+}
+
+int EventPoll::attach(const int fd, const int e, void *data)
+{
+#if TS_USE_EPOLL
+  struct epoll_event ev;
+  memset(&ev, 0, sizeof(ev));
+  ev.events = e | _extra_events;
+  ev.data.ptr = data;
+  return epoll_ctl(_poll_fd, EPOLL_CTL_ADD, fd, &ev);
+#elif TS_USE_KQUEUE
+  struct kevent ev[2];
+  int n = 0;
+  if (e & EVENTIO_READ) {
+    EV_SET(&ev[n++], fd, EVFILT_READ, EV_ADD | _extra_events, 0, 0, data);
+  }
+  if (e & EVENTIO_WRITE) {
+    EV_SET(&ev[n++], fd, EVFILT_WRITE, EV_ADD | _extra_events, 0, 0, data);
+  }
+  return kevent(_poll_fd, ev, n, NULL, 0, NULL);
+#elif TS_USE_PORT
+  return port_associate(_poll_fd, PORT_SOURCE_FD, fd, e, data);
+#endif
+}
+
+int EventPoll::modify(const int fd, const int e, void *data)
+{
+#if TS_USE_EPOLL
+  struct epoll_event ev;
+  memset(&ev, 0, sizeof(ev));
+  ev.events = e | _extra_events;
+  ev.data.ptr = data;
+  return epoll_ctl(_poll_fd, EPOLL_CTL_MOD, fd, &ev);
+#elif TS_USE_KQUEUE
+  struct kevent ev[2];
+  int n = 0;
+  if (e & EVENTIO_READ) {
+    EV_SET(&ev[n++], fd, EVFILT_READ, EV_ADD | _extra_events, 0, 0, data);
+  }
+  else {
+    EV_SET(&ev[n++], fd, EVFILT_READ, EV_DELETE, 0, 0, data);
+  }
+
+  if (e & EVENTIO_WRITE) {
+    EV_SET(&ev[n++], fd, EVFILT_WRITE, EV_ADD | _extra_events, 0, 0, data);
+  }
+  else {
+    EV_SET(&ev[n++], fd, EVFILT_WRITE, EV_DELETE, 0, 0, data);
+  }
+  return kevent(_poll_fd, ev, n, NULL, 0, NULL);
+#elif TS_USE_PORT
+  return port_associate(_poll_fd, PORT_SOURCE_FD, fd, e, data);
+#endif
+}
+
+int EventPoll::detach(const int fd)
+{
+#if TS_USE_EPOLL
+  return epoll_ctl(_poll_fd, EPOLL_CTL_DEL, fd, NULL);
+#elif TS_USE_PORT
+  return port_dissociate(_poll_fd, PORT_SOURCE_FD, fd);
+#else
+  return 0;
+#endif
+}
+
+int EventPoll::poll()
+{
+#if TS_USE_EPOLL
+  return epoll_wait(_poll_fd, _events, _size, _timeout);
+#elif TS_USE_KQUEUE
+  return kevent(_poll_fd, NULL, 0, _events, _size, &_timeout);
+#elif TS_USE_PORT
+  int retval;
+  unsigned nget = 1;
+  if((retval = port_getn(_poll_fd, _events,
+          _size, &nget, &_timeout)) == 0)
+  {
+    result = (int)nget;
+  } else {
+    switch(errno) {
+      case EINTR:
+      case EAGAIN:
+      case ETIME:
+        if (nget > 0) {
+          result = (int)nget;
+        }
+        else {
+          result = 0;
+        }
+        break;
+      default:
+        result = -1;
+        break;
+    }
+  }
+  return result;
+#else
+#error port me
+#endif
+}
+

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/EventPoll.h
----------------------------------------------------------------------
diff --git a/iocore/cluster/EventPoll.h b/iocore/cluster/EventPoll.h
new file mode 100644
index 0000000..ccb65de
--- /dev/null
+++ b/iocore/cluster/EventPoll.h
@@ -0,0 +1,105 @@
+/** @file
+
+  A brief file description
+
+  @section license License
+
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+      http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+ */
+
+#ifndef __EVENT_POLL_H__
+#define __EVENT_POLL_H__
+
+#include "P_Net.h"
+
+class EventPoll {
+  public:
+    EventPoll(const int size, int timeout);
+    ~EventPoll();
+    int attach(const int fd, const int e, void *data);
+    int modify(const int fd, const int e, void *data);
+    int detach(const int fd);
+    int poll();
+
+#if TS_USE_KQUEUE
+    /* we define these here as numbers, because for kqueue mapping them to a combination of
+     * filters / flags is hard to do. */
+    inline int kq_event_convert(int16_t event, uint16_t flags)
+    {
+      int r = 0;
+
+      if (event == EVFILT_READ) {
+        r |= INK_EVP_IN;
+      }
+      else if (event == EVFILT_WRITE) {
+        r |= INK_EVP_OUT;
+      }
+
+      if (flags & EV_EOF) {
+        r |= INK_EVP_HUP;
+      }
+      return r;
+    }
+#endif
+
+    inline int getEvents(const int index)
+    {
+#if TS_USE_EPOLL
+      return _events[index].events;
+#elif TS_USE_KQUEUE
+      /* we define these here as numbers, because for kqueue mapping them to a combination of
+       * filters / flags is hard to do. */
+      return kq_event_convert(_events[index].filter, _events[index].flags);
+#elif TS_USE_PORT
+      return _events[index].portev_events;
+#else
+#error port me
+#endif
+    }
+
+    inline void *getData(const int index)
+    {
+#if TS_USE_EPOLL
+      return _events[index].data.ptr;
+#elif TS_USE_KQUEUE
+      return _events[index].udata;
+#elif TS_USE_PORT
+      return _events[index].portev_user;
+#else
+#error port me
+#endif
+    }
+
+  protected:
+    int _size;  //max events (fd)
+    int _extra_events;
+    int _poll_fd;
+
+#if TS_USE_EPOLL
+    struct epoll_event *_events;
+    int _timeout;
+#elif TS_USE_KQUEUE
+    struct kevent *_events;
+    struct timespec _timeout;
+#elif TS_USE_PORT
+    port_event_t *_events;
+    timespec_t _timeout;
+#endif
+};
+
+#endif
+

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/Makefile.am
----------------------------------------------------------------------
diff --git a/iocore/cluster/Makefile.am b/iocore/cluster/Makefile.am
index 1d8266d..b49046d 100644
--- a/iocore/cluster/Makefile.am
+++ b/iocore/cluster/Makefile.am
@@ -57,5 +57,12 @@ libinkcluster_a_SOURCES = \
   P_ClusterLoadMonitor.h \
   P_ClusterMachine.h \
   P_TimeTrace.h \
-  Inline.cc
+  Inline.cc \
+  global.cc  \
+  nio.cc \
+  session.cc  \
+  message.cc \
+  connection.cc \
+  machine.cc \
+	EventPoll.cc
 

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/P_Cluster.h
----------------------------------------------------------------------
diff --git a/iocore/cluster/P_Cluster.h b/iocore/cluster/P_Cluster.h
index aa3d6a5..f24d3de 100644
--- a/iocore/cluster/P_Cluster.h
+++ b/iocore/cluster/P_Cluster.h
@@ -123,6 +123,8 @@ enum
   cluster_stat_count
 };
 
+#define SIZE_OF_FRAGEMENT ((1 << 20) - 128)
+
 extern RecRawStatBlock *cluster_rsb;
 #define CLUSTER_INCREMENT_DYN_STAT(x) \
 	RecIncrRawStat(cluster_rsb, mutex->thread_holding, (int) x, 1);

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/P_ClusterCache.h
----------------------------------------------------------------------
diff --git a/iocore/cluster/P_ClusterCache.h b/iocore/cluster/P_ClusterCache.h
index 10fc46a..162fef0 100644
--- a/iocore/cluster/P_ClusterCache.h
+++ b/iocore/cluster/P_ClusterCache.h
@@ -53,6 +53,9 @@
 /****************************************************************************/
 
 #include "P_ClusterMachine.h"
+#include "clusterinterface.h"
+
+extern int enable_cache_empty_http_doc;
 
 //
 // Cluster Processor
@@ -310,6 +313,7 @@ struct ClusterVCToken
 //
 typedef void ClusterFunction(ClusterHandler * ch, void *data, int len);
 typedef ClusterFunction *ClusterFunctionPtr;
+typedef void ClusterFunctionExt(ClusterSession cs, void *context, void *data);
 
 struct ClusterVConnectionBase;
 
@@ -512,7 +516,9 @@ struct ClusterVConnection: public ClusterVConnectionBase
   ClusterVConnection(int is_new_connect_read = 0);
   ~ClusterVConnection();
   void free();                  // Destructor actions (we are using ClassAllocator)
-
+  virtual bool is_read_from_writer() {
+    return false;
+  }
   virtual void do_io_close(int lerrno = -1);
   virtual VIO *do_io_read(Continuation * c, int64_t nbytes, MIOBuffer * buf);
   virtual VIO *do_io_write(Continuation * c, int64_t nbytes, IOBufferReader * buf, bool owner = false);
@@ -739,9 +745,9 @@ extern ClusterFunction close_channel_ClusterFunction;
 extern ClusterFunction get_hostinfo_ClusterFunction;
 extern ClusterFunction put_hostinfo_ClusterFunction;
 extern ClusterFunction cache_lookup_ClusterFunction;
-extern ClusterFunction cache_op_ClusterFunction;
+//extern ClusterFunction cache_op_ClusterFunction;
 extern ClusterFunction cache_op_malloc_ClusterFunction;
-extern ClusterFunction cache_op_result_ClusterFunction;
+//extern ClusterFunction cache_op_result_ClusterFunction;
 extern ClusterFunction set_channel_data_ClusterFunction;
 extern ClusterFunction post_setchan_send_ClusterFunction;
 extern ClusterFunction set_channel_pin_ClusterFunction;
@@ -750,6 +756,9 @@ extern ClusterFunction set_channel_priority_ClusterFunction;
 extern ClusterFunction post_setchan_priority_ClusterFunction;
 extern ClusterFunction default_api_ClusterFunction;
 
+extern ClusterFunctionExt cache_op_ClusterFunction;
+extern ClusterFunctionExt cache_op_result_ClusterFunction;
+
 struct ClusterFunctionDescriptor
 {
   bool fMalloced;               // the function will free the data
@@ -767,7 +776,8 @@ struct ClusterFunctionDescriptor
 #ifndef DEFINE_CLUSTER_FUNCTIONS
 extern
 #endif
-ClusterFunctionDescriptor clusterFunction[]
+ClusterFunctionDescriptor clusterFunction[1]
+#if 0
 #ifdef DEFINE_CLUSTER_FUNCTIONS
   = {
   {false, true, CMSG_LOW_PRI, test_ClusterFunction, 0},
@@ -863,7 +873,7 @@ ClusterFunctionDescriptor clusterFunction[]
   // ********** ADD NEW ENTRIES ABOVE THIS LINE ************
 }
 #endif
-
+#endif
 ;
 extern unsigned SIZE_clusterFunction;        // clusterFunction[] entries
 
@@ -983,10 +993,27 @@ ClusterFuncToQpri(int cluster_func)
 #define API_F29_CLUSTER_FUNCTION  	     	     79
 #define API_F30_CLUSTER_FUNCTION  	     	     80
 
-#define API_STARECT_CLUSTER_FUNCTION		     API_F01_CLUSTER_FUNCTION
-#define API_END_CLUSTER_FUNCTION		     API_F30_CLUSTER_FUNCTION
+#define CLUSTER_CACHE_OP_CLUSTER_FUNCTION      (CLUSTER_MSG_START+81)
+#define CLUSTER_CACHE_DATA_READ_BEGIN          (CLUSTER_MSG_START+82)
+#define CLUSTER_CACHE_DATA_READ_REENABLE       (CLUSTER_MSG_START+83)
+#define CLUSTER_CACHE_DATA_WRITE_BEGIN         (CLUSTER_MSG_START+84)
+#define CLUSTER_CACHE_HEADER_ONLY_UPDATE       (CLUSTER_MSG_START+85)
+#define CLUSTER_CACHE_DATA_CLOSE               (CLUSTER_MSG_START+86)
+#define CLUSTER_CACHE_DATA_ABORT               (CLUSTER_MSG_START+87)
+#define CLUSTER_CACHE_DATA_WRITE_DONE          (CLUSTER_MSG_START+88)
+
+#define CLUSTER_CACHE_OP_RESULT_CLUSTER_FUNCTION (CLUSTER_MSG_START+89)
+#define CLUSTER_CACHE_DATA_READ_DONE           (CLUSTER_MSG_START+90)
+#define CLUSTER_CACHE_DATA_ERROR               (CLUSTER_MSG_START+91)
 
-#define UNDEFINED_CLUSTER_FUNCTION                   0xFDEFFDEF
+#define CLUSTER_INTERNEL_ERROR                 (CLUSTER_MSG_START+100)
+#define CLUSTER_PING_CLUSTER_FUNCTION          (CLUSTER_MSG_START+101)                        1
+#define CLUSTER_PING_REPLY_CLUSTER_FUNCTION    (CLUSTER_MSG_START+102)
+
+#define API_STARECT_CLUSTER_FUNCTION           API_F01_CLUSTER_FUNCTION
+#define API_END_CLUSTER_FUNCTION               API_F30_CLUSTER_FUNCTION
+
+#define UNDEFINED_CLUSTER_FUNCTION             0xFDEFFDEF
 
 //////////////////////////////////////////////
 // Initial cluster connect exchange message
@@ -1171,4 +1198,328 @@ ClusterVC_remove_write(ClusterVConnectionBase * vc)
 }
 
 
+struct ClusterCacheVC: public CacheVConnection
+{
+  static int size_to_init;
+  Action _action;
+  Ptr<IOBufferData> buf;          // for read
+  Ptr<IOBufferData> first_buf;    // the head fragment
+  Ptr<IOBufferBlock> blocks; // data available to write
+
+  CacheHTTPInfo alternate;
+
+  VIO vio;
+  ink_hrtime start_time;
+  CacheFragType frag_type;
+  int64_t seek_to;                // pread offset
+  int64_t offset;                 // offset into 'blocks' of data to write
+  int64_t length;                 // length of data available to write
+  int64_t total_len;
+  int64_t data_sent;
+  int64_t doc_len;
+
+  int doc_pos;                // read position in 'buf'
+  int d_len;                  // the length of data in 'buf'
+
+  int closed;
+  int recursive;
+  int disk_io_priority;
+  int probe_depth;
+  MessagePriority priority;
+
+  time_t time_pin;
+  EThread *initial_thread;  // initial thread open_XX was called on
+  ClusterSession cs;
+  Event *trigger;
+  ContinuationHandler save_handler;
+
+
+  bool in_progress; //
+  bool remote_closed;
+  bool session_closed;
+
+  union
+  {
+    uint32_t flags;
+    struct
+    {
+      unsigned int use_first_key:1;
+      unsigned int overwrite:1; // overwrite first_key Dir if it exists
+      unsigned int close_complete:1; // WRITE_COMPLETE is final
+      unsigned int sync:1; // write to be committed to durable storage before WRITE_COMPLETE
+      unsigned int evacuator:1;
+      unsigned int single_fragment:1;
+      unsigned int evac_vector:1;
+      unsigned int lookup:1;
+      unsigned int update:1;
+      unsigned int remove:1;
+      unsigned int remove_aborted_writers:1;
+      unsigned int open_read_timeout:1; // UNUSED
+      unsigned int data_done:1;
+      unsigned int read_from_writer_called:1;
+      unsigned int not_from_ram_cache:1;        // entire object was from ram cache
+      unsigned int rewrite_resident_alt:1;
+      unsigned int readers:1;
+      unsigned int doc_from_ram_cache:1;
+#ifdef HIT_EVACUATE
+      unsigned int hit_evacuate:1;
+#endif
+#ifdef HTTP_CACHE
+      unsigned int force_empty:1; // used for cache empty http document
+#endif
+#ifdef SSD_CACHE
+      unsigned int read_from_ssd:1;
+      unsigned int write_into_ssd:1;
+      unsigned int ram_fixup:1;
+      unsigned int transistor:1;
+#endif
+    } f;
+  };
+  ClusterCacheVC();
+  VIO *do_io_read(Continuation *c, int64_t nbytes, MIOBuffer *buf); // invoke remote
+  VIO *do_io_pread(Continuation *c, int64_t nbytes, MIOBuffer *buf, int64_t offset); // invoke remote
+  VIO *do_io_write(Continuation *c, int64_t nbytes, IOBufferReader *buf, bool owner = false); // invoke remote
+  void do_io_close(int lerrno = -1); // invoke remote ?
+  void reenable(VIO *avio); // invoke remote ?
+  void reenable_re(VIO *avio); // invoke remote ?
+
+  void do_remote_close(); // invoke remote, for cancel or error
+
+  virtual int get_header(void **, int *)
+  {
+    ink_assert(!"implemented");
+    return -1;
+  }
+  virtual int set_header(void *, int)
+  {
+    ink_assert(!"implemented");
+    return -1;
+  }
+  virtual int get_single_data(void **, int *)
+  {
+    ink_assert(!"implemented");
+    return -1;
+  }
+
+#ifdef HTTP_CACHE
+  virtual void set_http_info(CacheHTTPInfo *info) {
+    if (enable_cache_empty_http_doc) {
+      MIMEField *field = info->m_alt->m_response_hdr.field_find(
+          MIME_FIELD_CONTENT_LENGTH, MIME_LEN_CONTENT_LENGTH);
+      if (field && !field->value_get_int64())
+        f.force_empty = 1;
+      else
+        f.force_empty = 0;
+    } else
+      f.force_empty = 0;
+    alternate.copy_shallow(info);
+    info->clear();
+  }
+  virtual void get_http_info(CacheHTTPInfo ** info) {
+    *info = &alternate;
+  }
+#endif
+
+  bool is_ram_cache_hit() {
+    ink_assert(vio.op == VIO::READ);
+    return !f.not_from_ram_cache;
+  }
+  virtual bool set_disk_io_priority(int priority)
+  {
+    disk_io_priority = priority;
+    return true;
+  }
+  virtual int get_disk_io_priority() {
+    return disk_io_priority;
+  }
+  virtual bool set_pin_in_cache(time_t t) {
+    time_pin = t;
+    return true;
+  }
+  virtual time_t get_pin_in_cache() {
+    return time_pin;
+  }
+  virtual int64_t get_object_size()
+  {
+    return alternate.object_size_get();
+  }
+  virtual bool is_read_from_writer()
+  {
+    return f.read_from_writer_called;
+  }
+  virtual bool is_ram_cache_hit() const
+  {
+    return !f.not_from_ram_cache;
+  }
+  virtual bool is_pread_capable()
+  {
+    return true;
+  }
+  void
+  cancel_trigger()
+  {
+    if (trigger) {
+      trigger->cancel_action();
+      trigger = NULL;
+    }
+  }
+
+  int calluser(int event);
+  int callcont(int event);
+  int handleRead(int event, void *data);
+  int openReadReadDone(int event, void *data);
+//  int handleWrite(int event, void *data);
+//  int openWriteWriteDone(int event, void *data);
+  int openReadStart(int event, void *data);
+  int openWriteStart(int event, void *data);
+  int openReadMain(int event, void *data);
+  int openWriteMain(int event, void *data);
+  int removeEvent(int event, void *data);
+};
+
+
+
+struct SetIOReadMessage: public ClusterMessageHeader
+{
+  int64_t nbytes;
+  int64_t offset;
+};
+
+struct SetIOWriteMessage: public ClusterMessageHeader
+{
+  int32_t hdr_len;
+  int64_t nbytes;
+};
+
+struct SetIOCloseMessage: public ClusterMessageHeader
+{
+  int h_len;
+  int d_len;
+  int64_t total_len;
+};
+
+struct SetIOReenableMessage: public ClusterMessageHeader
+{
+  int reenable;
+};
+struct SetResponseMessage: public ClusterMessageHeader
+{
+
+};
+
+inline IOBufferBlock *
+clone_IOBufferBlockList(IOBufferBlock *ab, int64_t offset, int64_t len)
+{
+  IOBufferBlock *b = ab;
+  IOBufferBlock *head = NULL;
+  IOBufferBlock *clone = NULL;
+
+  while (b && len >= 0) {
+    int64_t max_bytes = b->read_avail();
+    max_bytes -= offset;
+    if (max_bytes <= 0) {
+      offset = -max_bytes;
+      b = b->next;
+      continue;
+    }
+
+    if (!head) {
+      head = b->clone();
+      head->consume(offset);
+      clone = head;
+    } else {
+      clone->next = b->clone();
+      clone = clone->next;
+    }
+
+    len -= max_bytes;
+    b = b->next;
+    offset = 0;
+  }
+  if (clone && len < 0)
+    clone->fill(len);
+  return head;
+}
+
+ClusterCacheVC *new_ClusterCacheVC();
+void free_ClusterCacheVC(ClusterCacheVC *ccvc);
+
+inline int
+ClusterCacheVC::calluser(int event)
+{
+  recursive++;
+  ink_assert(this_ethread() == vio._cont->mutex->thread_holding);
+  vio._cont->handleEvent(event, (void *) &vio);
+  recursive--;
+  if (closed && !in_progress) {
+    free_ClusterCacheVC(this);
+    return EVENT_DONE;
+  }
+  return EVENT_CONT;
+}
+
+inline int
+ClusterCacheVC::callcont(int event)
+{
+  recursive++;
+  ink_assert(this_ethread() == _action.mutex->thread_holding);
+  _action.continuation->handleEvent(event, this);
+  recursive--;
+  if (closed && !in_progress) {
+    free_ClusterCacheVC(this);
+    return EVENT_DONE;
+  } else if (vio.vc_server)
+    handleEvent(EVENT_IMMEDIATE, 0);
+  return EVENT_DONE;
+}
+
+extern ClassAllocator<ClusterCacheVC> clusterCacheVCAllocator;
+
+inline ClusterCacheVC *
+new_ClusterCacheVC(Continuation *cont)
+{
+  EThread *t = cont->mutex->thread_holding;
+  ClusterCacheVC *c = clusterCacheVCAllocator.alloc();
+  c->_action = cont;
+  c->initial_thread = t;
+  c->mutex = cont->mutex;
+  c->start_time = ink_get_hrtime();
+  ink_assert(c->trigger == NULL);
+
+  Debug("cluster_cache_new", "new %p", c);
+  return c;
+}
+
+inline void
+free_ClusterCacheVC(ClusterCacheVC *cont)
+{
+  Debug("cluster_cache_free", "free %p", cont);
+  ink_assert(cont->mutex->thread_holding == this_ethread());
+
+  if (cont->trigger)
+    cont->trigger->cancel();
+  ink_assert(!cont->in_progress);
+
+  if (!cont->session_closed)
+    cluster_close_session(cont->cs);
+
+  cont->vio.buffer.clear();
+  cont->vio.mutex.clear();
+#ifdef HTTP_CACHE
+  if (cont->vio.op == VIO::WRITE)
+    cont->alternate.destroy();
+  else
+    cont->alternate.clear();
+#endif
+  cont->_action.cancelled = 0;
+  cont->_action.mutex.clear();
+  cont->mutex.clear();
+  cont->buf.clear();
+  cont->first_buf.clear();
+  cont->blocks.clear();
+
+  memset((char *) &cont->vio, 0, cont->size_to_init);
+
+  clusterCacheVCAllocator.free(cont);
+}
 #endif /* _Cluster_h */

http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/P_ClusterCacheInternal.h
----------------------------------------------------------------------
diff --git a/iocore/cluster/P_ClusterCacheInternal.h b/iocore/cluster/P_ClusterCacheInternal.h
index 8b62d44..44b675c 100644
--- a/iocore/cluster/P_ClusterCacheInternal.h
+++ b/iocore/cluster/P_ClusterCacheInternal.h
@@ -81,6 +81,88 @@ extern int ET_CLUSTER;
 #define PROBE_LOCAL_CACHE_FIRST        DO_REPLICATION
 #define PROBE_LOCAL_CACHE_LAST         false
 
+struct ClusterCont: public Continuation
+{
+  ClusterSession session;
+  Ptr<IOBufferBlock> data;
+  void *context;
+  int func_id;
+  int data_len;
+
+  Action _action;
+  int handleEvent(int event, void *d);
+  IOBufferData *copy_data();
+  int copy_data(char *buf, int size);
+  void consume(int size);
+};
+
+inline IOBufferData *
+ClusterCont::copy_data() {
+  IOBufferData *buf = new_IOBufferData(iobuffer_size_to_index(data_len, MAX_BUFFER_SIZE_INDEX));
+  char *p = buf->data();
+  for (IOBufferBlock *b = data; b; b = b->next) {
+    memcpy(p, b->_start, b->_end - b->_start);
+    p += b->_end - b->_start;
+  }
+  return buf;
+}
+
+inline void
+ClusterCont::consume(int size) {
+
+  int64_t sz = size;
+  while (data && sz >= data->read_avail()) {
+    sz -= data->read_avail();
+    data = data->next;
+  }
+  if (data)
+    data->_start += sz;
+
+  data_len = data_len > size ? (data_len - size) : 0;
+}
+
+inline int
+ClusterCont::copy_data(char *buf, int len)
+{
+  ink_assert(data_len >= len);
+  IOBufferBlock *b = data;
+  int64_t sz = len;
+  while (len > 0 && b) {
+    int64_t avail = b->read_avail();
+    sz -= avail;
+    if (sz < 0) {
+      memcpy(buf, b->_start, avail + sz);
+      sz = 0;
+      break;
+    } else {
+      memcpy(buf, b->_start, avail);
+      buf += avail;
+      b = b->next;
+    }
+  }
+  return len - (int) sz;
+}
+extern ClassAllocator<ClusterCont> clusterContAllocator;
+
+inline int
+ClusterCont::handleEvent(int, void *) {
+  if (func_id == CLUSTER_CACHE_OP_CLUSTER_FUNCTION)
+    cache_op_ClusterFunction(session, context, this);
+  else if (func_id == CLUSTER_CACHE_OP_RESULT_CLUSTER_FUNCTION)
+    cache_op_result_ClusterFunction(session, context, this);
+  else if (func_id == CLUSTER_INTERNEL_ERROR)
+    _action.continuation->handleEvent(func_id, NULL);
+  else
+    _action.continuation->handleEvent(func_id, this);
+
+  mutex.clear();
+  _action.mutex.clear();
+  data = NULL;
+
+  clusterContAllocator.free(this);
+  return EVENT_DONE;
+}
+
 //
 // This continuation handles all cache cluster traffic, on both
 // sides (state machine client and cache server)
@@ -89,111 +171,91 @@ struct CacheContinuation;
 typedef int (CacheContinuation::*CacheContHandler) (int, void *);
 struct CacheContinuation:public Continuation
 {
+  static int size_to_init;
   enum
   {
     MagicNo = 0x92183123
   };
   int magicno;
-  void *callback_data;
-  void *callback_data_2;
   INK_MD5 url_md5;
-  Event *timeout;
-  Action action;
-  ClusterMachine *target_machine;
-  int probe_depth;
+
   ClusterMachine *past_probes[CONFIGURATION_HISTORY_PROBE_DEPTH];
+
+  ClusterVCToken token;
+
+  CacheHTTPInfo cache_vc_info; // for get_http_info
+//  MIOBuffer doc_data;
+  Ptr<IOBufferBlock> doc_data;
+  // Incoming data generated from unmarshaling request/response ops
+  Ptr<IOBufferData> rw_buf_msg;
+  Arena ic_arena;
+  CacheHTTPHdr ic_request; // for lookup or read
+  CacheHTTPInfo ic_old_info; // for update
+  CacheHTTPInfo ic_new_info; // for set_http_info
+
+  ClusterSession cs;
+  char *ic_hostname;
+  int ic_hostname_len;
+
   ink_hrtime start_time;
-  ClusterMachine *from;
-  ClusterHandler *ch;
-  VConnection *cache_vc;
+  ClusterMachine *target_machine;
+  int probe_depth;
+
+  CacheVC *cache_vc;
+  Action *pending_action;
   bool cache_read;
+  bool request_purge;
+  bool have_all_data;           // all object data in response
+  bool expect_next;
+  bool writer_aborted;
   int result;                   // return event code
   int result_error;             // error code associated with event
-  ClusterVCToken token;
-  unsigned int seq_number;
   uint16_t cfl_flags;             // Request flags; see CFL_XXX defines
+
+  unsigned int seq_number;
   CacheFragType frag_type;
-  int nbytes;
+  int nbytes;       // the msg nbyts
   unsigned int target_ip;
   int request_opcode;
-  bool request_purge;
-  bool local_lookup_only;
-  bool no_reply_message;
-  bool request_timeout;         // timeout occurred before
-  //   op complete
-  bool expect_cache_callback;
-
-  // remove_and_delete() specific data
-  bool use_deferred_callback;
-
-  // open_read/write data
-
-  time_t pin_in_cache;
-
-  // setMsgBufferLen(), allocMsgBuffer() and freeMsgBuffer() data
-
-    Ptr<IOBufferData> rw_buf_msg;
+  int header_len;
   int rw_buf_msg_len;
 
-  // open data
-
-  ClusterVConnection *read_cluster_vc;
-  ClusterVConnection *write_cluster_vc;
-  int cluster_vc_channel;
-  ClusterVCToken open_local_token;
-
-  // Readahead on open read specific data
-
-  int caller_buf_freebytes;     // remote bufsize for
-  //  initial data
-  VIO *readahead_vio;
-  IOBufferReader *readahead_reader;
-    Ptr<IOBufferBlock> readahead_data;
-  bool have_all_data;           // all object data in response
-
-  CacheHTTPInfo cache_vc_info;
-  OneWayTunnel *tunnel;
-    Ptr<ProxyMutex> tunnel_mutex;
-  CacheContinuation *tunnel_cont;
-  bool tunnel_closed;
-  Action *cache_action;
-  Event *lookup_open_write_vc_event;
-
-  // Incoming data generated from unmarshaling request/response ops
-
-  Arena ic_arena;
-  CacheHTTPHdr ic_request;
-  CacheHTTPHdr ic_response;
+  time_t pin_in_cache;
+  int64_t doc_size;
+  int64_t total_length;
+  VIO *vio;                  //
+  IOBufferReader *reader;    // for normal read
   CacheLookupHttpConfig *ic_params;
-  CacheHTTPInfo ic_old_info;
-  CacheHTTPInfo ic_new_info;
-    Ptr<IOBufferData> ic_hostname;
-  int ic_hostname_len;
-
-  // debugging
-  int cache_op_ClusterFunction;
-
-  int lookupEvent(int event, void *d);
-  int probeLookupEvent(int event, void *d);
-  int remoteOpEvent(int event, Event * e);
-  int replyLookupEvent(int event, void *d);
-  int replyOpEvent(int event, VConnection * vc);
-  int handleReplyEvent(int event, Event * e);
-  int callbackEvent(int event, Event * e);
-  int setupVCdataRead(int event, VConnection * vc);
-  int VCdataRead(int event, VIO * target_vio);
-  int setupReadWriteVC(int, VConnection *);
-  ClusterVConnection *lookupOpenWriteVC();
-  int lookupOpenWriteVCEvent(int, Event *);
-  int localVCsetupEvent(int event, ClusterVConnection * vc);
-  void insert_cache_callback_user(ClusterVConnection *, int, void *);
-  int insertCallbackEvent(int, Event *);
-  void callback_user(int result, void *d);
-  void defer_callback_result(int result, void *d);
-  int callbackResultEvent(int event, Event * e);
-  void setupReadBufTunnel(VConnection *, VConnection *);
-  int tunnelClosedEvent(int event, void *);
-  int remove_and_delete(int, Event *);
+  MIOBuffer *mbuf;
+  EThread *thread;
+
+//  int lookupEvent(int event, void *d);
+//  int probeLookupEvent(int event, void *d);
+//  int remoteOpEvent(int event, Event * e);
+//  int replyLookupEvent(int event, void *d);
+  int replyOpEvent();
+//  int handleReplyEvent(int event, Event * e);
+//  int callbackEvent(int event, Event * e);
+  int setupVCdataRead(int event, void *data);
+  int setupVCdataWrite(int event, void *data);
+  int setupVCdataRemove(int event, void *data);
+  int setupVCdataLink(int event, void *data);
+  int setupVCdataDeref(int event, void *data);
+  int VCdataRead(int event, void *data);
+  int VCdataWrite(int event, void *data);
+  int VCSmallDataRead(int event, void *data);
+//  int setupReadWriteVC(int, VConnection *);
+//  ClusterVConnection *lookupOpenWriteVC();
+//  int lookupOpenWriteVCEvent(int, Event *);
+//  int localVCsetupEvent(int event, ClusterVConnection * vc);
+//  void insert_cache_callback_user(ClusterVConnection *, int, void *);
+//  int insertCallbackEvent(int, Event *);
+//  void callback_user(int result, void *d);
+//  void defer_callback_result(int result, void *d);
+//  int callbackResultEvent(int event, Event * e);
+//  void setupReadBufTunnel(VConnection *, VConnection *);
+//  int tunnelClosedEvent(int event, void *);
+//  int remove_and_delete(int, Event *);
 
 
   inline void setMsgBufferLen(int l, IOBufferData * b = 0) {
@@ -254,66 +316,26 @@ struct CacheContinuation:public Continuation
     if (ic_request.valid()) {
       ic_request.clear();
     }
-    if (ic_response.valid()) {
-      ic_response.clear();
-    }
+//    if (ic_response.valid()) {
+//      ic_response.clear();
+//    }
     if (ic_old_info.valid()) {
       ic_old_info.destroy();
     }
     if (ic_new_info.valid()) {
       ic_new_info.destroy();
     }
-    ic_arena.reset();
+//    ic_arena.reset();
     freeMsgBuffer();
-
-    tunnel_mutex = 0;
-    readahead_data = 0;
+//
+//    tunnel_mutex = 0;
+//    readahead_data = 0;
     ic_hostname = 0;
   }
 
-CacheContinuation():
-  Continuation(NULL),
-    magicno(MagicNo),
-    callback_data(0),
-    callback_data_2(0),
-    timeout(0),
-    target_machine(0),
-    probe_depth(0),
-    start_time(0),
-    cache_read(false),
-    result(0),
-    result_error(0),
-    seq_number(0),
-    cfl_flags(0),
-    frag_type(CACHE_FRAG_TYPE_NONE),
-    nbytes(0),
-    target_ip(0),
-    request_opcode(0),
-    request_purge(false),
-    local_lookup_only(0),
-    no_reply_message(0),
-    request_timeout(0),
-    expect_cache_callback(true),
-    use_deferred_callback(0),
-    pin_in_cache(0),
-    rw_buf_msg_len(0),
-    read_cluster_vc(0),
-    write_cluster_vc(0),
-    cluster_vc_channel(0),
-    caller_buf_freebytes(0),
-    readahead_vio(0),
-    readahead_reader(0),
-    have_all_data(false),
-    cache_vc_info(),
-    tunnel(0),
-    tunnel_cont(0),
-    tunnel_closed(0),
-    lookup_open_write_vc_event(0),
-    ic_arena(),
-    ic_request(),
-    ic_response(), ic_params(0), ic_old_info(), ic_new_info(), ic_hostname_len(0), cache_op_ClusterFunction(0) {
-    token.clear();
-    SET_HANDLER((CacheContHandler) & CacheContinuation::remoteOpEvent);
+  CacheContinuation(): magicno(MagicNo) {
+    size_to_init = sizeof(CacheContinuation) - (size_t) & ((CacheContinuation *) 0)->cs;
+    memset((char *) &cs, 0, size_to_init);
   }
 
   inline static bool is_ClusterThread(EThread * et)
@@ -334,14 +356,66 @@ CacheContinuation():
   static void cacheContAllocator_free(CacheContinuation *);
   inkcoreapi static Action *callback_failure(Action *, int, int, CacheContinuation * this_cc = 0);
   static Action *do_remote_lookup(Continuation *, CacheKey *, CacheContinuation *, CacheFragType, char *, int);
-  inkcoreapi static Action *do_op(Continuation *, ClusterMachine *, void *, int, char *, int,
-                                  int nbytes = -1, MIOBuffer * b = 0);
+  inkcoreapi static Action *do_op(Continuation * c, ClusterSession cs, void *args,
+                           int user_opcode, IOBufferData *data, int data_len, int nbytes = -1, MIOBuffer * b = 0);
   static int setup_local_vc(char *data, int data_len, CacheContinuation * cc, ClusterMachine * mp, Action **);
   static void disposeOfDataBuffer(void *buf);
   static int handleDisposeEvent(int event, CacheContinuation * cc);
-  static int32_t getObjectSize(VConnection *, int, CacheHTTPInfo *);
+  int32_t getObjectSize(VConnection *, int, CacheHTTPInfo *);
 };
 
+extern ClassAllocator<CacheContinuation> cacheContAllocator;
+
+inline CacheContinuation *
+new_CacheCont(EThread *t) {
+  ink_assert(t == this_ethread());
+  CacheContinuation *c = cacheContAllocator.alloc();
+  c->mutex = new_ProxyMutex();
+  c->start_time = ink_get_hrtime();
+  c->thread = t;
+  return c;
+}
+
+inline void
+free_CacheCont(CacheContinuation *c) {
+  ink_assert(c->magicno == (int) c->MagicNo && !c->expect_next);
+//  ink_assert(!c->cache_op_ClusterFunction);
+  if (c->pending_action) {
+    c->pending_action->cancel();
+    c->pending_action = NULL;
+  }
+  if (c->cache_vc) {
+    if (c->cache_vc->vio.op == VIO::READ)
+      c->cache_vc->do_io(VIO::CLOSE);
+    else
+      c->cache_vc->do_io(VIO::ABORT);
+    c->cache_vc = NULL;
+  }
+  if (c->mbuf) {
+    free_MIOBuffer(c->mbuf);
+    c->mbuf = NULL;
+  }
+
+  c->magicno = -1;
+  c->token.clear();
+  c->cache_vc_info.clear();
+  if (c->ic_params) {
+    delete c->ic_params;
+    c->ic_params = 0;
+  }
+  c->ic_request.clear();
+  c->ic_old_info.clear();
+  c->ic_new_info.destroy();
+  c->ic_arena.reset();
+  c->freeMsgBuffer();
+  c->ic_hostname = 0;
+  c->mutex.clear();
+
+  c->doc_data = NULL;
+
+  cacheContAllocator.free(c);
+}
+
 /////////////////////////////////////////
 // Cache OP specific args for do_op()  //
 /////////////////////////////////////////
@@ -595,18 +669,19 @@ struct CacheOpReplyMsg:public ClusterMessageHeader
 {
   uint32_t seq_number;
   int32_t result;
-  ClusterVCToken token;
-  bool is_ram_cache_hit;          // Entire object was from ram cache
-  Alias32 moi;                 // Used by CACHE_OPEN_READ & CACHE_LINK reply
+  int32_t h_len;
+  int32_t d_len;
+  int32_t reason; // // Used by CACHE_OPEN_READ & CACHE_LINK reply
+  int64_t doc_size;
+
   enum
   {
     MIN_VERSION = 1,
     MAX_VERSION = 1,
     CACHE_OP_REPLY_MESSAGE_VERSION = MAX_VERSION
   };
-  CacheOpReplyMsg(uint16_t vers = CACHE_OP_REPLY_MESSAGE_VERSION)
-    : ClusterMessageHeader(vers), seq_number(0), result(0), is_ram_cache_hit(false) {
-    moi.u32 = 0;
+  CacheOpReplyMsg(uint16_t vers = CACHE_OP_REPLY_MESSAGE_VERSION):
+  ClusterMessageHeader(vers), seq_number(0), result(0), h_len(0), d_len(0), reason(0), doc_size(0) {
   }
 
   //////////////////////////////////////////////////////////////////////////
@@ -617,7 +692,7 @@ struct CacheOpReplyMsg:public ClusterMessageHeader
   }
   static int sizeof_fixedlen_msg()
   {
-    return (int) ALIGN_DOUBLE(offsetof(CacheOpReplyMsg, moi));
+    return INK_ALIGN(sizeof (CacheOpReplyMsg), 16);
   }
   void init(uint16_t vers = CACHE_OP_REPLY_MESSAGE_VERSION) {
     _init(vers);
@@ -627,12 +702,12 @@ struct CacheOpReplyMsg:public ClusterMessageHeader
     if (NeedByteSwap()) {
       ats_swap32(&seq_number);
       ats_swap32((uint32_t *) & result);
-      token.SwapBytes();
+      ats_swap32((uint32_t *) & reason);
+      ats_swap64((uint64_t *) & doc_size);
     }
   }
   //////////////////////////////////////////////////////////////////////////
 };
-
 inline int
 maxval(int a, int b)
 {
@@ -795,6 +870,7 @@ event_reply_may_have_moi(int event)
 {
   switch (event) {
   case CACHE_EVENT_OPEN_READ:
+  case CACHE_EVENT_OPEN_WRITE:
   case CACHE_EVENT_LINK:
   case CACHE_EVENT_LINK_FAILED:
   case CACHE_EVENT_OPEN_READ_FAILED:


[5/6] refine the codes of cluster

Posted by we...@apache.org.
http://git-wip-us.apache.org/repos/asf/trafficserver/blob/62504a9f/iocore/cluster/ClusterCache.cc
----------------------------------------------------------------------
diff --git a/iocore/cluster/ClusterCache.cc b/iocore/cluster/ClusterCache.cc
index 8d4b6e5..9b3be6c 100644
--- a/iocore/cluster/ClusterCache.cc
+++ b/iocore/cluster/ClusterCache.cc
@@ -28,6 +28,10 @@
 
 #include "P_Cluster.h"
 
+//ClassAllocator<ClusterBuffer> clusterBufferAllocator("clusterBufferAllocator");
+
+int CacheContinuation::size_to_init = -1;
+
 #ifdef DEBUG
 #define CLUSTER_TEST_DEBUG	1
 #endif
@@ -53,31 +57,34 @@ int open_delay_events = 0;
 // default will be read from config
 int cache_migrate_on_demand = false;
 
-/////////////////
-// Static Data //
-/////////////////
-static ClassAllocator<CacheContinuation> cacheContAllocator("cacheContAllocator");
+ClassAllocator<CacheContinuation> cacheContAllocator("cacheContAllocator");
+ClassAllocator<ClusterCont> clusterContAllocator("clusterContAllocator");
 
-static Queue<CacheContinuation> remoteCacheContQueue[REMOTE_CONNECT_HASH];
-static Ptr<ProxyMutex> remoteCacheContQueueMutex[REMOTE_CONNECT_HASH];
+//static Queue<CacheContinuation> remoteCacheContQueue[REMOTE_CONNECT_HASH];
+//static Ptr<ProxyMutex> remoteCacheContQueueMutex[REMOTE_CONNECT_HASH];
 
 // 0 is an illegal sequence number
 #define CACHE_NO_RESPONSE            0
 static int cluster_sequence_number = 1;
 
 #ifdef CLUSTER_TEST_DEBUG
-static ink_hrtime cache_cluster_timeout = HRTIME_SECONDS(65536);
+//static ink_hrtime cache_cluster_timeout = HRTIME_SECONDS(65536);
 #else
-static ink_hrtime cache_cluster_timeout = CACHE_CLUSTER_TIMEOUT;
+//static ink_hrtime cache_cluster_timeout = CACHE_CLUSTER_TIMEOUT;
 #endif
 
 ///////////////////
 // Declarations  //
 ///////////////////
-static CacheContinuation *find_cache_continuation(unsigned int, unsigned int);
+//static CacheContinuation *find_cache_continuation(unsigned int, unsigned int);
 
 static unsigned int new_cache_sequence_number();
 
+#ifdef DEBUG
+int64_t num_of_cachecontinuation = 0;
+int64_t num_of_cluster_cachevc = 0;
+#endif
+
 #define DOT_SEPARATED(_x)                             \
 ((unsigned char*)&(_x))[0], ((unsigned char*)&(_x))[1],   \
   ((unsigned char*)&(_x))[2], ((unsigned char*)&(_x))[3]
@@ -310,7 +317,7 @@ ClusterVConnectionCache::lookup(INK_MD5 * key)
 }
 
 int
-ClusterVConnectionCacheEvent::eventHandler(int /* event ATS_UNUSED */, Event * e)
+ClusterVConnectionCacheEvent::eventHandler(int , Event * e)
 {
   CLUSTER_INCREMENT_DYN_STAT(CLUSTER_VC_CACHE_SCANS_STAT);
   MUTEX_TRY_LOCK(lock, cache->hash_lock[hash_index], this_ethread());
@@ -358,12 +365,12 @@ ClusterVConnectionCacheEvent::eventHandler(int /* event ATS_UNUSED */, Event * e
 int
 CacheContinuation::init()
 {
-  int n;
-  for (n = 0; n < REMOTE_CONNECT_HASH; ++n)
-    remoteCacheContQueueMutex[n] = new_ProxyMutex();
-
-  GlobalOpenWriteVCcache = new ClusterVConnectionCache;
-  GlobalOpenWriteVCcache->init();
+//  int n;
+//  for (n = 0; n < REMOTE_CONNECT_HASH; ++n)
+//    remoteCacheContQueueMutex[n] = new_ProxyMutex();
+//
+//  GlobalOpenWriteVCcache = new ClusterVConnectionCache;
+//  GlobalOpenWriteVCcache->init();
   return 0;
 }
 
@@ -371,14 +378,269 @@ CacheContinuation::init()
 // do_op()
 //   Main function to do a cluster cache operation
 ///////////////////////////////////////////////////////////////////////
+//Action *
+//CacheContinuation::do_op(Continuation * c, ClusterMachine * mp, void *args,
+//                         int user_opcode, char *data, int data_len, int nbytes, MIOBuffer * b)
+//{
+//  CacheContinuation *cc = 0;
+//  Action *act = 0;
+//  char *msg = 0;
+//
+//  /////////////////////////////////////////////////////////////////////
+//  // Unconditionally map open read buffer interfaces to open read.
+//  // open read buffer interfaces are now deprecated.
+//  /////////////////////////////////////////////////////////////////////
+//  int opcode = user_opcode;
+//  switch (opcode) {
+//  case CACHE_OPEN_READ_BUFFER:
+//    opcode = CACHE_OPEN_READ;
+//    break;
+//  case CACHE_OPEN_READ_BUFFER_LONG:
+//    opcode = CACHE_OPEN_READ_LONG;
+//    break;
+//  default:
+//    break;
+//  }
+//
+//  if (!ch)
+//    goto no_send_exit;
+//
+//  if (c) {
+//    cc = cacheContAllocator_alloc();
+//    cc->ch = ch;
+//    cc->target_machine = mp;
+//    cc->request_opcode = opcode;
+//    cc->mutex = c->mutex;
+//    cc->action = c;
+//    cc->action.cancelled = false;
+//    cc->start_time = ink_get_hrtime();
+//    cc->from = mp;
+//    cc->result = op_failure(opcode);
+//    SET_CONTINUATION_HANDLER(cc, (CacheContHandler)
+//                             & CacheContinuation::remoteOpEvent);
+//    act = &cc->action;
+//
+//    // set up sequence number so we can find this continuation
+//
+//    cc->target_ip = mp->ip;
+//    cc->seq_number = new_cache_sequence_number();
+//
+//    // establish timeout for cache op
+//
+//    unsigned int hash = FOLDHASH(cc->target_ip, cc->seq_number);
+//    MUTEX_TRY_LOCK(queuelock, remoteCacheContQueueMutex[hash], this_ethread());
+//    if (!queuelock) {
+//
+//      // failed to acquire lock: no problem, retry later
+//      cc->timeout = eventProcessor.schedule_in(cc, CACHE_RETRY_PERIOD, ET_CACHE_CONT_SM);
+//    } else {
+//      remoteCacheContQueue[hash].enqueue(cc);
+//      MUTEX_RELEASE(queuelock);
+//      cc->timeout = eventProcessor.schedule_in(cc, cache_cluster_timeout, ET_CACHE_CONT_SM);
+//    }
+//  }
+//  //
+//  // Determine the type of the "Over The Wire" (OTW) message header and
+//  //   initialize it.
+//  //
+//  Debug("cache_msg",
+//        "do_op opcode=%d seqno=%d Machine=%p data=%p datalen=%d mio=%p",
+//        opcode, (c ? cc->seq_number : CACHE_NO_RESPONSE), mp, data, data_len, b);
+//
+//  switch (opcode) {
+//  case CACHE_OPEN_WRITE_BUFFER:
+//  case CACHE_OPEN_WRITE_BUFFER_LONG:
+//    {
+//      ink_release_assert(!"write buffer not supported");
+//      break;
+//    }
+//  case CACHE_OPEN_READ_BUFFER:
+//  case CACHE_OPEN_READ_BUFFER_LONG:
+//    {
+//      ink_release_assert(!"read buffer not supported");
+//      break;
+//    }
+//  case CACHE_OPEN_WRITE:
+//  case CACHE_OPEN_READ:
+//    {
+//      ink_release_assert(c > 0);
+//      //////////////////////
+//      // Use short format //
+//      //////////////////////
+//      if (!data) {
+//        data_len = op_to_sizeof_fixedlen_msg(opcode);
+//        data = (char *) ALLOCA_DOUBLE(data_len);
+//      }
+//      msg = (char *) data;
+//      CacheOpMsg_short *m = (CacheOpMsg_short *) msg;
+//      m->init();
+//      m->opcode = opcode;
+//      m->cfl_flags = ((CacheOpArgs_General *) args)->cfl_flags;
+//      m->md5 = *((CacheOpArgs_General *) args)->url_md5;
+//      cc->url_md5 = m->md5;
+//      m->seq_number = (c ? cc->seq_number : CACHE_NO_RESPONSE);
+//      m->frag_type = ((CacheOpArgs_General *) args)->frag_type;
+//      if (opcode == CACHE_OPEN_WRITE) {
+//        m->nbytes = nbytes;
+//        m->data = (uint32_t) ((CacheOpArgs_General *) args)->pin_in_cache;
+//      } else {
+//        m->nbytes = 0;
+//        m->data = 0;
+//      }
+//
+//      if (opcode == CACHE_OPEN_READ) {
+//        //
+//        // Set upper limit on initial data received with response
+//        // for open read response
+//        //
+//        m->buffer_size = DEFAULT_MAX_BUFFER_SIZE;
+//      } else {
+//        m->buffer_size = 0;
+//      }
+//
+//      //
+//      // Establish the local VC
+//      //
+//      int res = setup_local_vc(msg, data_len, cc, mp, &act);
+//      if (!res) {
+//        /////////////////////////////////////////////////////
+//        // Unable to setup local VC, request aborted.
+//        // Remove request from pending list and deallocate.
+//        /////////////////////////////////////////////////////
+//        cc->remove_and_delete(0, (Event *) 0);
+//        return act;
+//
+//      } else if (res != -1) {
+//        ///////////////////////////////////////
+//        // VC established, send request
+//        ///////////////////////////////////////
+//        break;
+//
+//      } else {
+//        //////////////////////////////////////////////////////
+//        // Unable to setup VC, delay required, await callback
+//        //////////////////////////////////////////////////////
+//        goto no_send_exit;
+//      }
+//    }
+//
+//  case CACHE_OPEN_READ_LONG:
+//  case CACHE_OPEN_WRITE_LONG:
+//    {
+//      ink_release_assert(c > 0);
+//      //////////////////////
+//      // Use long format  //
+//      //////////////////////
+//      msg = data;
+//      CacheOpMsg_long *m = (CacheOpMsg_long *) msg;
+//      m->init();
+//      m->opcode = opcode;
+//      m->cfl_flags = ((CacheOpArgs_General *) args)->cfl_flags;
+//      m->url_md5 = *((CacheOpArgs_General *) args)->url_md5;
+//      cc->url_md5 = m->url_md5;
+//      m->seq_number = (c ? cc->seq_number : CACHE_NO_RESPONSE);
+//      m->nbytes = nbytes;
+//      m->data = (uint32_t) ((CacheOpArgs_General *) args)->pin_in_cache;
+//      m->frag_type = (uint32_t) ((CacheOpArgs_General *) args)->frag_type;
+//
+//      if (opcode == CACHE_OPEN_READ_LONG) {
+//        //
+//        // Set upper limit on initial data received with response
+//        // for open read response
+//        //
+//        m->buffer_size = DEFAULT_MAX_BUFFER_SIZE;
+//      } else {
+//        m->buffer_size = 0;
+//      }
+//      //
+//      // Establish the local VC
+//      //
+//      int res = setup_local_vc(msg, data_len, cc, mp, &act);
+//      if (!res) {
+//        /////////////////////////////////////////////////////
+//        // Unable to setup local VC, request aborted.
+//        // Remove request from pending list and deallocate.
+//        /////////////////////////////////////////////////////
+//        cc->remove_and_delete(0, (Event *) 0);
+//        return act;
+//
+//      } else if (res != -1) {
+//        ///////////////////////////////////////
+//        // VC established, send request
+//        ///////////////////////////////////////
+//        break;
+//
+//      } else {
+//        //////////////////////////////////////////////////////
+//        // Unable to setup VC, delay required, await callback
+//        //////////////////////////////////////////////////////
+//        goto no_send_exit;
+//      }
+//    }
+//  case CACHE_UPDATE:
+//  case CACHE_REMOVE:
+//  case CACHE_DEREF:
+//    {
+//      //////////////////////
+//      // Use short format //
+//      //////////////////////
+//      msg = data;
+//      CacheOpMsg_short *m = (CacheOpMsg_short *) msg;
+//      m->init();
+//      m->opcode = opcode;
+//      m->frag_type = ((CacheOpArgs_Deref *) args)->frag_type;
+//      m->cfl_flags = ((CacheOpArgs_Deref *) args)->cfl_flags;
+//      if (opcode == CACHE_DEREF)
+//        m->md5 = *((CacheOpArgs_Deref *) args)->md5;
+//      else
+//        m->md5 = *((CacheOpArgs_General *) args)->url_md5;
+//      m->seq_number = (c ? cc->seq_number : CACHE_NO_RESPONSE);
+//      break;
+//    }
+//  case CACHE_LINK:
+//    {
+//      ////////////////////////
+//      // Use short_2 format //
+//      ////////////////////////
+//      msg = data;
+//      CacheOpMsg_short_2 *m = (CacheOpMsg_short_2 *) msg;
+//      m->init();
+//      m->opcode = opcode;
+//      m->cfl_flags = ((CacheOpArgs_Link *) args)->cfl_flags;
+//      m->md5_1 = *((CacheOpArgs_Link *) args)->from;
+//      m->md5_2 = *((CacheOpArgs_Link *) args)->to;
+//      m->seq_number = (c ? cc->seq_number : CACHE_NO_RESPONSE);
+//      m->frag_type = ((CacheOpArgs_Link *) args)->frag_type;
+//      break;
+//    }
+//  default:
+//    msg = 0;
+//    break;
+//  }
+//#ifdef CACHE_MSG_TRACE
+//  log_cache_op_sndmsg((c ? cc->seq_number : CACHE_NO_RESPONSE), 0, "do_op");
+//#endif
+//  clusterProcessor.invoke_remote(ch,
+//                                 op_needs_marshalled_coi(opcode) ? CACHE_OP_MALLOCED_CLUSTER_FUNCTION
+//                                 : CACHE_OP_CLUSTER_FUNCTION, (char *) msg, data_len);
+//
+//no_send_exit:
+//  if (c) {
+//    return act;
+//  } else {
+//    return (Action *) 0;
+//  }
+//}
+
+
 Action *
-CacheContinuation::do_op(Continuation * c, ClusterMachine * mp, void *args,
-                         int user_opcode, char *data, int data_len, int nbytes, MIOBuffer * b)
+CacheContinuation::do_op(Continuation * c, ClusterSession cs, void *args,
+                         int user_opcode, IOBufferData *data, int data_len, int nbytes, MIOBuffer * b)
 {
-  CacheContinuation *cc = 0;
-  Action *act = 0;
-  char *msg = 0;
-  ClusterHandler *ch = mp->pop_ClusterHandler();
+  ink_assert(data && !b);
+
+  ClusterCacheVC *ccvc = 0;
+  char *msg = data->data();
 
   /////////////////////////////////////////////////////////////////////
   // Unconditionally map open read buffer interfaces to open read.
@@ -396,50 +658,29 @@ CacheContinuation::do_op(Continuation * c, ClusterMachine * mp, void *args,
     break;
   }
 
-  if (!ch)
-    goto no_send_exit;
-
   if (c) {
-    cc = cacheContAllocator_alloc();
-    cc->ch = ch;
-    cc->target_machine = mp;
-    cc->request_opcode = opcode;
-    cc->mutex = c->mutex;
-    cc->action = c;
-    cc->action.cancelled = false;
-    cc->start_time = ink_get_hrtime();
-    cc->from = mp;
-    cc->result = op_failure(opcode);
-    SET_CONTINUATION_HANDLER(cc, (CacheContHandler)
-                             & CacheContinuation::remoteOpEvent);
-    act = &cc->action;
-
-    // set up sequence number so we can find this continuation
-
-    cc->target_ip = mp->ip;
-    cc->seq_number = new_cache_sequence_number();
-
-    // establish timeout for cache op
-
-    unsigned int hash = FOLDHASH(cc->target_ip, cc->seq_number);
-    MUTEX_TRY_LOCK(queuelock, remoteCacheContQueueMutex[hash], this_ethread());
-    if (!queuelock) {
-
-      // failed to acquire lock: no problem, retry later
-      cc->timeout = eventProcessor.schedule_in(cc, CACHE_RETRY_PERIOD, ET_CACHE_CONT_SM);
-    } else {
-      remoteCacheContQueue[hash].enqueue(cc);
-      MUTEX_RELEASE(queuelock);
-      cc->timeout = eventProcessor.schedule_in(cc, cache_cluster_timeout, ET_CACHE_CONT_SM);
+    ccvc = new_ClusterCacheVC(c);
+
+    if (opcode == CACHE_OPEN_READ || opcode == CACHE_OPEN_READ_LONG) {
+      SET_CONTINUATION_HANDLER(ccvc, &ClusterCacheVC::openReadStart);
+      ccvc->vio.op = VIO::READ;
+      ccvc->frag_type = ((CacheOpArgs_General *) args)->frag_type;
+    } else if (opcode == CACHE_OPEN_WRITE || opcode == CACHE_OPEN_WRITE_LONG) {
+      SET_CONTINUATION_HANDLER(ccvc, &ClusterCacheVC::openWriteStart);
+      ccvc->vio.op = VIO::WRITE;
+      ccvc->frag_type = ((CacheOpArgs_General *) args)->frag_type;
+    } else if (opcode == CACHE_REMOVE) {
+      SET_CONTINUATION_HANDLER(ccvc, &ClusterCacheVC::removeEvent);
+      ccvc->frag_type = ((CacheOpArgs_General *) args)->frag_type;
     }
+
+    cluster_bind_session(cs, ccvc);
+    ccvc->cs = cs;
   }
-  //
-  // Determine the type of the "Over The Wire" (OTW) message header and
-  //   initialize it.
-  //
+
   Debug("cache_msg",
-        "do_op opcode=%d seqno=%d Machine=%p data=%p datalen=%d mio=%p",
-        opcode, (c ? cc->seq_number : CACHE_NO_RESPONSE), mp, data, data_len, b);
+        "do_op opcode=%d data=%p datalen=%d mio=%p",
+        opcode, data, data_len, b);
 
   switch (opcode) {
   case CACHE_OPEN_WRITE_BUFFER:
@@ -461,61 +702,26 @@ CacheContinuation::do_op(Continuation * c, ClusterMachine * mp, void *args,
       //////////////////////
       // Use short format //
       //////////////////////
-      if (!data) {
-        data_len = op_to_sizeof_fixedlen_msg(opcode);
-        data = (char *) ALLOCA_DOUBLE(data_len);
-      }
-      msg = (char *) data;
       CacheOpMsg_short *m = (CacheOpMsg_short *) msg;
       m->init();
       m->opcode = opcode;
       m->cfl_flags = ((CacheOpArgs_General *) args)->cfl_flags;
       m->md5 = *((CacheOpArgs_General *) args)->url_md5;
-      cc->url_md5 = m->md5;
-      m->seq_number = (c ? cc->seq_number : CACHE_NO_RESPONSE);
+      //cc->url_md5 = m->md5;
+      m->seq_number = new_cache_sequence_number();
       m->frag_type = ((CacheOpArgs_General *) args)->frag_type;
       if (opcode == CACHE_OPEN_WRITE) {
         m->nbytes = nbytes;
         m->data = (uint32_t) ((CacheOpArgs_General *) args)->pin_in_cache;
+        ink_assert(ccvc);
+        ccvc->time_pin = ((CacheOpArgs_General *) args)->pin_in_cache;
       } else {
         m->nbytes = 0;
         m->data = 0;
       }
 
-      if (opcode == CACHE_OPEN_READ) {
-        //
-        // Set upper limit on initial data received with response
-        // for open read response
-        //
-        m->buffer_size = DEFAULT_MAX_BUFFER_SIZE;
-      } else {
-        m->buffer_size = 0;
-      }
-
-      //
-      // Establish the local VC
-      //
-      int res = setup_local_vc(msg, data_len, cc, mp, &act);
-      if (!res) {
-        /////////////////////////////////////////////////////
-        // Unable to setup local VC, request aborted.
-        // Remove request from pending list and deallocate.
-        /////////////////////////////////////////////////////
-        cc->remove_and_delete(0, (Event *) 0);
-        return act;
-
-      } else if (res != -1) {
-        ///////////////////////////////////////
-        // VC established, send request
-        ///////////////////////////////////////
-        break;
-
-      } else {
-        //////////////////////////////////////////////////////
-        // Unable to setup VC, delay required, await callback
-        //////////////////////////////////////////////////////
-        goto no_send_exit;
-      }
+      m->buffer_size = 0;
+      break;
     }
 
   case CACHE_OPEN_READ_LONG:
@@ -525,51 +731,21 @@ CacheContinuation::do_op(Continuation * c, ClusterMachine * mp, void *args,
       //////////////////////
       // Use long format  //
       //////////////////////
-      msg = data;
       CacheOpMsg_long *m = (CacheOpMsg_long *) msg;
       m->init();
       m->opcode = opcode;
       m->cfl_flags = ((CacheOpArgs_General *) args)->cfl_flags;
       m->url_md5 = *((CacheOpArgs_General *) args)->url_md5;
-      cc->url_md5 = m->url_md5;
-      m->seq_number = (c ? cc->seq_number : CACHE_NO_RESPONSE);
+      //cc->url_md5 = m->url_md5;
+      m->seq_number = new_cache_sequence_number();
       m->nbytes = nbytes;
       m->data = (uint32_t) ((CacheOpArgs_General *) args)->pin_in_cache;
+      ink_assert(ccvc);
+      ccvc->time_pin = (uint32_t) ((CacheOpArgs_General *) args)->pin_in_cache;
       m->frag_type = (uint32_t) ((CacheOpArgs_General *) args)->frag_type;
 
-      if (opcode == CACHE_OPEN_READ_LONG) {
-        //
-        // Set upper limit on initial data received with response
-        // for open read response
-        //
-        m->buffer_size = DEFAULT_MAX_BUFFER_SIZE;
-      } else {
-        m->buffer_size = 0;
-      }
-      //
-      // Establish the local VC
-      //
-      int res = setup_local_vc(msg, data_len, cc, mp, &act);
-      if (!res) {
-        /////////////////////////////////////////////////////
-        // Unable to setup local VC, request aborted.
-        // Remove request from pending list and deallocate.
-        /////////////////////////////////////////////////////
-        cc->remove_and_delete(0, (Event *) 0);
-        return act;
-
-      } else if (res != -1) {
-        ///////////////////////////////////////
-        // VC established, send request
-        ///////////////////////////////////////
-        break;
-
-      } else {
-        //////////////////////////////////////////////////////
-        // Unable to setup VC, delay required, await callback
-        //////////////////////////////////////////////////////
-        goto no_send_exit;
-      }
+      m->buffer_size = 0;
+      break;
     }
   case CACHE_UPDATE:
   case CACHE_REMOVE:
@@ -578,7 +754,6 @@ CacheContinuation::do_op(Continuation * c, ClusterMachine * mp, void *args,
       //////////////////////
       // Use short format //
       //////////////////////
-      msg = data;
       CacheOpMsg_short *m = (CacheOpMsg_short *) msg;
       m->init();
       m->opcode = opcode;
@@ -588,7 +763,7 @@ CacheContinuation::do_op(Continuation * c, ClusterMachine * mp, void *args,
         m->md5 = *((CacheOpArgs_Deref *) args)->md5;
       else
         m->md5 = *((CacheOpArgs_General *) args)->url_md5;
-      m->seq_number = (c ? cc->seq_number : CACHE_NO_RESPONSE);
+      m->seq_number = new_cache_sequence_number();
       break;
     }
   case CACHE_LINK:
@@ -596,369 +771,375 @@ CacheContinuation::do_op(Continuation * c, ClusterMachine * mp, void *args,
       ////////////////////////
       // Use short_2 format //
       ////////////////////////
-      msg = data;
       CacheOpMsg_short_2 *m = (CacheOpMsg_short_2 *) msg;
       m->init();
       m->opcode = opcode;
       m->cfl_flags = ((CacheOpArgs_Link *) args)->cfl_flags;
       m->md5_1 = *((CacheOpArgs_Link *) args)->from;
       m->md5_2 = *((CacheOpArgs_Link *) args)->to;
-      m->seq_number = (c ? cc->seq_number : CACHE_NO_RESPONSE);
+      m->seq_number = new_cache_sequence_number();
       m->frag_type = ((CacheOpArgs_Link *) args)->frag_type;
       break;
     }
   default:
-    msg = 0;
+    ink_release_assert(!"error request_op");
     break;
   }
 #ifdef CACHE_MSG_TRACE
   log_cache_op_sndmsg((c ? cc->seq_number : CACHE_NO_RESPONSE), 0, "do_op");
 #endif
-  clusterProcessor.invoke_remote(ch,
-                                 op_needs_marshalled_coi(opcode) ? CACHE_OP_MALLOCED_CLUSTER_FUNCTION
-                                 : CACHE_OP_CLUSTER_FUNCTION, (char *) msg, data_len);
-
-no_send_exit:
-  if (c) {
-    return act;
-  } else {
-    return (Action *) 0;
-  }
-}
-
-int
-CacheContinuation::setup_local_vc(char *data, int data_len, CacheContinuation * cc, ClusterMachine * mp, Action ** act)
-{
-  bool read_op = op_is_read(cc->request_opcode);
-  bool short_msg = op_is_shortform(cc->request_opcode);
-
-  // Alloc buffer, copy message and attach to continuation
-  cc->setMsgBufferLen(data_len);
-  cc->allocMsgBuffer();
-  memcpy(cc->getMsgBuffer(), data, data_len);
-
-  SET_CONTINUATION_HANDLER(cc, (CacheContHandler)
-                           & CacheContinuation::localVCsetupEvent);
-
-  if (short_msg) {
-    Debug("cache_proto", "open_local-s (%s) seqno=%d", (read_op ? "R" : "W"), ((CacheOpMsg_short *) data)->seq_number);
-  } else {
-    Debug("cache_proto", "open_local-l (%s) seqno=%d", (read_op ? "R" : "W"), ((CacheOpMsg_long *) data)->seq_number);
-  }
 
-  // Create local VC
-  ClusterVConnection *vc;
+  IOBufferBlock *ret = new_IOBufferBlock(data, data_len, 0);
+  ret->_buf_end = ret->_end;
 
-  if (!read_op && (cc->request_opcode == CACHE_OPEN_WRITE_LONG)) {
-    // Determine if the open_write has already been established.
-    vc = cc->lookupOpenWriteVC();
-
-  } else {
-    vc = clusterProcessor.open_local(cc, mp, cc->open_local_token,
-                                     (CLUSTER_OPT_ALLOW_IMMEDIATE |
-                                      (read_op ? CLUSTER_OPT_CONN_READ : CLUSTER_OPT_CONN_WRITE)));
-  }
-  if (!vc) {
-    // Error, abort request
-    if (short_msg) {
-      Debug("cache_proto", "0open_local-s (%s) failed, seqno=%d",
-            (read_op ? "R" : "W"), ((CacheOpMsg_short *) data)->seq_number);
-    } else {
-      Debug("cache_proto", "1open_local-l (%s) failed, seqno=%d",
-            (read_op ? "R" : "W"), ((CacheOpMsg_long *) data)->seq_number);
-    }
-    cc->freeMsgBuffer();
-    if (cc->timeout)
-      cc->timeout->cancel();
-    cc->timeout = NULL;
-
-    // Post async failure callback on a different continuation.
-    *act = callback_failure(&cc->action, (read_op ? CACHE_EVENT_OPEN_READ_FAILED : CACHE_EVENT_OPEN_WRITE_FAILED), 0);
-    return 0;
-
-  } else if (vc != CLUSTER_DELAYED_OPEN) {
-    // We have established the VC
-    if (read_op) {
-      cc->read_cluster_vc = vc;
-    } else {
-      cc->write_cluster_vc = vc;
-    }
-    cc->cluster_vc_channel = vc->channel;
-    vc->current_cont = cc;
-
-    if (short_msg) {
-      CacheOpMsg_short *ms = (CacheOpMsg_short *) data;
-      ms->channel = vc->channel;
-      ms->token = cc->open_local_token;
-      Debug("cache_proto",
-            "0open_local-s (%s) success, seqno=%d chan=%d token=%d,%d VC=%p",
-            (read_op ? "R" : "W"), ms->seq_number, vc->channel, ms->token.ip_created, ms->token.sequence_number, vc);
-    } else {
-      CacheOpMsg_long *ml = (CacheOpMsg_long *) data;
-      ml->channel = vc->channel;
-      ml->token = cc->open_local_token;
-      Debug("cache_proto",
-            "1open_local-l (%s) success, seqno=%d chan=%d token=%d,%d VC=%p",
-            (read_op ? "R" : "W"), ml->seq_number, vc->channel, ml->token.ip_created, ml->token.sequence_number, vc);
-    }
-    cc->freeMsgBuffer();
-    SET_CONTINUATION_HANDLER(cc, (CacheContHandler)
-                             & CacheContinuation::remoteOpEvent);
-    return 1;
-
-  } else {
-    //////////////////////////////////////////////////////
-    // Unable to setup VC, delay required, await callback
-    //////////////////////////////////////////////////////
-    return -1;
-  }
-}
-
-ClusterVConnection *
-CacheContinuation::lookupOpenWriteVC()
-{
-  ///////////////////////////////////////////////////////////////
-  // See if we already have an open_write ClusterVConnection
-  // which was established in a previous remote open_read which
-  // failed.
-  ///////////////////////////////////////////////////////////////
-  ClusterVConnection *vc;
-  CacheOpMsg_long *ml = (CacheOpMsg_long *) getMsgBuffer();
-
-  vc = GlobalOpenWriteVCcache->lookup(&ml->url_md5);
-
-  if (vc == ((ClusterVConnection *) 0)) {
-    // Retry lookup
-    SET_CONTINUATION_HANDLER(this, (CacheContHandler)
-                             & CacheContinuation::lookupOpenWriteVCEvent);
-    //
-    // Note: In the lookupOpenWriteVCEvent handler, we use EVENT_IMMEDIATE
-    //       to distinguish the lookup retry from a request timeout
-    //       which uses EVENT_INTERVAL.
-    //
-    lookup_open_write_vc_event = eventProcessor.schedule_imm(this, ET_CACHE_CONT_SM);
-
-  } else if (vc != ((ClusterVConnection *) - 1)) {
-    // Hit, found open_write VC in cache.
-    // Post open_write completion by simulating a
-    // remote cache op result message.
-
-    vc->action_ = action;       // establish new continuation
-
-    SET_CONTINUATION_HANDLER(this, (CacheContHandler)
-                             & CacheContinuation::localVCsetupEvent);
-    this->handleEvent(CLUSTER_EVENT_OPEN_EXISTS, vc);
-
-    CacheOpReplyMsg msg;
-    int msglen;
-
-    msglen = CacheOpReplyMsg::sizeof_fixedlen_msg();
-    msg.result = CACHE_EVENT_OPEN_WRITE;
-    msg.seq_number = seq_number;
-    msg.token = vc->token;
-
-    cache_op_result_ClusterFunction(ch, (void *) &msg, msglen);
-
-  } else {
-    // Miss, establish local VC and send remote open_write request
-
-    SET_CONTINUATION_HANDLER(this, (CacheContHandler)
-                             & CacheContinuation::localVCsetupEvent);
-    vc = clusterProcessor.open_local(this, from, open_local_token,
-                                     (CLUSTER_OPT_ALLOW_IMMEDIATE | CLUSTER_OPT_CONN_WRITE));
-    if (!vc) {
-      this->handleEvent(CLUSTER_EVENT_OPEN_FAILED, 0);
-
-    } else if (vc != CLUSTER_DELAYED_OPEN) {
-      this->handleEvent(CLUSTER_EVENT_OPEN, vc);
-    }
-  }
-  return CLUSTER_DELAYED_OPEN;  // force completion in callback
-}
-
-int
-CacheContinuation::lookupOpenWriteVCEvent(int event, Event * e)
-{
-  if (event == EVENT_IMMEDIATE) {
-    // Retry open_write VC lookup
-    lookupOpenWriteVC();
-
-  } else {
-    lookup_open_write_vc_event->cancel();
-    SET_CONTINUATION_HANDLER(this, (CacheContHandler)
-                             & CacheContinuation::localVCsetupEvent);
-    this->handleEvent(event, e);
-  }
-  return EVENT_DONE;
-}
-
-int
-CacheContinuation::remove_and_delete(int /* event ATS_UNUSED */, Event * e)
-{
-  unsigned int hash = FOLDHASH(target_ip, seq_number);
-  MUTEX_TRY_LOCK(queuelock, remoteCacheContQueueMutex[hash], this_ethread());
-  if (queuelock) {
-    if (remoteCacheContQueue[hash].in(this)) {
-      remoteCacheContQueue[hash].remove(this);
-    }
-    MUTEX_RELEASE(queuelock);
-    if (use_deferred_callback)
-      callback_failure(&action, result, result_error, this);
-    else
-      cacheContAllocator_free(this);
-
-  } else {
-    SET_HANDLER((CacheContHandler) & CacheContinuation::remove_and_delete);
-    if (!e) {
-      timeout = eventProcessor.schedule_in(this, cache_cluster_timeout, ET_CACHE_CONT_SM);
-    } else {
-      e->schedule_in(cache_cluster_timeout);
+  if (!cluster_send_message(cs, CLUSTER_CACHE_OP_CLUSTER_FUNCTION, ret, -1, PRIORITY_HIGH)) {
+    if (ccvc) {
+      ccvc->in_progress = true;
+      cluster_set_events(cs, RESPONSE_EVENT_NOTIFY_DEALER);
+      return &ccvc->_action;
     }
   }
-  return EVENT_DONE;
-}
-
-int
-CacheContinuation::localVCsetupEvent(int event, ClusterVConnection * vc)
-{
-  ink_assert(magicno == (int) MagicNo);
-  ink_assert(getMsgBuffer());
-  bool short_msg = op_is_shortform(request_opcode);
-  bool read_op = op_is_read(request_opcode);
-
-  if (event == EVENT_INTERVAL) {
-    Event *e = (Event *) vc;
-    unsigned int hash = FOLDHASH(target_ip, seq_number);
-
-    MUTEX_TRY_LOCK(queuelock, remoteCacheContQueueMutex[hash], e->ethread);
-    if (!queuelock) {
-      e->schedule_in(CACHE_RETRY_PERIOD);
-      return EVENT_CONT;
-    }
-
-    if (!remoteCacheContQueue[hash].in(this)) {
-      ////////////////////////////////////////////////////
-      // Not yet queued on outstanding operations list
-      ////////////////////////////////////////////////////
-      remoteCacheContQueue[hash].enqueue(this);
-      ink_assert(timeout == e);
-      MUTEX_RELEASE(queuelock);
-      e->schedule_in(cache_cluster_timeout);
-      return EVENT_CONT;
-
-    } else {
-      /////////////////////////////////////////////////////
-      // Timeout occurred
-      /////////////////////////////////////////////////////
-      remoteCacheContQueue[hash].remove(this);
-      MUTEX_RELEASE(queuelock);
-      Debug("cluster_timeout", "0cluster op timeout %d", seq_number);
-      CLUSTER_INCREMENT_DYN_STAT(CLUSTER_REMOTE_OP_TIMEOUTS_STAT);
-      timeout = (Event *) 1;    // Note timeout
-      /////////////////////////////////////////////////////////////////
-      // Note: Failure callback is sent now, but the deallocation of
-      //       the CacheContinuation is deferred until we receive the
-      //       open_local() callback.
-      /////////////////////////////////////////////////////////////////
-      if (!action.cancelled)
-        action.continuation->handleEvent((read_op ? CACHE_EVENT_OPEN_READ_FAILED : CACHE_EVENT_OPEN_WRITE_FAILED), 0);
-      return EVENT_DONE;
-    }
-
-  } else if (((event == CLUSTER_EVENT_OPEN) || (event == CLUSTER_EVENT_OPEN_EXISTS))
-             && (((ptrdiff_t) timeout & (ptrdiff_t) 1) == 0)) {
-    ink_hrtime now;
-    now = ink_get_hrtime();
-    CLUSTER_SUM_DYN_STAT(CLUSTER_OPEN_DELAY_TIME_STAT, now - start_time);
-    LOG_EVENT_TIME(start_time, open_delay_time_dist, open_delay_events);
-    if (read_op) {
-      read_cluster_vc = vc;
-    } else {
-      write_cluster_vc = vc;
-    }
-    cluster_vc_channel = vc->channel;
-    vc->current_cont = this;
-
-    if (short_msg) {
-      CacheOpMsg_short *ms = (CacheOpMsg_short *) getMsgBuffer();
-      ms->channel = vc->channel;
-      ms->token = open_local_token;
-
-      Debug("cache_proto",
-            "2open_local-s (%s) success, seqno=%d chan=%d token=%d,%d VC=%p",
-            (read_op ? "R" : "W"), ms->seq_number, vc->channel, ms->token.ip_created, ms->token.sequence_number, vc);
-
-    } else {
-      CacheOpMsg_long *ml = (CacheOpMsg_long *) getMsgBuffer();
-      ml->channel = vc->channel;
-      ml->token = open_local_token;
-
-      Debug("cache_proto",
-            "3open_local-l (%s) success, seqno=%d chan=%d token=%d,%d VC=%p",
-            (read_op ? "R" : "W"), ml->seq_number, vc->channel, ml->token.ip_created, ml->token.sequence_number, vc);
-    }
-    SET_HANDLER((CacheContHandler) & CacheContinuation::remoteOpEvent);
-
-    if (event != CLUSTER_EVENT_OPEN_EXISTS) {
-      // Send request message
-      clusterProcessor.invoke_remote(ch,
-                                     (op_needs_marshalled_coi(request_opcode) ?
-                                      CACHE_OP_MALLOCED_CLUSTER_FUNCTION :
-                                      CACHE_OP_CLUSTER_FUNCTION), (char *) getMsgBuffer(), getMsgBufferLen());
-    }
-
-  } else {
-    int send_failure_callback = 1;
-
-    if (((ptrdiff_t) timeout & (ptrdiff_t) 1) == 0) {
-      if (short_msg) {
-        Debug("cache_proto", "2open_local-s (%s) failed, seqno=%d",
-              (read_op ? "R" : "W"), ((CacheOpMsg_short *) getMsgBuffer())->seq_number);
-      } else {
-        Debug("cache_proto", "3open_local-l (%s) failed, seqno=%d",
-              (read_op ? "R" : "W"), ((CacheOpMsg_long *) getMsgBuffer())->seq_number);
-      }
-
-    } else {
-      Debug("cache_proto", "4open_local cancelled due to timeout, seqno=%d", seq_number);
-      this->timeout = 0;
-
-      // Deallocate VC if successfully acquired
-
-      if (event == CLUSTER_EVENT_OPEN) {
-        vc->pending_remote_fill = 0;
-        vc->remote_closed = 1;  // avoid remote close msg
-        vc->do_io(VIO::CLOSE);
-      }
-      send_failure_callback = 0;        // already sent.
-    }
-
-    if (this->timeout)
-      this->timeout->cancel();
-    this->timeout = NULL;
-
-    freeMsgBuffer();
-    if (send_failure_callback) {
-      //
-      // Action corresponding to "this" already sent back to user,
-      //   use "this" to establish the failure callback after
-      //   removing ourselves from the active list.
-      //
-      this->use_deferred_callback = true;
-      this->result = (read_op ? CACHE_EVENT_OPEN_READ_FAILED : CACHE_EVENT_OPEN_WRITE_FAILED);
-      this->result_error = 0;
-      remove_and_delete(0, (Event *) 0);
-
-    } else {
-      cacheContAllocator_free(this);
-    }
-    return EVENT_DONE;
+  cluster_close_session(cs);
+  if (ccvc) {
+    ccvc->session_closed = true;
+    free_ClusterCacheVC(ccvc);
   }
-  // Free message
-  freeMsgBuffer();
-
-  return EVENT_DONE;
+  return 0;
 }
+//int
+//CacheContinuation::setup_local_vc(char *data, int data_len, CacheContinuation * cc, ClusterMachine * mp, Action ** act)
+//{
+//  bool read_op = op_is_read(cc->request_opcode);
+//  bool short_msg = op_is_shortform(cc->request_opcode);
+//
+//  // Alloc buffer, copy message and attach to continuation
+//  cc->setMsgBufferLen(data_len);
+//  cc->allocMsgBuffer();
+//  memcpy(cc->getMsgBuffer(), data, data_len);
+//
+//  SET_CONTINUATION_HANDLER(cc, (CacheContHandler)
+//                           & CacheContinuation::localVCsetupEvent);
+//
+//  if (short_msg) {
+//    Debug("cache_proto", "open_local-s (%s) seqno=%d", (read_op ? "R" : "W"), ((CacheOpMsg_short *) data)->seq_number);
+//  } else {
+//    Debug("cache_proto", "open_local-l (%s) seqno=%d", (read_op ? "R" : "W"), ((CacheOpMsg_long *) data)->seq_number);
+//  }
+//
+//  // Create local VC
+//  ClusterVConnection *vc;
+//
+//  if (!read_op && (cc->request_opcode == CACHE_OPEN_WRITE_LONG)) {
+//    // Determine if the open_write has already been established.
+//    vc = cc->lookupOpenWriteVC();
+//
+//  } else {
+//    vc = clusterProcessor.open_local(cc, mp, cc->open_local_token,
+//                                     (CLUSTER_OPT_ALLOW_IMMEDIATE |
+//                                      (read_op ? CLUSTER_OPT_CONN_READ : CLUSTER_OPT_CONN_WRITE)));
+//  }
+//  if (!vc) {
+//    // Error, abort request
+//    if (short_msg) {
+//      Debug("cache_proto", "0open_local-s (%s) failed, seqno=%d",
+//            (read_op ? "R" : "W"), ((CacheOpMsg_short *) data)->seq_number);
+//    } else {
+//      Debug("cache_proto", "1open_local-l (%s) failed, seqno=%d",
+//            (read_op ? "R" : "W"), ((CacheOpMsg_long *) data)->seq_number);
+//    }
+//    cc->freeMsgBuffer();
+//    if (cc->timeout)
+//      cc->timeout->cancel();
+//    cc->timeout = NULL;
+//
+//    // Post async failure callback on a different continuation.
+//    *act = callback_failure(&cc->action, (read_op ? CACHE_EVENT_OPEN_READ_FAILED : CACHE_EVENT_OPEN_WRITE_FAILED), 0);
+//    return 0;
+//
+//  } else if (vc != CLUSTER_DELAYED_OPEN) {
+//    // We have established the VC
+//    if (read_op) {
+//      cc->read_cluster_vc = vc;
+//    } else {
+//      cc->write_cluster_vc = vc;
+//    }
+//    cc->cluster_vc_channel = vc->channel;
+//    vc->current_cont = cc;
+//
+//    if (short_msg) {
+//      CacheOpMsg_short *ms = (CacheOpMsg_short *) data;
+//      ms->channel = vc->channel;
+//      ms->token = cc->open_local_token;
+//      Debug("cache_proto",
+//            "0open_local-s (%s) success, seqno=%d chan=%d token=%d,%d VC=%p",
+//            (read_op ? "R" : "W"), ms->seq_number, vc->channel, ms->token.ip_created, ms->token.sequence_number, vc);
+//    } else {
+//      CacheOpMsg_long *ml = (CacheOpMsg_long *) data;
+//      ml->channel = vc->channel;
+//      ml->token = cc->open_local_token;
+//      Debug("cache_proto",
+//            "1open_local-l (%s) success, seqno=%d chan=%d token=%d,%d VC=%p",
+//            (read_op ? "R" : "W"), ml->seq_number, vc->channel, ml->token.ip_created, ml->token.sequence_number, vc);
+//    }
+//    cc->freeMsgBuffer();
+//    SET_CONTINUATION_HANDLER(cc, (CacheContHandler)
+//                             & CacheContinuation::remoteOpEvent);
+//    return 1;
+//
+//  } else {
+//    //////////////////////////////////////////////////////
+//    // Unable to setup VC, delay required, await callback
+//    //////////////////////////////////////////////////////
+//    return -1;
+//  }
+//}
+//
+//ClusterVConnection *
+//CacheContinuation::lookupOpenWriteVC()
+//{
+//  ///////////////////////////////////////////////////////////////
+//  // See if we already have an open_write ClusterVConnection
+//  // which was established in a previous remote open_read which
+//  // failed.
+//  ///////////////////////////////////////////////////////////////
+//  ClusterVConnection *vc;
+//  CacheOpMsg_long *ml = (CacheOpMsg_long *) getMsgBuffer();
+//
+//  vc = GlobalOpenWriteVCcache->lookup(&ml->url_md5);
+//
+//  if (vc == ((ClusterVConnection *) 0)) {
+//    // Retry lookup
+//    SET_CONTINUATION_HANDLER(this, (CacheContHandler)
+//                             & CacheContinuation::lookupOpenWriteVCEvent);
+//    //
+//    // Note: In the lookupOpenWriteVCEvent handler, we use EVENT_IMMEDIATE
+//    //       to distinguish the lookup retry from a request timeout
+//    //       which uses EVENT_INTERVAL.
+//    //
+//    lookup_open_write_vc_event = eventProcessor.schedule_imm(this, ET_CACHE_CONT_SM);
+//
+//  } else if (vc != ((ClusterVConnection *) - 1)) {
+//    // Hit, found open_write VC in cache.
+//    // Post open_write completion by simulating a
+//    // remote cache op result message.
+//
+//    vc->action_ = action;       // establish new continuation
+//
+//    SET_CONTINUATION_HANDLER(this, (CacheContHandler)
+//                             & CacheContinuation::localVCsetupEvent);
+//    this->handleEvent(CLUSTER_EVENT_OPEN_EXISTS, vc);
+//
+//    CacheOpReplyMsg msg;
+//    int msglen;
+//
+//    msglen = CacheOpReplyMsg::sizeof_fixedlen_msg();
+//    msg.result = CACHE_EVENT_OPEN_WRITE;
+//    msg.seq_number = seq_number;
+//    msg.token = vc->token;
+//
+//    cache_op_result_ClusterFunction(ch, (void *) &msg, msglen);
+//
+//  } else {
+//    // Miss, establish local VC and send remote open_write request
+//
+//    SET_CONTINUATION_HANDLER(this, (CacheContHandler)
+//                             & CacheContinuation::localVCsetupEvent);
+//    vc = clusterProcessor.open_local(this, from, open_local_token,
+//                                     (CLUSTER_OPT_ALLOW_IMMEDIATE | CLUSTER_OPT_CONN_WRITE));
+//    if (!vc) {
+//      this->handleEvent(CLUSTER_EVENT_OPEN_FAILED, 0);
+//
+//    } else if (vc != CLUSTER_DELAYED_OPEN) {
+//      this->handleEvent(CLUSTER_EVENT_OPEN, vc);
+//    }
+//  }
+//  return CLUSTER_DELAYED_OPEN;  // force completion in callback
+//}
+//
+//int
+//CacheContinuation::lookupOpenWriteVCEvent(int event, Event * e)
+//{
+//  if (event == EVENT_IMMEDIATE) {
+//    // Retry open_write VC lookup
+//    lookupOpenWriteVC();
+//
+//  } else {
+//    lookup_open_write_vc_event->cancel();
+//    SET_CONTINUATION_HANDLER(this, (CacheContHandler)
+//                             & CacheContinuation::localVCsetupEvent);
+//    this->handleEvent(event, e);
+//  }
+//  return EVENT_DONE;
+//}
+//
+//int
+//CacheContinuation::remove_and_delete(int event, Event * e)
+//{
+//  NOWARN_UNUSED(event);
+//  unsigned int hash = FOLDHASH(target_ip, seq_number);
+//  MUTEX_TRY_LOCK(queuelock, remoteCacheContQueueMutex[hash], this_ethread());
+//  if (queuelock) {
+//    if (remoteCacheContQueue[hash].in(this)) {
+//      remoteCacheContQueue[hash].remove(this);
+//    }
+//    MUTEX_RELEASE(queuelock);
+//    if (use_deferred_callback)
+//      callback_failure(&action, result, result_error, this);
+//    else
+//      cacheContAllocator_free(this);
+//
+//  } else {
+//    SET_HANDLER((CacheContHandler) & CacheContinuation::remove_and_delete);
+//    if (!e) {
+//      timeout = eventProcessor.schedule_in(this, cache_cluster_timeout, ET_CACHE_CONT_SM);
+//    } else {
+//      e->schedule_in(cache_cluster_timeout);
+//    }
+//  }
+//  return EVENT_DONE;
+//}
+//
+//int
+//CacheContinuation::localVCsetupEvent(int event, ClusterVConnection * vc)
+//{
+//  ink_assert(magicno == (int) MagicNo);
+//  ink_assert(getMsgBuffer());
+//  bool short_msg = op_is_shortform(request_opcode);
+//  bool read_op = op_is_read(request_opcode);
+//
+//  if (event == EVENT_INTERVAL) {
+//    Event *e = (Event *) vc;
+//    unsigned int hash = FOLDHASH(target_ip, seq_number);
+//
+//    MUTEX_TRY_LOCK(queuelock, remoteCacheContQueueMutex[hash], e->ethread);
+//    if (!queuelock) {
+//      e->schedule_in(CACHE_RETRY_PERIOD);
+//      return EVENT_CONT;
+//    }
+//
+//    if (!remoteCacheContQueue[hash].in(this)) {
+//      ////////////////////////////////////////////////////
+//      // Not yet queued on outstanding operations list
+//      ////////////////////////////////////////////////////
+//      remoteCacheContQueue[hash].enqueue(this);
+//      ink_assert(timeout == e);
+//      MUTEX_RELEASE(queuelock);
+//      e->schedule_in(cache_cluster_timeout);
+//      return EVENT_CONT;
+//
+//    } else {
+//      /////////////////////////////////////////////////////
+//      // Timeout occurred
+//      /////////////////////////////////////////////////////
+//      remoteCacheContQueue[hash].remove(this);
+//      MUTEX_RELEASE(queuelock);
+//      Debug("cluster_timeout", "0cluster op timeout %d", seq_number);
+//      CLUSTER_INCREMENT_DYN_STAT(CLUSTER_REMOTE_OP_TIMEOUTS_STAT);
+//      timeout = (Event *) 1;    // Note timeout
+//      /////////////////////////////////////////////////////////////////
+//      // Note: Failure callback is sent now, but the deallocation of
+//      //       the CacheContinuation is deferred until we receive the
+//      //       open_local() callback.
+//      /////////////////////////////////////////////////////////////////
+//      if (!action.cancelled)
+//        action.continuation->handleEvent((read_op ? CACHE_EVENT_OPEN_READ_FAILED : CACHE_EVENT_OPEN_WRITE_FAILED), 0);
+//      return EVENT_DONE;
+//    }
+//
+//  } else if (((event == CLUSTER_EVENT_OPEN) || (event == CLUSTER_EVENT_OPEN_EXISTS))
+//             && (((ptrdiff_t) timeout & (ptrdiff_t) 1) == 0)) {
+//    ink_hrtime now;
+//    now = ink_get_hrtime();
+//    CLUSTER_SUM_DYN_STAT(CLUSTER_OPEN_DELAY_TIME_STAT, now - start_time);
+//    LOG_EVENT_TIME(start_time, open_delay_time_dist, open_delay_events);
+//    if (read_op) {
+//      read_cluster_vc = vc;
+//    } else {
+//      write_cluster_vc = vc;
+//    }
+//    cluster_vc_channel = vc->channel;
+//    vc->current_cont = this;
+//
+//    if (short_msg) {
+//      CacheOpMsg_short *ms = (CacheOpMsg_short *) getMsgBuffer();
+//      ms->channel = vc->channel;
+//      ms->token = open_local_token;
+//
+//      Debug("cache_proto",
+//            "2open_local-s (%s) success, seqno=%d chan=%d token=%d,%d VC=%p",
+//            (read_op ? "R" : "W"), ms->seq_number, vc->channel, ms->token.ip_created, ms->token.sequence_number, vc);
+//
+//    } else {
+//      CacheOpMsg_long *ml = (CacheOpMsg_long *) getMsgBuffer();
+//      ml->channel = vc->channel;
+//      ml->token = open_local_token;
+//
+//      Debug("cache_proto",
+//            "3open_local-l (%s) success, seqno=%d chan=%d token=%d,%d VC=%p",
+//            (read_op ? "R" : "W"), ml->seq_number, vc->channel, ml->token.ip_created, ml->token.sequence_number, vc);
+//    }
+//    SET_HANDLER((CacheContHandler) & CacheContinuation::remoteOpEvent);
+//
+//    if (event != CLUSTER_EVENT_OPEN_EXISTS) {
+//      // Send request message
+//      clusterProcessor.invoke_remote(ch,
+//                                     (op_needs_marshalled_coi(request_opcode) ?
+//                                      CACHE_OP_MALLOCED_CLUSTER_FUNCTION :
+//                                      CACHE_OP_CLUSTER_FUNCTION), (char *) getMsgBuffer(), getMsgBufferLen());
+//    }
+//
+//  } else {
+//    int send_failure_callback = 1;
+//
+//    if (((ptrdiff_t) timeout & (ptrdiff_t) 1) == 0) {
+//      if (short_msg) {
+//        Debug("cache_proto", "2open_local-s (%s) failed, seqno=%d",
+//              (read_op ? "R" : "W"), ((CacheOpMsg_short *) getMsgBuffer())->seq_number);
+//      } else {
+//        Debug("cache_proto", "3open_local-l (%s) failed, seqno=%d",
+//              (read_op ? "R" : "W"), ((CacheOpMsg_long *) getMsgBuffer())->seq_number);
+//      }
+//
+//    } else {
+//      Debug("cache_proto", "4open_local cancelled due to timeout, seqno=%d", seq_number);
+//      this->timeout = 0;
+//
+//      // Deallocate VC if successfully acquired
+//
+//      if (event == CLUSTER_EVENT_OPEN) {
+//        vc->pending_remote_fill = 0;
+//        vc->remote_closed = 1;  // avoid remote close msg
+//        vc->do_io(VIO::CLOSE);
+//      }
+//      send_failure_callback = 0;        // already sent.
+//    }
+//
+//    if (this->timeout)
+//      this->timeout->cancel();
+//    this->timeout = NULL;
+//
+//    freeMsgBuffer();
+//    if (send_failure_callback) {
+//      //
+//      // Action corresponding to "this" already sent back to user,
+//      //   use "this" to establish the failure callback after
+//      //   removing ourselves from the active list.
+//      //
+//      this->use_deferred_callback = true;
+//      this->result = (read_op ? CACHE_EVENT_OPEN_READ_FAILED : CACHE_EVENT_OPEN_WRITE_FAILED);
+//      this->result_error = 0;
+//      remove_and_delete(0, (Event *) 0);
+//
+//    } else {
+//      cacheContAllocator_free(this);
+//    }
+//    return EVENT_DONE;
+//  }
+//  // Free message
+//  freeMsgBuffer();
+//
+//  return EVENT_DONE;
+//}
 
 ///////////////////////////////////////////////////////////////////////////
 // cache_op_ClusterFunction()
@@ -995,14 +1176,13 @@ unmarshal_CacheOpMsg_short_2(void *data, int NeedByteSwap)
 
 // init_from_long() support routine for cache_op_ClusterFunction()
 inline void
-init_from_long(CacheContinuation * cont, CacheOpMsg_long * msg, ClusterMachine * m)
+init_from_long(CacheContinuation * cont, CacheOpMsg_long * msg)
 {
-  cont->no_reply_message = (msg->seq_number == CACHE_NO_RESPONSE);
+//  cont->no_reply_message = (msg->seq_number == CACHE_NO_RESPONSE);
   cont->seq_number = msg->seq_number;
   cont->cfl_flags = msg->cfl_flags;
-  cont->from = m;
   cont->url_md5 = msg->url_md5;
-  cont->cluster_vc_channel = msg->channel;
+//  cont->cluster_vc_channel = msg->channel;
   cont->frag_type = (CacheFragType) msg->frag_type;
   if ((cont->request_opcode == CACHE_OPEN_WRITE_LONG)
       || (cont->request_opcode == CACHE_OPEN_READ_LONG)) {
@@ -1013,23 +1193,22 @@ init_from_long(CacheContinuation * cont, CacheOpMsg_long * msg, ClusterMachine *
   cont->token = msg->token;
   cont->nbytes = (((int) msg->nbytes < 0) ? 0 : msg->nbytes);
 
-  if (cont->request_opcode == CACHE_OPEN_READ_LONG) {
-    cont->caller_buf_freebytes = msg->buffer_size;
-  } else {
-    cont->caller_buf_freebytes = 0;
-  }
+//  if (cont->request_opcode == CACHE_OPEN_READ_LONG) {
+//    cont->caller_buf_freebytes = msg->buffer_size;
+//  } else {
+//    cont->caller_buf_freebytes = 0;
+//  }
 }
 
 // init_from_short() support routine for cache_op_ClusterFunction()
 inline void
-init_from_short(CacheContinuation * cont, CacheOpMsg_short * msg, ClusterMachine * m)
+init_from_short(CacheContinuation * cont, CacheOpMsg_short * msg)
 {
-  cont->no_reply_message = (msg->seq_number == CACHE_NO_RESPONSE);
+//  cont->no_reply_message = (msg->seq_number == CACHE_NO_RESPONSE);
   cont->seq_number = msg->seq_number;
   cont->cfl_flags = msg->cfl_flags;
-  cont->from = m;
   cont->url_md5 = msg->md5;
-  cont->cluster_vc_channel = msg->channel;
+//  cont->cluster_vc_channel = msg->channel;
   cont->token = msg->token;
   cont->nbytes = (((int) msg->nbytes < 0) ? 0 : msg->nbytes);
   cont->frag_type = (CacheFragType) msg->frag_type;
@@ -1040,29 +1219,436 @@ init_from_short(CacheContinuation * cont, CacheOpMsg_short * msg, ClusterMachine
     cont->pin_in_cache = 0;
   }
 
-  if (cont->request_opcode == CACHE_OPEN_READ) {
-    cont->caller_buf_freebytes = msg->buffer_size;
-  } else {
-    cont->caller_buf_freebytes = 0;
-  }
+//  if (cont->request_opcode == CACHE_OPEN_READ) {
+//    cont->caller_buf_freebytes = msg->buffer_size;
+//  } else {
+//    cont->caller_buf_freebytes = 0;
+//  }
 }
 
 // init_from_short_2() support routine for cache_op_ClusterFunction()
 inline void
-init_from_short_2(CacheContinuation * cont, CacheOpMsg_short_2 * msg, ClusterMachine * m)
+init_from_short_2(CacheContinuation * cont, CacheOpMsg_short_2 * msg)
 {
-  cont->no_reply_message = (msg->seq_number == CACHE_NO_RESPONSE);
+//  cont->no_reply_message = (msg->seq_number == CACHE_NO_RESPONSE);
   cont->seq_number = msg->seq_number;
   cont->cfl_flags = msg->cfl_flags;
-  cont->from = m;
   cont->url_md5 = msg->md5_1;
   cont->frag_type = (CacheFragType) msg->frag_type;
 }
 
+//void
+//cache_op_ClusterFunction(ClusterHandler * ch, void *data, int len)
+//{
+//  EThread *thread = this_ethread();
+//  ProxyMutex *mutex = thread->mutex;
+//  ////////////////////////////////////////////////////////
+//  // Note: we are running on the ET_CLUSTER thread
+//  ////////////////////////////////////////////////////////
+//  CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CACHE_OUTSTANDING_STAT);
+//
+//  int opcode;
+//  ClusterMessageHeader *mh = (ClusterMessageHeader *) data;
+//
+//  if (mh->GetMsgVersion() != CacheOpMsg_long::CACHE_OP_LONG_MESSAGE_VERSION) {  ////////////////////////////////////////////////
+//    // Convert from old to current message format
+//    ////////////////////////////////////////////////
+//    ink_release_assert(!"cache_op_ClusterFunction() bad msg version");
+//  }
+//  opcode = ((CacheOpMsg_long *) data)->opcode;
+//
+//  // If necessary, create a continuation to reflect the response back
+//
+//  CacheContinuation *c = CacheContinuation::cacheContAllocator_alloc();
+//  c->mutex = new_ProxyMutex();
+//  MUTEX_TRY_LOCK(lock, c->mutex, this_ethread());
+//  c->request_opcode = opcode;
+//  c->token.clear();
+//  c->start_time = ink_get_hrtime();
+//  c->ch = ch;
+//  SET_CONTINUATION_HANDLER(c, (CacheContHandler)
+//                           & CacheContinuation::replyOpEvent);
+//
+//  switch (opcode) {
+//  case CACHE_OPEN_WRITE_BUFFER:
+//  case CACHE_OPEN_WRITE_BUFFER_LONG:
+//    ink_release_assert(!"cache_op_ClusterFunction WRITE_BUFFER not supported");
+//    break;
+//
+//  case CACHE_OPEN_READ_BUFFER:
+//  case CACHE_OPEN_READ_BUFFER_LONG:
+//    ink_release_assert(!"cache_op_ClusterFunction READ_BUFFER not supported");
+//    break;
+//
+//  case CACHE_OPEN_READ:
+//    {
+//      CacheOpMsg_short *msg = unmarshal_CacheOpMsg_short(data, mh->NeedByteSwap());
+//      init_from_short(c, msg, ch->machine);
+//      Debug("cache_msg",
+//            "cache_op-s op=%d seqno=%d data=%p len=%d machine=%p", opcode, c->seq_number, data, len, ch->machine);
+//      //
+//      // Establish the remote side of the ClusterVConnection
+//      //
+//      c->write_cluster_vc = clusterProcessor.connect_local((Continuation *) 0,
+//                                                           &c->token,
+//                                                           c->cluster_vc_channel,
+//                                                           (CLUSTER_OPT_IMMEDIATE | CLUSTER_OPT_CONN_READ));
+//      if (!c->write_cluster_vc) {
+//        // Unable to setup channel, abort processing.
+//        CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CHAN_INUSE_STAT);
+//        Debug("chan_inuse",
+//              "1Remote chan=%d inuse tok.ip=%u.%u.%u.%u tok.seqno=%d seqno=%d",
+//              c->cluster_vc_channel, DOT_SEPARATED(c->token.ip_created), c->token.sequence_number, c->seq_number);
+//
+//        // Send cluster op failed reply
+//        c->replyOpEvent(CACHE_EVENT_OPEN_READ_FAILED, (VConnection *) - ECLUSTER_CHANNEL_INUSE);
+//        break;
+//
+//      } else {
+//        c->write_cluster_vc->current_cont = c;
+//      }
+//      ink_release_assert(c->write_cluster_vc != CLUSTER_DELAYED_OPEN);
+//      ink_release_assert((opcode == CACHE_OPEN_READ)
+//                         || c->write_cluster_vc->pending_remote_fill);
+//
+//      SET_CONTINUATION_HANDLER(c, (CacheContHandler)
+//                               & CacheContinuation::setupVCdataRead);
+//      Debug("cache_proto",
+//            "0read op, seqno=%d chan=%d bufsize=%d token=%d,%d",
+//            msg->seq_number, msg->channel, msg->buffer_size, msg->token.ip_created, msg->token.sequence_number);
+//#ifdef CACHE_MSG_TRACE
+//      log_cache_op_msg(msg->seq_number, len, "cache_op_open_read");
+//#endif
+//      CacheKey key(msg->md5);
+//
+//      char *hostname = NULL;
+//      int host_len = len - op_to_sizeof_fixedlen_msg(opcode);
+//      if (host_len) {
+//        hostname = (char *) msg->moi;
+//      }
+//      Cache *call_cache = caches[c->frag_type];
+//      c->cache_action = call_cache->open_read(c, &key, c->frag_type, hostname, host_len);
+//      break;
+//    }
+//  case CACHE_OPEN_READ_LONG:
+//    {
+//      // Cache needs message data, copy it.
+//      c->setMsgBufferLen(len);
+//      c->allocMsgBuffer();
+//      memcpy(c->getMsgBuffer(), (char *) data, len);
+//
+//      int flen = CacheOpMsg_long::sizeof_fixedlen_msg();
+//      CacheOpMsg_long *msg = unmarshal_CacheOpMsg_long(c->getMsgBuffer(), mh->NeedByteSwap());
+//      init_from_long(c, msg, ch->machine);
+//      Debug("cache_msg",
+//            "cache_op-l op=%d seqno=%d data=%p len=%d machine=%p", opcode, c->seq_number, data, len, ch->machine);
+//#ifdef CACHE_MSG_TRACE
+//      log_cache_op_msg(msg->seq_number, len, "cache_op_open_read_long");
+//#endif
+//      //
+//      // Establish the remote side of the ClusterVConnection
+//      //
+//      c->write_cluster_vc = clusterProcessor.connect_local((Continuation *) 0,
+//                                                           &c->token,
+//                                                           c->cluster_vc_channel,
+//                                                           (CLUSTER_OPT_IMMEDIATE | CLUSTER_OPT_CONN_READ));
+//      if (!c->write_cluster_vc) {
+//        // Unable to setup channel, abort processing.
+//        CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CHAN_INUSE_STAT);
+//        Debug("chan_inuse",
+//              "2Remote chan=%d inuse tok.ip=%u.%u.%u.%u tok.seqno=%d seqno=%d",
+//              c->cluster_vc_channel, DOT_SEPARATED(c->token.ip_created), c->token.sequence_number, c->seq_number);
+//
+//        // Send cluster op failed reply
+//        c->replyOpEvent(CACHE_EVENT_OPEN_READ_FAILED, (VConnection *) - ECLUSTER_CHANNEL_INUSE);
+//        break;
+//
+//      } else {
+//        c->write_cluster_vc->current_cont = c;
+//      }
+//      ink_release_assert(c->write_cluster_vc != CLUSTER_DELAYED_OPEN);
+//      ink_release_assert((opcode == CACHE_OPEN_READ_LONG)
+//                         || c->write_cluster_vc->pending_remote_fill);
+//
+//      SET_CONTINUATION_HANDLER(c, (CacheContHandler)
+//                               & CacheContinuation::setupReadWriteVC);
+//      Debug("cache_proto",
+//            "1read op, seqno=%d chan=%d bufsize=%d token=%d,%d",
+//            msg->seq_number, msg->channel, msg->buffer_size, msg->token.ip_created, msg->token.sequence_number);
+//
+//      const char *p = (const char *) msg + flen;
+//      int moi_len = len - flen;
+//      int res;
+//
+//      ink_assert(moi_len > 0);
+//
+//      // Unmarshal CacheHTTPHdr
+//      res = c->ic_request.unmarshal((char *) p, moi_len, NULL);
+//      ink_assert(res > 0);
+//      ink_assert(c->ic_request.valid());
+//      c->request_purge = c->ic_request.method_get_wksidx() == HTTP_WKSIDX_PURGE || c->ic_request.method_get_wksidx() == HTTP_WKSIDX_DELETE;
+//      moi_len -= res;
+//      p += res;
+//      ink_assert(moi_len > 0);
+//      // Unmarshal CacheLookupHttpConfig
+//      c->ic_params = new(CacheLookupHttpConfigAllocator.alloc())
+//        CacheLookupHttpConfig();
+//      res = c->ic_params->unmarshal(&c->ic_arena, (const char *) p, moi_len);
+//      ink_assert(res > 0);
+//
+//      moi_len -= res;
+//      p += res;
+//
+//      CacheKey key(msg->url_md5);
+//
+//      char *hostname = NULL;
+//      int host_len = 0;
+//
+//      if (moi_len) {
+//        hostname = (char *) p;
+//        host_len = moi_len;
+//
+//        // Save hostname and attach it to the continuation since we may
+//        //  need it if we convert this to an open_write.
+//
+//        c->ic_hostname = new_IOBufferData(iobuffer_size_to_index(host_len));
+//        c->ic_hostname_len = host_len;
+//
+//        memcpy(c->ic_hostname->data(), hostname, host_len);
+//      }
+//
+//      Cache *call_cache = caches[c->frag_type];
+//      Action *a = call_cache->open_read(c, &key, &c->ic_request,
+//                                        c->ic_params,
+//                                        c->frag_type, hostname, host_len);
+//      // Get rid of purify warnings since 'c' can be freed by open_read.
+//      if (a != ACTION_RESULT_DONE) {
+//        c->cache_action = a;
+//      }
+//      break;
+//    }
+//  case CACHE_OPEN_WRITE:
+//    {
+//      CacheOpMsg_short *msg = unmarshal_CacheOpMsg_short(data, mh->NeedByteSwap());
+//      init_from_short(c, msg, ch->machine);
+//      Debug("cache_msg",
+//            "cache_op-s op=%d seqno=%d data=%p len=%d machine=%p", opcode, c->seq_number, data, len, ch->machine);
+//#ifdef CACHE_MSG_TRACE
+//      log_cache_op_msg(msg->seq_number, len, "cache_op_open_write");
+//#endif
+//      //
+//      // Establish the remote side of the ClusterVConnection
+//      //
+//      c->read_cluster_vc = clusterProcessor.connect_local((Continuation *) 0,
+//                                                          &c->token,
+//                                                          c->cluster_vc_channel,
+//                                                          (CLUSTER_OPT_IMMEDIATE | CLUSTER_OPT_CONN_WRITE));
+//      if (!c->read_cluster_vc) {
+//        // Unable to setup channel, abort processing.
+//        CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CHAN_INUSE_STAT);
+//        Debug("chan_inuse",
+//              "3Remote chan=%d inuse tok.ip=%u.%u.%u.%u tok.seqno=%d seqno=%d",
+//              c->cluster_vc_channel, DOT_SEPARATED(c->token.ip_created), c->token.sequence_number, c->seq_number);
+//
+//        // Send cluster op failed reply
+//        c->replyOpEvent(CACHE_EVENT_OPEN_WRITE_FAILED, (VConnection *) - ECLUSTER_CHANNEL_INUSE);
+//        break;
+//
+//      } else {
+//        c->read_cluster_vc->current_cont = c;
+//      }
+//      ink_release_assert(c->read_cluster_vc != CLUSTER_DELAYED_OPEN);
+//
+//      CacheKey key(msg->md5);
+//
+//      char *hostname = NULL;
+//      int host_len = len - op_to_sizeof_fixedlen_msg(opcode);
+//      if (host_len) {
+//        hostname = (char *) msg->moi;
+//      }
+//
+//      Cache *call_cache = caches[c->frag_type];
+//      Action *a = call_cache->open_write(c, &key, c->frag_type,
+//                                         !!(c->cfl_flags & CFL_OVERWRITE_ON_WRITE),
+//                                         c->pin_in_cache, hostname, host_len);
+//      if (a != ACTION_RESULT_DONE) {
+//        c->cache_action = a;
+//      }
+//      break;
+//    }
+//  case CACHE_OPEN_WRITE_LONG:
+//    {
+//      // Cache needs message data, copy it.
+//      c->setMsgBufferLen(len);
+//      c->allocMsgBuffer();
+//      memcpy(c->getMsgBuffer(), (char *) data, len);
+//
+//      int flen = CacheOpMsg_long::sizeof_fixedlen_msg();
+//      CacheOpMsg_long *msg = unmarshal_CacheOpMsg_long(c->getMsgBuffer(), mh->NeedByteSwap());
+//      init_from_long(c, msg, ch->machine);
+//      Debug("cache_msg",
+//            "cache_op-l op=%d seqno=%d data=%p len=%d machine=%p", opcode, c->seq_number, data, len, ch->machine);
+//#ifdef CACHE_MSG_TRACE
+//      log_cache_op_msg(msg->seq_number, len, "cache_op_open_write_long");
+//#endif
+//      //
+//      // Establish the remote side of the ClusterVConnection
+//      //
+//      c->read_cluster_vc = clusterProcessor.connect_local((Continuation *) 0,
+//                                                          &c->token,
+//                                                          c->cluster_vc_channel,
+//                                                          (CLUSTER_OPT_IMMEDIATE | CLUSTER_OPT_CONN_WRITE));
+//      if (!c->read_cluster_vc) {
+//        // Unable to setup channel, abort processing.
+//        CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CHAN_INUSE_STAT);
+//        Debug("chan_inuse",
+//              "4Remote chan=%d inuse tok.ip=%u.%u.%u.%u tok.seqno=%d seqno=%d",
+//              c->cluster_vc_channel, DOT_SEPARATED(c->token.ip_created), c->token.sequence_number, c->seq_number);
+//
+//        // Send cluster op failed reply
+//        c->replyOpEvent(CACHE_EVENT_OPEN_WRITE_FAILED, (VConnection *) - ECLUSTER_CHANNEL_INUSE);
+//        break;
+//
+//      } else {
+//        c->read_cluster_vc->current_cont = c;
+//      }
+//      ink_release_assert(c->read_cluster_vc != CLUSTER_DELAYED_OPEN);
+//
+//      CacheHTTPInfo *ci = 0;
+//      const char *p;
+//      int res = 0;
+//      int moi_len = len - flen;
+//
+//      if (moi_len && c->cfl_flags & CFL_LOPENWRITE_HAVE_OLDINFO) {
+//        p = (const char *) msg + flen;
+//
+//        // Unmarshal old CacheHTTPInfo
+//        res = HTTPInfo::unmarshal((char *) p, moi_len, NULL);
+//        ink_assert(res > 0);
+//        c->ic_old_info.get_handle((char *) p, moi_len);
+//        ink_assert(c->ic_old_info.valid());
+//        ci = &c->ic_old_info;
+//      } else {
+//        p = (const char *) 0;
+//      }
+//      if (c->cfl_flags & CFL_ALLOW_MULTIPLE_WRITES) {
+//        ink_assert(!ci);
+//        ci = (CacheHTTPInfo *) CACHE_ALLOW_MULTIPLE_WRITES;
+//      }
+//      moi_len -= res;
+//      p += res;
+//
+//      CacheKey key(msg->url_md5);
+//      char *hostname = NULL;
+//
+//      if (moi_len) {
+//        hostname = (char *) p;
+//      }
+//
+//      Cache *call_cache = caches[c->frag_type];
+//      Action *a = call_cache->open_write(c, &key, ci, c->pin_in_cache,
+//                                         NULL, c->frag_type, hostname, len);
+//      if (a != ACTION_RESULT_DONE) {
+//        c->cache_action = a;
+//      }
+//      break;
+//    }
+//  case CACHE_REMOVE:
+//    {
+//      CacheOpMsg_short *msg = unmarshal_CacheOpMsg_short(data, mh->NeedByteSwap());
+//      init_from_short(c, msg, ch->machine);
+//      Debug("cache_msg",
+//            "cache_op op=%d seqno=%d data=%p len=%d machine=%p", opcode, c->seq_number, data, len, ch->machine);
+//#ifdef CACHE_MSG_TRACE
+//      log_cache_op_msg(msg->seq_number, len, "cache_op_remove");
+//#endif
+//      CacheKey key(msg->md5);
+//
+//      char *hostname = NULL;
+//      int host_len = len - op_to_sizeof_fixedlen_msg(opcode);
+//      if (host_len) {
+//        hostname = (char *) msg->moi;
+//      }
+//
+//      Cache *call_cache = caches[c->frag_type];
+//      Action *a = call_cache->remove(c, &key, c->frag_type,
+//                                     !!(c->cfl_flags & CFL_REMOVE_USER_AGENTS),
+//                                     !!(c->cfl_flags & CFL_REMOVE_LINK),
+//                                     hostname, host_len);
+//      if (a != ACTION_RESULT_DONE) {
+//        c->cache_action = a;
+//      }
+//      break;
+//    }
+//  case CACHE_LINK:
+//    {
+//      CacheOpMsg_short_2 *msg = unmarshal_CacheOpMsg_short_2(data, mh->NeedByteSwap());
+//      init_from_short_2(c, msg, ch->machine);
+//      Debug("cache_msg",
+//            "cache_op op=%d seqno=%d data=%p len=%d machine=%p", opcode, c->seq_number, data, len, ch->machine);
+//#ifdef CACHE_MSG_TRACE
+//      log_cache_op_msg(msg->seq_number, len, "cache_op_link");
+//#endif
+//
+//      CacheKey key1(msg->md5_1);
+//      CacheKey key2(msg->md5_2);
+//
+//      char *hostname = NULL;
+//      int host_len = len - op_to_sizeof_fixedlen_msg(opcode);
+//      if (host_len) {
+//        hostname = (char *) msg->moi;
+//      }
+//
+//      Cache *call_cache = caches[c->frag_type];
+//      Action *a = call_cache->link(c, &key1, &key2, c->frag_type,
+//                                   hostname, host_len);
+//      if (a != ACTION_RESULT_DONE) {
+//        c->cache_action = a;
+//      }
+//      break;
+//    }
+//  case CACHE_DEREF:
+//    {
+//      CacheOpMsg_short *msg = unmarshal_CacheOpMsg_short(data, mh->NeedByteSwap());
+//      init_from_short(c, msg, ch->machine);
+//      Debug("cache_msg",
+//            "cache_op op=%d seqno=%d data=%p len=%d machine=%p", opcode, c->seq_number, data, len, ch->machine);
+//#ifdef CACHE_MSG_TRACE
+//      log_cache_op_msg(msg->seq_number, len, "cache_op_deref");
+//#endif
+//
+//      CacheKey key(msg->md5);
+//
+//      char *hostname = NULL;
+//      int host_len = len - op_to_sizeof_fixedlen_msg(opcode);
+//      if (host_len) {
+//        hostname = (char *) msg->moi;
+//      }
+//
+//      Cache *call_cache = caches[c->frag_type];
+//      Action *a = call_cache->deref(c, &key, c->frag_type,
+//                                    hostname, host_len);
+//      if (a != ACTION_RESULT_DONE) {
+//        c->cache_action = a;
+//      }
+//      break;
+//    }
+//
+//  default:
+//    {
+//      ink_release_assert(0);
+//    }
+//  }                             // End of switch
+//}
+
+
 void
-cache_op_ClusterFunction(ClusterHandler * ch, void *data, int len)
+cache_op_ClusterFunction(ClusterSession cs, void *context, void *d)
 {
-  EThread *thread = this_ethread();
+  ClusterCont *cc = (ClusterCont *) d;
+  ink_assert(cc && !context);
+
+  EThread *thread = cc->mutex->thread_holding;
   ProxyMutex *mutex = thread->mutex;
   ////////////////////////////////////////////////////////
   // Note: we are running on the ET_CLUSTER thread
@@ -1070,26 +1656,37 @@ cache_op_ClusterFunction(ClusterHandler * ch, void *data, int len)
   CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CACHE_OUTSTANDING_STAT);
 
   int opcode;
-  ClusterMessageHeader *mh = (ClusterMessageHeader *) data;
+  int len = cc->data_len;
+
+  // memcpy to release the reference early
+  Ptr<IOBufferData> buf;
+  buf = new_IOBufferData(iobuffer_size_to_index(len, MAX_BUFFER_SIZE_INDEX));
+  char *data = buf.m_ptr->data();
+  for (IOBufferBlock *b = cc->data; b; b = b->next) {
+    memcpy(data, b->_start, b->_end - b->_start);
+    data += b->_end - b->_start;
+  }
+  data = buf->data();
 
-  if (mh->GetMsgVersion() != CacheOpMsg_long::CACHE_OP_LONG_MESSAGE_VERSION) {  ////////////////////////////////////////////////
-    // Convert from old to current message format
-    ////////////////////////////////////////////////
-    ink_release_assert(!"cache_op_ClusterFunction() bad msg version");
+  ClusterMessageHeader *mh = (ClusterMessageHeader *) data;
+  ink_assert(mh->GetMsgVersion() == CacheOpMsg_long::CACHE_OP_LONG_MESSAGE_VERSION);
+
+  opcode = ((CacheOpMsg_long *) mh)->opcode;
+  CacheContinuation *c = new_CacheCont(thread);
+  if (cluster_bind_session(cs, c)) {
+    cluster_close_session(cs);
+    free_CacheCont(c);
+    return;
   }
-  opcode = ((CacheOpMsg_long *) data)->opcode;
-
-  // If necessary, create a continuation to reflect the response back
 
-  CacheContinuation *c = CacheContinuation::cacheContAllocator_alloc();
-  c->mutex = new_ProxyMutex();
-  MUTEX_TRY_LOCK(lock, c->mutex, this_ethread());
   c->request_opcode = opcode;
+  c->frag_type = (CacheFragType) ((CacheOpMsg_long *) mh)->frag_type;
   c->token.clear();
-  c->start_time = ink_get_hrtime();
-  c->ch = ch;
-  SET_CONTINUATION_HANDLER(c, (CacheContHandler)
-                           & CacheContinuation::replyOpEvent);
+  c->rw_buf_msg = buf;
+  c->rw_buf_msg_len = len;
+  c->cs = cs;
+
+  MUTEX_TRY_LOCK(lock, c->mutex, c->thread);
 
   switch (opcode) {
   case CACHE_OPEN_WRITE_BUFFER:
@@ -1105,33 +1702,9 @@ cache_op_ClusterFunction(ClusterHandler * ch, void *data, int len)
   case CACHE_OPEN_READ:
     {
       CacheOpMsg_short *msg = unmarshal_CacheOpMsg_short(data, mh->NeedByteSwap());
-      init_from_short(c, msg, ch->machine);
+      init_from_short(c, msg);
       Debug("cache_msg",
-            "cache_op-s op=%d seqno=%d data=%p len=%d machine=%p", opcode, c->seq_number, data, len, ch->machine);
-      //
-      // Establish the remote side of the ClusterVConnection
-      //
-      c->write_cluster_vc = clusterProcessor.connect_local((Continuation *) 0,
-                                                           &c->token,
-                                                           c->cluster_vc_channel,
-                                                           (CLUSTER_OPT_IMMEDIATE | CLUSTER_OPT_CONN_READ));
-      if (!c->write_cluster_vc) {
-        // Unable to setup channel, abort processing.
-        CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CHAN_INUSE_STAT);
-        Debug("chan_inuse",
-              "1Remote chan=%d inuse tok.ip=%u.%u.%u.%u tok.seqno=%d seqno=%d",
-              c->cluster_vc_channel, DOT_SEPARATED(c->token.ip_created), c->token.sequence_number, c->seq_number);
-
-        // Send cluster op failed reply
-        c->replyOpEvent(CACHE_EVENT_OPEN_READ_FAILED, (VConnection *) - ECLUSTER_CHANNEL_INUSE);
-        break;
-
-      } else {
-        c->write_cluster_vc->current_cont = c;
-      }
-      ink_release_assert(c->write_cluster_vc != CLUSTER_DELAYED_OPEN);
-      ink_release_assert((opcode == CACHE_OPEN_READ)
-                         || c->write_cluster_vc->pending_remote_fill);
+            "cache_op-s op=%d seqno=%d data=%p len=%d", opcode, c->seq_number, data, len);
 
       SET_CONTINUATION_HANDLER(c, (CacheContHandler)
                                & CacheContinuation::setupVCdataRead);
@@ -1143,57 +1716,31 @@ cache_op_ClusterFunction(ClusterHandler * ch, void *data, int len)
 #endif
       CacheKey key(msg->md5);
 
-      char *hostname = NULL;
-      int host_len = len - op_to_sizeof_fixedlen_msg(opcode);
-      if (host_len) {
-        hostname = (char *) msg->moi.byte;
-      }
+      int flen = op_to_sizeof_fixedlen_msg(opcode);
+      c->ic_hostname_len = len - flen;
+      c->ic_hostname = (char *) msg + flen;
       Cache *call_cache = caches[c->frag_type];
-      c->cache_action = call_cache->open_read(c, &key, c->frag_type, hostname, host_len);
+      c->pending_action = call_cache->open_read(c, &key, c->frag_type, c->ic_hostname, c->ic_hostname_len);
       break;
     }
   case CACHE_OPEN_READ_LONG:
     {
       // Cache needs message data, copy it.
-      c->setMsgBufferLen(len);
-      c->allocMsgBuffer();
-      memcpy(c->getMsgBuffer(), (char *) data, len);
+//      c->setMsgBufferLen(len);
+//      c->allocMsgBuffer();
+//      memcpy(c->getMsgBuffer(), (char *) data, len);
 
       int flen = CacheOpMsg_long::sizeof_fixedlen_msg();
-      CacheOpMsg_long *msg = unmarshal_CacheOpMsg_long(c->getMsgBuffer(), mh->NeedByteSwap());
-      init_from_long(c, msg, ch->machine);
+      CacheOpMsg_long *msg = unmarshal_CacheOpMsg_long(data, mh->NeedByteSwap());
+      init_from_long(c, msg);
       Debug("cache_msg",
-            "cache_op-l op=%d seqno=%d data=%p len=%d machine=%p", opcode, c->seq_number, data, len, ch->machine);
+            "cache_op-l op=%d seqno=%d data=%p len=%d", opcode, c->seq_number, data, len);
 #ifdef CACHE_MSG_TRACE
       log_cache_op_msg(msg->seq_number, len, "cache_op_open_read_long");
 #endif
-      //
-      // Establish the remote side of the ClusterVConnection
-      //
-      c->write_cluster_vc = clusterProcessor.connect_local((Continuation *) 0,
-                                                           &c->token,
-                                                           c->cluster_vc_channel,
-                                                           (CLUSTER_OPT_IMMEDIATE | CLUSTER_OPT_CONN_READ));
-      if (!c->write_cluster_vc) {
-        // Unable to setup channel, abort processing.
-        CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CHAN_INUSE_STAT);
-        Debug("chan_inuse",
-              "2Remote chan=%d inuse tok.ip=%u.%u.%u.%u tok.seqno=%d seqno=%d",
-              c->cluster_vc_channel, DOT_SEPARATED(c->token.ip_created), c->token.sequence_number, c->seq_number);
-
-        // Send cluster op failed reply
-        c->replyOpEvent(CACHE_EVENT_OPEN_READ_FAILED, (VConnection *) - ECLUSTER_CHANNEL_INUSE);
-        break;
-
-      } else {
-        c->write_cluster_vc->current_cont = c;
-      }
-      ink_release_assert(c->write_cluster_vc != CLUSTER_DELAYED_OPEN);
-      ink_release_assert((opcode == CACHE_OPEN_READ_LONG)
-                         || c->write_cluster_vc->pending_remote_fill);
 
       SET_CONTINUATION_HANDLER(c, (CacheContHandler)
-                               & CacheContinuation::setupReadWriteVC);
+                                     & CacheContinuation::setupVCdataRead);
       Debug("cache_proto",
             "1read op, seqno=%d chan=%d bufsize=%d token=%d,%d",
             msg->seq_number, msg->channel, msg->buffer_size, msg->token.ip_created, msg->token.sequence_number);
@@ -1215,6 +1762,11 @@ cache_op_ClusterFunction(ClusterHandler * ch, void *data, int len)
       // Unmarshal CacheLookupHttpConfig
       c->ic_params = new(CacheLookupHttpConfigAllocator.alloc())
         CacheLookupHttpConfig();
+      memcpy(c->ic_params, p, sizeof(CacheLookupHttpConfig));
+      moi_len -= sizeof(CacheLookupHttpConfig);
+      p += sizeof(CacheLookupHttpConfig);
+
+      ink_assert(moi_len > 0);
       res = c->ic_params->unmarshal(&c->ic_arena, (const char *) p, moi_len);
       ink_assert(res > 0);
 
@@ -1223,132 +1775,81 @@ cache_op_ClusterFunction(ClusterHandler * ch, void *data, int len)
 
       CacheKey key(msg->url_md5);
 
-      char *hostname = NULL;
-      int host_len = 0;
-
       if (moi_len) {
-        hostname = (char *) p;
-        host_len = moi_len;
-
-        // Save hostname and attach it to the continuation since we may
-        //  need it if we convert this to an open_write.
-
-        c->ic_hostname = new_IOBufferData(iobuffer_size_to_index(host_len));
-        c->ic_hostname_len = host_len;
-
-        memcpy(c->ic_hostname->data(), hostname, host_len);
+        c->ic_hostname = (char *) p;
+        c->ic_hostname_len = moi_len;
       }
 
       Cache *call_cache = caches[c->frag_type];
       Action *a = call_cache->open_read(c, &key, &c->ic_request,
                                         c->ic_params,
-                                        c->frag_type, hostname, host_len);
+                                        c->frag_type, c->ic_hostname, c->ic_hostname_len);
       // Get rid of purify warnings since 'c' can be freed by open_read.
       if (a != ACTION_RESULT_DONE) {
-        c->cache_action = a;
+        c->pending_action = a;
       }
       break;
     }
   case CACHE_OPEN_WRITE:
     {
       CacheOpMsg_short *msg = unmarshal_CacheOpMsg_short(data, mh->NeedByteSwap());
-      init_from_short(c, msg, ch->machine);
+      init_from_short(c, msg);
       Debug("cache_msg",
-            "cache_op-s op=%d seqno=%d data=%p len=%d machine=%p", opcode, c->seq_number, data, len, ch->machine);
+            "cache_op-s op=%d seqno=%d data=%p len=%d", opcode, c->seq_number, data, len);
 #ifdef CACHE_MSG_TRACE
       log_cache_op_msg(msg->seq_number, len, "cache_op_open_write");
 #endif
-      //
-      // Establish the remote side of the ClusterVConnection
-      //
-      c->read_cluster_vc = clusterProcessor.connect_local((Continuation *) 0,
-                                                          &c->token,
-                                                          c->cluster_vc_channel,
-                                                          (CLUSTER_OPT_IMMEDIATE | CLUSTER_OPT_CONN_WRITE));
-      if (!c->read_cluster_vc) {
-        // Unable to setup channel, abort processing.
-        CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CHAN_INUSE_STAT);
-        Debug("chan_inuse",
-              "3Remote chan=%d inuse tok.ip=%u.%u.%u.%u tok.seqno=%d seqno=%d",
-              c->cluster_vc_channel, DOT_SEPARATED(c->token.ip_created), c->token.sequence_number, c->seq_number);
-
-        // Send cluster op failed reply
-        c->replyOpEvent(CACHE_EVENT_OPEN_WRITE_FAILED, (VConnection *) - ECLUSTER_CHANNEL_INUSE);
-        break;
-
-      } else {
-        c->read_cluster_vc->current_cont = c;
-      }
-      ink_release_assert(c->read_cluster_vc != CLUSTER_DELAYED_OPEN);
 
       CacheKey key(msg->md5);
 
-      char *hostname = NULL;
-      int host_len = len - op_to_sizeof_fixedlen_msg(opcode);
-      if (host_len) {
-        hostname = (char *) msg->moi.byte;
+      int flen = op_to_sizeof_fixedlen_msg(opcode);
+      c->ic_hostname_len = len - flen;
+      if (c->ic_hostname_len) {
+        c->ic_hostname = (char *) msg + flen;
       }
 
+      SET_CONTINUATION_HANDLER(c, (CacheContHandler)
+                                           & CacheContinuation::setupVCdataWrite);
       Cache *call_cache = caches[c->frag_type];
       Action *a = call_cache->open_write(c, &key, c->frag_type,
                                          !!(c->cfl_flags & CFL_OVERWRITE_ON_WRITE),
-                                         c->pin_in_cache, hostname, host_len);
+                                         c->pin_in_cache, c->ic_hostname, c->ic_hostname_len);
       if (a != ACTION_RESULT_DONE) {
-        c->cache_action = a;
+        c->pending_action = a;
       }
       break;
     }
   case CACHE_OPEN_WRITE_LONG:
     {
       // Cache needs message data, copy it.
-      c->setMsgBufferLen(len);
-      c->allocMsgBuffer();
-      memcpy(c->getMsgBuffer(), (char *) data, len);
+//      c->setMsgBufferLen(len);
+//      c->allocMsgBuffer();
+//      memcpy(c->getMsgBuffer(), (char *) data, len);
 
       int flen = CacheOpMsg_long::sizeof_fixedlen_msg();
       CacheOpMsg_long *msg = unmarshal_CacheOpMsg_long(c->getMsgBuffer(), mh->NeedByteSwap());
-      init_from_long(c, msg, ch->machine);
+      init_from_long(c, msg);
       Debug("cache_msg",
-            "cache_op-l op=%d seqno=%d data=%p len=%d machine=%p", opcode, c->seq_number, data, len, ch->machine);
+            "cache_op-l op=%d seqno=%d data=%p len=%d", opcode, c->seq_number, data, len);
 #ifdef CACHE_MSG_TRACE
       log_cache_op_msg(msg->seq_number, len, "cache_op_open_write_long");
 #endif
-      //
-      // Establish the remote side of the ClusterVConnection
-      //
-      c->read_cluster_vc = clusterProcessor.connect_local((Continuation *) 0,
-                                                          &c->token,
-                                                          c->cluster_vc_channel,
-                                                          (CLUSTER_OPT_IMMEDIATE | CLUSTER_OPT_CONN_WRITE));
-      if (!c->read_cluster_vc) {
-        // Unable to setup channel, abort processing.
-        CLUSTER_INCREMENT_DYN_STAT(CLUSTER_CHAN_INUSE_STAT);
-        Debug("chan_inuse",
-              "4Remote chan=%d inuse tok.ip=%u.%u.%u.%u tok.seqno=%d seqno=%d",
-              c->cluster_vc_channel, DOT_SEPARATED(c->token.ip_created), c->token.sequence_number, c->seq_number);
-
-        // Send cluster op failed reply
-        c->replyOpEvent(CACHE_EVENT_OPEN_WRITE_FAILED, (VConnection *) - ECLUSTER_CHANNEL_INUSE);
-        break;
-
-      } else {
-        c->read_cluster_vc->current_cont = c;
-      }
-      ink_release_assert(c->read_cluster_vc != CLUSTER_DELAYED_OPEN);
 
       CacheHTTPInfo *ci = 0;
-      const char *p = (const char *) msg + flen;
+      const char *p;
       int res = 0;
       int moi_len = len - flen;
 
-      if (moi_len && c->cfl_flags & CFL_LOPENWRITE_HAVE_OLDINFO) {
-
+      if (moi_len && (c->cfl_flags & CFL_LOPENWRITE_HAVE_OLDINFO)) {
+        p = (const char *) msg + flen;
         // Unmarshal old CacheHTTPInfo
         res = HTTPInfo::unmarshal((char *) p, moi_len, NULL);
         ink_assert(res > 0);
         c->ic_old_info.get_handle((char *) p, moi_len);
         ink_assert(c->ic_old_info.valid());
         ci = &c->ic_old_info;
+      } else {
+        p = (const char *) 0;
       }
       if (c->cfl_flags & CFL_ALLOW_MULTIPLE_WRITES) {
         ink_assert(!ci);
@@ -1358,53 +1859,60 @@ cache_op_ClusterFunction(ClusterHandler * ch, void *data, int len)
       p += res;
 
       CacheKey key(msg->url_md5);
-      char *hostname = NULL;
 
       if (moi_len) {
-        hostname = (char *) p;
+        c->ic_hostname = (char *) p;
+        c->ic_hostname_len = moi_len;
       }
 
+      SET_CONTINUATION_HANDLER(c, (CacheContHandler)
+                                                 & CacheContinuation::setupVCdataWrite);
+
       Cache *call_cache = caches[c->frag_type];
       Action *a = call_cache->open_write(c, &key, ci, c->pin_in_cache,
-                                         NULL, c->frag_type, hostname, moi_len);
+                                         NULL, c->frag_type, c->ic_hostname, c->ic_hostname_len);
       if (a != ACTION_RESULT_DONE) {
-        c->cache_action = a;
+        c->pending_action = a;
       }
       break;
     }
   case CACHE_REMOVE:
     {
       CacheOpMsg_short *msg = unmarshal_CacheOpMsg_short(data, mh->NeedByteSwap());
-      init_from_short(c, msg, ch->machine);
+      init_from_short(c, msg);
       Debug("cache_msg",
-            "cache_op op=%d seqno=%d data=%p len=%d machine=%p", opcode, c->seq_number, data, len, ch->machine);
+            "cache_op op=%d seqno=%d data=%p len=%d", opcode, c->seq_number, data, len);
 #ifdef CACHE_MSG_TRACE
       log_cache_op_msg(msg->seq_number, len, "cache_op_remove");
 #endif
       CacheKey key(msg->md5);
 
-      char *hostname = NULL;
-      int host_len = len - op_to_sizeof_fixedlen_msg(opcode);
+      int flen = op_to_sizeof_fixedlen_msg(opcode);
+      int host_len = len - flen;
       if (host_len) {
-        hostname = (char *) msg->moi.byte;
+        c->ic_hostname = (char *) msg + flen;
+        c->ic_hostname_len = host_len;
       }
 
+      SET_CONTINUATION_HANDLER(c, (CacheContHandler)
+                                                       & CacheContinuation::setupVCdataRemove);
+
       Cache *call_cache = caches[c->frag_type];
       Action *a = call_cache->remove(c, &key, c->frag_type,
                                      !!(c->cfl_flags & CFL_REMOVE_USER_AGENTS),
                                      !!(c->cfl_flags & CFL_REMOVE_LINK),
-                                     hostname, host_len);
+                                     c->ic_hostname, c->ic_hostname_len);
       if (a != ACTION_RESULT_DONE) {
-        c->cache_action = a;
+        c->pending_action = a;
       }
       break;
     }
   case CACHE_LINK:
     {
       CacheOpMsg_short_2 *msg = unmarshal_CacheOpMsg_short_2(data, mh->NeedByteSwap());
-      init_from_short_2(c, msg, ch->machine);
+      init_from_short_2(c, msg);
       Debug("cache_msg",
-            "cache_op op=%d seqno=%d data=%p len=%d machine=%p", opcode, c->seq_number, data, len, ch->machine);
+            "cache_op op=%d seqno=%d data=%p len=%d", opcode, c->seq_number, data, len);
 #ifdef CACHE_MSG_TRACE
       log_cache_op_msg(msg->seq_number, len, "cache_op_link");
 #endif
@@ -1412,364 +1920,671 @@ cache_op_ClusterFunction(ClusterHandler * ch, void *data, int len)
       CacheKey key1(msg->md5_1);
       CacheKey key2(msg->md5_2);
 
-      char *hostname = NULL;
-      int host_len = len - op_to_sizeof_fixedlen_msg(opcode);
+      int flen = op_to_sizeof_fixedlen_msg(opcode);
+      int host_len = len - flen;
       if (host_len) {
-        hostname = (char *) msg->moi.byte;
+        c->ic_hostname = (char *) msg + flen;
+        c->ic_hostname_len = host_len;
       }
 
+      SET_CONTINUATION_HANDLER(c, (CacheContHandler)
+                                                       & CacheContinuation::setupVCdataLink);
+
       Cache *call_cache = caches[c->frag_type];
       Action *a = call_cache->link(c, &key1, &key2, c->frag_type,
-                                   hostname, host_len);
+                                   c->ic_hostname, c->ic_hostname_len);
       if (a != ACTION_RESULT_DONE) {
-        c->cache_action = a;
+        c->pending_action = a;
       }
       break;
     }
   case CACHE_DEREF:
     {
       CacheOpMsg_short *msg = unmarshal_CacheOpMsg_short(data, mh->NeedByteSwap());
-      init_from_short(c, msg, ch->machine);
+      init_from_short(c, msg);
       Debug("cache_msg",
-            "cache_op op=%d seqno=%d data=%p len=%d machine=%p", opcode, c->seq_number, data, len, ch->machine);
+            "cache_op op=%d seqno=%d data=%p len=%d", opcode, c->seq_number, data, len);
 #ifdef CACHE_MSG_TRACE
       log_cache_op_msg(msg->seq_number, len, "cache_op_deref");
 #endif
 
       CacheKey key(msg->md5);
 
-      char *hostname = NULL;
-      int host_len = len - op_to_sizeof_fixedlen_msg(opcode);
+      int flen = op_to_sizeof_fixedlen_msg(opcode);
+      int host_len = len - flen;
       if (host_len) {
-        hostname = (char *) msg->moi.byte;
+        c->ic_hostname = (char *) msg + flen;
+        c->ic_hostname_len = host_len;
       }
 
+      SET_CONTINUATION_HANDLER(c, (CacheContHandler)
+                                                       & CacheContinuation::setupVCdataDeref);
+
       Cache *call_cache = caches[c->frag_type];
       Action *a = call_cache->deref(c, &key, c->frag_type,
-                                    hostname, host_len);
+          c->ic_hostname, c->ic_hostname_len);
       if (a != ACTION_RESULT_DONE) {
-        c->cache_action = a;
+        c->pending_action = a;
       }
       break;
     }
 
   default:
     {
-      ink_release_assert(0);
+      ink_assert(0);
+      break;
     }
   }                             // End of switch
 }
-
 void
 cache_op_malloc_ClusterFunction(ClusterHandler *ch, void *data, int len)
 {
-  cache_op_ClusterFunction(ch, data, len);
-  // We own the message data, free it back to the Cluster subsystem
-  clusterProcessor.free_remote_data((char *) data, len);
+//  cache_op_ClusterFunction(ch, data, len);
+//  // We own the message data, free it back to the Cluster subsystem
+//  clusterProcessor.free_remote_data((char *) data, len);
+  (void) ch;
+  (void) data;
+  (void) len;
+  return;
 }
 
+//struct HeadData
+//{
+//  int32_t magic; // feedbabe
+//  int32_t h_len;
+//  int32_t d_len;
+//  uint32_t flags;
+//
+//  char *hdr() {
+//    return (char *)this + sizeof(HeadData);
+//  }
+//
+//  int32_t hdr_len() {
+//    return h_len;
+//  }
+//
+//  int32_t data_len() {
+//    return d_len;
+//  }
+//
+//  char *data() {
+//    return (char *)this + hdr_len + sizeof(HeadData);
+//  }
+//};
+
 int
-CacheContinuation::setupVCdataRead(int event, VConnection * vc)
+CacheContinuation::setupVCdataRead(int event, void *data)
 {
   ink_assert(magicno == (int) MagicNo);
   //
   // Setup the initial data read for the given Cache VC.
   // This data is sent back in the response message.
   //
+  if (event > CLUSTER_MSG_START && event <= CLUSTER_INTERNEL_ERROR) {
+    Debug("cache_proto", "replyOpEvent: freeing this=%p", this);
+    ink_assert(cluster_close_session(cs));
+    free_CacheCont(this);
+    return EVENT_DONE;
+  }
+
+  pending_action = NULL;
+  result = (event == CACHE_EVENT_OPEN_READ ? CACHE_EVENT_OPEN_READ : CACHE_EVENT_OPEN_READ_FAILED);
+
   if (event == CACHE_EVENT_OPEN_READ) {
     //////////////////////////////////////////
     // Allocate buffer and initiate read.
     //////////////////////////////////////////
     Debug("cache_proto", "setupVCdataRead CACHE_EVENT_OPEN_READ seqno=%d", seq_number);
-    ink_release_assert(c

<TRUNCATED>