You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafodion.apache.org by db...@apache.org on 2018/03/01 21:07:59 UTC
[1/6] trafodion git commit: [TRAFODION-2883] Preliminary Scale
Enhacements - Increased cluster node limit to 1024 - Added timestamps to node
down system message - Added timestamps and values to registry change
notifications - Fixed monitor trace causing
Repository: trafodion
Updated Branches:
refs/heads/master aec5108ad -> d48a88741
[TRAFODION-2883] Preliminary Scale Enhacements
- Increased cluster node limit to 1024
- Added timestamps to node down system message
- Added timestamps and values to registry change notifications
- Fixed monitor trace causing memory overwrites
Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/887051c8
Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/887051c8
Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/887051c8
Branch: refs/heads/master
Commit: 887051c8a1ba57b0ad47278dce2fc9361a2afb9c
Parents: 087af70
Author: Zalo Correa <za...@esgyn.com>
Authored: Tue Feb 20 16:57:40 2018 -0800
Committer: Zalo Correa <za...@esgyn.com>
Committed: Tue Feb 20 16:57:40 2018 -0800
----------------------------------------------------------------------
core/sqf/monitor/linux/cluster.cxx | 3 +-
core/sqf/monitor/linux/monitor.cxx | 170 ++++++++++++-----
core/sqf/monitor/linux/msgdef.h | 48 +----
core/sqf/monitor/linux/pnode.cxx | 13 ++
core/sqf/monitor/linux/reqprocinfo.cxx | 83 +++++----
core/sqf/monitor/linux/reqqueue.cxx | 9 +-
core/sqf/monitor/linux/shell.cxx | 273 ++++++++++++++++------------
core/sqf/monitor/test/runtest | 3 +-
core/sqf/sqenvcom.sh | 7 +
core/sqf/sql/scripts/sqnodes.pm | 4 +-
core/sqf/src/tm/tm_internal.h | 1 -
core/sqf/src/tm/tmlibmsg.h | 4 +-
core/sqf/src/tm/tools/dtmci.cpp | 2 -
core/sqf/src/tm/tools/pwd.cpp | 2 -
core/sqf/src/tm/tools/tmshutdown.cpp | 1 -
15 files changed, 379 insertions(+), 244 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/trafodion/blob/887051c8/core/sqf/monitor/linux/cluster.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/cluster.cxx b/core/sqf/monitor/linux/cluster.cxx
index c22e8ea..49697dd 100644
--- a/core/sqf/monitor/linux/cluster.cxx
+++ b/core/sqf/monitor/linux/cluster.cxx
@@ -4166,8 +4166,8 @@ void CCluster::ReIntegrateSock( int initProblem )
{
for (int i=0; i<pnodeCount; i++)
{
- if (Node[nodeInfo[i].pnid] == NULL) continue;
if (nodeInfo[i].pnid == -1) continue;
+ if (Node[nodeInfo[i].pnid] == NULL) continue;
trace_printf( "%s@%d - Node info for pnid=%d (%s)\n"
" Node[%d] commPort=%s\n"
" Node[%d] syncPort=%s\n"
@@ -4181,6 +4181,7 @@ void CCluster::ReIntegrateSock( int initProblem )
}
for ( int i =0; i < pnodeCount; i++ )
{
+ if (nodeInfo[i].pnid == -1) continue;
trace_printf( "%s@%d socks_[%d]=%d, sockPorts_[%d]=%d\n"
, method_name, __LINE__
, nodeInfo[i].pnid, socks_[nodeInfo[i].pnid]
http://git-wip-us.apache.org/repos/asf/trafodion/blob/887051c8/core/sqf/monitor/linux/monitor.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/monitor.cxx b/core/sqf/monitor/linux/monitor.cxx
index c5dd28b..70df7cc 100755
--- a/core/sqf/monitor/linux/monitor.cxx
+++ b/core/sqf/monitor/linux/monitor.cxx
@@ -524,6 +524,7 @@ char * CMonitor::ProcCopy(char *bufPtr, CProcess *process)
const char method_name[] = "CMonitor::ProcCopy";
TRACE_ENTRY;
+ int stringDataLen = 0;
struct clone_def *procObj = (struct clone_def *)bufPtr;
procObj->nid = process->GetNid();
@@ -555,36 +556,94 @@ char * CMonitor::ProcCopy(char *bufPtr, CProcess *process)
, process->GetPid()
, process->GetVerifier() );
- char * stringData = &procObj->stringData;
+ char *stringData = &procObj->stringData;
- // Copy the program name
- procObj->nameLen = strlen(process->GetName()) + 1;
- memcpy(stringData, process->GetName(), procObj->nameLen );
- stringData += procObj->nameLen;
+ if (strlen(process->GetName()))
+ {
+ // Copy the program name
+ procObj->nameLen = strlen(process->GetName()) + 1;
+ memcpy(stringData, process->GetName(), procObj->nameLen );
+ stringData += procObj->nameLen;
+ stringDataLen = procObj->nameLen;
+ }
+ else
+ {
+ procObj->nameLen = 0;
+ }
- // Copy the port
- procObj->portLen = strlen(process->GetPort()) + 1;
- memcpy(stringData, process->GetPort(), procObj->portLen );
- stringData += procObj->portLen;
+ if (strlen(process->GetPort()))
+ {
+ // Copy the port
+ procObj->portLen = strlen(process->GetPort()) + 1;
+ memcpy(stringData, process->GetPort(), procObj->portLen );
+ stringData += procObj->portLen;
+ stringDataLen += procObj->portLen;
+ }
+ else
+ {
+ procObj->portLen = 0;
+ }
if (process->IsPersistent())
{
- // Copy the standard in file name
- procObj->infileLen = strlen(process->infile()) + 1;
- memcpy(stringData, process->infile(), procObj->infileLen);
- stringData += procObj->infileLen;
+ if (strlen(process->infile()))
+ {
+ // Copy the standard in file name
+ procObj->infileLen = strlen(process->infile()) + 1;
+ memcpy(stringData, process->infile(), procObj->infileLen);
+ stringData += procObj->infileLen;
+ stringDataLen += procObj->infileLen;
+ }
+ else
+ {
+ procObj->infileLen = 0;
+ }
- // Copy the standard out file name
- procObj->outfileLen = strlen(process->outfile()) + 1;
- memcpy(stringData, process->outfile(), procObj->outfileLen );
- stringData += procObj->outfileLen;
+ if (strlen(process->outfile()))
+ {
+ // Copy the standard out file name
+ procObj->outfileLen = strlen(process->outfile()) + 1;
+ memcpy(stringData, process->outfile(), procObj->outfileLen );
+ stringData += procObj->outfileLen;
+ stringDataLen += procObj->outfileLen;
+ }
+ else
+ {
+ procObj->outfileLen = 0;
+ }
- // Copy the program argument strings
procObj->argvLen = process->userArgvLen();
- memcpy(stringData, process->userArgv(), procObj->argvLen);
- stringData += procObj->argvLen;
+ if (procObj->argvLen)
+ {
+ // Copy the program argument strings
+ memcpy(stringData, process->userArgv(), procObj->argvLen);
+ stringData += procObj->argvLen;
+ stringDataLen += procObj->argvLen;
+ }
+
+ procObj->persistent = true;
- procObj->persistent = true;
+ if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
+ trace_printf( "%s@%d - Packing process string data:\n"
+ " name(%d) =%s\n"
+ " port(%d) =%s\n"
+ " infile(%d) =%s\n"
+ " outfile(%d) =%s\n"
+ " userArgv(%d) =%s\n"
+ " stringData(%d) =%s\n"
+ , method_name, __LINE__
+ , procObj->nameLen
+ , process->GetName()
+ , procObj->portLen
+ , process->GetPort()
+ , procObj->infileLen
+ , process->infile()
+ , procObj->outfileLen
+ , process->outfile()
+ , procObj->argvLen
+ , procObj->argvLen?process->userArgv():""
+ , stringDataLen
+ , stringDataLen?&procObj->stringData:"" );
}
else
{
@@ -658,6 +717,9 @@ void CMonitor::UnpackProcObjs( char *&buffer, int procCount )
CNode * node = NULL;
CProcess * process = NULL;
+ int stringDataLen;
+ char *name = NULL;
+ char *port = NULL;
char *infile = NULL;
char *outfile = NULL;
char *userargv = NULL;
@@ -671,45 +733,72 @@ void CMonitor::UnpackProcObjs( char *&buffer, int procCount )
{
procObj = (struct clone_def *)buffer;
+ stringDataLen = 0;
stringData = &procObj->stringData;
node = Nodes->GetLNode (procObj->nid)->GetNode();
- if (procObj->infileLen)
+ if (procObj->nameLen)
{
- infile = &stringData[procObj->nameLen + procObj->portLen];
+ name = &procObj->stringData;
+ stringDataLen += procObj->nameLen;
}
- else
+
+ if (procObj->portLen)
{
- infile = NULL;
+ port = &stringData[stringDataLen];
+ stringDataLen += procObj->portLen;
}
- if (procObj->outfileLen)
+ if (procObj->infileLen)
{
- outfile = &stringData[procObj->nameLen + procObj->portLen + procObj->infileLen];
+ infile = &stringData[stringDataLen];
+ stringDataLen += procObj->infileLen;
}
- else
+
+ if (procObj->outfileLen)
{
- outfile = NULL;
+ outfile = &stringData[stringDataLen];
+ stringDataLen += procObj->outfileLen;
}
if (procObj->argvLen)
{
- userargv = &stringData[procObj->nameLen + procObj->portLen
- + procObj->infileLen + procObj->outfileLen];
- }
- else
- {
- userargv = NULL;
+ userargv = &stringData[stringDataLen];
+ stringDataLen += procObj->argvLen;
}
+ if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
+ trace_printf( "%s@%d - Unpacking process string data:\n"
+ " stringData(%d) =%s\n"
+ " name(%d) =%s\n"
+ " port(%d) =%s\n"
+ " infile(%d) =%s\n"
+ " outfile(%d) =%s\n"
+ " userArgc =%d\n"
+ " userArgv(%d) =%s\n"
+ , method_name, __LINE__
+ , stringDataLen
+ , stringDataLen?&procObj->stringData:""
+ , procObj->nameLen
+ , procObj->nameLen?name:""
+ , procObj->portLen
+ , procObj->portLen?port:""
+ , procObj->infileLen
+ , procObj->infileLen?infile:""
+ , procObj->outfileLen
+ , procObj->outfileLen?outfile:""
+ , procObj->argc
+ , procObj->argvLen
+ , procObj->argvLen?userargv:"" );
+
process = node->CloneProcess (procObj->nid,
procObj->type,
procObj->priority,
procObj->backup,
procObj->unhooked,
- &stringData[0], // process name
- &stringData[procObj->nameLen], // port
+ procObj->nameLen?name:(char *)"",
+ procObj->portLen?port:(char *)"",
procObj->os_pid,
procObj->verifier,
procObj->parent_nid,
@@ -720,8 +809,8 @@ void CMonitor::UnpackProcObjs( char *&buffer, int procCount )
procObj->pathStrId,
procObj->ldpathStrId,
procObj->programStrId,
- infile,
- outfile,
+ procObj->infileLen?infile:(char *)"",
+ procObj->outfileLen?outfile:(char *)"",
&procObj->creation_time);
if ( process && procObj->argvLen )
@@ -734,8 +823,7 @@ void CMonitor::UnpackProcObjs( char *&buffer, int procCount )
process->SetPersistent(true);
}
- buffer = &stringData[procObj->nameLen + procObj->portLen + procObj->infileLen
- + procObj->outfileLen + procObj->argvLen];
+ buffer = &stringData[stringDataLen];
}
TRACE_EXIT;
http://git-wip-us.apache.org/repos/asf/trafodion/blob/887051c8/core/sqf/monitor/linux/msgdef.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/msgdef.h b/core/sqf/monitor/linux/msgdef.h
index 3f5c8c8..3532ac8 100644
--- a/core/sqf/monitor/linux/msgdef.h
+++ b/core/sqf/monitor/linux/msgdef.h
@@ -61,9 +61,11 @@
// NOTE: Increase with caution as this number
// is also used to gather local CPU statistics
// and a large number may degrade performance
-#define MAX_NODES 256 // This can be higher when needed and will
+#define MAX_NODES TC_NODES_MAX // This can be higher when needed and will
// have performance implications
- // Increment by 64 to match node state bitmask
+ // NOTE: Must increment by 64 to match node state
+ // bitmask. See trafconfig.h TC_NODES_MAX in
+ // Trafodion Configuration API
#define MAX_LNODES_PER_NODE 1 // The 1 is a per physical node limit
// (it can be more, but it is not currently used)
#define MAX_LNODES (MAX_NODES*MAX_LNODES_PER_NODE)
@@ -211,20 +213,9 @@ typedef enum {
RoleType_Aggregation = 0x0002, // Maps to ZoneType_Aggregation, Backend or Any
RoleType_Storage = 0x0004 // Maps to ZoneType_Storage, Backend or Any
} RoleType;
-#if 0
-typedef enum {
- ZoneType_Undefined = 0x0000, // No zone type defined
- ZoneType_Edge = 0x0001, // Zone of service only nodes
- ZoneType_Aggregation = 0x0002, // Zone of compute only nodes
- ZoneType_Storage = 0x0004, // Zone of storage only nodes
- ZoneType_Excluded = 0x0010, // Excluded cores
- ZoneType_Any = ( ZoneType_Edge | ZoneType_Aggregation | ZoneType_Storage ),
- ZoneType_Frontend = ( ZoneType_Edge | ZoneType_Aggregation ),
- ZoneType_Backend = ( ZoneType_Aggregation | ZoneType_Storage )
-} ZoneType;
-#else
+
typedef TcZoneType_t ZoneType;
-#endif
+
// Service Request types
// note: other data structures depend on the ordering of the REQTYPE elements.
// if the ordering changes corresponding changes must be made to
@@ -330,32 +321,9 @@ typedef enum {
// types, add any new message types
// before this one
} MSGTYPE;
-#if 0
-typedef enum {
- ProcessType_Undefined=0, // No process type as been defined
- ProcessType_TSE, // Identifies a Table Storage Engine (DP2)
- ProcessType_DTM, // Identifies a Distributed Transaction Monitor process
- ProcessType_ASE, // Identifies a Audit Storage Engine (ADP)
- ProcessType_Generic, // Identifies a generic process
- ProcessType_Watchdog, // Identifies the monitor's watchdog processes
- ProcessType_AMP, // Identifies a AMP process
- ProcessType_Backout, // Identifies a Backout process
- ProcessType_VolumeRecovery, // Identifies a Volume Recovery process
- ProcessType_MXOSRVR, // Identifies a MXOSRVR process
- ProcessType_SPX, // Identifies a SeaPilot ProXy process
- ProcessType_SSMP, // Identifies a SQL Statistics Merge Process (SSMP)
- ProcessType_PSD, // Identifies the monitor's process start daemon processes
- ProcessType_SMS, // Identifies a SeaMonster Service process
- ProcessType_TMID, // Identifies a Transaction Management ID process
- ProcessType_PERSIST, // Identifies a generic persistent process
-
- ProcessType_Invalid // marks the end of the process
- // types, add any new process
- // types before this one
-} PROCESSTYPE;
-#else
+
typedef TcProcessType_t PROCESSTYPE;
-#endif
+
typedef enum {
ShutdownLevel_Undefined=-1,
ShutdownLevel_Normal=0, // Wait for all transactions and processes to end
http://git-wip-us.apache.org/repos/asf/trafodion/blob/887051c8/core/sqf/monitor/linux/pnode.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/pnode.cxx b/core/sqf/monitor/linux/pnode.cxx
index 8aa0671..75c2137 100644
--- a/core/sqf/monitor/linux/pnode.cxx
+++ b/core/sqf/monitor/linux/pnode.cxx
@@ -1977,6 +1977,13 @@ void CNodeContainer::UnpackNodeMappings( intBuffPtr_t &buffer, int nodeMapCount
int pnid, pnidConfig;
+ // lock sync thread since we are making a change the monitor's
+ // operational view of the cluster
+ if ( !Emulate_Down )
+ {
+ Monitor->EnterSyncCycle();
+ }
+
for (int count = 0; count < nodeMapCount; count++)
{
pnidConfig = *buffer++;
@@ -1991,6 +1998,12 @@ void CNodeContainer::UnpackNodeMappings( intBuffPtr_t &buffer, int nodeMapCount
UpdateCluster();
+ // unlock sync thread
+ if ( !Emulate_Down )
+ {
+ Monitor->ExitSyncCycle();
+ }
+
TRACE_EXIT;
return;
}
http://git-wip-us.apache.org/repos/asf/trafodion/blob/887051c8/core/sqf/monitor/linux/reqprocinfo.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/reqprocinfo.cxx b/core/sqf/monitor/linux/reqprocinfo.cxx
index f7eeefe..4148f36 100644
--- a/core/sqf/monitor/linux/reqprocinfo.cxx
+++ b/core/sqf/monitor/linux/reqprocinfo.cxx
@@ -287,11 +287,14 @@ void CExtProcInfoReq::performRequest()
verifier_ = msg_->u.request.u.process_info.verifier;
processName_ = msg_->u.request.u.process_info.process_name;
+ int pnid = -1;
int target_nid = -1;
int target_pid = -1;
string target_process_name;
Verifier_t target_verifier = -1;
- CProcess *requester = NULL;
+ CClusterConfig *clusterConfig = NULL;
+ CLNodeConfig *lnodeConfig = NULL;
+ CProcess *requester = NULL;
target_nid = msg_->u.request.u.process_info.target_nid;
target_pid = msg_->u.request.u.process_info.target_pid;
@@ -390,42 +393,54 @@ void CExtProcInfoReq::performRequest()
, msg_->u.request.u.process_info.type);
}
- if (target_pid == -1)
+ clusterConfig = Nodes->GetClusterConfig();
+ if (clusterConfig)
{
- // get info for all processes in node
- if (target_nid >= 0 && target_nid < Nodes->GetLNodesConfigMax())
+ if (clusterConfig->IsConfigReady())
{
- count = ProcessInfo_BuildReply(Nodes->GetNode(target_nid)->GetFirstProcess(),
- msg_,
- msg_->u.request.u.process_info.type,
- false,
- msg_->u.request.u.process_info.target_process_pattern);
- }
- }
- else
- {
- // get info for single process in node
- if ((requester->GetType() == ProcessType_TSE ||
- requester->GetType() == ProcessType_ASE ||
- requester->GetType() == ProcessType_AMP) &&
- (requester->GetNid() == target_nid &&
- requester->GetPid() == target_pid))
- {
- ProcessInfo_CopyData(requester,
- msg_->u.reply.u.process_info.process[0]);
- count = 1;
- }
- else if (target_nid >= 0 && target_nid < Nodes->GetLNodesConfigMax())
- { // find by nid/pid (check node state, don't check process state, backup is Ok)
- CProcess *process = Nodes->GetProcess( target_nid
- , target_pid
- , target_verifier
- , true, false, true );
- if (process)
+ lnodeConfig = clusterConfig->GetLNodeConfig( target_nid );
+ if (lnodeConfig)
{
- ProcessInfo_CopyData(process,
- msg_->u.reply.u.process_info.process[0]);
- count = 1;
+
+ if (target_pid == -1)
+ {
+ // get info for all processes in node
+ if (target_nid >= 0 && target_nid < Nodes->GetLNodesConfigMax())
+ {
+ count = ProcessInfo_BuildReply(Nodes->GetNode(target_nid)->GetFirstProcess(),
+ msg_,
+ msg_->u.request.u.process_info.type,
+ false,
+ msg_->u.request.u.process_info.target_process_pattern);
+ }
+ }
+ else
+ {
+ // get info for single process in node
+ if ((requester->GetType() == ProcessType_TSE ||
+ requester->GetType() == ProcessType_ASE ||
+ requester->GetType() == ProcessType_AMP) &&
+ (requester->GetNid() == target_nid &&
+ requester->GetPid() == target_pid))
+ {
+ ProcessInfo_CopyData(requester,
+ msg_->u.reply.u.process_info.process[0]);
+ count = 1;
+ }
+ else if (target_nid >= 0 && target_nid < Nodes->GetLNodesConfigMax())
+ { // find by nid/pid (check node state, don't check process state, backup is Ok)
+ CProcess *process = Nodes->GetProcess( target_nid
+ , target_pid
+ , target_verifier
+ , true, false, true );
+ if (process)
+ {
+ ProcessInfo_CopyData(process,
+ msg_->u.reply.u.process_info.process[0]);
+ count = 1;
+ }
+ }
+ }
}
}
}
http://git-wip-us.apache.org/repos/asf/trafodion/blob/887051c8/core/sqf/monitor/linux/reqqueue.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/reqqueue.cxx b/core/sqf/monitor/linux/reqqueue.cxx
index becb0cd..e4e8dbd 100644
--- a/core/sqf/monitor/linux/reqqueue.cxx
+++ b/core/sqf/monitor/linux/reqqueue.cxx
@@ -2355,7 +2355,6 @@ void CIntReviveReq::performRequest()
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
trace_printf("%s@%d - Spare Nodes List unpacked\n", method_name, __LINE__);
- //Nodes->UnpackNodeMappings( (intBuffPtr_t&)buffer, header.nodeMapCount_ );
Nodes->UnpackNodeMappings( (intBuffPtr_t&)buffer, header.nodeMapCount_ );
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
@@ -2469,8 +2468,8 @@ void CIntSnapshotReq::performRequest()
}
// estimate size of snapshot buffer
- // about 100 bytes per process, 2 times total
- int procSize = Nodes->ProcessCount() * 2 * 100;
+ // about 500 bytes per process, 2 times total
+ int procSize = Nodes->ProcessCount() * 2 * 500;
int idsSize = Nodes->GetSNodesCount() * sizeof(int); // spare pnids
idsSize += (Nodes->GetPNodesCount() + Nodes->GetLNodesCount()) * sizeof(int); // pnid/nid map
idsSize += Nodes->GetLNodesCount() * sizeof(int); // nids
@@ -2481,7 +2480,7 @@ void CIntSnapshotReq::performRequest()
mem_log_write(MON_REQQUEUE_SNAPSHOT_4, procSize, idsSize);
- snapshotBuf = (char *) malloc (procSize + idsSize);
+ snapshotBuf = (char *) malloc (procSize + idsSize);
if (!snapshotBuf)
{
@@ -2497,6 +2496,7 @@ void CIntSnapshotReq::performRequest()
clock_gettime(CLOCK_REALTIME, &startTime);
+ memset( snapshotBuf, 0, (procSize + idsSize) );
char *buf = snapshotBuf;
CCluster::snapShotHeader_t header;
@@ -2550,6 +2550,7 @@ void CIntSnapshotReq::performRequest()
return;
}
+ memset( compBuf, 0, compSize );
z_result = compress((Bytef *)compBuf, (unsigned long *)&compSize,
(Bytef *)snapshotBuf, header.fullSize_);
http://git-wip-us.apache.org/repos/asf/trafodion/blob/887051c8/core/sqf/monitor/linux/shell.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/shell.cxx b/core/sqf/monitor/linux/shell.cxx
index 5ad73ab..7d7db50 100644
--- a/core/sqf/monitor/linux/shell.cxx
+++ b/core/sqf/monitor/linux/shell.cxx
@@ -1102,15 +1102,16 @@ void recv_notice_msg(struct message_def *recv_msg, int )
switch (recv_msg->type )
{
case MsgType_Change:
- printf ("[%s] Configuration Change Notice for Group: %s Key: %s\n",
- MyName,
+ printf ("[%s] %s - Configuration Change Notice for Group: %s Key: %s Value: %s\n",
+ MyName, time_string(),
recv_msg->u.request.u.change.group,
- recv_msg->u.request.u.change.key);
+ recv_msg->u.request.u.change.key,
+ recv_msg->u.request.u.change.value);
break;
case MsgType_Event:
- printf("[%s] Event %d received\n",
- MyName, recv_msg->u.request.u.event_notice.event_id);
+ printf("[%s] %s - Event %d received\n",
+ MyName, time_string(), recv_msg->u.request.u.event_notice.event_id);
break;
case MsgType_NodeAdded:
@@ -1177,8 +1178,8 @@ void recv_notice_msg(struct message_def *recv_msg, int )
break;
case MsgType_NodeDown:
- printf ("[%s] Node %d (%s) is DOWN\n",
- MyName, recv_msg->u.request.u.down.nid,
+ printf ("[%s] %s - Node %d (%s) is DOWN\n",
+ MyName, time_string(), recv_msg->u.request.u.down.nid,
recv_msg->u.request.u.down.node_name );
NodeState[recv_msg->u.request.u.down.nid] = false;
@@ -1211,15 +1212,15 @@ void recv_notice_msg(struct message_def *recv_msg, int )
case MsgType_NodePrepare:
- printf("[%s] Node %s (%d) node-up preparation, takeover=%s\n",
- MyName, recv_msg->u.request.u.prepare.node_name,
+ printf("[%s] %s - Node %s (%d) node-up preparation, takeover=%s\n",
+ MyName, time_string(), recv_msg->u.request.u.prepare.node_name,
recv_msg->u.request.u.prepare.nid,
((recv_msg->u.request.u.prepare.takeover)? "true": "false"));
break;
case MsgType_NodeQuiesce:
- printf ("[%s] Node %d (%s) is QUIESCEd\n",
- MyName, msg->u.request.u.quiesce.nid,
+ printf ("[%s] %s - Node %d (%s) is QUIESCEd\n",
+ MyName, time_string(), msg->u.request.u.quiesce.nid,
msg->u.request.u.quiesce.node_name );
NodeState[msg->u.request.u.quiesce.nid] = false;
if ( waitDeathPending )
@@ -1248,15 +1249,15 @@ void recv_notice_msg(struct message_def *recv_msg, int )
case MsgType_ProcessCreated:
if ( recv_msg->u.request.u.process_created.return_code == MPI_SUCCESS )
{
- printf ("[%s] Process %s successfully created. Nid=%d, Pid=%d\n",
- MyName, recv_msg->u.request.u.process_created.process_name,
+ printf ("[%s] %s - Process %s successfully created. Nid=%d, Pid=%d\n",
+ MyName, time_string(), recv_msg->u.request.u.process_created.process_name,
recv_msg->u.request.u.process_created.nid,
recv_msg->u.request.u.process_created.pid);
}
else
{
- printf ("[%s] Process %s NOT created. Nid=%d, Pid=%d\n",
- MyName, recv_msg->u.request.u.process_created.process_name,
+ printf ("[%s] %s - Process %s NOT created. Nid=%d, Pid=%d\n",
+ MyName, time_string(), recv_msg->u.request.u.process_created.process_name,
recv_msg->u.request.u.process_created.nid,
recv_msg->u.request.u.process_created.pid);
}
@@ -1265,15 +1266,15 @@ void recv_notice_msg(struct message_def *recv_msg, int )
case MsgType_ProcessDeath:
if ( recv_msg->u.request.u.death.aborted )
{
- printf ("[%s] Process %s abnormally terminated. Nid=%d, Pid=%d\n",
- MyName, recv_msg->u.request.u.death.process_name,
+ printf ("[%s] %s - Process %s abnormally terminated. Nid=%d, Pid=%d\n",
+ MyName, time_string(), recv_msg->u.request.u.death.process_name,
recv_msg->u.request.u.death.nid,
recv_msg->u.request.u.death.pid);
}
else
{
- printf ("[%s] Process %s terminated normally. Nid=%d, Pid=%d\n",
- MyName, recv_msg->u.request.u.death.process_name,
+ printf ("[%s] %s - Process %s terminated normally. Nid=%d, Pid=%d\n",
+ MyName, time_string(), recv_msg->u.request.u.death.process_name,
recv_msg->u.request.u.death.nid,
recv_msg->u.request.u.death.pid);
}
@@ -1298,18 +1299,18 @@ void recv_notice_msg(struct message_def *recv_msg, int )
break;
case MsgType_Shutdown:
- printf("[%s] Shutdown notice, level=%d received\n",
- MyName, recv_msg->u.request.u.shutdown.level);
+ printf("[%s] %s - Shutdown notice, level=%d received\n",
+ MyName, time_string(), recv_msg->u.request.u.shutdown.level);
nodePendingComplete();
break;
case MsgType_TmSyncAbort:
- printf("[%s] TmSync abort notice received\n",
- MyName);
+ printf("[%s] %s - TmSync abort notice received\n",
+ MyName, time_string());
break;
case MsgType_TmSyncCommit:
- printf("[%s] TmSync commit notice received\n",
- MyName);
+ printf("[%s] %s - TmSync commit notice received\n",
+ MyName, time_string());
break;
case MsgType_ReintegrationError:
@@ -1321,8 +1322,8 @@ void recv_notice_msg(struct message_def *recv_msg, int )
break;
default:
- printf("[%s] Unexpected notice type(%d) received\n",
- MyName, recv_msg->type);
+ printf("[%s] %s - Unexpected notice type(%d) received\n",
+ MyName, time_string(), recv_msg->type);
}
@@ -3868,7 +3869,7 @@ int node_up( int nid, char *node_name, bool nowait )
// If this is a real cluster
if ( nid == -1 )
{
- // Get current physical state of target nodes
+ // Get current physical state of all nodes
if ( !update_node_state( node_name, false ) )
{
return( rc ) ;
@@ -4133,7 +4134,8 @@ void persist_config( char *prefix )
}
if (!foundConfig)
{
- printf ("[%s] Persistent process configuration does not exist\n", MyName);
+ printf("[%s] %s - Persistent process configuration does not exist\n"
+ , MyName, time_string() );
}
}
@@ -4213,7 +4215,8 @@ void persist_info( char *prefix )
}
if (!foundConfig)
{
- printf ("[%s] Persistent process configuration does not exist\n", MyName);
+ printf("[%s] %s - Persistent process configuration does not exist\n"
+ , MyName, time_string() );
}
}
@@ -4268,7 +4271,9 @@ bool persist_process_kill( CPersistConfig *persistConfig )
, persistZones );
if ( !find_process( processName ) )
{
- printf( "Persistent process %s does not exist\n", processName);
+ printf( "[%s] %s - Persistent process %s does not exist\n"
+ , MyName, time_string()
+ , processName );
continue;
}
kill_process( -1, -1, processName, true );
@@ -4288,7 +4293,9 @@ bool persist_process_kill( CPersistConfig *persistConfig )
, persistZones );
if ( !find_process( processName ) )
{
- printf( "Persistent process %s does not exist\n", processName);
+ printf( "[%s] %s - Persistent process %s does not exist\n"
+ , MyName, time_string()
+ , processName );
break;
}
kill_process( -1, -1, processName, true );
@@ -4306,7 +4313,9 @@ bool persist_process_kill( CPersistConfig *persistConfig )
, persistZones );
if ( !find_process( processName ) )
{
- printf( "Persistent process %s does not exist\n", processName);
+ printf( "[%s] %s - Persistent process %s does not exist\n"
+ , MyName, time_string()
+ , processName );
break;
}
kill_process( -1, -1, processName, true );
@@ -4377,7 +4386,9 @@ bool persist_process_start( CPersistConfig *persistConfig )
, persistZones );
if ( find_process( processName ) )
{
- printf( "Persistent process %s already exists\n", processName);
+ printf( "[%s] %s - Persistent process %s already exists\n"
+ , MyName, time_string()
+ , processName );
continue;
}
if (programArgc)
@@ -4403,7 +4414,9 @@ bool persist_process_start( CPersistConfig *persistConfig )
//, (char *)persistConfig->GetProgramName() );
if (pid > 0)
{
- printf( "Persistent process %s created\n", processName);
+ printf( "[%s] %s - Persistent process %s created\n"
+ , MyName, time_string()
+ , processName );
if (process_type == ProcessType_DTM)
{
DTMexists = true;
@@ -4431,7 +4444,9 @@ bool persist_process_start( CPersistConfig *persistConfig )
, persistZones );
if ( find_process( processName ) )
{
- printf( "Persistent process %s already exists\n", processName);
+ printf( "[%s] %s - Persistent process %s already exists\n"
+ , MyName, time_string()
+ , processName );
break;
}
if (programArgc)
@@ -4456,7 +4471,9 @@ bool persist_process_start( CPersistConfig *persistConfig )
, programNameAndArgs );
if (pid > 0)
{
- printf( "Persistent process %s created\n", processName);
+ printf( "[%s] %s - Persistent process %s created\n"
+ , MyName, time_string()
+ , processName );
if (process_type == ProcessType_DTM)
{
DTMexists = true;
@@ -4482,7 +4499,9 @@ bool persist_process_start( CPersistConfig *persistConfig )
, persistZones );
if ( find_process( processName ) )
{
- printf( "Persistent process %s already exists\n", processName);
+ printf( "[%s] %s - Persistent process %s already exists\n"
+ , MyName, time_string()
+ , processName );
break;
}
if (programArgc)
@@ -4507,7 +4526,9 @@ bool persist_process_start( CPersistConfig *persistConfig )
, programNameAndArgs );
if (pid > 0)
{
- printf( "Persistent process %s created\n", processName);
+ printf( "[%s] %s - Persistent process %s created\n"
+ , MyName, time_string()
+ , processName );
if (process_type == ProcessType_DTM)
{
DTMexists = true;
@@ -4516,8 +4537,8 @@ bool persist_process_start( CPersistConfig *persistConfig )
else
{
if ( trace_settings & TRACE_SHELL_CMD )
- trace_printf("%s@%d [%s] persistexec failed!\n",
- method_name, __LINE__, MyName);
+ trace_printf("%s@%d [%s] persist exec failed!\n"
+ , method_name, __LINE__, MyName);
}
break;
default:
@@ -6177,7 +6198,7 @@ void help_cmd (void)
printf ("[%s] -- persist exec <persist-process-prefix>\n", MyName);
printf ("[%s] -- persist info [<persist-process-prefix>]\n", MyName);
printf ("[%s] -- persist kill <persist-process-prefix>\n", MyName);
- printf ("[%s] -- ps [{CS|DTM|GEN|PSD|SMS|SSMP|WDG}] [<process_name>|<nid,pid>]\n", MyName);
+ printf ("[%s] -- ps [{CS|DTM|GEN|PSD|SMS|SSMP|WDG}] [<nid>|<process_name>|<nid,pid>]\n", MyName);
printf ("[%s] -- pwd\n", MyName);
printf ("[%s] -- quit\n", MyName);
printf ("[%s] -- scanbufs\n", MyName);
@@ -6372,6 +6393,7 @@ void monstats_cmd (char *)
void node_cmd (char *cmd_tail)
{
int nid;
+ int pnid;
char token[MAX_TOKEN];
char delimiter;
char *cmd = cmd_tail;
@@ -6423,7 +6445,7 @@ void node_cmd (char *cmd_tail)
{
sprintf( msgString, "[%s] Node add is not available with Virtual Nodes!",MyName);
write_startup_log( msgString );
- printf ("[%s] Node add is not available with Virtual Nodes!\n", MyName);
+ printf ("%s\n", msgString);
}
else
{
@@ -6474,7 +6496,7 @@ void node_cmd (char *cmd_tail)
{
sprintf( msgString, "[%s] Node delete is not available with Virtual Nodes!",MyName);
write_startup_log( msgString );
- printf ("[%s] Node delete is not available with Virtual Nodes!\n", MyName);
+ printf ("%s\n", msgString);
}
else
{
@@ -6487,7 +6509,7 @@ void node_cmd (char *cmd_tail)
{
sprintf( msgString, "[%s] Node delete is not enabled, to enable export SQ_ELASTICY_ENABLED=1",MyName);
write_startup_log( msgString );
- printf ("[%s] Node delete is not enabled, to enable export SQ_ELASTICY_ENABLED=1\n", MyName);
+ printf ("%s\n", msgString);
}
}
}
@@ -6516,15 +6538,15 @@ void node_cmd (char *cmd_tail)
if ( *cmd )
{
nid = atoi (cmd);
- if ((!isNumeric(cmd)) || (nid >= LNodesConfigMax) || (nid < 0))
+ pnid = get_pnid_by_nid( nid );
+ if ( pnid == -1 )
{
- printf ("[%s] Invalid nid\n", MyName);
- }
- else
- {
- node_info(nid);
- CurNodes = NumLNodes-NumDown;
+ printf( "[%s] Node id %d does not exist in configuration!\n"
+ , MyName, nid );
+ return;
}
+ node_info(nid);
+ CurNodes = NumLNodes-NumDown;
}
else
{
@@ -6546,7 +6568,7 @@ void node_cmd (char *cmd_tail)
{
sprintf( msgString, "[%s] Node name is not available with Virtual Nodes!",MyName);
write_startup_log( msgString );
- printf ("[%s] Node name is not available with Virtual Nodes!\n", MyName);
+ printf ("%s\n", msgString);
}
else
{
@@ -6559,7 +6581,7 @@ void node_cmd (char *cmd_tail)
{
sprintf( msgString, "[%s] Node name is not enabled, to enable export SQ_ELASTICY_ENABLED=1",MyName);
write_startup_log( msgString );
- printf ("[%s] Node name is not enabled, to enable export SQ_ELASTICY_ENABLED=1\n", MyName);
+ printf ("%s\n", msgString);
}
}
}
@@ -6714,9 +6736,9 @@ void node_config_cmd( char *cmd )
char *cmd_tail = cmd;
char delim;
- char msgString[MAX_BUFFER] = { 0 };
char token[MAX_TOKEN] = { 0 };
int nid = -1;
+ int pnid = -1;
if ( trace_settings & TRACE_SHELL_CMD )
trace_printf ("%s@%d [%s] processing node config command.\n",
@@ -6730,33 +6752,22 @@ void node_config_cmd( char *cmd )
if ( isNumeric( token ) )
{
nid = atoi (token);
- if (nid < 0 || nid > LNodesConfigMax - 1)
+ pnid = get_pnid_by_nid( nid );
+ if ( pnid == -1 )
{
- sprintf( msgString, "[%s] Node id is not configured!",MyName);
- write_startup_log( msgString );
- printf ("%s\n", msgString);
- return;
+ printf( "[%s] Node id %d does not exist in configuration!\n"
+ , MyName, nid );
+ return;
}
- snprintf( msgString, sizeof(msgString)
- , "[%s] Executing node config. (nid=%s)"
- , MyName, token );
- write_startup_log( msgString );
}
else
{
if ( get_node_name( token ) != 0 )
{
- sprintf( msgString, "[%s] Node %s is not configured!"
- , MyName, token);
- write_startup_log( msgString );
- printf( "[%s] Node %s is not configured!\n"
- , MyName, token);
+ printf( "[%s] Node %s does not exist in configuration!\n"
+ , MyName, token );
return;
}
- snprintf( msgString, sizeof(msgString)
- , "[%s] Executing node config. (node_name=%s)"
- , MyName, token );
- write_startup_log( msgString );
}
}
@@ -6793,11 +6804,10 @@ void node_delete_cmd( char *cmd )
{
if ( get_node_name( token ) != 0 )
{
- sprintf( msgString, "[%s] Node %s is not configured!"
+ sprintf( msgString, "[%s] Node %s does not exist in configuration!"
, MyName, token);
write_startup_log( msgString );
- printf( "[%s] Node %s is not configured!\n"
- , MyName, token);
+ printf ("%s\n", msgString);
return;
}
STRCPY(node_name, token);
@@ -6805,13 +6815,14 @@ void node_delete_cmd( char *cmd )
, "[%s] Executing node delete. (node_name=%s)"
, MyName, node_name );
write_startup_log( msgString );
+ printf ("%s\n", msgString);
}
}
else
{
sprintf( msgString, "[%s] Invalid node delete options syntax!",MyName);
write_startup_log( msgString );
- printf ("[%s] Invalid node delete options syntax!\n", MyName);
+ printf ("%s\n", msgString);
return;
}
@@ -6824,6 +6835,7 @@ void node_down_cmd( char *cmd )
int numLNodes = -1;
int nid;
+ int pnid;
char *cmd_tail = cmd;
char delim;
char msgString[MAX_BUFFER] = { 0 };
@@ -6852,41 +6864,45 @@ void node_down_cmd( char *cmd )
}
write_startup_log( msgString );
printf ("%s\n", msgString);
+
nid = atoi (token);
- if (nid < 0 || nid > LNodesConfigMax - 1)
+ pnid = get_pnid_by_nid( nid );
+ if ( pnid == -1 )
{
- sprintf( msgString, "[%s] Invalid node id!",MyName);
+ sprintf( msgString, "[%s] Node id %d does not exist in configuration!"
+ , MyName, nid);
write_startup_log( msgString );
printf ("%s\n", msgString);
- return;
+ return;
}
}
else
{
- if ( get_node_name( token ) != 0 )
- {
- sprintf( msgString, "[%s] Node %s is not configured!"
- , MyName, token);
- write_startup_log( msgString );
- printf ("%s\n", msgString);
- return;
- }
- STRCPY(node_name, token);
- nid = get_first_nid( node_name );
if (cmd_tail[0] != 0)
{
snprintf( msgString, sizeof(msgString)
, "[%s] Executing node down. (node_name=%s) \"%s\""
- , MyName, node_name, cmd_tail );
+ , MyName, token, cmd_tail );
}
else
{
snprintf( msgString, sizeof(msgString)
, "[%s] Executing node down. (node_name=%s)"
- , MyName, node_name );
+ , MyName, token );
}
write_startup_log( msgString );
printf ("%s\n", msgString);
+
+ if ( get_node_name( token ) != 0 )
+ {
+ sprintf( msgString, "[%s] Node %s does not exist in configuration!"
+ , MyName, token);
+ write_startup_log( msgString );
+ printf ("%s\n", msgString);
+ return;
+ }
+ STRCPY(node_name, token);
+ nid = get_first_nid( node_name );
}
numLNodes = get_lnodes_count( nid );
@@ -6895,7 +6911,6 @@ void node_down_cmd( char *cmd )
return;
}
- int pnid;
int zid = -1;
STATE state;
@@ -6907,7 +6922,7 @@ void node_down_cmd( char *cmd )
{
sprintf( msgString, "[%s] Node is already down! (nid=%d, state=%s)\n", MyName, nid, StateString(state) );
write_startup_log( msgString );
- printf ("[%s] Node is already down! (nid=%d, state=%s)\n", MyName, nid, StateString(state) );
+ printf ("%s\n", msgString);
return;
}
else
@@ -6918,7 +6933,7 @@ void node_down_cmd( char *cmd )
{
sprintf( msgString, "[%s] Multiple logical nodes in physical node. Use <nid> '!' to down all logical nodes in physical node\n", MyName);
write_startup_log( msgString );
- printf ("[%s] Multiple logical nodes in physical node. Use <nid> '!' to down all logical nodes in physical node\n", MyName);
+ printf ("%s\n", msgString);
return;
}
}
@@ -7058,7 +7073,7 @@ void node_up_cmd( char *cmd, char delimiter )
{
sprintf( msgString, "[%s] Invalid up options syntax!",MyName);
write_startup_log( msgString );
- printf ("[%s] Invalid up options syntax!\n", MyName);
+ printf ("%s\n", msgString);
delimiter = ' ';
break;
}
@@ -7068,7 +7083,7 @@ void node_up_cmd( char *cmd, char delimiter )
{
sprintf( msgString, "[%s] Invalid up syntax!",MyName);
write_startup_log( msgString );
- printf ("[%s] Invalid up syntax!\n", MyName);
+ printf ("%s\n", msgString);
}
else if (delimiter == '}')
{
@@ -7080,6 +7095,11 @@ void node_up_cmd( char *cmd, char delimiter )
{
if ( VirtualNodes )
{
+ sprintf( msgString, "[%s] Executing node up. (nid=%s)"
+ , MyName, cmd_tail);
+ write_startup_log( msgString );
+ printf ("%s\n", msgString);
+
get_token( cmd_tail, token, &delim );
if ( isNumeric( token ) )
{
@@ -7088,7 +7108,7 @@ void node_up_cmd( char *cmd, char delimiter )
{
sprintf( msgString, "[%s] Invalid node id!",MyName);
write_startup_log( msgString );
- printf ("[%s] Invalid node id!\n", MyName);
+ printf ("%s\n", msgString);
}
else
{
@@ -7100,27 +7120,47 @@ void node_up_cmd( char *cmd, char delimiter )
{
sprintf( msgString, "[%s] Invalid node id!",MyName);
write_startup_log( msgString );
- printf ("[%s] Invalid node id!\n", MyName);
+ printf ("%s\n", msgString);
}
}
else
{
- if ( get_node_name( cmd_tail ) == 0 )
+ sprintf( msgString, "[%s] Executing node up. (node=%s)"
+ , MyName, cmd_tail);
+ write_startup_log( msgString );
+ printf ("%s\n", msgString);
+
+ get_token( cmd_tail, token, &delim );
+ if ( isNumeric( token ) )
+ {
+ sprintf( msgString, "[%s] Invalid node name (%s)!"
+ , MyName, token);
+ write_startup_log( msgString );
+ printf ("%s\n", msgString);
+ return;
+ }
+ else
{
- if ( ClusterConfig.GetStorageType() == TCDBSQLITE)
+ if ( get_node_name( token ) == 0 )
{
- if ( copy_config_db( cmd_tail ) == 0 )
+ if ( ClusterConfig.GetStorageType() == TCDBSQLITE)
{
- node_up( -1, cmd_tail, nowait );
+ if ( copy_config_db( cmd_tail ) != 0 )
+ {
+ return;
+ }
}
}
+ else
+ {
+ sprintf( msgString, "[%s] Node %s does not exist in configuration!"
+ , MyName, token);
+ write_startup_log( msgString );
+ printf ("%s\n", msgString);
+ return;
+ }
}
- else
- {
- sprintf( msgString, "[%s] Invalid node name!",MyName);
- write_startup_log( msgString );
- printf ("[%s] Invalid node name!\n", MyName);
- }
+ node_up( -1, cmd_tail, nowait );
}
}
}
@@ -7485,6 +7525,7 @@ void ps_cmd (char *cmd_tail, char delimiter)
{
int nid;
int pid;
+ int pnid;
char process_name[MAX_PROCESS_NAME];
char token[MAX_TOKEN];
PROCESSTYPE process_type = ProcessType_Undefined;
@@ -7546,7 +7587,7 @@ void ps_cmd (char *cmd_tail, char delimiter)
}
}
- // check if we have a process <name> or <nid,pid>
+ // check if we have a process <name> or <nid> or <nid,pid>
if (isdigit (*cmd_tail))
{
cmd_tail = get_token (cmd_tail, token, &delimiter);
@@ -7558,7 +7599,15 @@ void ps_cmd (char *cmd_tail, char delimiter)
}
else
{
- printf ("[%s] Invalid process Nid,Pid!\n", MyName);
+ nid = atoi (token);
+ pid = -1;
+ //printf ("[%s] Invalid process Nid,Pid!\n", MyName);
+ //return;
+ }
+ pnid = get_pnid_by_nid( nid );
+ if ( pnid == -1 )
+ {
+ printf( "[%s] Invalid node, nid=%d\n", MyName, nid );
return;
}
}
@@ -8251,8 +8300,6 @@ bool process_command( char *token, char *cmd_tail, char delimiter )
}
else if (strcmp (token, "up") == 0)
{
- sprintf( msgString, "[%s] Executing node up. (node=%s)",MyName,cmd_tail);
- write_startup_log( msgString );
if (Started)
{
node_up_cmd( cmd_tail, delimiter );
http://git-wip-us.apache.org/repos/asf/trafodion/blob/887051c8/core/sqf/monitor/test/runtest
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/test/runtest b/core/sqf/monitor/test/runtest
index 9c08949..8c1193b 100755
--- a/core/sqf/monitor/test/runtest
+++ b/core/sqf/monitor/test/runtest
@@ -102,7 +102,8 @@ fi
#
# Setup test execution
#
-export PATH=$PATH:$PWD/Linux-x86_64/64/dbg
+ARCH=`arch`
+export PATH=$PATH:$PWD/Linux-${ARCH}/64/dbg
cd $TRAF_HOME/monitor/test
echo $PWD
http://git-wip-us.apache.org/repos/asf/trafodion/blob/887051c8/core/sqf/sqenvcom.sh
----------------------------------------------------------------------
diff --git a/core/sqf/sqenvcom.sh b/core/sqf/sqenvcom.sh
index eec2a3b..7058637 100644
--- a/core/sqf/sqenvcom.sh
+++ b/core/sqf/sqenvcom.sh
@@ -707,6 +707,13 @@ export SQ_MON_EPOLL_RETRY_COUNT=4
# Trafodion Configuration Zookeeper store
#export TC_ZCONFIG_SESSION_TIMEOUT=120
+# increase SQ_MON,ZCLIENT,WDT timeout only to jenkins env.
+if [[ "$TRAF_HOME" == *"/home/jenkins"* ]]; then
+export SQ_MON_EPOLL_WAIT_TIMEOUT=20
+export SQ_MON_ZCLIENT_SESSION_TIMEOUT=360
+export SQ_WDT_KEEPALIVETIMERVALUE=360
+fi
+
# set to 0 to disable phandle verifier
export SQ_PHANDLE_VERIFIER=1
http://git-wip-us.apache.org/repos/asf/trafodion/blob/887051c8/core/sqf/sql/scripts/sqnodes.pm
----------------------------------------------------------------------
diff --git a/core/sqf/sql/scripts/sqnodes.pm b/core/sqf/sql/scripts/sqnodes.pm
index 36d8f0c..0d09565 100644
--- a/core/sqf/sql/scripts/sqnodes.pm
+++ b/core/sqf/sql/scripts/sqnodes.pm
@@ -279,10 +279,10 @@ sub verifyParse
displayStmt($stmtOk);
print " Error: node-id not specified\n";
}
- elsif ($nodeId > 255)
+ elsif ($nodeId > 1023)
{
displayStmt($stmtOk);
- print " Error: node-id must be in the range 0..255.\n";
+ print " Error: node-id must be in the range 0..1023.\n";
}
if (@cores == 0)
{
http://git-wip-us.apache.org/repos/asf/trafodion/blob/887051c8/core/sqf/src/tm/tm_internal.h
----------------------------------------------------------------------
diff --git a/core/sqf/src/tm/tm_internal.h b/core/sqf/src/tm/tm_internal.h
index bca0f77..cf735b2 100644
--- a/core/sqf/src/tm/tm_internal.h
+++ b/core/sqf/src/tm/tm_internal.h
@@ -46,7 +46,6 @@
#define MAX_FILE_NAME 64
#define MAX_NUM_TRANS 1024
-#define MAX_NODES 256
#define MAX_SYNC_TXS 50
#define MAX_RECEIVE_BUFFER 200000
// low number for testing
http://git-wip-us.apache.org/repos/asf/trafodion/blob/887051c8/core/sqf/src/tm/tmlibmsg.h
----------------------------------------------------------------------
diff --git a/core/sqf/src/tm/tmlibmsg.h b/core/sqf/src/tm/tmlibmsg.h
index c8eea04..1049b7d 100644
--- a/core/sqf/src/tm/tmlibmsg.h
+++ b/core/sqf/src/tm/tmlibmsg.h
@@ -43,8 +43,8 @@
//#include "dumapp.h"
+#include "trafconf/trafconfig.h"
#include "dtm/tm.h"
-
#include "dtm/xa.h"
#include "rm.h"
#include "../../inc/fs/feerrors.h" //legacy error codes for SQL
@@ -86,7 +86,7 @@
#define MAX_NUM_TRANS 5000
#define STEADYSTATE_LOW_TRANS 5
#define STEADYSTATE_HIGH_TRANS 1000
-#define MAX_NODES 256
+#define MAX_NODES TC_NODES_MAX
#define MAX_SYNC_TXS 50
#define MAX_TXN_TAGS UINT_MAX
http://git-wip-us.apache.org/repos/asf/trafodion/blob/887051c8/core/sqf/src/tm/tools/dtmci.cpp
----------------------------------------------------------------------
diff --git a/core/sqf/src/tm/tools/dtmci.cpp b/core/sqf/src/tm/tools/dtmci.cpp
index fff798f..5d19734 100644
--- a/core/sqf/src/tm/tools/dtmci.cpp
+++ b/core/sqf/src/tm/tools/dtmci.cpp
@@ -50,8 +50,6 @@ DEFINE_EXTERN_COMP_DOVERS(dtmci)
const char ga_timestamp[] = "v 3.1.0, Nov 26, 2014";
-#define MAX_NODES 256
-
using namespace std;
extern const char *ms_getenv_str(const char *pp_key);
http://git-wip-us.apache.org/repos/asf/trafodion/blob/887051c8/core/sqf/src/tm/tools/pwd.cpp
----------------------------------------------------------------------
diff --git a/core/sqf/src/tm/tools/pwd.cpp b/core/sqf/src/tm/tools/pwd.cpp
index f95c126..97606a2 100644
--- a/core/sqf/src/tm/tools/pwd.cpp
+++ b/core/sqf/src/tm/tools/pwd.cpp
@@ -47,8 +47,6 @@ DEFINE_EXTERN_COMP_DOVERS(dtmci)
const char ga_timestamp[] = "v 3.1.0, Nov 26, 2014";
-#define MAX_NODES 256
-
using namespace std;
extern const char *ms_getenv_str(const char *pp_key);
http://git-wip-us.apache.org/repos/asf/trafodion/blob/887051c8/core/sqf/src/tm/tools/tmshutdown.cpp
----------------------------------------------------------------------
diff --git a/core/sqf/src/tm/tools/tmshutdown.cpp b/core/sqf/src/tm/tools/tmshutdown.cpp
index 5961c5b..b0ca8b0 100644
--- a/core/sqf/src/tm/tools/tmshutdown.cpp
+++ b/core/sqf/src/tm/tools/tmshutdown.cpp
@@ -37,7 +37,6 @@
// Version
DEFINE_EXTERN_COMP_DOVERS(tmshutdown)
-#define MAX_NODES 256
#define MAX_ARGLEN 32
using namespace std;
[2/6] trafodion git commit: [TRAFODION-2883] Preliminary Scale
Enhacements - Added timestamps to node down system message - Added timestamps
and values to registry change notifications - Fixed monitor trace causing
memory overwrites - Implemented AGENT m
Posted by db...@apache.org.
[TRAFODION-2883] Preliminary Scale Enhacements
- Added timestamps to node down system message
- Added timestamps and values to registry change notifications
- Fixed monitor trace causing memory overwrites
- Implemented AGENT mode monitor functionality
o This is a pre reliminary change to remove dependency on OpenMPI during
initialization of operational cluster by creating a cluster of one node
(MASTER monitor) where other remote nodes (SLAVE monitors) join the
cluster through the MASTER
- Implemented MASTER monitor selection logic
- Scale bug fixes found when creating clusters greater than 120 nodes-
Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/bded0e84
Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/bded0e84
Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/bded0e84
Branch: refs/heads/master
Commit: bded0e843f8b600a5459c5353bbdc9f59d6d6551
Parents: 887051c
Author: Zalo Correa <za...@esgyn.com>
Authored: Tue Feb 27 17:34:25 2018 -0800
Committer: Zalo Correa <za...@esgyn.com>
Committed: Tue Feb 27 17:34:25 2018 -0800
----------------------------------------------------------------------
.../export/include/common/evl_sqlog_eventnum.h | 14 +
core/sqf/monitor/linux/cluster.cxx | 291 +++++++++++++--
core/sqf/monitor/linux/cluster.h | 7 +
core/sqf/monitor/linux/commaccept.cxx | 108 +++---
core/sqf/monitor/linux/mlio.cxx | 8 +-
core/sqf/monitor/linux/monitor.cxx | 370 ++++++++++++++++---
core/sqf/monitor/linux/monitor.h | 2 -
core/sqf/monitor/linux/pnode.cxx | 55 +--
core/sqf/monitor/linux/process.cxx | 75 +++-
core/sqf/monitor/linux/reqqueue.cxx | 1 +
core/sqf/monitor/linux/zclient.cxx | 343 ++++++++++++++++-
core/sqf/monitor/linux/zclient.h | 5 +
core/sqf/sqenvcom.sh | 12 +
core/sqf/sql/scripts/sqnodes.pm | 4 +-
core/sqf/src/trafconf/clusterconf.cpp | 10 +
core/sqf/src/trafconf/clusterconf.h | 4 +
16 files changed, 1131 insertions(+), 178 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/trafodion/blob/bded0e84/core/sqf/export/include/common/evl_sqlog_eventnum.h
----------------------------------------------------------------------
diff --git a/core/sqf/export/include/common/evl_sqlog_eventnum.h b/core/sqf/export/include/common/evl_sqlog_eventnum.h
index 96c3df9..0418c70 100644
--- a/core/sqf/export/include/common/evl_sqlog_eventnum.h
+++ b/core/sqf/export/include/common/evl_sqlog_eventnum.h
@@ -255,6 +255,12 @@
#define MON_MONITOR_MAIN_9 101020109
#define MON_MONITOR_MAIN_10 101020110
#define MON_MONITOR_MAIN_11 101020111
+#define MON_MONITOR_MAIN_12 101020112
+#define MON_MONITOR_MAIN_13 101020113
+#define MON_MONITOR_MAIN_14 101020114
+#define MON_MONITOR_MAIN_15 101020115
+#define MON_MONITOR_MAIN_16 101020116
+#define MON_MONITOR_MAIN_17 101020117
#define MON_MONITOR_TMLEADER_1 101020201
#define MON_MONITOR_TMLEADER_2 101020202
#define MON_MONITOR_DEATH_HANDLER_1 101020301
@@ -895,6 +901,14 @@
#define MON_ZCLIENT_ISZNODEEXPIRED_2 101371802
#define MON_ZCLIENT_CHECKMYZNODE_1 101371901
#define MON_ZCLIENT_CHECKMYZNODE_2 101371902
+#define MON_ZCLIENT_AMICONFIGUREDMASTER_1 101372101
+#define MON_ZCLIENT_AMICONFIGUREDMASTER_2 101372102
+#define MON_ZCLIENT_WAITFORANDRETURNMASTER 101372103
+#define MON_ZCLIENT_CREATEMASTERZNODE 101372104
+#define MON_ZCLIENT_WATCHMASTERNODEDELETE_1 101372105
+#define MON_ZCLIENT_WATCHMASTERNODEDELETE_2 101372106
+#define MON_ZCLIENT_WATCHMASTERNODEDELETE_3 101372107
+#define MON_ZCLIENT_CREATEORSETMASTERWATCH 101372108
/* Module: zconfig.cxx = 38 */
#define ZCONFIG_ZCONFIG_1 101380101
http://git-wip-us.apache.org/repos/asf/trafodion/blob/bded0e84/core/sqf/monitor/linux/cluster.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/cluster.cxx b/core/sqf/monitor/linux/cluster.cxx
index 49697dd..d1b3e91 100644
--- a/core/sqf/monitor/linux/cluster.cxx
+++ b/core/sqf/monitor/linux/cluster.cxx
@@ -67,6 +67,11 @@ using namespace std;
extern bool IAmIntegrating;
extern bool IAmIntegrated;
extern bool IsRealCluster;
+extern bool IsAgentMode;
+extern bool IsMaster;
+extern bool IsMPIChild;
+extern char MasterMonitorName[MAX_PROCESS_PATH];
+extern char Node_name[MPI_MAX_PROCESSOR_NAME];
extern bool ZClientEnabled;
extern char IntegratingMonitorPort[MPI_MAX_PORT_NAME];
extern char MyCommPort[MPI_MAX_PORT_NAME];
@@ -289,11 +294,12 @@ void CCluster::NodeTmReady( int nid )
if (trace_settings & (TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
- trace_printf( "%s@%d - TmReady, nid=%d, tm count=%d, soft node down=%d\n"
+ trace_printf( "%s@%d - TmReady, nid=%d, tm count=%d, soft node down=%d, LNodesCount=%d\n"
, method_name, __LINE__
, nid
, tmReadyCount_
- , MyNode->IsSoftNodeDown() );
+ , MyNode->IsSoftNodeDown()
+ , MyNode->GetLNodesCount() );
}
MyNode->StartPStartDPersistentDTM( nid );
@@ -352,6 +358,131 @@ void CCluster::NodeReady( CNode *spareNode )
TRACE_EXIT;
}
+// Assign leaders as required
+// Current leaders are TM Leader and Monitor Leader
+void CCluster::AssignLeaders( int pnid, bool checkProcess )
+{
+ const char method_name[] = "CCluster::AssignLeaders";
+ TRACE_ENTRY;
+
+ AssignTmLeader ( pnid, checkProcess );
+ AssignMonitorLeader ( pnid );
+
+ TRACE_EXIT;
+}
+
+// Assign montior lead in the case of failure
+void CCluster::AssignMonitorLeader( int pnid )
+{
+ const char method_name[] = "CCluster::AssignMonitorLeader";
+ TRACE_ENTRY;
+
+ int i = 0;
+ int rc = 0;
+
+ int lMonitorLeaderPNid = MonitorLeaderPNid;
+ CNode *node = NULL;
+
+ if (MonitorLeaderPNid != pnid)
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
+ {
+ trace_printf( "%s@%d" " - (MasterMonitor) returning, pnid %d != monitorLead %d\n"
+ , method_name, __LINE__, pnid, MonitorLeaderPNid );
+ }
+ return;
+ }
+
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
+ {
+ trace_printf( "%s@%d" " - (MasterMonitor) Node " "%d" " MonitorLeader failed!\n"
+ , method_name, __LINE__, MonitorLeaderPNid );
+ }
+
+ for (i=0; i<GetConfigPNodesMax(); i++)
+ {
+ lMonitorLeaderPNid++;
+
+ if (lMonitorLeaderPNid == GetConfigPNodesMax())
+ {
+ lMonitorLeaderPNid = 0; // restart with nid 0
+ }
+
+ if (lMonitorLeaderPNid == pnid)
+ {
+ continue; // this is the node that is going down, skip it
+ }
+
+ if (Node[lMonitorLeaderPNid] == NULL)
+ {
+ continue;
+ }
+
+ node = Node[lMonitorLeaderPNid];
+
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
+ {
+ trace_printf( "%s@%d - Node pnid=%d (%s), phase=%s, isSoftNodeDown=%d\n"
+ , method_name, __LINE__
+ , node->GetPNid()
+ , node->GetName()
+ , NodePhaseString(node->GetPhase())
+ , node->IsSoftNodeDown());
+ }
+
+ if ( node->IsSpareNode() ||
+ node->IsSoftNodeDown() ||
+ node->GetState() != State_Up ||
+ node->GetPhase() != Phase_Ready )
+ {
+ continue; // skip this node for any of the above reasons
+ }
+
+ MonitorLeaderPNid = node->GetPNid();
+
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
+ {
+ trace_printf("%s@%d" " - Node " "%d" " is the new MonitorLeaderPNid." "\n", method_name, __LINE__, MonitorLeaderPNid);
+ }
+
+ if (ZClientEnabled)
+ {
+ rc = ZClient->CreateMasterZNode ( node->GetName() );
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
+ {
+ trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader CreateMasterZNode with rc = %d\n", method_name, __LINE__, rc);
+ }
+ if ( (rc == ZOK) || (rc == ZNODEEXISTS) )
+ {
+ if ( IsAgentMode )
+ {
+ rc = ZClient->WatchMasterNode( node->GetName( ) );
+ if ( trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC) )
+ {
+ trace_printf( "%s@%d" " (MasterMonitor) AssignMonitorLeader WatchMasterNode with rc = %d\n", method_name, __LINE__, rc );
+ }
+ }
+ }
+ else
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
+ {
+ trace_printf("%s@%d" " (MasterMonitor) AssignMonitorLeader Unable to set create or set watch\n", method_name, __LINE__);
+ }
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], Unable to set create or set watch on master node %s\n"
+ , method_name, node->GetName() );
+ mon_log_write(MON_ZCLIENT_CREATEORSETMASTERWATCH, SQ_LOG_ERR, buf);
+ }
+ }
+
+ break;
+ }
+
+ TRACE_EXIT;
+}
+
// Assigns a new TMLeader if given pnid is same as TmLeaderNid
// TmLeader is a logical node num.
// pnid has gone down, so if that node was previously the TM leader, a new one needs to be chosen.
@@ -494,6 +625,7 @@ CCluster::CCluster (void)
configPNodesMax_ (-1),
NodeMap (NULL),
TmLeaderNid (-1),
+ MonitorLeaderPNid (-1),
tmReadyCount_(0),
minRecvCount_(4096),
recvBuffer_(NULL),
@@ -529,6 +661,7 @@ CCluster::CCluster (void)
const char method_name[] = "CCluster::CCluster";
TRACE_ENTRY;
+ configMaster_ = -1;
MPI_Comm_set_errhandler(MPI_COMM_WORLD,MPI_ERRORS_RETURN);
char *env = getenv("SQ_MON_CHECK_SEQNUM");
@@ -548,6 +681,9 @@ CCluster::CCluster (void)
CClusterConfig *clusterConfig = Nodes->GetClusterConfig();
configPNodesMax_ = clusterConfig->GetPNodesConfigMax();
+ // get master from CClusterConfig
+ configMaster_ = clusterConfig->GetConfigMaster();
+
// Compute minimum "sync cycles" per second. The minimum is 1/10
// the expected number, assuming "next_test_delay" cycles per second (where
// next_test_delay is in microseconds).
@@ -640,6 +776,21 @@ CCluster::~CCluster (void)
const char method_name[] = "CCluster::~CCluster";
TRACE_ENTRY;
+ if (epollFD_ != -1)
+ {
+ close( epollFD_ );
+ }
+
+ if (commSock_ != -1)
+ {
+ close( commSock_ );
+ }
+
+ if (syncSock_ != -1)
+ {
+ close( syncSock_ );
+ }
+
delete [] comms_;
delete [] otherMonRank_;
delete [] socks_;
@@ -677,26 +828,26 @@ unsigned long long CCluster::EnsureAndGetSeqNum(cluster_state_def_t nodestate[])
unsigned long long seqNum = 0;
- for (int i = 0; i < GetConfigPNodesMax(); i++)
+ for (int i = 0; i < GetConfigPNodesCount(); i++)
{
if (trace_settings & TRACE_RECOVERY)
{
- trace_printf("%s@%d nodestate[%d].seq_num=%lld, seqNum=%lld\n", method_name, __LINE__, i, nodestate[i].seq_num, seqNum );
+ trace_printf("%s@%d nodestate[%d].seq_num=%lld, seqNum=%lld\n", method_name, __LINE__, i, nodestate[indexToPnid_[i]].seq_num, seqNum );
}
- if (nodestate[i].seq_num > 1)
+ if (nodestate[indexToPnid_[i]].seq_num > 1)
{
if (seqNum == 0)
{
- seqNum = nodestate[i].seq_num;
+ seqNum = nodestate[indexToPnid_[i]].seq_num;
}
else
{
- assert(nodestate[i].seq_num == seqNum);
+ assert(nodestate[indexToPnid_[i]].seq_num == seqNum);
}
}
if (trace_settings & TRACE_RECOVERY)
{
- trace_printf("%s@%d nodestate[%d].seq_num=%lld, seqNum=%lld\n", method_name, __LINE__, i, nodestate[i].seq_num, seqNum );
+ trace_printf("%s@%d nodestate[%d].seq_num=%lld, seqNum=%lld\n", method_name, __LINE__, i, nodestate[indexToPnid_[i]].seq_num, seqNum );
}
}
@@ -857,6 +1008,7 @@ void CCluster::HardNodeDown (int pnid, bool communicate_state)
if ( ZClientEnabled )
{
ZClient->WatchNodeDelete( node->GetName() );
+ ZClient->WatchNodeMasterDelete( node->GetName() );
}
}
}
@@ -875,7 +1027,7 @@ void CCluster::HardNodeDown (int pnid, bool communicate_state)
if ( Emulate_Down )
{
IAmIntegrated = false;
- AssignTmLeader(pnid, false);
+ AssignLeaders(pnid, false);
}
TRACE_EXIT;
@@ -976,7 +1128,7 @@ void CCluster::SoftNodeDown( int pnid )
}
IAmIntegrated = false;
- AssignTmLeader(pnid, false);
+ AssignLeaders(pnid, false);
TRACE_EXIT;
}
@@ -1237,8 +1389,8 @@ int CCluster::HardNodeUp( int pnid, char *node_name )
TRACE_ENTRY;
if (trace_settings & (TRACE_REQUEST | TRACE_INIT | TRACE_RECOVERY))
- trace_printf( "%s@%d - pnid=%d, name=%s\n"
- , method_name, __LINE__, pnid, node_name );
+ trace_printf( "%s@%d - pnid=%d, name=%s (MyPNID = %d)\n"
+ , method_name, __LINE__, pnid, node_name, MyPNID );
if ( pnid == -1 )
{
@@ -2252,7 +2404,7 @@ void CCluster::HandleOtherNodeMsg (struct internal_msg_def *recv_msg,
{
case SyncType_TmData:
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
- trace_printf("%s@%d - TMSYNC(TmData) on Node %s (pnid=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid);
+ trace_printf("%s@%d - TMSYNC(TmData) on Node %s (pnid=%d), (phase=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid, MyNode->GetPhase());
if ( ! MyNode->IsSpareNode() && MyNode->GetPhase() != Phase_Ready )
{
MyNode->CheckActivationPhase();
@@ -2871,16 +3023,49 @@ void CCluster::InitializeConfigCluster( void )
InitServerSock();
}
- // The new monitor in a real cluster initializes all
- // existing nodes to a down state.
- // ReIntegrate() will set the state to up when communication is established.
- if ( IAmIntegrating )
+ if (trace_settings & TRACE_INIT)
+ {
+ trace_printf( "%s@%d (MasterMonitor) IAmIntegrating=%d,"
+ " IsAgentMode=%d, IsMaster=%d,"
+ " MasterMonitorName=%s, Node_name=%s\n"
+ , method_name, __LINE__
+ , IAmIntegrating
+ , IsAgentMode, IsMaster, MasterMonitorName, Node_name );
+ }
+
+ if (IAmIntegrating || IsAgentMode)
{
+ int TmLeaderPNid = -1;
+ if (IsMaster)
+ {
+ TmLeaderNid = Nodes->GetFirstNid();
+ TmLeaderPNid = LNode[TmLeaderNid]->GetNode()->GetPNid();
+ }
+ // Non-master monitors in AGENT mode in a real cluster initialize all
+ // remote nodes to a down state. The master monitor and the joining
+ // monitors will set the joining node state to up as part of the node
+ // re-integration processing as monitor processes join the cluster
+ // through the master.
for (int i=0; i < clusterConfig->GetPNodesCount(); i++)
{
- if ( Node[indexToPnid_[i]] && Node[indexToPnid_[i]]->GetPNid() != MyPNID )
+ if (Node[indexToPnid_[i]])
{
- Node[indexToPnid_[i]]->SetState( State_Down );
+ if (Node[indexToPnid_[i]]->GetPNid() == MyPNID)
+ { // Set bit indicating node is up
+ upNodes_.upNodes[indexToPnid_[i]/MAX_NODE_BITMASK] |=
+ (1ull << (indexToPnid_[i]%MAX_NODE_BITMASK));
+ }
+ else
+ { // Set node state to down
+ Node[indexToPnid_[i]]->SetState( State_Down );
+ if (IsMaster)
+ {
+ if (TmLeaderPNid == indexToPnid_[i])
+ {
+ AssignTmLeader(indexToPnid_[i], false);
+ }
+ }
+ }
}
}
}
@@ -3060,6 +3245,23 @@ void CCluster::InitializeConfigCluster( void )
if (nodeNames) delete [] nodeNames;
}
+ if ( CommType == CommType_Sockets )
+ {
+ // Allgather() cluster sockets are established as remote
+ // monitor processes join the cluster
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ for ( int i =0; i < clusterConfig->GetPNodesCount() ; i++ )
+ {
+ trace_printf( "%s@%d %s (%d), state=%s, socks_[%d]=%d\n"
+ , method_name, __LINE__
+ , Node[indexToPnid_[i]]->GetName()
+ , Node[indexToPnid_[i]]->GetPNid()
+ , StateString(Node[indexToPnid_[i]]->GetState())
+ , indexToPnid_[i], socks_[indexToPnid_[i]]);
+ }
+ }
+ }
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
for ( int i =0; i < MAX_NODE_MASKS ; i++ )
@@ -3072,7 +3274,10 @@ void CCluster::InitializeConfigCluster( void )
// Kill the MPICH hydra_pmi_proxy to prevent it from killing all
// processes in cluster when mpirun or monitor processes are killed
- kill( getppid(), SIGKILL );
+ if (!IsAgentMode || (IsAgentMode && IsMPIChild))
+ {
+ kill( getppid(), SIGKILL );
+ }
TRACE_EXIT;
}
@@ -3807,10 +4012,31 @@ void CCluster::ReIntegrateSock( int initProblem )
TEST_POINT( TP010_NODE_UP );
// Connect with my creator monitor
- joinSock_ = Monitor->Connect( IntegratingMonitorPort );
- if ( joinSock_ < 0 )
+ bool lv_done = false;
+ bool lv_did_not_connect_in_first_attempt = false;
+ while ( ! lv_done )
{
- HandleReintegrateError( joinSock_, Reintegrate_Err1, -1, NULL, true );
+ joinSock_ = Monitor->Connect( IntegratingMonitorPort );
+ if ( joinSock_ < 0 )
+ {
+ if ( IsAgentMode )
+ {
+ lv_did_not_connect_in_first_attempt = true;
+ sleep( 15 );
+ }
+ else
+ {
+ HandleReintegrateError( joinSock_, Reintegrate_Err1, -1, NULL, true );
+ }
+ }
+ else
+ {
+ if ( lv_did_not_connect_in_first_attempt )
+ {
+ sleep( 10 );
+ }
+ lv_done = true;
+ }
}
mem_log_write(CMonLog::MON_REINTEGRATE_4, MyPNID);
@@ -4281,8 +4507,6 @@ void CCluster::SetIntegratingPNid( int pnid )
TRACE_ENTRY;
integratingPNid_ = pnid;
- // Indicate to the commAcceptor thread to stop accepting connections
- CommAccept.stopAccepting();
TRACE_EXIT;
}
@@ -6181,8 +6405,8 @@ void CCluster::HandleDownNode( int pnid )
if (trace_settings & TRACE_INIT)
trace_printf("%s@%d - Added down node to list, pnid=%d, name=(%s)\n", method_name, __LINE__, downNode->GetPNid(), downNode->GetName());
- // assign new TmLeader if TMLeader node is dead.
- AssignTmLeader(pnid, false);
+ // assign new leaders if needed
+ AssignLeaders(pnid, false);
// Build available list of spare nodes
CNode *spareNode;
@@ -8276,6 +8500,19 @@ int CCluster::MkSrvSock( int *pport )
return ( -1 );
}
+ int reuse = 1; // sockopt reuse option
+ if ( setsockopt( sock, SOL_SOCKET, SO_REUSEADDR, (char *) &reuse, sizeof(int) ) )
+ {
+ char la_buf[MON_STRING_BUF_SIZE];
+ int err = errno;
+ sprintf( la_buf, "[%s], setsockopt(SO_REUSEADDR) failed! errno=%d (%s)\n"
+ , method_name, err, strerror( err ));
+ mon_log_write(MON_CLUSTER_MKSRVSOCK_4, SQ_LOG_ERR, la_buf);
+ close( sock );
+ return ( -1 );
+ }
+
+
// Bind socket.
size = sizeof(sockinfo);
memset( (char *) &sockinfo, 0, size );
http://git-wip-us.apache.org/repos/asf/trafodion/blob/bded0e84/core/sqf/monitor/linux/cluster.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/cluster.h b/core/sqf/monitor/linux/cluster.h
index 58d3540..6b658ae 100644
--- a/core/sqf/monitor/linux/cluster.h
+++ b/core/sqf/monitor/linux/cluster.h
@@ -113,7 +113,9 @@ public:
#ifndef USE_BARRIER
void ArmWakeUpSignal (void);
#endif
+ void AssignLeaders( int pnid, bool checkProcess );
void AssignTmLeader( int pnid, bool checkProcess );
+ void AssignMonitorLeader( int pnid );
void stats();
void CompleteSyncCycle()
{ syncCycle_.lock(); syncCycle_.wait(); syncCycle_.unlock(); }
@@ -124,6 +126,8 @@ public:
void ExpediteDown( void );
inline int GetTmLeader( void ) { return( TmLeaderNid); }
inline void SetTmLeader( int tmLeaderNid ) { TmLeaderNid = tmLeaderNid; }
+ inline int GetMonitorLeader( void ) { return( MonitorLeaderPNid); }
+ inline void SetMonitorLeader( int monitorLeaderPNid ) { MonitorLeaderPNid = monitorLeaderPNid; }
int GetDownedNid( void );
inline int GetTmSyncPNid( void ) { return( TmSyncPNid ); } // Physical Node ID of current TmSync operations master
void InitClusterComm(int worldSize, int myRank, int *rankToPnid);
@@ -177,6 +181,7 @@ public:
bool ReinitializeConfigCluster( bool nodeAdded, int pnid );
int incrGetVerifierNum();
+ int getConfigMaster() { return configMaster_; }
enum { SYNC_MAX_RESPONSIVE = 1 }; // Max seconds before sync thread is "stuck"
@@ -201,6 +206,7 @@ protected:
int syncSock_;
int epollFD_;
int *indexToPnid_;
+ int configMaster_;
CNode **Node; // array of nodes
CLNode **LNode; // array of logical nodes
@@ -229,6 +235,7 @@ private:
int configPNodesMax_; // max # of physical nodes that can be configured
int *NodeMap; // Mapping of Node ranks to COMM_WORLD ranks
int TmLeaderNid; // Nid of currently assigned TM Leader node
+ int MonitorLeaderPNid; // PNid of currently assigned Monitor leader node
int tmReadyCount_; // # of DTM processes ready for transactions
size_t minRecvCount_; // minimum size of receive buffer for allgather
http://git-wip-us.apache.org/repos/asf/trafodion/blob/bded0e84/core/sqf/monitor/linux/commaccept.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/commaccept.cxx b/core/sqf/monitor/linux/commaccept.cxx
index 21b30a6..11c12d7 100644
--- a/core/sqf/monitor/linux/commaccept.cxx
+++ b/core/sqf/monitor/linux/commaccept.cxx
@@ -556,6 +556,25 @@ void CCommAccept::processNewSock( int joinFd )
node= Nodes->GetNode( nodeId.nodeName );
+ if ( node == NULL )
+ {
+ close( joinFd );
+
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], got connection from unknown "
+ "node %d (%s). Ignoring it.\n"
+ , method_name
+ , nodeId.pnid
+ , nodeId.nodeName);
+ mon_log_write(MON_COMMACCEPT_9, SQ_LOG_ERR, buf);
+
+ // Requests is complete, begin accepting connections again
+ CommAccept.startAccepting();
+
+ return;
+ }
+
if ( nodeId.ping )
{
// Reply with my node info
@@ -595,6 +614,10 @@ void CCommAccept::processNewSock( int joinFd )
, method_name, node?node->GetName():"", ErrorMsg(rc));
mon_log_write(MON_COMMACCEPT_19, SQ_LOG_ERR, buf);
}
+
+ // Requests is complete, begin accepting connections again
+ CommAccept.startAccepting();
+
return;
}
@@ -607,53 +630,6 @@ void CCommAccept::processNewSock( int joinFd )
, nodeId.creatorShellVerifier );
}
- int pnid = -1;
- if ( node != NULL )
- { // Store port numbers for the node
- char commPort[MPI_MAX_PORT_NAME];
- char syncPort[MPI_MAX_PORT_NAME];
- strncpy(commPort, nodeId.commPort, MPI_MAX_PORT_NAME);
- strncpy(syncPort, nodeId.syncPort, MPI_MAX_PORT_NAME);
- char *pch1;
- char *pch2;
- pnid = nodeId.pnid;
-
- node->SetCommPort( commPort );
- pch1 = strtok (commPort,":");
- pch1 = strtok (NULL,":");
- node->SetCommSocketPort( atoi(pch1) );
-
- node->SetSyncPort( syncPort );
- pch2 = strtok (syncPort,":");
- pch2 = strtok (NULL,":");
- node->SetSyncSocketPort( atoi(pch2) );
-
- if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
- {
- trace_printf( "%s@%d - Setting node %d (%s), commPort=%s(%d), syncPort=%s(%d)\n"
- , method_name, __LINE__
- , node->GetPNid()
- , node->GetName()
- , pch1, atoi(pch1)
- , pch2, atoi(pch2) );
- }
- }
- else
- {
- close( joinFd );
-
- char buf[MON_STRING_BUF_SIZE];
- snprintf( buf, sizeof(buf)
- , "[%s], got connection from unknown "
- "node %d (%s). Ignoring it.\n"
- , method_name
- , nodeId.pnid
- , nodeId.nodeName);
- mon_log_write(MON_COMMACCEPT_9, SQ_LOG_ERR, buf);
-
- return;
- }
-
// Sanity check, re-integrating node must be down
if ( node->GetState() != State_Down )
{
@@ -672,9 +648,43 @@ void CCommAccept::processNewSock( int joinFd )
, StateString(node->GetState()));
mon_log_write(MON_COMMACCEPT_10, SQ_LOG_ERR, buf);
+ // Requests is complete, begin accepting connections again
+ CommAccept.startAccepting();
+
return;
}
+ int pnid = -1;
+
+ // Store port numbers for the node
+ char commPort[MPI_MAX_PORT_NAME];
+ char syncPort[MPI_MAX_PORT_NAME];
+ strncpy(commPort, nodeId.commPort, MPI_MAX_PORT_NAME);
+ strncpy(syncPort, nodeId.syncPort, MPI_MAX_PORT_NAME);
+ char *pch1;
+ char *pch2;
+ pnid = nodeId.pnid;
+
+ node->SetCommPort( commPort );
+ pch1 = strtok (commPort,":");
+ pch1 = strtok (NULL,":");
+ node->SetCommSocketPort( atoi(pch1) );
+
+ node->SetSyncPort( syncPort );
+ pch2 = strtok (syncPort,":");
+ pch2 = strtok (NULL,":");
+ node->SetSyncSocketPort( atoi(pch2) );
+
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d - Setting node %d (%s), commPort=%s(%d), syncPort=%s(%d)\n"
+ , method_name, __LINE__
+ , node->GetPNid()
+ , node->GetName()
+ , pch1, atoi(pch1)
+ , pch2, atoi(pch2) );
+ }
+
mem_log_write(CMonLog::MON_CONNTONEWMON_4, pnid);
if ( MyNode->IsCreator() )
@@ -916,6 +926,8 @@ void CCommAccept::commAcceptorIB()
interComm = MPI_COMM_NULL;
rc = MPI_Comm_accept( MyCommPort, MPI_INFO_NULL, 0, MPI_COMM_SELF,
&interComm );
+ // Stop accepting connections until this request completes
+ CommAccept.stopAccepting();
}
else
{
@@ -988,6 +1000,8 @@ void CCommAccept::commAcceptorSock()
mem_log_write(CMonLog::MON_CONNTONEWMON_1);
joinFd = Monitor->AcceptCommSock();
+ // Stop accepting connections until this request completes
+ CommAccept.stopAccepting();
}
else
{
http://git-wip-us.apache.org/repos/asf/trafodion/blob/bded0e84/core/sqf/monitor/linux/mlio.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/mlio.cxx b/core/sqf/monitor/linux/mlio.cxx
index 61803f8..7db35ec 100644
--- a/core/sqf/monitor/linux/mlio.cxx
+++ b/core/sqf/monitor/linux/mlio.cxx
@@ -1261,7 +1261,13 @@ SQ_LocalIOToClient::SQ_LocalIOToClient(int nid)
if (cmid == -1)
{
if (trace_settings & TRACE_INIT)
- trace_printf("%s@%d" " failed shmget(" "%d" "), errno=" "%d" "\n", method_name, __LINE__, (shsize), errno);
+ {
+ int err = errno;
+ char la_buf[MON_STRING_BUF_SIZE];
+ trace_printf( "%s@%d" " failed shmget(%d), errno=%d (%s)\n"
+ , method_name, __LINE__
+ , (shsize), err, strerror(err) );
+ }
if ( errno == EEXIST)
{
// and try getting it with a smaller size
http://git-wip-us.apache.org/repos/asf/trafodion/blob/bded0e84/core/sqf/monitor/linux/monitor.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/monitor.cxx b/core/sqf/monitor/linux/monitor.cxx
index 70df7cc..124b1ff 100755
--- a/core/sqf/monitor/linux/monitor.cxx
+++ b/core/sqf/monitor/linux/monitor.cxx
@@ -53,6 +53,7 @@ using namespace std;
#include "tmsync.h"
#include "cluster.h"
#include "monitor.h"
+#include "props.h"
#ifdef DMALLOC
#include "dm.h"
@@ -99,12 +100,16 @@ char MySyncPort[MPI_MAX_PORT_NAME] = {'\0'};
char Node_name[MPI_MAX_PROCESSOR_NAME] = {'\0'};
sigset_t SigSet;
bool Emulate_Down = false;
-long next_test_delay = 10000; // in usec.
-
+long next_test_delay = 100000; // in usec. (default 100 msec)
+CClusterConfig *ClusterConfig = NULL;
bool IAmIntegrating = false;
bool IAmIntegrated = false;
char IntegratingMonitorPort[MPI_MAX_PORT_NAME] = {'\0'};
bool IsRealCluster = true;
+bool IsAgentMode = false;
+bool IsMaster = false;
+bool IsMPIChild = false;
+char MasterMonitorName[MAX_PROCESS_PATH]= {'\0'};
CommType_t CommType = CommType_Undefined;
bool SMSIntegrating = false;
int CreatorShellPid = -1;
@@ -865,9 +870,9 @@ void HandleNodeExpiration( const char *nodeName )
TRACE_EXIT;
}
-void CMonitor::CreateZookeeperClient( void )
+void CreateZookeeperClient( void )
{
- const char method_name[] = "CMonitor::CreateZookeeperClient";
+ const char method_name[] = "CreateZookeeperClient";
TRACE_ENTRY;
if ( ZClientEnabled )
@@ -961,9 +966,9 @@ void CMonitor::CreateZookeeperClient( void )
TRACE_EXIT;
}
-void CMonitor::StartZookeeperClient( void )
+void StartZookeeperClient( void )
{
- const char method_name[] = "CMonitor::StartZookeeperClient";
+ const char method_name[] = "StartZookeeperClient";
TRACE_ENTRY;
int rc = -1;
@@ -1043,19 +1048,71 @@ int main (int argc, char *argv[])
char temp_fname[MAX_PROCESS_PATH];
char buf[MON_STRING_BUF_SIZE];
unsigned int initSleepTime = 1; // 1 second
+
mallopt(M_ARENA_MAX, 4); // call to limit the number of arena's of monitor to 4.This call doesn't seem to have any effect !
CALL_COMP_DOVERS(monitor, argc, argv);
const char method_name[] = "main";
+ if (argc < 2) {
+ printf("error: monitor needs an argument...exitting...\n");
+ exit(0);
+ }
+
+ int lv_arg_index = 1;
+ while ( lv_arg_index < argc )
+ {
+ // Installations like Cloudera Manager, the monitor is started in AGENT mode
+ if ( strcmp( argv[lv_arg_index], "COLD_AGENT" ) == 0 )
+ {
+ IsAgentMode = true;
+ }
+
+ lv_arg_index++;
+ }
+
// Set flag to indicate whether we are operating in a real cluster
// or a virtual cluster. This is used throughout the monitor when
// behavior differs for a real vs. virtual cluster environment.
- if ( getenv("SQ_VIRTUAL_NODES") )
+ if ( !IsAgentMode )
{
- IsRealCluster = false;
- Emulate_Down = true;
+ if ( getenv( "SQ_VIRTUAL_NODES" ) )
+ {
+ IsRealCluster = false;
+ Emulate_Down = true;
+ }
+ if (IsRealCluster)
+ {
+ // The monitor processes may be started by MPIrun utility
+ env = getenv("SQ_MON_CREATOR");
+ if ( env != NULL && strcmp(env, "MPIRUN") == 0 )
+ {
+ IsMPIChild = true;
+ }
+ // The monitor can be set to run in AGENT mode
+ env = getenv("SQ_MON_RUN_MODE");
+ if ( env != NULL && strcmp(env, "AGENT") == 0 )
+ {
+ IsAgentMode = true;
+ }
+ }
+ }
+
+ if ( IsAgentMode )
+ {
+ MON_Props xprops( true );
+ xprops.load( "monitor.env" );
+ MON_Smap_Enum xenum( &xprops );
+ while ( xenum.more( ) )
+ {
+ char *xkey = xenum.next( );
+ const char *xvalue = xprops.get( xkey );
+ if ( xkey && xkey[0] && xvalue )
+ {
+ setenv( xkey, xvalue, 1 );
+ }
+ }
}
MonLog = new CMonLog( "log4cxx.monitor.mon.config", "MON", "alt.mon", -1, -1, getpid(), "$MONITOR" );
@@ -1240,7 +1297,7 @@ int main (int argc, char *argv[])
abort();
}
- if (argc > 3 && strcmp (argv[2], "-integrate") == 0)
+ if ((!IsAgentMode) && (argc > 3 && strcmp (argv[2], "-integrate") == 0))
{
switch( CommType )
{
@@ -1257,13 +1314,13 @@ int main (int argc, char *argv[])
}
break;
case CommType_Sockets:
- if ( isdigit (*argv[3]) )
+ if ( IsAgentMode || isdigit (*argv[3]) )
{
// In agent mode and when re-integrating (node up), all
// monitors processes start as a cluster of 1 and join to the
// creator monitor to establish the real cluster.
- // Therefore, MyPNID will always be zero when in and
- // it is necessary to use the node name to obtain the correct
+ // Therefore, MyPNID will always be zero them it is
+ // necessary to use the node name to obtain the correct
// <pnid> from the configuration which occurs when creating the
// CMonitor object down below. By setting MyPNID to -1, when the
// CCluster::InitializeConfigCluster() invoked during the creation
@@ -1306,8 +1363,15 @@ int main (int argc, char *argv[])
// Trace cannot be specified on startup command but need to
// check for trace environment variable settings.
MonTrace->mon_trace_init("0", NULL);
+
+ }
+
+ if (IsAgentMode)
+ {
+ CreatorShellPid = 1000; // per monitor.sh
+ CreatorShellVerifier = 0;
}
- else
+
if (argc == 3 && isdigit(*argv[2]) )
{
MonTrace->mon_trace_init(argv[2], "STDOUT");
@@ -1398,8 +1462,12 @@ int main (int argc, char *argv[])
MonStats->MonitorBusyIncr();
snprintf(buf, sizeof(buf),
- "[CMonitor::main], %s, Started! CommType: %s\n"
- , CALL_COMP_GETVERS2(monitor), CommTypeString( CommType ));
+ "[CMonitor::main], %s, Started! CommType: %s (%s%s%s)\n"
+ , CALL_COMP_GETVERS2(monitor)
+ , CommTypeString( CommType )
+ , IsRealCluster?"RealCluster":"VirtualCluster"
+ , IsAgentMode?"/AgentMode":""
+ , IsMPIChild?"/MPIChild":"" );
mon_log_write(MON_MONITOR_MAIN_3, SQ_LOG_INFO, buf);
#ifdef DMALLOC
@@ -1420,11 +1488,230 @@ int main (int argc, char *argv[])
// Create thread for monitoring redirected i/o.
// This is also used for monitor logs, so start it early.
Redirector.start();
+
+ // Create global configuration now
+ ClusterConfig = new CClusterConfig();
+ if (ClusterConfig)
+ {
+ bool traceEnabled = (trace_settings & TRACE_TRAFCONFIG) ? true : false;
+ if (ClusterConfig->Initialize( traceEnabled, MonTrace->getTraceFileName()))
+ {
+ if (!ClusterConfig->LoadConfig())
+ {
+ char la_buf[MON_STRING_BUF_SIZE];
+ sprintf(la_buf, "[%s], Failed to load cluster configuration.\n", method_name);
+ mon_log_write(MON_MONITOR_MAIN_12, SQ_LOG_CRIT, la_buf);
+
+ abort();
+ }
+ }
+ else
+ {
+ char la_buf[MON_STRING_BUF_SIZE];
+ sprintf(la_buf, "[%s], Failed to open cluster configuration.\n", method_name);
+ mon_log_write(MON_MONITOR_MAIN_13, SQ_LOG_CRIT, la_buf);
+
+ abort();
+ }
+ }
+ else
+ {
+ char la_buf[MON_STRING_BUF_SIZE];
+ sprintf(la_buf, "[%s], Failed to allocate cluster configuration.\n", method_name);
+ mon_log_write(MON_MONITOR_MAIN_14, SQ_LOG_CRIT, la_buf);
+
+ abort();
+ }
+
+ // Set up zookeeper and determine the master
+ if ( IsAgentMode || IsRealCluster )
+ {
+ // Zookeeper client is enabled only in a real cluster
+ env = getenv("SQ_MON_ZCLIENT_ENABLED");
+
+ if ( env )
+ {
+ if ( env && isdigit(*env) )
+ {
+ if ( strcmp(env,"0")==0 )
+ {
+ ZClientEnabled = false;
+ }
+ }
+ }
+
+ if ( ZClientEnabled )
+ {
+ CreateZookeeperClient( );
+ }
+ }
+ else
+ {
+ ZClientEnabled = false;
+ }
+
+ if (IsAgentMode)
+ {
+ if ((ZClientEnabled) && (ZClient != NULL))
+ {
+ // Do not wait, just see if one exists
+ const char *masterMonitor = ZClient->WaitForAndReturnMaster(false);
- // CNodeContainer loads static configuration from database
- Nodes = new CNodeContainer ();
+ if (masterMonitor)
+ {
+ strcpy (MasterMonitorName, masterMonitor);
+ // unfortunately, we have to do this to see if we are the master before
+ // other things are set up. This is how we must do that
+ if (strcmp(Node_name, masterMonitor) == 0)
+ {
+ IsMaster = true;
+ }
+ else
+ {
+ IsMaster = false;
+ }
+ }
+ else
+ {
+ strcpy (MasterMonitorName, ClusterConfig->GetConfigMasterByName());
+ if (strcmp (Node_name, ClusterConfig->GetConfigMasterByName()) == 0)
+ {
+ IsMaster = true;
+ }
+ else
+ {
+ IsMaster = false;
+ }
+ }
+
+ }
+ }
+
+ if (IsAgentMode)
+ {
+ if (!IsMaster)
+ {
+ MyPNID=-1;
+ SMSIntegrating = IAmIntegrating = true;
+ char *monitorPort = getenv ("MONITOR_COMM_PORT");
+ if (monitorPort)
+ {
+ strcpy( IntegratingMonitorPort, MasterMonitorName);
+ strcat( IntegratingMonitorPort, ":");
+ strcat( IntegratingMonitorPort, monitorPort);
+ }
+ if (trace_settings & TRACE_INIT)
+ {
+ trace_printf( "%s@%d (MasterMonitor) IsAgentMode = TRUE, I am NOT the master, "
+ "MyPNID=%d, master port=%s\n"
+ , method_name, __LINE__
+ , MyPNID, IntegratingMonitorPort );
+ }
+ }
+ else
+ {
+ if (trace_settings & TRACE_INIT)
+ {
+ trace_printf( "%s@%d (MasterMonitor) IsAgentMode = TRUE, I am the master, MyPNID=%d\n"
+ , method_name, __LINE__, MyPNID );
+ }
+ IAmIntegrating = false;
+ }
+ }
+ Nodes = new CNodeContainer ();
Config = new CConfigContainer ();
- Monitor = new CMonitor (procTermSig);
+ Monitor = new CMonitor (procTermSig);
+
+ if ( IsAgentMode )
+ {
+ if (trace_settings & TRACE_INIT)
+ {
+ trace_printf( "%s@%d MyPNID=%d\n"
+ , method_name, __LINE__, MyPNID );
+ }
+ MonLog->setPNid( MyPNID );
+ }
+
+ if (IsAgentMode)
+ {
+ CNode *myNode = Nodes->GetNode(MyPNID);
+ const char *masterMonitor=NULL;
+ if (myNode == NULL)
+ {
+ char la_buf[MON_STRING_BUF_SIZE];
+ sprintf( la_buf
+ , "[%s], Failed to get my Node, MyPNID=%d\n"
+ , method_name, MyPNID );
+ mon_log_write(MON_MONITOR_MAIN_15, SQ_LOG_CRIT, la_buf);
+
+ abort();
+ }
+
+ if ((ZClientEnabled) && (ZClient != NULL))
+ {
+ CNode *masterNode = Nodes->GetNode(MasterMonitorName);
+ if (!masterNode)
+ {
+ if (trace_settings & TRACE_INIT)
+ {
+ trace_printf("%s@%d (MasterMonitor) IsMaster == %d, masterNode is NULL, with MasterMonitorName %s\n", method_name, __LINE__, IsMaster, MasterMonitorName);
+ }
+ char la_buf[MON_STRING_BUF_SIZE];
+ sprintf(la_buf, "[%s], Failed to get my Master Node.\n", method_name);
+ mon_log_write(MON_MONITOR_MAIN_16, SQ_LOG_CRIT, la_buf);
+
+ abort();
+ }
+ else
+ {
+ if (trace_settings & TRACE_INIT)
+ {
+ trace_printf("%s@%d (MasterMonitor) IsMaster == %d, masterNode=%s\n", method_name, __LINE__, IsMaster, masterNode->GetName() );
+ }
+ }
+ Monitor->SetMonitorLeader( masterNode->GetPNid() );
+ if (MyPNID == masterNode->GetPNid())
+ {
+ ZClient->CreateMasterZNode ( myNode->GetName() );
+ strcpy (MasterMonitorName, myNode->GetName());
+ if (trace_settings & TRACE_INIT)
+ {
+ trace_printf("%s@%d (MasterMonitor) IsMaster == %d, set monitor lead to %d\n", method_name, __LINE__, IsMaster, MyPNID);
+ }
+ }
+ else
+ {
+ masterMonitor = ZClient->WaitForAndReturnMaster(true);
+ CNode *masterNode = NULL;
+ if (masterMonitor)
+ {
+ strcpy (MasterMonitorName, masterMonitor);
+ masterNode = Nodes->GetNode(MasterMonitorName);
+ }
+
+ if (masterNode)
+ {
+ if (trace_settings & TRACE_INIT)
+ {
+ trace_printf("%s@%d (MasterMonitor) IsMaster == %d, set monitor lead to %d\n", method_name, __LINE__, IsMaster, masterNode->GetPNid());
+ }
+ Monitor->SetMonitorLeader( masterNode->GetPNid() );
+ }
+ else
+ {
+ if (trace_settings & TRACE_INIT)
+ {
+ trace_printf("%s@%d (MasterMonitor) IsMaster == %d, masterNode is NULL, with MasterMonitorName %s\n", method_name, __LINE__, IsMaster, MasterMonitorName);
+ }
+ char la_buf[MON_STRING_BUF_SIZE];
+ sprintf(la_buf, "[%s], Failed to get my Master Node.\n", method_name);
+ mon_log_write(MON_MONITOR_MAIN_17, SQ_LOG_CRIT, la_buf);
+
+ abort();
+ }
+ }
+ }
+ }
if (!IAmIntegrating)
{
Config->Init ();
@@ -1493,7 +1780,6 @@ int main (int argc, char *argv[])
{
strcpy (Node_name, myNode->GetName());
}
-
// create with no caching, user read/write, group read/write, other read
fd = open( port_fname
, O_RDWR | O_TRUNC | O_CREAT | O_DIRECT
@@ -1539,7 +1825,6 @@ int main (int argc, char *argv[])
MPI_Abort(MPI_COMM_SELF,99);
}
free( ioBuffer );
-
int ret = SQ_theLocalIOToClient->initWorker();
if (ret)
{
@@ -1566,33 +1851,7 @@ int main (int argc, char *argv[])
printf("%s@%d" " RLIMIT_SIGPENDING cur=%d, max=%d\n", method_name, __LINE__, (int)Rl.rlim_cur, (int)Rl.rlim_max);
}
}
-
- if ( IsRealCluster )
- {
- // Zookeeper client is enabled only in a real cluster
- env = getenv("SQ_MON_ZCLIENT_ENABLED");
- if ( env )
- {
- if ( env && isdigit(*env) )
- {
- if ( strcmp(env,"0")==0 )
- {
- ZClientEnabled = false;
- }
- }
- }
-
- if ( ZClientEnabled )
- {
- Monitor->CreateZookeeperClient();
- }
- }
- else
- {
- ZClientEnabled = false;
- }
-
- if ( IAmIntegrating )
+ if ( IAmIntegrating )
{
// This monitor is integrating to (joining) an existing cluster
Monitor->ReIntegrate( 0 );
@@ -1602,7 +1861,7 @@ int main (int argc, char *argv[])
trace_printf("%s@%d" " After UpdateCluster" "\n", method_name, __LINE__);
}
else
- {
+ {
Monitor->EnterSyncCycle();
done = Monitor->exchangeNodeData();
Monitor->ExitSyncCycle();
@@ -1618,7 +1877,18 @@ int main (int argc, char *argv[])
{
if ( ZClientEnabled )
{
- Monitor->StartZookeeperClient();
+ {
+ StartZookeeperClient();
+ // Set watch for master
+ if (IsAgentMode)
+ {
+ ZClient->WatchMasterNode( MasterMonitorName );
+ }
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d (MasterMonitor) set watch for MasterMonitorName %s\n", method_name, __LINE__, MasterMonitorName );
+ }
+ }
}
}
http://git-wip-us.apache.org/repos/asf/trafodion/blob/bded0e84/core/sqf/monitor/linux/monitor.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/monitor.h b/core/sqf/monitor/linux/monitor.h
index 1b44c57..49308b9 100644
--- a/core/sqf/monitor/linux/monitor.h
+++ b/core/sqf/monitor/linux/monitor.h
@@ -63,7 +63,6 @@ public:
~CMonitor( void );
bool CompleteProcessStartup( struct message_def *msg );
- void CreateZookeeperClient( void );
void IncOpenCount(void);
void IncNoticeCount(void);
void IncProcessCount(void);
@@ -71,7 +70,6 @@ public:
void DecrNoticeCount(void);
void DecrProcessCount(void);
void StartPrimitiveProcesses( void );
- void StartZookeeperClient( void );
void openProcessMap ( void );
void writeProcessMapEntry ( const char * buf );
void writeProcessMapBegin( const char *name
http://git-wip-us.apache.org/repos/asf/trafodion/blob/bded0e84/core/sqf/monitor/linux/pnode.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/pnode.cxx b/core/sqf/monitor/linux/pnode.cxx
index 75c2137..485d013 100644
--- a/core/sqf/monitor/linux/pnode.cxx
+++ b/core/sqf/monitor/linux/pnode.cxx
@@ -49,6 +49,8 @@ using namespace std;
#include "replicate.h"
#include "reqqueue.h"
+#include "healthcheck.h"
+
extern CReqQueue ReqQueue;
extern char MyPath[MAX_PROCESS_PATH];
extern int MyPNID;
@@ -64,9 +66,13 @@ extern CNode *MyNode;
extern CMonStats *MonStats;
extern CRedirector Redirector;
extern CReplicate Replicator;
+extern CHealthCheck HealthCheck;
extern CMonTrace *MonTrace;
-
+extern bool IsAgentMode;
extern bool IAmIntegrating;
+extern char MasterMonitorName[MAX_PROCESS_PATH];
+extern char Node_name[MPI_MAX_PROCESSOR_NAME];
+extern CClusterConfig *ClusterConfig;
const char *StateString( STATE state);
const char *SyncStateString( SyncState state);
@@ -464,13 +470,14 @@ void CNode::CheckActivationPhase( void )
int tmCount = 0;
CLNode *lnode;
CProcess *process;
- bool tmReady;
+ bool tmReady = false;
const char method_name[] = "CNode::CheckActivationPhase";
TRACE_ENTRY;
// check for a TM process in each lnode
lnode = GetFirstLNode();
+
tmReady = lnode ? true : false;
for ( ; lnode ; lnode = lnode->GetNextP() )
{
@@ -1701,8 +1708,11 @@ void CNodeContainer::AddNodes( )
}
else
{
- if (pnid >= maxNode) // only for workstation acting as single node
- rank = -1;
+ if (pnid >= maxNode) // only for workstation acting as single node
+// || (IsAgentMode &&(strcmp( MasterMonitorName, Node_name ) != 0)))
+ {
+ rank = -1; // -1 creates node in down state
+ }
node = new CNode( (char *)pnodeConfig->GetName(), pnid, rank );
assert( node != NULL );
}
@@ -3134,7 +3144,7 @@ void CNodeContainer::SetupCluster( CNode ***pnode_list, CLNode ***lnode_list, in
if ( node->GetState() == State_Up && node->IsSpareNode() )
{
spareNodesConfigList_.push_back( node );
- if ( IAmIntegrating )
+ if (IAmIntegrating)
{
// do nothing. spareNodesList will get populated in the join phase.
}
@@ -3166,40 +3176,11 @@ void CNodeContainer::LoadConfig( void )
const char method_name[] = "CNodeContainer::LoadConfig";
TRACE_ENTRY;
+ // The configuration is now global. To minimize impact for the time being, just set the local
+ // pointer to the global configuration
if ( !clusterConfig_ )
{
- clusterConfig_ = new CClusterConfig();
- }
- if ( clusterConfig_ )
- {
- bool traceEnabled = (trace_settings & TRACE_TRAFCONFIG) ? true : false;
- if ( clusterConfig_->Initialize( traceEnabled, MonTrace->getTraceFileName() ) )
- {
- if ( ! clusterConfig_->LoadConfig() )
- {
- char la_buf[MON_STRING_BUF_SIZE];
- sprintf(la_buf, "[%s], Failed to load cluster configuration.\n", method_name);
- mon_log_write(MON_NODECONT_LOAD_CONFIG_1, SQ_LOG_CRIT, la_buf);
-
- abort();
- }
- }
- else
- {
- char la_buf[MON_STRING_BUF_SIZE];
- sprintf(la_buf, "[%s], Failed to open cluster configuration.\n", method_name);
- mon_log_write(MON_NODECONT_LOAD_CONFIG_2, SQ_LOG_CRIT, la_buf);
-
- abort();
- }
- }
- else
- {
- char la_buf[MON_STRING_BUF_SIZE];
- sprintf(la_buf, "[%s], Failed to allocate cluster configuration.\n", method_name);
- mon_log_write(MON_NODECONT_LOAD_CONFIG_3, SQ_LOG_CRIT, la_buf);
-
- abort();
+ clusterConfig_ = ClusterConfig;
}
TRACE_EXIT;
http://git-wip-us.apache.org/repos/asf/trafodion/blob/bded0e84/core/sqf/monitor/linux/process.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/process.cxx b/core/sqf/monitor/linux/process.cxx
index 6a8e08b..bce018b 100644
--- a/core/sqf/monitor/linux/process.cxx
+++ b/core/sqf/monitor/linux/process.cxx
@@ -72,6 +72,9 @@ extern CReqQueue ReqQueue;
#include "replicate.h"
+extern bool IsAgentMode;
+extern bool IsMaster;
+
extern bool PidMap;
extern int Measure;
extern int trace_level;
@@ -1651,13 +1654,39 @@ bool CProcess::Create (CProcess *parent, int & result)
}
string LDpath;
- if ( ldpathStrId_.nid != -1 )
- Config->strIdToString(ldpathStrId_, LDpath);
- if ( !LDpath.empty() )
+ static bool sv_getenv_ld_library_path_done = false;
+ static string sv_ld_library_path;
+ if (IsAgentMode)
{
- setEnvStrVal ( childEnv, nextEnv, "LD_LIBRARY_PATH", LDpath.c_str() );
+ if (! sv_getenv_ld_library_path_done)
+ {
+ sv_getenv_ld_library_path_done = true;
+ sv_ld_library_path = getenv( "LD_LIBRARY_PATH" );
+ if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
+ {
+ trace_printf( "%s@%d" " - LD_LIBRARY_PATH = " "%s" "\n", method_name, __LINE__, sv_ld_library_path.c_str() );
+ }
+ }
+ LDpath = sv_ld_library_path;
if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
- trace_printf("%s@%d - LD_LIBRARY_PATH = %s\n", method_name, __LINE__, LDpath.c_str());
+ {
+ trace_printf( "%s@%d" " - LD_LIBRARY_PATH = " "%s" "\n", method_name, __LINE__, LDpath.c_str() );
+ }
+ }
+ else
+ {
+ if (ldpathStrId_.nid != -1)
+ {
+ Config->strIdToString( ldpathStrId_, LDpath );
+ }
+ }
+ if (!LDpath.empty())
+ {
+ setEnvStrVal( childEnv, nextEnv, "LD_LIBRARY_PATH", LDpath.c_str( ) );
+ if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
+ {
+ trace_printf( "%s@%d - LD_LIBRARY_PATH = %s\n", method_name, __LINE__, LDpath.c_str() );
+ }
}
setEnvStr ( childEnv, nextEnv, "LD_BIND_NOW=true" );
@@ -1695,15 +1724,39 @@ bool CProcess::Create (CProcess *parent, int & result)
trace_printf("%s@%d - PWD=%s\n", method_name, __LINE__,
pwd.c_str());
}
-
-
string path;
- if ( pathStrId_.nid != -1 )
- Config->strIdToString( pathStrId_, path);
- setEnvStrVal ( childEnv, nextEnv, "PATH", path.c_str() );
+ static bool sv_getenv_path_done = false;
+ static string sv_path;
+ if (IsAgentMode)
+ {
+ if (! sv_getenv_path_done)
+ {
+ sv_getenv_path_done = true;
+ sv_path = getenv( "PATH" );
+ if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
+ {
+ trace_printf( "%s@%d" " - PATH = " "%s" "\n", method_name, __LINE__, sv_path.c_str() );
+ }
+ }
+ path = sv_path;
+ if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
+ {
+ trace_printf( "%s@%d" " - PATH = " "%s" "\n", method_name, __LINE__, path.c_str() );
+ }
+ }
+ else
+ {
+ if (pathStrId_.nid != -1)
+ {
+ Config->strIdToString( pathStrId_, path );
+ }
+ }
+ setEnvStrVal( childEnv, nextEnv, "PATH", path.c_str( ) );
if (trace_settings & (TRACE_SYNC_DETAIL | TRACE_REQUEST_DETAIL | TRACE_PROCESS_DETAIL))
- trace_printf("%s@%d" " - PATH = " "%s" "\n", method_name, __LINE__, path.c_str());
+ {
+ trace_printf( "%s@%d" " - PATH = " "%s" "\n", method_name, __LINE__, path.c_str() );
+ }
// Set values from registry as environment variables
setEnvFromRegistry ( childEnv, nextEnv );
http://git-wip-us.apache.org/repos/asf/trafodion/blob/bded0e84/core/sqf/monitor/linux/reqqueue.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/reqqueue.cxx b/core/sqf/monitor/linux/reqqueue.cxx
index e4e8dbd..b4d2529 100644
--- a/core/sqf/monitor/linux/reqqueue.cxx
+++ b/core/sqf/monitor/linux/reqqueue.cxx
@@ -56,6 +56,7 @@ extern int req_type_startup;
extern bool IAmIntegrating;
extern bool IAmIntegrated;
+
extern CommType_t CommType;
CReqResource::CReqResource()
http://git-wip-us.apache.org/repos/asf/trafodion/blob/bded0e84/core/sqf/monitor/linux/zclient.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/zclient.cxx b/core/sqf/monitor/linux/zclient.cxx
index 36a0600..107cf32 100644
--- a/core/sqf/monitor/linux/zclient.cxx
+++ b/core/sqf/monitor/linux/zclient.cxx
@@ -488,6 +488,103 @@ int CZClient::ZooExistRetry(zhandle_t *zh, const char *path, int watch, struct S
return rc;
}
+const char* CZClient::WaitForAndReturnMaster( bool doWait )
+{
+ const char method_name[] = "CZClient::WaitForAndReturnMaster";
+ TRACE_ENTRY;
+
+ bool found = false;
+ int rc = -1;
+ int retries = 0;
+ Stat stat;
+
+ struct String_vector nodes = {0, NULL};
+ stringstream ss;
+ ss.str( "" );
+ ss << zkRootNode_.c_str()
+ << zkRootNodeInstance_.c_str()
+ << ZCLIENT_MASTER_ZNODE;
+ string masterMonitor( ss.str( ) );
+
+ // wait for 3 minutes for giving up.
+ while ( (!found) && (retries < 180))
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d trafCluster=%s\n"
+ , method_name, __LINE__, masterMonitor.c_str() );
+ }
+ // Verify the existence of the parent ZCLIENT_MASTER_ZNODE
+ rc = ZooExistRetry( ZHandle, masterMonitor.c_str( ), 0, &stat );
+
+ if ( rc == ZNONODE )
+ {
+ if (doWait == false)
+ {
+ break;
+ }
+ continue;
+ }
+ else if ( rc == ZOK )
+ {
+ // Now get the list of available znodes in the cluster.
+ //
+ // This will return child znodes for each monitor process that has
+ // registered, including this process.
+ rc = zoo_get_children( ZHandle, masterMonitor.c_str( ), 0, &nodes );
+ if ( nodes.count > 0 )
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d nodes.count=%d\n"
+ , method_name, __LINE__
+ , nodes.count );
+ }
+ found = true;
+ }
+ else
+ {
+ if (doWait == false)
+ {
+ break;
+ }
+ usleep(1000000); // sleep for a second as to not overwhelm the system
+ retries++;
+ continue;
+ }
+ }
+
+ else // error
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d Error (MasterMonitor) WaitForAndReturnMaster returned rc (%d), retries %d\n"
+ , method_name, __LINE__, rc, retries );
+ }
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], ZooExistRetry() for %s failed with error %s\n"
+ , method_name, masterMonitor.c_str( ), zerror(rc));
+ mon_log_write(MON_ZCLIENT_WAITFORANDRETURNMASTER, SQ_LOG_ERR, buf);
+ break;
+ }
+ }
+
+ //should we assert nodes.count == 1?
+ if (found)
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d (MasterMonitor) Master Monitor found (%s)\n"
+ , method_name, __LINE__, masterMonitor.c_str() );
+ }
+ return nodes.data[0];
+ }
+
+ TRACE_EXIT;
+ return NULL;
+}
+
int CZClient::GetClusterZNodes( String_vector *nodes )
{
const char method_name[] = "CZClient::GetClusterZNodes";
@@ -700,7 +797,7 @@ void CZClient::HandleExpiredZNode( void )
int CZClient::InitializeZClient( void )
{
- const char method_name[] = "CZClient::MakeClusterZNodes";
+ const char method_name[] = "CZClient::InitializeZClient";
TRACE_ENTRY;
int rc;
@@ -799,6 +896,67 @@ bool CZClient::IsZNodeExpired( const char *nodeName, int &zerr )
return( expired );
}
+int CZClient::CreateMasterZNode( const char *nodeName )
+{
+ const char method_name[] = "CZClient::CreateMasterZNode";
+ TRACE_ENTRY;
+
+ int rc;
+ int retries = 0;
+
+ stringstream masterpath;
+ masterpath.str( "" );
+ masterpath << zkRootNode_.c_str()
+ << zkRootNodeInstance_.c_str()
+ << ZCLIENT_MASTER_ZNODE<< "/"
+ << nodeName;
+
+ string monZnode = masterpath.str( );
+
+ stringstream ss;
+ ss.str( "" );
+ ss <<nodeName << ":" << MyPNID;
+ string monData = ss.str( );
+
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d RegisterZNode(%s:%s)\n"
+ , method_name, __LINE__
+ , monZnode.c_str()
+ , monData.c_str() );
+ }
+
+ rc = RegisterZNode( monZnode.c_str(), monData.c_str(), ZOO_EPHEMERAL );
+ while ( ((rc == ZCONNECTIONLOSS) || (rc == ZOPERATIONTIMEOUT)) && retries < ZOOKEEPER_RETRY_COUNT)
+ {
+ sleep(ZOOKEEPER_RETRY_WAIT);
+ retries++;
+ rc = RegisterZNode( monZnode.c_str(), monData.c_str(), ZOO_EPHEMERAL );
+ }
+
+ if (rc != ZOK)
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d Error (MasterMonitor) Create master node for %s with rc = %d)\n"
+ , method_name, __LINE__, monZnode.c_str( ), rc);
+ }
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], RegisterZNode(%s) failed with error %s\n"
+ , method_name, monData.c_str(), zerror(rc) );
+ mon_log_write(MON_ZCLIENT_CREATEMASTERZNODE, SQ_LOG_ERR, buf);
+ return(rc); // Return the error
+ }
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d (MasterMonitor) Created master node for %s with rc = %d)\n"
+ , method_name, __LINE__, monZnode.c_str( ), rc);
+ }
+ TRACE_EXIT;
+ return(rc);
+}
+
int CZClient::MakeClusterZNodes( void )
{
const char method_name[] = "CZClient::MakeClusterZNodes";
@@ -908,6 +1066,40 @@ int CZClient::MakeClusterZNodes( void )
break;
}
+ ss.str( "" );
+ ss << zkRootNode_.c_str()
+ << zkRootNodeInstance_.c_str()
+ << ZCLIENT_MASTER_ZNODE;
+ string masterDir( ss.str( ) );
+
+ rc = ZooExistRetry( ZHandle, masterDir.c_str( ), 0, &stat );
+ switch (rc)
+ {
+ case ZOK:
+ break;
+ case ZNONODE:
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d RegisterZNode(%s)\n"
+ , method_name, __LINE__
+ , masterDir.c_str() );
+ }
+ rc = RegisterZNode( masterDir.c_str(), NULL, 0 );
+ if ( rc && rc != ZNODEEXISTS )
+ {
+ return(rc); // Return the error
+ }
+ rc = ZOK;
+ break;
+ default:
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], zoo_exists(%s) failed with error %s\n"
+ , method_name, masterDir.c_str(), zerror(rc) );
+ mon_log_write(MON_ZCLIENT_CHECKCLUSTERZNODES_3, SQ_LOG_ERR, buf);
+ break;
+ }
+
TRACE_EXIT;
return(rc);
}
@@ -1484,6 +1676,53 @@ void CZClient::WatchCluster( void )
TRACE_EXIT;
}
+int CZClient::WatchMasterNode( const char *nodeName )
+{
+ const char method_name[] = "CZClient::WatchMasterNode";
+ TRACE_ENTRY;
+
+ int rc;
+ stringstream newpath;
+ newpath.str( "" );
+ newpath << zkRootNode_.c_str()
+ << zkRootNodeInstance_.c_str()
+ << ZCLIENT_MASTER_ZNODE << "/"
+ << nodeName;
+ string monZnode = newpath.str( );
+
+ lock();
+ rc = SetZNodeWatch( monZnode );
+ unlock();
+ if ( rc != ZOK )
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d Error (MasterMonitor) WatchMasterNode failed with rc = %d for %s\n"
+ , method_name, __LINE__
+ , rc
+ , nodeName);
+ }
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], SetZNodeWatch(%s) failed!\n"
+ , method_name
+ , monZnode.c_str() );
+ mon_log_write(MON_ZCLIENT_WATCHNODE_1, SQ_LOG_ERR, buf);
+ }
+ else
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d (MasterMonitor) WatchMasterNode set on monZnode=%s\n"
+ , method_name, __LINE__
+ , monZnode.c_str() );
+ }
+ }
+
+ TRACE_EXIT;
+ return(rc);
+}
+
int CZClient::WatchNode( const char *nodeName )
{
const char method_name[] = "CZClient::WatchNode";
@@ -1524,6 +1763,108 @@ int CZClient::WatchNode( const char *nodeName )
return(rc);
}
+int CZClient::WatchNodeMasterDelete( const char *nodeName )
+{
+ const char method_name[] = "CZClient::WatchMasterDelete";
+ TRACE_ENTRY;
+
+ int rc = -1;
+ stringstream newpath;
+ newpath.str( "" );
+ newpath << zkRootNode_.c_str()
+ << zkRootNodeInstance_.c_str()
+ << ZCLIENT_MASTER_ZNODE
+ << nodeName;
+
+ string monZnode = newpath.str( );
+
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d zoo_delete(%s)\n"
+ , method_name, __LINE__
+ , monZnode.c_str() );
+ }
+
+ rc = zoo_delete( ZHandle
+ , monZnode.c_str( )
+ , -1 );
+ if ( rc == ZOK )
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d (MasterMonitor) WatchNodeMasterDelete deleted %s, with rc == ZOK\n"
+ , method_name, __LINE__
+ , nodeName );
+ }
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], znode (%s) deleted!\n"
+ , method_name, nodeName );
+ mon_log_write(MON_ZCLIENT_WATCHMASTERNODEDELETE_1, SQ_LOG_INFO, buf);
+ }
+ else if ( rc == ZNONODE )
+ {
+ // This is fine since we call it indiscriminately
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d (MasterMonitor) WatchNodeMasterDelete deleted %s, with rc == ZNONODE (fine)\n"
+ , method_name, __LINE__
+ , nodeName );
+ }
+ }
+ else if ( rc == ZCONNECTIONLOSS ||
+ rc == ZOPERATIONTIMEOUT )
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d (MasterMonitor) WatchNodeMasterDelete deleted %s, with rc == ZOK\n"
+ , method_name, __LINE__
+ , nodeName );
+ }
+ rc = ZOK;
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], znode (%s) already deleted or cannot be accessed!\n"
+ , method_name, nodeName );
+ mon_log_write(MON_ZCLIENT_WATCHMASTERNODEDELETE_2, SQ_LOG_INFO, buf);
+ }
+ else
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d (MasterMonitor) WatchNodeMasterDelete deleted %s, with rc == ZOK\n"
+ , method_name, __LINE__
+ , nodeName );
+ }
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], zoo_delete(%s) failed with error %s\n"
+ , method_name, nodeName, zerror(rc) );
+ mon_log_write(MON_ZCLIENT_WATCHMASTERNODEDELETE_3, SQ_LOG_CRIT, buf);
+ switch ( rc )
+ {
+ case ZSYSTEMERROR:
+ case ZRUNTIMEINCONSISTENCY:
+ case ZDATAINCONSISTENCY:
+ case ZMARSHALLINGERROR:
+ case ZUNIMPLEMENTED:
+ case ZBADARGUMENTS:
+ case ZINVALIDSTATE:
+ case ZSESSIONEXPIRED:
+ case ZCLOSING:
+ // Treat these error like a session expiration, since
+ // we can't communicate with quorum servers
+ HandleMyNodeExpiration();
+ break;
+ default:
+ break;
+ }
+ }
+
+ TRACE_EXIT;
+ return( rc );
+}
+
int CZClient::WatchNodeDelete( const char *nodeName )
{
const char method_name[] = "CZClient::WatchNodeDelete";
http://git-wip-us.apache.org/repos/asf/trafodion/blob/bded0e84/core/sqf/monitor/linux/zclient.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/zclient.h b/core/sqf/monitor/linux/zclient.h
index ea9bca3..6108021 100644
--- a/core/sqf/monitor/linux/zclient.h
+++ b/core/sqf/monitor/linux/zclient.h
@@ -104,6 +104,7 @@ using namespace std;
#define ZCLIENT_TRAFODION_ZNODE "/trafodion"
#define ZCLIENT_INSTANCE_ZNODE "/instance"
+#define ZCLIENT_MASTER_ZNODE "/master"
typedef list<string> ZNodeList_t;
@@ -137,6 +138,7 @@ public:
, const char *instanceZNode );
~CZClient( void );
+ int CreateMasterZNode( const char *nodeName );
int GetSessionTimeout( void) { return( zkSessionTimeout_ ); }
bool IsZNodeExpired( const char *nodeName, int &zerr );
void MonitorZCluster( void );
@@ -148,8 +150,11 @@ public:
int StartWork( void );
void StopMonitoring( void );
void TriggerCheck( int type, const char *znodePath );
+ const char* WaitForAndReturnMaster( bool doWait );
int WatchNode( const char *nodeName );
+ int WatchMasterNode( const char *nodeName );
int WatchNodeDelete( const char *nodeName );
+ int WatchNodeMasterDelete( const char *nodeName );
private:
int ZooExistRetry(zhandle_t *zh, const char *path, int watch, struct Stat *stat);
http://git-wip-us.apache.org/repos/asf/trafodion/blob/bded0e84/core/sqf/sqenvcom.sh
----------------------------------------------------------------------
diff --git a/core/sqf/sqenvcom.sh b/core/sqf/sqenvcom.sh
index 7058637..c5a9164 100644
--- a/core/sqf/sqenvcom.sh
+++ b/core/sqf/sqenvcom.sh
@@ -673,6 +673,18 @@ export SQ_LUNMGR_VERBOSITY=1
# Control SQ default startup behavior (c=cold, w=warm, if removed sqstart will autocheck)
export SQ_STARTUP=r
+# Monitor process creator:
+# MPIRUN - monitor process is created by mpirun
+# Uncomment SQ_MON_CREATOR when running monitor in AGENT mode
+#export SQ_MON_CREATOR=MPIRUN
+
+# Monitor process run mode:
+# AGENT - monitor process runs in agent mode versus MPI collective
+# Uncomment the three environment variables below
+#export SQ_MON_RUN_MODE=AGENT
+#export MONITOR_COMM_PORT=23399
+#export MONITOR_SYNC_PORT=23398
+
# Alternative logging capability in monitor
export SQ_MON_ALTLOG=0
http://git-wip-us.apache.org/repos/asf/trafodion/blob/bded0e84/core/sqf/sql/scripts/sqnodes.pm
----------------------------------------------------------------------
diff --git a/core/sqf/sql/scripts/sqnodes.pm b/core/sqf/sql/scripts/sqnodes.pm
index 0d09565..36d8f0c 100644
--- a/core/sqf/sql/scripts/sqnodes.pm
+++ b/core/sqf/sql/scripts/sqnodes.pm
@@ -279,10 +279,10 @@ sub verifyParse
displayStmt($stmtOk);
print " Error: node-id not specified\n";
}
- elsif ($nodeId > 1023)
+ elsif ($nodeId > 255)
{
displayStmt($stmtOk);
- print " Error: node-id must be in the range 0..1023.\n";
+ print " Error: node-id must be in the range 0..255.\n";
}
if (@cores == 0)
{
http://git-wip-us.apache.org/repos/asf/trafodion/blob/bded0e84/core/sqf/src/trafconf/clusterconf.cpp
----------------------------------------------------------------------
diff --git a/core/sqf/src/trafconf/clusterconf.cpp b/core/sqf/src/trafconf/clusterconf.cpp
index 0ebffeb..039b87d 100644
--- a/core/sqf/src/trafconf/clusterconf.cpp
+++ b/core/sqf/src/trafconf/clusterconf.cpp
@@ -49,6 +49,7 @@ using namespace std;
CClusterConfig::CClusterConfig( void )
: CPNodeConfigContainer(TC_NODES_MAX)
, CLNodeConfigContainer(TC_NODES_MAX)
+ , configMaster_(-1)
, nodeReady_(false)
, persistReady_(false)
, newPNodeConfig_(true)
@@ -61,6 +62,8 @@ CClusterConfig::CClusterConfig( void )
const char method_name[] = "CClusterConfig::CClusterConfig";
TRACE_ENTRY;
+ memset( &configMasterName_, 0, TC_PROCESSOR_NAME_MAX );
+
TRACE_EXIT;
}
@@ -373,6 +376,13 @@ bool CClusterConfig::LoadNodeConfig( void )
for (int i =0; i < nodeCount; i++ )
{
ProcessLNode( nodeConfigData[i], pnodeConfigInfo, lnodeConfigInfo );
+ // We want to pick the first configured node so all monitors pick the same one
+ // This only comes into play for a Trafodion start from scratch
+ if (i == 0)
+ {
+ configMaster_ = pnodeConfigInfo.pnid;
+ strcpy (configMasterName_ ,pnodeConfigInfo.nodename);
+ }
AddNodeConfiguration( pnodeConfigInfo, lnodeConfigInfo );
}
http://git-wip-us.apache.org/repos/asf/trafodion/blob/bded0e84/core/sqf/src/trafconf/clusterconf.h
----------------------------------------------------------------------
diff --git a/core/sqf/src/trafconf/clusterconf.h b/core/sqf/src/trafconf/clusterconf.h
index 1a8942f..ff4b17e 100644
--- a/core/sqf/src/trafconf/clusterconf.h
+++ b/core/sqf/src/trafconf/clusterconf.h
@@ -43,6 +43,8 @@ public:
void Clear( void );
bool DeleteNodeConfig( int pnid );
+ int GetConfigMaster ( ) { return configMaster_;}
+ char * GetConfigMasterByName() {return configMasterName_;}
bool Initialize( void );
bool Initialize( bool traceEnabled, const char *traceFile );
void InitCoreMask( cpu_set_t &coreMask );
@@ -73,6 +75,8 @@ public:
protected:
private:
+ int configMaster_;
+ char configMasterName_[TC_PROCESSOR_NAME_MAX];
bool nodeReady_; // true when node configuration loaded
bool persistReady_; // true when persist configuration loaded
bool newPNodeConfig_;
[4/6] trafodion git commit: More code review fixes.
Posted by db...@apache.org.
More code review fixes.
Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/717f9c3d
Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/717f9c3d
Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/717f9c3d
Branch: refs/heads/master
Commit: 717f9c3d25870c02d8a719f9fc15e55c0d4f1be8
Parents: 3d7855b
Author: Zalo Correa <za...@esgyn.com>
Authored: Wed Feb 28 15:52:37 2018 -0800
Committer: Zalo Correa <za...@esgyn.com>
Committed: Wed Feb 28 15:52:37 2018 -0800
----------------------------------------------------------------------
core/sqf/monitor/linux/mlio.cxx | 1 -
core/sqf/monitor/linux/zclient.cxx | 11 ++++++-----
2 files changed, 6 insertions(+), 6 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/trafodion/blob/717f9c3d/core/sqf/monitor/linux/mlio.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/mlio.cxx b/core/sqf/monitor/linux/mlio.cxx
index 7db35ec..b62cd46 100644
--- a/core/sqf/monitor/linux/mlio.cxx
+++ b/core/sqf/monitor/linux/mlio.cxx
@@ -1263,7 +1263,6 @@ SQ_LocalIOToClient::SQ_LocalIOToClient(int nid)
if (trace_settings & TRACE_INIT)
{
int err = errno;
- char la_buf[MON_STRING_BUF_SIZE];
trace_printf( "%s@%d" " failed shmget(%d), errno=%d (%s)\n"
, method_name, __LINE__
, (shsize), err, strerror(err) );
http://git-wip-us.apache.org/repos/asf/trafodion/blob/717f9c3d/core/sqf/monitor/linux/zclient.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/zclient.cxx b/core/sqf/monitor/linux/zclient.cxx
index 1c133ca..0ca03b1 100644
--- a/core/sqf/monitor/linux/zclient.cxx
+++ b/core/sqf/monitor/linux/zclient.cxx
@@ -580,6 +580,7 @@ const char* CZClient::WaitForAndReturnMaster( bool doWait )
trace_printf( "%s@%d (MasterMonitor) Master Monitor found (%s)\n"
, method_name, __LINE__, masterMonitor.c_str() );
}
+ TRACE_EXIT;
return nodes.data[0];
}
@@ -1811,7 +1812,7 @@ int CZClient::WatchNodeMasterDelete( const char *nodeName )
// This is fine since we call it indiscriminately
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
- trace_printf( "%s@%d (MasterMonitor) WatchNodeMasterDelete deleted %s, with rc == ZNONODE (fine)\n"
+ trace_printf( "%s@%d (MasterMonitor) WatchNodeMasterDelete already deleted %s, with rc == ZNONODE (fine)\n"
, method_name, __LINE__
, nodeName );
}
@@ -1821,15 +1822,15 @@ int CZClient::WatchNodeMasterDelete( const char *nodeName )
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
- trace_printf( "%s@%d (MasterMonitor) WatchNodeMasterDelete deleted %s, with rc == ZOK\n"
+ trace_printf( "%s@%d (MasterMonitor) znode (%s) already deleted or cannot be accessed, rc=%d (%s)\n"
, method_name, __LINE__
- , nodeName );
+ , nodeName, rc, zerror(rc) );
}
rc = ZOK;
char buf[MON_STRING_BUF_SIZE];
snprintf( buf, sizeof(buf)
- , "[%s], znode (%s) already deleted or cannot be accessed!\n"
- , method_name, nodeName );
+ , "[%s], znode (%s) already deleted or cannot be accessed, rc=%d (%s)\n"
+ , method_name, nodeName, rc, zerror(rc) );
mon_log_write(MON_ZCLIENT_WATCHMASTERNODEDELETE_2, SQ_LOG_INFO, buf);
}
else
[6/6] trafodion git commit: Merge [TRAFODION-2883] PR 1457
Preliminary scale enhancements
Posted by db...@apache.org.
Merge [TRAFODION-2883] PR 1457 Preliminary scale enhancements
Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/d48a8874
Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/d48a8874
Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/d48a8874
Branch: refs/heads/master
Commit: d48a887413f69d7b0e573106ee10f6770f29533c
Parents: aec5108 87739b8
Author: Dave Birdsall <db...@apache.org>
Authored: Thu Mar 1 21:07:00 2018 +0000
Committer: Dave Birdsall <db...@apache.org>
Committed: Thu Mar 1 21:07:00 2018 +0000
----------------------------------------------------------------------
.../export/include/common/evl_sqlog_eventnum.h | 14 +
core/sqf/monitor/linux/cluster.cxx | 388 ++++++++++---
core/sqf/monitor/linux/cluster.h | 26 +-
core/sqf/monitor/linux/commaccept.cxx | 108 ++--
core/sqf/monitor/linux/mlio.cxx | 7 +-
core/sqf/monitor/linux/monitor.cxx | 540 +++++++++++++++----
core/sqf/monitor/linux/monitor.h | 2 -
core/sqf/monitor/linux/msgdef.h | 48 +-
core/sqf/monitor/linux/pnode.cxx | 67 ++-
core/sqf/monitor/linux/process.cxx | 75 ++-
core/sqf/monitor/linux/reqprocinfo.cxx | 83 +--
core/sqf/monitor/linux/reqqueue.cxx | 10 +-
core/sqf/monitor/linux/shell.cxx | 273 ++++++----
core/sqf/monitor/linux/tmsync.cxx | 10 +-
core/sqf/monitor/linux/zclient.cxx | 348 +++++++++++-
core/sqf/monitor/linux/zclient.h | 5 +
core/sqf/monitor/test/runtest | 3 +-
core/sqf/sqenvcom.sh | 19 +
core/sqf/src/tm/tm_internal.h | 1 -
core/sqf/src/tm/tmlibmsg.h | 4 +-
core/sqf/src/tm/tools/dtmci.cpp | 2 -
core/sqf/src/tm/tools/pwd.cpp | 2 -
core/sqf/src/tm/tools/tmshutdown.cpp | 1 -
core/sqf/src/trafconf/clusterconf.cpp | 10 +
core/sqf/src/trafconf/clusterconf.h | 4 +
25 files changed, 1570 insertions(+), 480 deletions(-)
----------------------------------------------------------------------
[3/6] trafodion git commit: Code review fixes.
Posted by db...@apache.org.
Code review fixes.
Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/3d7855b6
Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/3d7855b6
Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/3d7855b6
Branch: refs/heads/master
Commit: 3d7855b6f64733ad3776fc421cb598883acbb6bf
Parents: bded0e8
Author: Zalo Correa <za...@esgyn.com>
Authored: Wed Feb 28 15:23:31 2018 -0800
Committer: Zalo Correa <za...@esgyn.com>
Committed: Wed Feb 28 15:23:31 2018 -0800
----------------------------------------------------------------------
core/sqf/monitor/linux/cluster.cxx | 128 ++++++++++++++++----------------
core/sqf/monitor/linux/cluster.h | 25 +++----
core/sqf/monitor/linux/pnode.cxx | 1 -
core/sqf/monitor/linux/tmsync.cxx | 10 +--
core/sqf/monitor/linux/zclient.cxx | 8 +-
5 files changed, 87 insertions(+), 85 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/trafodion/blob/3d7855b6/core/sqf/monitor/linux/cluster.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/cluster.cxx b/core/sqf/monitor/linux/cluster.cxx
index d1b3e91..83ea923 100644
--- a/core/sqf/monitor/linux/cluster.cxx
+++ b/core/sqf/monitor/linux/cluster.cxx
@@ -380,45 +380,46 @@ void CCluster::AssignMonitorLeader( int pnid )
int i = 0;
int rc = 0;
- int lMonitorLeaderPNid = MonitorLeaderPNid;
+ int monitorLeaderPNid = monitorLeaderPNid_;
CNode *node = NULL;
- if (MonitorLeaderPNid != pnid)
+ if (monitorLeaderPNid_ != pnid)
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf( "%s@%d" " - (MasterMonitor) returning, pnid %d != monitorLead %d\n"
- , method_name, __LINE__, pnid, MonitorLeaderPNid );
+ , method_name, __LINE__, pnid, monitorLeaderPNid_ );
}
- return;
+ TRACE_EXIT;
+ return;
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf( "%s@%d" " - (MasterMonitor) Node " "%d" " MonitorLeader failed!\n"
- , method_name, __LINE__, MonitorLeaderPNid );
+ , method_name, __LINE__, monitorLeaderPNid_ );
}
for (i=0; i<GetConfigPNodesMax(); i++)
{
- lMonitorLeaderPNid++;
+ monitorLeaderPNid++;
- if (lMonitorLeaderPNid == GetConfigPNodesMax())
+ if (monitorLeaderPNid == GetConfigPNodesMax())
{
- lMonitorLeaderPNid = 0; // restart with nid 0
+ monitorLeaderPNid = 0; // restart with nid 0
}
- if (lMonitorLeaderPNid == pnid)
+ if (monitorLeaderPNid == pnid)
{
continue; // this is the node that is going down, skip it
}
- if (Node[lMonitorLeaderPNid] == NULL)
+ if (Node[monitorLeaderPNid] == NULL)
{
continue;
}
- node = Node[lMonitorLeaderPNid];
+ node = Node[monitorLeaderPNid];
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
@@ -438,11 +439,11 @@ void CCluster::AssignMonitorLeader( int pnid )
continue; // skip this node for any of the above reasons
}
- MonitorLeaderPNid = node->GetPNid();
+ monitorLeaderPNid_ = node->GetPNid();
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
- trace_printf("%s@%d" " - Node " "%d" " is the new MonitorLeaderPNid." "\n", method_name, __LINE__, MonitorLeaderPNid);
+ trace_printf("%s@%d" " - Node " "%d" " is the new monitorLeaderPNid_." "\n", method_name, __LINE__, monitorLeaderPNid_);
}
if (ZClientEnabled)
@@ -483,7 +484,7 @@ void CCluster::AssignMonitorLeader( int pnid )
TRACE_EXIT;
}
-// Assigns a new TMLeader if given pnid is same as TmLeaderNid
+// Assigns a new TMLeader if given pnid is same as tmLeaderNid_
// TmLeader is a logical node num.
// pnid has gone down, so if that node was previously the TM leader, a new one needs to be chosen.
void CCluster::AssignTmLeader( int pnid, bool checkProcess )
@@ -495,15 +496,15 @@ void CCluster::AssignTmLeader( int pnid, bool checkProcess )
CNode *node = NULL;
CProcess *process = NULL;
- int TmLeaderPNid = LNode[TmLeaderNid]->GetNode()->GetPNid();
+ int TmLeaderPNid = LNode[tmLeaderNid_]->GetNode()->GetPNid();
if (TmLeaderPNid != pnid)
{
- node = LNode[TmLeaderNid]->GetNode();
+ node = LNode[tmLeaderNid_]->GetNode();
if (checkProcess)
{
- process = LNode[TmLeaderNid]->GetProcessLByType( ProcessType_DTM );
+ process = LNode[tmLeaderNid_]->GetProcessLByType( ProcessType_DTM );
if (process)
{
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
@@ -544,7 +545,7 @@ void CCluster::AssignTmLeader( int pnid, bool checkProcess )
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
trace_printf( "%s@%d" " - Node " "%d" " TmLeader failed! (checkProcess=%d)\n"
- , method_name, __LINE__, TmLeaderNid, checkProcess );
+ , method_name, __LINE__, tmLeaderNid_, checkProcess );
}
for (i=0; i<GetConfigPNodesMax(); i++)
@@ -586,11 +587,11 @@ void CCluster::AssignTmLeader( int pnid, bool checkProcess )
continue; // skip this node for any of the above reasons
}
- TmLeaderNid = node->GetFirstLNode()->GetNid();
+ tmLeaderNid_ = node->GetFirstLNode()->GetNid();
if (checkProcess)
{
- process = LNode[TmLeaderNid]->GetProcessLByType( ProcessType_DTM );
+ process = LNode[tmLeaderNid_]->GetProcessLByType( ProcessType_DTM );
if (!process)
{
continue; // skip this node no DTM process exists
@@ -599,7 +600,7 @@ void CCluster::AssignTmLeader( int pnid, bool checkProcess )
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY | TRACE_REQUEST | TRACE_SYNC | TRACE_TMSYNC))
{
- trace_printf("%s@%d" " - Node " "%d" " is the new TmLeader." "\n", method_name, __LINE__, TmLeaderNid);
+ trace_printf("%s@%d" " - Node " "%d" " is the new TmLeader." "\n", method_name, __LINE__, tmLeaderNid_);
}
break;
@@ -618,14 +619,13 @@ CCluster::CCluster (void)
,epollFD_(-1),
Node (NULL),
LNode (NULL),
- TmSyncPNid (-1),
- CurNodes (0),
- CurProcs (0),
+ tmSyncPNid_ (-1),
+ currentNodes_ (0),
configPNodesCount_ (-1),
configPNodesMax_ (-1),
- NodeMap (NULL),
- TmLeaderNid (-1),
- MonitorLeaderPNid (-1),
+ nodeMap_ (NULL),
+ tmLeaderNid_ (-1),
+ monitorLeaderPNid_ (-1),
tmReadyCount_(0),
minRecvCount_(4096),
recvBuffer_(NULL),
@@ -795,10 +795,10 @@ CCluster::~CCluster (void)
delete [] otherMonRank_;
delete [] socks_;
delete [] sockPorts_;
- if (NodeMap)
+ if (nodeMap_)
{
- delete [] NodeMap;
- NodeMap = NULL;
+ delete [] nodeMap_;
+ nodeMap_ = NULL;
}
delete [] recvBuffer2_;
@@ -2416,7 +2416,7 @@ void CCluster::HandleOtherNodeMsg (struct internal_msg_def *recv_msg,
// Begin a Slave Sync Start
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Slave Sync Start on Node %s (pnid=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid);
- TmSyncPNid = pnid;
+ tmSyncPNid_ = pnid;
Node[pnid]->SetTmSyncState( recv_msg->u.sync.state );
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
{
@@ -2430,12 +2430,12 @@ void CCluster::HandleOtherNodeMsg (struct internal_msg_def *recv_msg,
trace_printf("%s@%d - Sync State Collision! Node %s (pnid=%d) TmSyncState=(%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState()) );
if ( MyNode->GetTmSyncState() == SyncState_Continue )
{
- if ( pnid > TmSyncPNid )
+ if ( pnid > tmSyncPNid_ )
// highest node id will continue
{
// They take priority ... we abort
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
- trace_printf("%s@%d - Aborting Slave Sync Start on node %s (pnid=%d)\n", method_name, __LINE__, Node[Monitor->TmSyncPNid]->GetName(), Monitor->TmSyncPNid);
+ trace_printf("%s@%d - Aborting Slave Sync Start on node %s (pnid=%d)\n", method_name, __LINE__, Node[Monitor->tmSyncPNid_]->GetName(), Monitor->tmSyncPNid_);
MyNode->SetTmSyncState( SyncState_Null );
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Node %s (pnid=%d) TmSyncState updated (%d)(%s)\n", method_name, __LINE__, MyNode->GetName(), MyPNID, MyNode->GetTmSyncState(), SyncStateString( MyNode->GetTmSyncState() ) );
@@ -2443,7 +2443,7 @@ void CCluster::HandleOtherNodeMsg (struct internal_msg_def *recv_msg,
// Continue with other node's Slave TmSync Start request
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Slave Sync Start on node %s (pnid=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid);
- TmSyncPNid = pnid;
+ tmSyncPNid_ = pnid;
Node[pnid]->SetTmSyncState( recv_msg->u.sync.state );
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
{
@@ -2467,7 +2467,7 @@ void CCluster::HandleOtherNodeMsg (struct internal_msg_def *recv_msg,
// Continue with other node's Slave TmSync Start request
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - Slave Sync Start on node %s (pnid=%d)\n", method_name, __LINE__, Node[pnid]->GetName(), pnid);
- TmSyncPNid = pnid;
+ tmSyncPNid_ = pnid;
Node[pnid]->SetTmSyncState( recv_msg->u.sync.state );
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
{
@@ -2783,9 +2783,9 @@ void CCluster::HandleMyNodeMsg (struct internal_msg_def *recv_msg,
case SyncType_TmData:
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
trace_printf("%s@%d - TMSYNC(TmData) on Node %s (pnid=%d)\n", method_name, __LINE__, Node[MyPNID]->GetName(), MyPNID);
- TmSyncPNid = MyPNID;
+ tmSyncPNid_ = MyPNID;
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
- trace_printf("%s@%d - Sync communicated, TmSyncPNid=%d\n", method_name, __LINE__, TmSyncPNid);
+ trace_printf("%s@%d - Sync communicated, tmSyncPNid_=%d\n", method_name, __LINE__, tmSyncPNid_);
if ( ! MyNode->IsSpareNode() && MyNode->GetPhase() != Phase_Ready )
{
MyNode->CheckActivationPhase();
@@ -2974,7 +2974,7 @@ void CCluster::InitializeConfigCluster( void )
int rankToPnid[worldSize];
CClusterConfig *clusterConfig = Nodes->GetClusterConfig();
- CurNodes = worldSize;
+ currentNodes_ = worldSize;
if ( IsRealCluster )
{
@@ -3038,10 +3038,10 @@ void CCluster::InitializeConfigCluster( void )
int TmLeaderPNid = -1;
if (IsMaster)
{
- TmLeaderNid = Nodes->GetFirstNid();
- TmLeaderPNid = LNode[TmLeaderNid]->GetNode()->GetPNid();
+ tmLeaderNid_ = Nodes->GetFirstNid();
+ TmLeaderPNid = LNode[tmLeaderNid_]->GetNode()->GetPNid();
}
- // Non-master monitors in AGENT mode in a real cluster initialize all
+ // Monitors processes in AGENT mode in a real cluster initialize all
// remote nodes to a down state. The master monitor and the joining
// monitors will set the joining node state to up as part of the node
// re-integration processing as monitor processes join the cluster
@@ -3179,8 +3179,8 @@ void CCluster::InitializeConfigCluster( void )
delete [] commPortNums;
delete [] syncPortNums;
- TmLeaderNid = Nodes->GetFirstNid();
- int TmLeaderPNid = LNode[TmLeaderNid]->GetNode()->GetPNid();
+ tmLeaderNid_ = Nodes->GetFirstNid();
+ int TmLeaderPNid = LNode[tmLeaderNid_]->GetNode()->GetPNid();
// Any nodes not in the initial MPI_COMM_WORLD are down.
for (int i=0; i<GetConfigPNodesCount(); ++i)
@@ -3220,7 +3220,7 @@ void CCluster::InitializeConfigCluster( void )
}
else
{
- TmLeaderNid = 0;
+ tmLeaderNid_ = 0;
}
// Initialize communicators for point-to-point communications
@@ -3828,7 +3828,7 @@ void CCluster::ReIntegrateMPI( int initProblem )
{ // Already connected to creator monitor
comms_[i] = intraCommCreatorMon;
otherMonRank_[i] = 0;
- ++CurNodes;
+ ++currentNodes_;
// Set bit indicating node is up
upNodes_.upNodes[i/MAX_NODE_BITMASK] |= (1ull << (i%MAX_NODE_BITMASK));
@@ -3922,7 +3922,7 @@ void CCluster::ReIntegrateMPI( int initProblem )
comms_[i] = intraComm;
otherMonRank_[i] = 0;
- ++CurNodes;
+ ++currentNodes_;
Node[i]->SetSyncPort( nodeInfo[i].syncPort );
Node[i]->SetState( State_Up );
@@ -4163,7 +4163,7 @@ void CCluster::ReIntegrateSock( int initProblem )
}
otherMonRank_[nodeInfo[i].pnid] = 0;
- ++CurNodes;
+ ++currentNodes_;
// Store port numbers for the node
strncpy(commPort, nodeInfo[i].commPort, MPI_MAX_PORT_NAME);
@@ -4310,7 +4310,7 @@ void CCluster::ReIntegrateSock( int initProblem )
}
otherMonRank_[nodeInfo[i].pnid] = 0;
- ++CurNodes;
+ ++currentNodes_;
// Store port numbers for the node
strncpy(commPort, nodeInfo[i].commPort, MPI_MAX_PORT_NAME);
@@ -4583,7 +4583,7 @@ void CCluster::setNewComm( int pnid )
close( socks_[pnid] );
socks_[pnid] = -1;
}
- --CurNodes;
+ --currentNodes_;
}
if (trace_settings & TRACE_RECOVERY)
@@ -4595,7 +4595,7 @@ void CCluster::setNewComm( int pnid )
comms_[it->pnid] = it->comm;
otherMonRank_[it->pnid] = it->otherRank;
- ++CurNodes;
+ ++currentNodes_;
// Set bit indicating node is up
upNodes_.upNodes[it->pnid/MAX_NODE_BITMASK] |= (1ull << (it->pnid%MAX_NODE_BITMASK));
@@ -4686,14 +4686,14 @@ void CCluster::setNewSock( int pnid )
shutdown( socks_[pnid], SHUT_RDWR);
close( socks_[pnid] );
socks_[pnid] = -1;
- --CurNodes;
+ --currentNodes_;
}
CNode *node= Nodes->GetNode( it->pnid );
socks_[it->pnid] = it->socket;
sockPorts_[it->pnid] = node->GetSyncSocketPort();
otherMonRank_[it->pnid] = it->otherRank;
- ++CurNodes;
+ ++currentNodes_;
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
@@ -6040,7 +6040,7 @@ void CCluster::ValidateClusterState( cluster_state_def_t nodestate[],
// Evaluate each active (up) node in the cluster
int pnodesCount = 0;
for (int index = 0;
- index < GetConfigPNodesMax() && pnodesCount < CurNodes;
+ index < GetConfigPNodesMax() && pnodesCount < currentNodes_;
++index)
{
if ( nodestate[index].seq_num != 0 )
@@ -6104,11 +6104,11 @@ void CCluster::ValidateClusterState( cluster_state_def_t nodestate[],
if (trace_settings & (TRACE_SYNC | TRACE_RECOVERY | TRACE_INIT))
{
- trace_printf("%s@%d concurringNodes=%d, CurNodes=%d\n",
- method_name, __LINE__, concurringNodes, CurNodes);
+ trace_printf("%s@%d concurringNodes=%d, currentNodes_=%d\n",
+ method_name, __LINE__, concurringNodes, currentNodes_);
}
- if (concurringNodes == CurNodes)
+ if (concurringNodes == currentNodes_)
{ // General agreement that node is down, proceed to mark it down
CNode *downNode = Nodes->GetNode( it->exitedPnid );
@@ -6149,7 +6149,7 @@ void CCluster::ValidateClusterState( cluster_state_def_t nodestate[],
"%d but only %d of %d nodes also lost the "
"connection. See up: %s. See down: %s. So node "
"%d is going down (at seq #%lld).\n", method_name,
- it->exitedPnid, concurringNodes, CurNodes,
+ it->exitedPnid, concurringNodes, currentNodes_,
setSeesUp.c_str(), setSeesDown.c_str(),
MyPNID, seqNum_ );
mon_log_write(MON_CLUSTER_VALIDATE_STATE_2, SQ_LOG_ERR, buf);
@@ -6198,7 +6198,7 @@ void CCluster::ValidateClusterState( cluster_state_def_t nodestate[],
int pnodesCount2 = 0;
for (int remIndex = 0;
- remIndex < GetConfigPNodesMax() && pnodesCount2 < CurNodes;
+ remIndex < GetConfigPNodesMax() && pnodesCount2 < currentNodes_;
++remIndex)
{
bool someExited = false;
@@ -6248,7 +6248,7 @@ void CCluster::ValidateClusterState( cluster_state_def_t nodestate[],
{ // This remote node sees node pnid as up
int pnodesCount3 = 0;
for (int exitedPNid = 0;
- exitedPNid < GetConfigPNodesMax() && pnodesCount3 < CurNodes;
+ exitedPNid < GetConfigPNodesMax() && pnodesCount3 < currentNodes_;
++exitedPNid)
{
CNode *exitedNode = Nodes->GetNode( /*indexToPnid_[remIndex]*/exitedPNid );
@@ -6666,7 +6666,7 @@ void CCluster::UpdateClusterState( bool &doShutdown,
abort();
}
Node[index]->SetState( State_Down );
- --CurNodes;
+ --currentNodes_;
// Clear bit in set of "up nodes"
upNodes_.upNodes[index/MAX_NODE_BITMASK] &= ~(1ull << (index%MAX_NODE_BITMASK));
}
@@ -6738,7 +6738,7 @@ void CCluster::UpdateClusterState( bool &doShutdown,
// Programmer bonehead!
abort();
}
- --CurNodes;
+ --currentNodes_;
// Clear bit in set of "up nodes"
upNodes_.upNodes[index/MAX_NODE_BITMASK] &= ~(1ull << (index%MAX_NODE_BITMASK));
@@ -7068,14 +7068,14 @@ bool CCluster::checkIfDone ( )
if (trace_settings & TRACE_SYNC_DETAIL)
trace_printf("%s@%d - Node %d shutdown level=%d, state=%s. Process "
- "count=%d, internal state=%d, CurNodes=%d, "
+ "count=%d, internal state=%d, currentNodes_=%d, "
"local process count=%d\n",
method_name, __LINE__, MyNode->GetPNid(),
MyNode->GetShutdownLevel(),
StateString(MyNode->GetState()),
Nodes->ProcessCount(),
MyNode->getInternalState(),
- CurNodes, MyNode->GetNumProcs());
+ currentNodes_, MyNode->GetNumProcs());
// Check if we are also done
if (( MyNode->GetState() != State_Down ) &&
@@ -7094,7 +7094,7 @@ bool CCluster::checkIfDone ( )
return false;
}
else if ( (Nodes->ProcessCount() <=
- (CurNodes*MAX_PRIMITIVES)) // only WDGs alive
+ (currentNodes_*MAX_PRIMITIVES)) // only WDGs alive
&& !MyNode->isInQuiesceState() // post-quiescing will
// expire WDG (cluster)
&& !waitForWatchdogExit_ ) // WDG not yet exiting
http://git-wip-us.apache.org/repos/asf/trafodion/blob/3d7855b6/core/sqf/monitor/linux/cluster.h
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/cluster.h b/core/sqf/monitor/linux/cluster.h
index 6b658ae..ff49e56 100644
--- a/core/sqf/monitor/linux/cluster.h
+++ b/core/sqf/monitor/linux/cluster.h
@@ -124,12 +124,12 @@ public:
void DoDeviceReq(char * ldevname);
void ExpediteDown( void );
- inline int GetTmLeader( void ) { return( TmLeaderNid); }
- inline void SetTmLeader( int tmLeaderNid ) { TmLeaderNid = tmLeaderNid; }
- inline int GetMonitorLeader( void ) { return( MonitorLeaderPNid); }
- inline void SetMonitorLeader( int monitorLeaderPNid ) { MonitorLeaderPNid = monitorLeaderPNid; }
+ inline int GetTmLeader( void ) { return( tmLeaderNid_ ); }
+ inline void SetTmLeader( int tmLeaderNid ) { tmLeaderNid_ = tmLeaderNid; }
+ inline int GetMonitorLeader( void ) { return( monitorLeaderPNid_); }
+ inline void SetMonitorLeader( int monitorLeaderPNid ) { monitorLeaderPNid_ = monitorLeaderPNid; }
int GetDownedNid( void );
- inline int GetTmSyncPNid( void ) { return( TmSyncPNid ); } // Physical Node ID of current TmSync operations master
+ inline int GetTmSyncPNid( void ) { return( tmSyncPNid_ ); } // Physical Node ID of current TmSync operations master
void InitClusterComm(int worldSize, int myRank, int *rankToPnid);
void addNewComm(int nid, int otherRank, MPI_Comm comm);
void addNewSock(int nid, int otherRank, int sockFd );
@@ -210,7 +210,7 @@ protected:
CNode **Node; // array of nodes
CLNode **LNode; // array of logical nodes
- int TmSyncPNid; // Physical Node ID of current TmSync operations master
+ int tmSyncPNid_; // Physical Node ID of current TmSync operations master
void AddTmsyncMsg( struct sync_buffer_def *tmSyncBuffer
@@ -229,15 +229,14 @@ protected:
CLock syncCycle_;
private:
- int CurNodes; // Current # of nodes in the cluster
- int CurProcs; // Current # if processes alive in MPI_COMM_WORLD
+ int currentNodes_; // Current # of nodes in the cluster
int configPNodesCount_; // # of physical nodes configured
int configPNodesMax_; // max # of physical nodes that can be configured
- int *NodeMap; // Mapping of Node ranks to COMM_WORLD ranks
- int TmLeaderNid; // Nid of currently assigned TM Leader node
- int MonitorLeaderPNid; // PNid of currently assigned Monitor leader node
- int tmReadyCount_; // # of DTM processes ready for transactions
- size_t minRecvCount_; // minimum size of receive buffer for allgather
+ int *nodeMap_; // Mapping of Node ranks to COMM_WORLD ranks
+ int tmLeaderNid_; // Nid of currently assigned TM Leader node
+ int monitorLeaderPNid_; // PNid of currently assigned Monitor leader node
+ int tmReadyCount_; // # of DTM processes ready for transactions
+ size_t minRecvCount_; // minimum size of receive buffer for allgather
// Pointer to array of "sync_buffer_def" structures. Used by
// ShareWithPeers in "Allgather" operation.
http://git-wip-us.apache.org/repos/asf/trafodion/blob/3d7855b6/core/sqf/monitor/linux/pnode.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/pnode.cxx b/core/sqf/monitor/linux/pnode.cxx
index 485d013..783640f 100644
--- a/core/sqf/monitor/linux/pnode.cxx
+++ b/core/sqf/monitor/linux/pnode.cxx
@@ -1709,7 +1709,6 @@ void CNodeContainer::AddNodes( )
else
{
if (pnid >= maxNode) // only for workstation acting as single node
-// || (IsAgentMode &&(strcmp( MasterMonitorName, Node_name ) != 0)))
{
rank = -1; // -1 creates node in down state
}
http://git-wip-us.apache.org/repos/asf/trafodion/blob/3d7855b6/core/sqf/monitor/linux/tmsync.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/tmsync.cxx b/core/sqf/monitor/linux/tmsync.cxx
index 60d9f40..b87f0f4 100644
--- a/core/sqf/monitor/linux/tmsync.cxx
+++ b/core/sqf/monitor/linux/tmsync.cxx
@@ -321,7 +321,7 @@ int CTmSync_Container::CoordinateTmDataBlock ( struct sync_def *sync )
exchangeTmSyncData( sync, false );
syncCycle_.unlock();
ExchangeTmSyncState( false );
- if (( Monitor->TmSyncPNid == MyPNID ) &&
+ if (( Monitor->tmSyncPNid_ == MyPNID ) &&
( Nodes->GetTmState( SyncState_Start ) == SyncState_Start ) )
{
// send unsolicited messages to other TMs in
@@ -353,7 +353,7 @@ int CTmSync_Container::CoordinateTmDataBlock ( struct sync_def *sync )
else
{
if (trace_settings & (TRACE_SYNC | TRACE_TMSYNC))
- trace_printf("%s@%d" " - Tm Sync failed to start, TmSyncPNid=%d, MyPNID=%d, " "TmSyncState=%d, expecting=%d\n", method_name, __LINE__, TmSyncPNid, MyPNID, Nodes->GetTmState( SyncState_Start ), SyncState_Start);
+ trace_printf("%s@%d" " - Tm Sync failed to start, tmSyncPNid_=%d, MyPNID=%d, " "TmSyncState=%d, expecting=%d\n", method_name, __LINE__, tmSyncPNid_, MyPNID, Nodes->GetTmState( SyncState_Start ), SyncState_Start);
if (MyNode->GetTmSyncState() == SyncState_Start)
{
MyNode->SetTmSyncState( SyncState_Null );
@@ -449,7 +449,7 @@ void CTmSync_Container::EndTmSync( MSGTYPE type )
{
trace_printf("%s@%d - Request (%p) nid=%d, handle=%d, tag=%d, unsol=%d, comp=%d\n", method_name, __LINE__, req, req->Nid, req->Handle, req->Tag, req->Unsolicited, req->Completed);
}
- if ( TmSyncPNid == MyPNID )
+ if ( tmSyncPNid_ == MyPNID )
{
if ( MyNode->GetLNodesCount() > 1 )
{
@@ -666,7 +666,7 @@ void CTmSync_Container::ProcessTmSyncReply( struct message_def * msg )
TmSyncReplyCode |= msg->u.reply.u.unsolicited_tm_sync.return_code;
tmsync_req->Completed = true;
UnsolicitedComplete( msg );
- if ( TmSyncPNid == MyPNID )
+ if ( tmSyncPNid_ == MyPNID )
{
if (trace_settings & (TRACE_REQUEST | TRACE_TMSYNC))
trace_printf("%s@%d - Local Unsolicited TmSync reply, handle="
@@ -1102,7 +1102,7 @@ bool CTmSync_Container::TmSyncPending( void )
trace_printf("%s@%d" " - PendingTmSync=%d, total=%d, replies=%d, pending=%d\n", method_name, __LINE__, PendingSlaveTmSync, GetTotalSlaveTmSyncCount(), GetTmSyncReplies(), GetPendingSlaveTmSyncCount() );
if (( MyNode->GetTmSyncState() == SyncState_Abort ) &&
- ( TmSyncPNid != MyPNID ) &&
+ ( tmSyncPNid_ != MyPNID ) &&
( GetTmSyncReplies() == GetTotalSlaveTmSyncCount() ) )
{
CommitTmDataBlock( MPI_ERR_UNKNOWN );
http://git-wip-us.apache.org/repos/asf/trafodion/blob/3d7855b6/core/sqf/monitor/linux/zclient.cxx
----------------------------------------------------------------------
diff --git a/core/sqf/monitor/linux/zclient.cxx b/core/sqf/monitor/linux/zclient.cxx
index 107cf32..1c133ca 100644
--- a/core/sqf/monitor/linux/zclient.cxx
+++ b/core/sqf/monitor/linux/zclient.cxx
@@ -523,6 +523,8 @@ const char* CZClient::WaitForAndReturnMaster( bool doWait )
{
break;
}
+ usleep(1000000); // sleep for a second as to not overwhelm the system
+ retries++;
continue;
}
else if ( rc == ZOK )
@@ -549,14 +551,14 @@ const char* CZClient::WaitForAndReturnMaster( bool doWait )
break;
}
usleep(1000000); // sleep for a second as to not overwhelm the system
- retries++;
+ retries++;
continue;
}
}
else // error
{
- if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
{
trace_printf( "%s@%d Error (MasterMonitor) WaitForAndReturnMaster returned rc (%d), retries %d\n"
, method_name, __LINE__, rc, retries );
@@ -946,6 +948,8 @@ int CZClient::CreateMasterZNode( const char *nodeName )
, "[%s], RegisterZNode(%s) failed with error %s\n"
, method_name, monData.c_str(), zerror(rc) );
mon_log_write(MON_ZCLIENT_CREATEMASTERZNODE, SQ_LOG_ERR, buf);
+
+ TRACE_EXIT;
return(rc); // Return the error
}
if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
[5/6] trafodion git commit: Code review fix. Missed one :-(
Posted by db...@apache.org.
Code review fix. Missed one :-(
Project: http://git-wip-us.apache.org/repos/asf/trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/trafodion/commit/87739b81
Tree: http://git-wip-us.apache.org/repos/asf/trafodion/tree/87739b81
Diff: http://git-wip-us.apache.org/repos/asf/trafodion/diff/87739b81
Branch: refs/heads/master
Commit: 87739b81a022ae6005d810aadac7fc3c6071cf1d
Parents: 717f9c3
Author: Zalo Correa <za...@esgyn.com>
Authored: Wed Feb 28 16:17:26 2018 -0800
Committer: Zalo Correa <za...@esgyn.com>
Committed: Wed Feb 28 16:17:26 2018 -0800
----------------------------------------------------------------------
core/sqf/src/trafconf/clusterconf.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/trafodion/blob/87739b81/core/sqf/src/trafconf/clusterconf.cpp
----------------------------------------------------------------------
diff --git a/core/sqf/src/trafconf/clusterconf.cpp b/core/sqf/src/trafconf/clusterconf.cpp
index 039b87d..907919d 100644
--- a/core/sqf/src/trafconf/clusterconf.cpp
+++ b/core/sqf/src/trafconf/clusterconf.cpp
@@ -381,7 +381,7 @@ bool CClusterConfig::LoadNodeConfig( void )
if (i == 0)
{
configMaster_ = pnodeConfigInfo.pnid;
- strcpy (configMasterName_ ,pnodeConfigInfo.nodename);
+ strncpy( configMasterName_ , pnodeConfigInfo.nodename, sizeof(configMasterName_) );
}
AddNodeConfiguration( pnodeConfigInfo, lnodeConfigInfo );
}