You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@trafodion.apache.org by su...@apache.org on 2017/05/31 18:47:20 UTC
[19/50] [abbrv] incubator-trafodion git commit: Merge remote branch
'origin/master' into TRAFODION-2001
Merge remote branch 'origin/master' into TRAFODION-2001
Conflicts:
core/sqf/export/include/common/evl_sqlog_eventnum.h
core/sqf/monitor/linux/makefile
core/sqf/sql/scripts/sqcheck
Project: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/commit/1e294233
Tree: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/tree/1e294233
Diff: http://git-wip-us.apache.org/repos/asf/incubator-trafodion/diff/1e294233
Branch: refs/heads/master
Commit: 1e2942337ed5888e7685406d786fe1fce6507d54
Parents: 52d45b7 1e94882
Author: Zalo Correa <za...@esgyn.com>
Authored: Wed Sep 28 15:37:47 2016 -0700
Committer: Zalo Correa <za...@esgyn.com>
Committed: Wed Sep 28 15:37:47 2016 -0700
----------------------------------------------------------------------
.../export/include/common/evl_sqlog_eventnum.h | 41 +
core/sqf/monitor/linux/cluster.cxx | 249 ++-
core/sqf/monitor/linux/cluster.h | 1 +
core/sqf/monitor/linux/makefile | 26 +-
core/sqf/monitor/linux/monitor.cxx | 187 +++
core/sqf/monitor/linux/monitor.h | 2 +
core/sqf/monitor/linux/zclient.cxx | 1432 ++++++++++++++++++
core/sqf/monitor/linux/zclient.h | 116 ++
core/sqf/monitor/linux/zootest.cxx | 283 ++++
core/sqf/monitor/linux/zootest.h | 34 +
core/sqf/sqenvcom.sh | 19 +-
core/sqf/sql/scripts/sqcheck | 27 +-
core/sql/bin/SqlciErrors.txt | 4 +-
core/sql/regress/compGeneral/EXPECTED023 | 2 +-
core/sql/regress/hive/EXPECTED018 | 183 ++-
core/sql/regress/privs1/EXPECTED132 | 100 +-
core/sql/regress/privs1/EXPECTED136 | 2 +
core/sql/regress/privs2/EXPECTED129 | 23 +-
core/sql/regress/privs2/EXPECTED135 | 17 +-
core/sql/regress/privs2/EXPECTED138 | 22 +
core/sql/regress/privs2/EXPECTED139 | 10 +
core/sql/regress/privs2/EXPECTED140 | 4 +-
core/sql/regress/privs2/EXPECTED142 | 6 +
core/sql/sqlcomp/PrivMgrDesc.cpp | 36 +-
core/sql/sqlcomp/PrivMgrDesc.h | 22 +-
core/sql/sqlcomp/PrivMgrPrivileges.cpp | 1349 +++++++----------
core/sql/sqlcomp/PrivMgrPrivileges.h | 78 +-
core/sql/sqlcomp/nadefaults.cpp | 2 +-
.../org/trafodion/dcs/server/ServerManager.java | 115 +-
.../phoenix/end2end/MultiCfQueryExecTest.java | 2 +-
30 files changed, 3345 insertions(+), 1049 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/1e294233/core/sqf/export/include/common/evl_sqlog_eventnum.h
----------------------------------------------------------------------
diff --cc core/sqf/export/include/common/evl_sqlog_eventnum.h
index b15382f,e723d33..ec85b13
--- a/core/sqf/export/include/common/evl_sqlog_eventnum.h
+++ b/core/sqf/export/include/common/evl_sqlog_eventnum.h
@@@ -768,9 -727,38 +777,41 @@@
/* Module: pstartd.cxx = 35 */
#define MON_PSTARTD_MAIN_1 101350101
+/* Module: persistconfig.cxx = 36 */
+#define MON_PERSISTCONFIG_ADDCONFIG_1 101360101
+
+ /* Module: zoonode.cxx = 37 */
+ #define MON_ZCLIENT_ZCLIENT_1 101370101
+ #define MON_ZCLIENT_ZCLIENT_2 101370102
+ #define MON_ZCLIENT_ZCLIENT_3 101370103
+ #define MON_ZCLIENT_SYNC_STRING_COMP_1 101370201
+ #define MON_ZCLIENT_CHECKCLUSTER_1 101370301
+ #define MON_ZCLIENT_CHECKCLUSTER_2 101370302
+ #define MON_ZCLIENT_CHECKCLUSTERZNODES_1 101370401
+ #define MON_ZCLIENT_CHECKCLUSTERZNODES_2 101370402
+ #define MON_ZCLIENT_CHECKCLUSTERZNODES_3 101370403
+ #define MON_ZCLIENT_GETCLUSTERZNODES_1 101370501
+ #define MON_ZCLIENT_GETCLUSTERZNODES_2 101370502
+ #define MON_ZCLIENT_REGISTERZNODE_1 101370601
+ #define MON_ZCLIENT_SHUTDOWNWORK_1 101370701
+ #define MON_ZCLIENT_ZCLIENTTHREAD_1 101370801
+ #define MON_ZCLIENT_STARTWORK_1 101370901
+ #define MON_ZCLIENT_MONITORZCLUSTER_1 101371001
+ #define MON_ZCLIENT_GETZNODEDATA_1 101371101
+ #define MON_ZCLIENT_GETZNODEDATA_2 101371102
+ #define MON_ZCLIENT_GETZNODEDATA_3 101371103
+ #define MON_ZCLIENT_WATCHCLUSTER_1 101371201
+ #define MON_ZCLIENT_WATCHCLUSTER_2 101371202
+ #define MON_ZCLIENT_SETZNODEWATCH_1 101371301
+ #define MON_ZCLIENT_SETZNODEWATCH_2 101371302
+ #define MON_ZCLIENT_WATCHNODE_1 101371401
+ #define MON_ZCLIENT_ZSESSIONWATCHER_1 101371501
+ #define MON_ZCLIENT_ZSESSIONWATCHER_2 101371502
+ #define MON_ZCLIENT_CHECKZNODE_1 101371601
+ #define MON_ZCLIENT_WATCHNODEDELETE_1 101371701
+ #define MON_ZCLIENT_WATCHNODEDELETE_2 101371702
+ #define MON_ZCLIENT_WATCHNODEDELETE_3 101371703
+
/**********************************************/
/*********** Seabed ***********/
http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/1e294233/core/sqf/monitor/linux/cluster.cxx
----------------------------------------------------------------------
diff --cc core/sqf/monitor/linux/cluster.cxx
index 5a46cf8,104f7d8..84768c9
--- a/core/sqf/monitor/linux/cluster.cxx
+++ b/core/sqf/monitor/linux/cluster.cxx
@@@ -4447,9 -4276,10 +4474,10 @@@ int CCluster::AllgatherSock( int nbytes
int p_n2recv;
bool p_sending;
bool p_receiving;
+ int p_timeout_count;
char *p_buff;
} peer_t;
- peer_t p[cfgPNodes_];
+ peer_t p[GetConfigPNodesMax()];
memset( p, 0, sizeof(p) );
tag = 0; // make compiler happy
@@@ -4481,18 -4312,116 +4510,116 @@@
inBarrier_ = true;
MonStats->BarrierWaitIncr( );
+ static int sv_epoll_wait_timeout = -2;
+ static int sv_epoll_retry_count = 1;
+ if ( sv_epoll_wait_timeout == -2 )
+ {
+ char *lv_epoll_wait_timeout_env = getenv( "SQ_MON_EPOLL_WAIT_TIMEOUT" );
+ if ( lv_epoll_wait_timeout_env )
+ {
+ // convert to milliseconds
+ sv_epoll_wait_timeout = atoi( lv_epoll_wait_timeout_env ) * 1000;
+ }
+ else
+ {
+ sv_epoll_wait_timeout = -1;
+ }
+
+ char *lv_epoll_retry_count_env = getenv( "SQ_MON_EPOLL_RETRY_COUNT" );
+ if ( lv_epoll_retry_count_env )
+ {
+ sv_epoll_retry_count = atoi( lv_epoll_retry_count_env );
+ }
+ if ( sv_epoll_retry_count < 0 )
+ {
+ sv_epoll_retry_count = 0;
+ }
+ if ( sv_epoll_retry_count > 100 )
+ {
+ sv_epoll_retry_count = 100;
+ }
+ }
+
// do the work
- struct epoll_event events[2*cfgPNodes_ + 1];
+ struct epoll_event events[2*GetConfigPNodesMax() + 1];
while ( 1 )
{
- int maxEvents = 2*cfgPNodes_ - nsent - nrecv;
+ int maxEvents = 2*GetConfigPNodesMax() - nsent - nrecv;
if ( maxEvents == 0 ) break;
int nw;
while ( 1 )
{
- nw = epoll_wait( epollFD_, events, maxEvents, -1 );
+ nw = epoll_wait( epollFD_, events, maxEvents, sv_epoll_wait_timeout );
if ( nw >= 0 || errno != EINTR ) break;
}
+ if ( nw == 0 )
+ {
- for ( int iPeer = 0; iPeer < cfgPNodes_; iPeer++ )
++ for ( int iPeer = 0; iPeer < GetConfigPNodesMax(); iPeer++ )
+ {
+ peer_t *peer = &p[iPeer];
+ if ( (iPeer != MyPNID) &&
+ (socks_[iPeer] != -1) )
+ {
+ if ( (peer->p_receiving) ||
+ (peer->p_sending) )
+ {
+
+ peer->p_timeout_count++;
+
+ if ( peer->p_timeout_count <= sv_epoll_retry_count )
+ {
+ continue;
+ }
+
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s@%d] Not heard from peer=%d\n"
+ , method_name
+ , __LINE__
+ , iPeer );
+
+ mon_log_write( MON_CLUSTER_ALLGATHERSOCK_1, SQ_LOG_CRIT, buf );
+ stats[iPeer].MPI_ERROR = MPI_ERR_EXITED;
+ err = MPI_ERR_IN_STATUS;
+ if ( peer->p_sending )
+ {
+ peer->p_sending = false;
+ nsent++;
+ }
+ if ( peer->p_receiving )
+ {
+ peer->p_receiving = false;
+ nrecv++;
+ }
+
+ // setup the epoll structures
+ struct epoll_event event;
+ event.data.fd = socks_[iPeer];
+ int op = 0;
+ if ( !peer->p_sending && !peer->p_receiving )
+ {
+ op = EPOLL_CTL_DEL;
+ event.events = 0;
+ }
+ else if ( peer->p_sending )
+ {
+ op = EPOLL_CTL_MOD;
+ event.events = EPOLLOUT | EPOLLET | EPOLLRDHUP;
+ }
+ else if ( peer->p_receiving )
+ {
+ op = EPOLL_CTL_MOD;
+ event.events = EPOLLIN | EPOLLET | EPOLLRDHUP;
+ }
+ if ( op == EPOLL_CTL_DEL || op == EPOLL_CTL_MOD )
+ {
+ EpollCtl( epollFD_, op, socks_[iPeer], &event );
+ }
+ }
+ }
+ }
+ }
+
if ( nw < 0 )
{
char ebuff[256];
http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/1e294233/core/sqf/monitor/linux/cluster.h
----------------------------------------------------------------------
diff --cc core/sqf/monitor/linux/cluster.h
index cfaf495,eba2d5c..4d429fd
mode 100644,100755..100644
--- a/core/sqf/monitor/linux/cluster.h
+++ b/core/sqf/monitor/linux/cluster.h
http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/1e294233/core/sqf/monitor/linux/makefile
----------------------------------------------------------------------
diff --cc core/sqf/monitor/linux/makefile
index 451e459,656e9e6..0660414
mode 100644,100755..100644
--- a/core/sqf/monitor/linux/makefile
+++ b/core/sqf/monitor/linux/makefile
@@@ -288,10 -288,13 +296,15 @@@ ALLOBJS += $(TRACE_LOG_OBJS
ALLOBJS += $(CONFOBJS)
ALLOBJS += $(MEMLOGOBJS)
ALLOBJS += $(RTSIGBLOCK_OBJS)
+ALLOBJS += $(TCONFOBJS)
+ ALLOBJS += $(ZOOMONOBJS)
- PGMS = $(BINEXPDIR)/monitor $(BINEXPDIR)/shell $(BINEXPDIR)/sqwatchdog $(BINEXPDIR)/monmemlog $(BINEXPDIR)/trafconf
+ PGMS = $(BINEXPDIR)/monitor
+ PGMS += $(BINEXPDIR)/shell
+ PGMS += $(BINEXPDIR)/sqwatchdog
+ PGMS += $(BINEXPDIR)/monmemlog
PGMS += $(BINEXPDIR)/pstartd
++PGMS += $(BINEXPDIR)/trafconf
PGMS += $(LIBEXPDIR)/libseabasesig.so
TEST_PGMS = $(OUTDIR)/client
http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/1e294233/core/sqf/monitor/linux/monitor.cxx
----------------------------------------------------------------------
diff --cc core/sqf/monitor/linux/monitor.cxx
index 34995f4,a4b114c..669f3ad
--- a/core/sqf/monitor/linux/monitor.cxx
+++ b/core/sqf/monitor/linux/monitor.cxx
@@@ -755,6 -750,150 +758,150 @@@ void CMonitor::StartPrimitiveProcesses
TRACE_EXIT;
}
+ void HandleZSessionExpiration( void )
+ {
+ const char method_name[] = "HandleZSessionExpiration";
+ TRACE_ENTRY;
+ ReqQueue.enqueueDownReq(MyPNID);
+ TRACE_EXIT;
+ }
+
+ void HandleNodeExpiration( const char *nodeName )
+ {
+ const char method_name[] = "HandleNodeExpiration";
+ TRACE_ENTRY;
+ CNode *node = Nodes->GetNode((char *)nodeName);
+ if (node)
+ {
+ ReqQueue.enqueueDownReq(node->GetPNid());
+ }
+ TRACE_EXIT;
+ }
+
+ void CMonitor::CreateZookeeperClient( void )
+ {
+ const char method_name[] = "CMonitor::CreateZookeeperClient";
+ TRACE_ENTRY;
+
+ if ( ZClientEnabled )
+ {
+ string hostName;
+ string zkQuorumHosts;
+ stringstream zkQuorumPort;
+ char *env;
- char hostsStr[MAX_PROCESSOR_NAME*3] = { 0 };
++ char hostsStr[MPI_MAX_PROCESSOR_NAME * 3] = { 0 };
+ char *tkn = NULL;
+
+ int zport;
+ env = getenv("ZOOKEEPER_PORT");
+ if ( env && isdigit(*env) )
+ {
+ zport = atoi(env);
+ }
+ else
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf(buf, sizeof(buf),
+ "[%s], Zookeeper quorum port is not defined!\n"
+ , method_name);
+ mon_log_write(MON_MONITOR_CREATEZCLIENT_1, SQ_LOG_CRIT, buf);
+
+ ZClientEnabled = false;
+ TRACE_EXIT;
+ return;
+ }
+
+ env = getenv("ZOOKEEPER_NODES");
+ if ( env )
+ {
+ zkQuorumHosts = env;
+ if ( zkQuorumHosts.length() == 0 )
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf(buf, sizeof(buf),
+ "[%s], Zookeeper quorum hosts are not defined!\n"
+ , method_name);
+ mon_log_write(MON_MONITOR_CREATEZCLIENT_2, SQ_LOG_CRIT, buf);
+
+ ZClientEnabled = false;
+ TRACE_EXIT;
+ return;
+ }
+
+ strcpy( hostsStr, zkQuorumHosts.c_str() );
+ zkQuorumPort.str( "" );
+
+ tkn = strtok( hostsStr, "," );
+ do
+ {
+ if ( tkn != NULL )
+ {
+ hostName = tkn;
+ zkQuorumPort << hostName.c_str()
+ << ":"
+ << zport;
+ }
+ tkn = strtok( NULL, "," );
+ if ( tkn != NULL )
+ {
+ zkQuorumPort << ",";
+ }
+
+ }
+ while( tkn != NULL );
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d zkQuorumPort=%s\n"
+ , method_name, __LINE__
+ , zkQuorumPort.str().c_str() );
+ }
+ }
+
+ ZClient = new CZClient( zkQuorumPort.str().c_str()
+ , ZCLIENT_TRAFODION_ZNODE
+ , ZCLIENT_INSTANCE_ZNODE );
+ if ( ZClient == NULL )
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf(buf, sizeof(buf),
+ "[%s], Failed to allocate ZClient object!\n"
+ , method_name);
+ mon_log_write(MON_MONITOR_CREATEZCLIENT_3, SQ_LOG_CRIT, buf);
+ abort();
+ }
+ }
+
+ TRACE_EXIT;
+ }
+
+ void CMonitor::StartZookeeperClient( void )
+ {
+ const char method_name[] = "CMonitor::StartZookeeperClient";
+ TRACE_ENTRY;
+
+ int rc = -1;
+
+ if ( ZClientEnabled )
+ {
+ if ( ZClient )
+ {
+ rc = ZClient->StartWork();
+ if (rc == 0)
+ {
+ ZClient->StartMonitoring();
+
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf(buf, sizeof(buf),
+ "[%s], ZClient node monitoring started\n"
+ , method_name);
+ mon_log_write(MON_MONITOR_STARTZCLIENT_1, SQ_LOG_INFO, buf);
+ }
+ }
+ }
+
+ TRACE_EXIT;
+ }
+
#ifdef USE_SEQUENCE_NUM
long long CMonitor::GetTimeSeqNum()
{
http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/1e294233/core/sqf/monitor/linux/monitor.h
----------------------------------------------------------------------
diff --cc core/sqf/monitor/linux/monitor.h
index 49308b9,1b44c57..1b44c57
mode 100644,100755..100644
--- a/core/sqf/monitor/linux/monitor.h
+++ b/core/sqf/monitor/linux/monitor.h
http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/1e294233/core/sqf/monitor/linux/zclient.cxx
----------------------------------------------------------------------
diff --cc core/sqf/monitor/linux/zclient.cxx
index 0000000,01ab4e5..c767639
mode 000000,100644..100644
--- a/core/sqf/monitor/linux/zclient.cxx
+++ b/core/sqf/monitor/linux/zclient.cxx
@@@ -1,0 -1,1432 +1,1432 @@@
+ /**********************************************************************
+ // @@@ START COPYRIGHT @@@
+ //
+ // Licensed to the Apache Software Foundation (ASF) under one
+ // or more contributor license agreements. See the NOTICE file
+ // distributed with this work for additional information
+ // regarding copyright ownership. The ASF licenses this file
+ // to you under the Apache License, Version 2.0 (the
+ // "License"); you may not use this file except in compliance
+ // with the License. You may obtain a copy of the License at
+ //
+ // http://www.apache.org/licenses/LICENSE-2.0
+ //
+ // Unless required by applicable law or agreed to in writing,
+ // software distributed under the License is distributed on an
+ // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ // KIND, either express or implied. See the License for the
+ // specific language governing permissions and limitations
+ // under the License.
+ //
+ // @@@ END COPYRIGHT @@@
+ ********************************************************************/
+ #include <stdlib.h>
+ #include <errno.h>
+ #include <sys/ioctl.h>
+ #include <sys/time.h>
+ #include <signal.h>
+ #include <ctype.h>
+ #include <string.h>
+ #include <ifaddrs.h>
+ #include <netdb.h>
+ #include <new>
+ #include <stdio.h>
+ #include <list>
+ #include <string>
+
+ #include "lock.h"
+ #include "msgdef.h"
+ #include "montrace.h"
+ #include "monlogging.h"
+ #include "reqqueue.h"
+ #include "pnode.h"
+ #include "zclient.h"
+
+ // The following specify the default values for the timers if the
+ // zclient cluster monitoring timer related variables are not defined.
+ //
+ // NOTE: It is recommended to set the checkrate to -1 which essentially
+ // make the zclient event driven. Meaning the watcher is called
+ // only when watched a znode is changed or is deleted (expires)
+ // Also, the session timeout must be kept at or below 60 seconds
+ // as this is enforced by Zookeeper. Any, value above 60 seconds
+ // is renegotiated by Zookeeper to 60 seconds.
+ #define ZCLIENT_MONITORING_CHECKRATE -1 // seconds (disabled)
+ #define ZCLIENT_SESSION_TIMEOUT 60 // seconds (1 minute)
+
+ // The monitors register their znodes under the cluster znode
+ #define ZCLIENT_CLUSTER_ZNODE "/cluster"
+
+ using namespace std;
+
+ extern char Node_name[MPI_MAX_PROCESSOR_NAME];
+ extern int MyPNID;
+ extern int MyNid;
+ extern int MyPid;
+
+ extern CNodeContainer *Nodes;
+ extern CReqQueue ReqQueue;
+ extern CZClient *ZClient;
+ extern CMonLog *MonLog;
+ extern CMonLog *SnmpLog;
+
+ extern bool debugFlag;
+
+ static zhandle_t *ZHandle;
+ static clientid_t MyZooId;
+
+ void ZSessionWatcher( zhandle_t *zzh
+ , int type
+ , int state
+ , const char *path
+ , void *watcherCtx);
+
+ void FreeStringVector( struct String_vector *v )
+ {
+ if ( v->data )
+ {
+ for ( int32_t i=0; i < v->count; i++ )
+ {
+ free( v->data[i] );
+ }
+ free( v->data );
+ v->data = NULL;
+ v->count = 0;
+ }
+ }
+
+ static const char *ZClientStateStr( CZClient::ZClientState_t state )
+ {
+ switch (state)
+ {
+ case CZClient::ZC_DISABLED:
+ return "ZC_DISABLED";
+ case CZClient::ZC_START:
+ return "ZC_START";
+ case CZClient::ZC_CLUSTER:
+ return "ZC_CLUSTER";
+ case CZClient::ZC_ZNODE:
+ return "ZC_ZNODE";
+ case CZClient::ZC_WATCH:
+ return "ZC_WATCH";
+ case CZClient::ZC_STOP:
+ return "ZC_STOP";
+ case CZClient::ZC_SHUTDOWN:
+ return "ZC_SHUTDOWN";
+ default:
+ break;
+ }
+ return "ZClient State Invalid";
+ }
+
+ static const char *ZooConnectionTypeStr( int type )
+ {
+ if ( type == ZOO_CREATED_EVENT )
+ return "ZOO_CREATED_EVENT";
+ if ( type == ZOO_DELETED_EVENT )
+ return "ZOO_DELETED_EVENT";
+ if ( type == ZOO_CHANGED_EVENT )
+ return "ZOO_CHANGED_EVENT";
+ if ( type == ZOO_CHILD_EVENT )
+ return "ZOO_CHILD_EVENT";
+ if ( type == ZOO_SESSION_EVENT )
+ return "ZOO_SESSION_EVENT";
+ if ( type == ZOO_NOTWATCHING_EVENT )
+ return "ZOO_NOTWATCHING_EVENT";
+
+ return "INVALID_TYPE";
+ }
+
+ static const char *ZooConnectionStateStr( int state )
+ {
+ if ( state == 0 )
+ return "CLOSED_STATE";
+ if ( state == ZOO_EXPIRED_SESSION_STATE )
+ return "EXPIRED_SESSION_STATE";
+ if ( state == ZOO_AUTH_FAILED_STATE )
+ return "AUTH_FAILED_STATE";
+ if ( state == ZOO_CONNECTING_STATE )
+ return "CONNECTING_STATE";
+ if ( state == ZOO_ASSOCIATING_STATE )
+ return "ASSOCIATING_STATE";
+ if ( state == ZOO_CONNECTED_STATE )
+ return "CONNECTED_STATE";
+
+ return "INVALID_STATE";
+ }
+
+ const char *ZooErrorStr( int error )
+ {
+ if ( error == 0 )
+ return "ZOK";
+ if ( error == ZNONODE )
+ return "ZNONODE";
+ if ( error == ZNODEEXISTS )
+ return "ZNODEEXISTS";
+ if ( error == ZNOAUTH )
+ return "ZNOAUTH";
+ if ( error == ZNOCHILDRENFOREPHEMERALS )
+ return "ZNOCHILDRENFOREPHEMERALS";
+ if ( error == ZBADARGUMENTS )
+ return "ZBADARGUMENTS";
+ if ( error == ZINVALIDSTATE )
+ return "ZINVALIDSTATE";
+ if ( error == ZMARSHALLINGERROR )
+ return "ZMARSHALLINGERROR";
+ if ( error == ZCONNECTIONLOSS )
+ return "ZCONNECTIONLOSS";
+ if ( error == ZOPERATIONTIMEOUT )
+ return "ZOPERATIONTIMEOUT";
+
+ static char errorStr[20];
+ sprintf( errorStr, "%d", error );
+ return errorStr;
+ }
+
+ void ZSessionWatcher( zhandle_t *zzh
+ , int type
+ , int state
+ , const char *path
+ , void *watcherCtx)
+ {
+ const char method_name[] = "ZSessionWatcher";
+ TRACE_ENTRY;
+
+ watcherCtx = watcherCtx; // Make compiler happy!
+
+ /*
+ * Be careful using ZHandle here rather than zzh - as this may be mt code
+ * the client lib may call the watcher before zookeeper_init returns
+ */
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ if ( path && strlen( path ) > 0 )
+ {
+ trace_printf( "%s@%d" " - Watcher %s state = %s for path %s\n"
+ , method_name, __LINE__
+ , ZooConnectionTypeStr( type )
+ , ZooConnectionStateStr( state )
+ , path );
+ }
+ else
+ {
+ trace_printf( "%s@%d" " - Watcher %s state = %s\n"
+ , method_name, __LINE__
+ , ZooConnectionTypeStr( type )
+ , ZooConnectionStateStr( state ) );
+ }
+ }
+
+ if ( type == ZOO_SESSION_EVENT )
+ {
+ if ( state == ZOO_CONNECTED_STATE )
+ {
+ const clientid_t *id = zoo_client_id( zzh );
+ if ( MyZooId.client_id == 0 || MyZooId.client_id != id->client_id )
+ {
+ MyZooId = *id;
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d" " - Got a new session id: 0x%llx\n"
+ , method_name, __LINE__
+ , static_cast<long long unsigned int>(MyZooId.client_id) );
+ }
+ }
+ }
+ else if ( state == ZOO_AUTH_FAILED_STATE )
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], Error Zookeeper authentication failure. Node going down...\n"
+ , method_name );
+ mon_log_write(MON_ZCLIENT_ZSESSIONWATCHER_1, SQ_LOG_CRIT, buf);
+
+ HandleZSessionExpiration();
+
+ zookeeper_close( zzh );
+ ZHandle=0;
+ }
+ else if ( state == ZOO_EXPIRED_SESSION_STATE )
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], Error Zookeeper session expired. Node going down...\n"
+ , method_name );
+ mon_log_write(MON_ZCLIENT_ZSESSIONWATCHER_2, SQ_LOG_CRIT, buf);
+
+ HandleZSessionExpiration();
+
+ zookeeper_close( zzh );
+ ZHandle=0;
+ }
+ }
+ else if ( type == ZOO_CREATED_EVENT )
+ {
+ ZClient->TriggerCheck( type, path );
+ }
+ else if ( type == ZOO_DELETED_EVENT )
+ {
+ ZClient->TriggerCheck( type, path );
+ }
+ else if ( type == ZOO_CHANGED_EVENT )
+ {
+ ZClient->TriggerCheck( type, path );
+ }
+ else if ( type == ZOO_CHILD_EVENT )
+ {
+ ZClient->TriggerCheck( type, path );
+ }
+ else if ( type == ZOO_NOTWATCHING_EVENT )
+ {
+ ZClient->TriggerCheck( type, path );
+ }
+
+ TRACE_EXIT;
+ }
+
+ CZClient::CZClient( const char *quorumHosts
+ , const char *rootNode
+ , const char *instanceNode )
+ :threadId_(0)
+ ,state_(ZC_DISABLED)
+ ,enabled_(false)
+ ,checkCluster_(false)
+ ,zcMonitoringRate_(ZCLIENT_MONITORING_CHECKRATE) // seconds
+ ,zkQuorumHosts_(quorumHosts)
+ ,zkRootNode_(rootNode)
+ ,zkRootNodeInstance_(instanceNode)
+ ,zkQuorumPort_("")
+ ,zkSessionTimeout_(ZCLIENT_SESSION_TIMEOUT) // seconds
+ {
+ const char method_name[] = "CZClient::CZClient";
+ TRACE_ENTRY;
+
+ memcpy(&eyecatcher_, "ZCLT", 4);
+
+ char *zcMonitoringRateValueC;
+ int zcMonitoringRateValue;
+ if ( (zcMonitoringRateValueC = getenv( "SQ_MON_ZCLIENT_MONITORING_CHECKRATE" )) )
+ {
+ // in seconds
+ zcMonitoringRateValue = atoi( zcMonitoringRateValueC );
+ zcMonitoringRate_ = zcMonitoringRateValue; // in seconds
+ }
+
+ char *zkSessionTimeoutC;
+ int zkSessionTimeoutValue;
+ if ( (zkSessionTimeoutC = getenv( "SQ_MON_ZCLIENT_SESSION_TIMEOUT" )) )
+ {
+ // in seconds
+ zkSessionTimeoutValue = atoi( zkSessionTimeoutC );
+ zkSessionTimeout_ = zkSessionTimeoutValue; // in seconds
+ }
+
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d" " - ZClient monitoring rate in seconds=%ld\n"
+ , method_name, __LINE__, zcMonitoringRate_ );
+ trace_printf( "%s@%d" " - ZClient session timeout in seconds =%d\n"
+ , method_name, __LINE__, zkSessionTimeout_ );
+ }
+
+ if ( zkQuorumHosts_.length() == 0 )
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], Zookeeper quorum port address not initialized\n"
+ , method_name);
+ mon_log_write(MON_ZCLIENT_ZCLIENT_1, SQ_LOG_ERR, buf);
+ abort();
+ }
+ else
+ {
+ zkQuorumPort_ << zkQuorumHosts_.c_str();
+
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d zkQuorumPort is: %s\n"
+ , method_name, __LINE__
+ , zkQuorumPort_.str( ).c_str( ));
+ }
+ }
+
+ // Initialize zookeeper
+ zoo_deterministic_conn_order( 0 ); // non-deterministic order for client connections
+ ZHandle = zookeeper_init( zkQuorumPort_.str( ).c_str( )
+ , ZSessionWatcher
+ , zkSessionTimeout_ * 1000
+ , &MyZooId
+ , 0
+ , 0 );
+ if ( ZHandle == 0 )
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], zookeeper_init() failed for host:port %s\n"
+ , method_name, zkQuorumPort_.str( ).c_str( ));
+ mon_log_write(MON_ZCLIENT_ZCLIENT_2, SQ_LOG_ERR, buf);
+ abort();
+ }
+
+ int rc = InitializeZClient();
+ if ( rc )
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], Failed ZClient initialization (%s)\n"
+ , method_name, ZooErrorStr(rc) );
+ mon_log_write(MON_ZCLIENT_ZCLIENT_3, SQ_LOG_ERR, buf);
+ abort();
+ }
+
+ TRACE_EXIT;
+ }
+
+ CZClient::~CZClient( void )
+ {
+ const char method_name[] = "CZClient::~CZClient";
+ TRACE_ENTRY;
+
+ memcpy(&eyecatcher_, "zclt", 4);
+
+ if (ZHandle)
+ {
+ zookeeper_close(ZHandle);
+ ZHandle = 0;
+ }
+
+ TRACE_EXIT;
+ }
+
+ void CZClient::CheckCluster( void )
+ {
+ const char method_name[] = "CZClient::CheckCluster";
+ TRACE_ENTRY;
+
+ int rc;
+ struct String_vector nodes;
+
+ if ( IsCheckCluster() )
+ {
+ rc = GetClusterZNodes( &nodes );
+ if ( rc != ZOK )
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], GetClusterZNodes() failed!\n"
+ , method_name );
+ mon_log_write(MON_ZCLIENT_CHECKCLUSTER_1, SQ_LOG_ERR, buf);
+ SetState( CZClient::ZC_STOP );
+ CLock::wakeOne();
+ return;
+ }
+
+ stringstream newpath;
+ string monZnode;
+ string nodeName;
+ int pnid = -1;
+
+ if ( nodes.count > 0 )
+ {
+ for (int i = 0; i < nodes.count ; i++ )
+ {
+ newpath.str( "" );
+ newpath << zkRootNode_.c_str()
+ << zkRootNodeInstance_.c_str()
+ << ZCLIENT_CLUSTER_ZNODE << "/"
+ << nodes.data[i];
+ string monZnode = newpath.str( );
+
+ rc = GetZNodeData( monZnode, nodeName, pnid );
+ if ( rc != ZOK )
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], GetZNodeData() failed!\n"
+ , method_name );
+ mon_log_write(MON_ZCLIENT_CHECKCLUSTER_2, SQ_LOG_ERR, buf);
+ }
+ else
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d monZnode=%s, nodeName=%s, pnid=%d)\n"
+ , method_name, __LINE__
+ , monZnode.c_str(), nodeName.c_str(), pnid );
+ }
+ }
+ }
+ FreeStringVector( &nodes );
+ }
+ }
+ else
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d CheckCluster is NOT set!\n"
+ , method_name, __LINE__ );
+ }
+ }
+
+ TRACE_EXIT;
+ }
+
+ int CZClient::GetClusterZNodes( String_vector *nodes )
+ {
+ const char method_name[] = "CZClient::GetClusterZNodes";
+ TRACE_ENTRY;
+
+ bool found = false;
+ int rc = -1;
+ int retries = 0;
+ Stat stat;
+
+ stringstream ss;
+ ss.str( "" );
+ ss << zkRootNode_.c_str()
+ << zkRootNodeInstance_.c_str()
+ << ZCLIENT_CLUSTER_ZNODE;
+ string trafCluster( ss.str( ) );
+
+ nodes->count = 0;
+ nodes->data = NULL;
+
+ while ( !found )
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d trafCluster=%s\n"
+ , method_name, __LINE__, trafCluster.c_str() );
+ }
+ // Verify the existence of the parent ZCLIENT_CLUSTER_ZNODE
+ rc = zoo_exists( ZHandle, trafCluster.c_str( ), 0, &stat );
+ if ( rc == ZNONODE )
+ {
+ if (retries > 10)
+ break;
+ retries++;
+ continue;
+ }
+ else if ( rc == ZOK )
+ {
+ // Now get the list of available znodes in the cluster.
+ //
+ // This will return child znodes for each monitor process that has
+ // registered, including this process.
+ rc = zoo_get_children( ZHandle, trafCluster.c_str( ), 0, nodes );
+ if ( nodes->count > 0 )
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d nodes.count=%d\n"
+ , method_name, __LINE__
+ , nodes->count );
+ }
+ found = true;
+ }
+ else
+ {
+ if (retries > 10)
+ break;
+ retries++;
+ continue;
+ }
+ }
+ else // error
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], zoo_exists() for %s failed with error %s\n"
+ , method_name, trafCluster.c_str( ), ZooErrorStr(rc));
+ mon_log_write(MON_ZCLIENT_GETCLUSTERZNODES_2, SQ_LOG_ERR, buf);
+ break;
+ }
+ }
+
+ TRACE_EXIT;
+ return( rc );
+ }
+
+ int CZClient::GetZNodeData( string &monZnode, string &nodeName, int &pnid )
+ {
+ const char method_name[] = "CZClient::GetZNodeData";
+ TRACE_ENTRY;
+
+ char pnidStr[8] = { 0 };
+ char *tkn = NULL;
- char zkData[MAX_PROCESSOR_NAME];
++ char zkData[MPI_MAX_PROCESSOR_NAME];
+ int rc = -1;
+ int zkDataLen = sizeof(zkData);
+ Stat stat;
+
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d monZnode=%s\n"
+ , method_name, __LINE__, monZnode.c_str() );
+ }
+ rc = zoo_exists( ZHandle, monZnode.c_str( ), 0, &stat );
+ if ( rc == ZNONODE )
+ {
+ // return the error
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d monZnode=%s does not exist (ZNONODE)\n"
+ , method_name, __LINE__, monZnode.c_str() );
+ }
+ }
+ else if ( rc == ZOK )
+ {
+ // Get the pnid from the data part of znode
+ rc = zoo_get( ZHandle, monZnode.c_str( ), false, zkData, &zkDataLen, &stat );
+ if ( rc == ZOK )
+ {
+ // The first token is the node name
+ tkn = strtok( zkData, ":" );
+ if ( tkn != NULL )
+ {
+ nodeName = tkn;
+ }
+ tkn = strtok( NULL, ":" );
+ if ( tkn != NULL )
+ {
+ strcpy( pnidStr, tkn );
+ pnid = atoi( pnidStr );
+ }
+ // TODO: Save monZnode path in corresponding physical node object
+ // to match with when ZC_NODE is triggered
+ }
+ else
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], zoo_get() for %s failed with error %s\n"
+ , method_name, monZnode.c_str( ), ZooErrorStr(rc));
+ mon_log_write(MON_ZCLIENT_GETZNODEDATA_2, SQ_LOG_ERR, buf);
+ }
+ }
+ else
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], zoo_exists() for %s failed with error %s\n"
+ , method_name, monZnode.c_str( ), ZooErrorStr(rc));
+ mon_log_write(MON_ZCLIENT_GETZNODEDATA_3, SQ_LOG_ERR, buf);
+ }
+
+ TRACE_EXIT;
+ return( rc );
+ }
+
+ void CZClient::HandleExpiredZNode( void )
+ {
+ const char method_name[] = "CZClient::HandleExpiredZNode";
+ TRACE_ENTRY;
+
+ if ( IsCheckCluster() )
+ {
- char pathStr[MAX_PROCESSOR_NAME] = { 0 };
- char nodeName[MAX_PROCESSOR_NAME] = { 0 };
++ char pathStr[MPI_MAX_PROCESSOR_NAME] = { 0 };
++ char nodeName[MPI_MAX_PROCESSOR_NAME] = { 0 };
+ char *tkn = NULL;
+ char *tknStart = pathStr;
+ char *tknLast = NULL;
+ string monZnode;
+
+ monZnode.assign( znodeQueue_.front() );
+
+ if (trace_settings)
+ {
+ trace_printf("%s@%d" " - znodePath=%s, znodeQueue_.size=%ld\n"
+ , method_name, __LINE__
+ , monZnode.c_str(), znodeQueue_.size() );
+ }
+
+ znodeQueue_.pop_front();
+
+ trace_printf( "%s@%d" " - Checking znode=%s\n"
+ , method_name, __LINE__
+ , monZnode.c_str() );
+
+ strcpy( pathStr, monZnode.c_str() );
+
+ tknStart++; // skip the first '/'
+ tkn = strtok( tknStart, "/" );
+ do
+ {
+ tknLast = tkn;
+ tkn = strtok( NULL, "/" );
+ }
+ while( tkn != NULL );
+
+ strcpy( nodeName, tknLast );
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d nodeName=%s\n"
+ , method_name, __LINE__
+ , strlen(nodeName) ? nodeName : "" );
+ }
+
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], %s was deleted, handling node (%s) as a down node!\n"
+ , method_name, monZnode.c_str(), nodeName );
+ mon_log_write(MON_ZCLIENT_CHECKZNODE_1, SQ_LOG_ERR, buf);
+
+ HandleNodeExpiration( nodeName );
+ }
+ else
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d CheckCluster is NOT set!\n"
+ , method_name, __LINE__ );
+ }
+ }
+
+ TRACE_EXIT;
+ }
+
+ int CZClient::InitializeZClient( void )
+ {
+ const char method_name[] = "CZClient::MakeClusterZNodes";
+ TRACE_ENTRY;
+
+ int rc;
+
+ rc = MakeClusterZNodes();
+ if ( rc == ZOK )
+ {
+ rc = RegisterMyNodeZNode();
+ }
+
+ TRACE_EXIT;
+ return( rc );
+ }
+
+ int CZClient::MakeClusterZNodes( void )
+ {
+ const char method_name[] = "CZClient::MakeClusterZNodes";
+ TRACE_ENTRY;
+
+ int rc;
+ Stat stat;
+
+ stringstream ss;
+ ss.str( "" );
+ ss << zkRootNode_.c_str();
+ string rootDir( ss.str( ) );
+
+ rc = zoo_exists( ZHandle, rootDir.c_str(), 0, &stat );
+ switch (rc)
+ {
+ case ZOK:
+ break;
+ case ZNONODE:
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d RegisterZNode(%s)\n"
+ , method_name, __LINE__
+ , rootDir.c_str() );
+ }
+ rc = RegisterZNode( rootDir.c_str(), NULL, 0 );
+ if (rc) return(rc); // Return the error
+ break;
+ default:
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], zoo_exists() failed with error %s\n"
+ , method_name, ZooErrorStr(rc) );
+ mon_log_write(MON_ZCLIENT_CHECKCLUSTERZNODES_1, SQ_LOG_ERR, buf);
+ if (rc) return(rc); // Return the error
+ break;
+ }
+
+ ss.str( "" );
+ ss << zkRootNode_.c_str()
+ << zkRootNodeInstance_.c_str();
+ string instanceDir( ss.str( ) );
+
+ rc = zoo_exists( ZHandle, instanceDir.c_str( ), 0, &stat );
+ switch (rc)
+ {
+ case ZOK:
+ break;
+ case ZNONODE:
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d RegisterZNode(%s)\n"
+ , method_name, __LINE__
+ , instanceDir.c_str() );
+ }
+ rc = RegisterZNode( instanceDir.c_str(), NULL, 0 );
+ break;
+ default:
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], zoo_exists() failed with error %s\n"
+ , method_name, ZooErrorStr(rc) );
+ mon_log_write(MON_ZCLIENT_CHECKCLUSTERZNODES_2, SQ_LOG_ERR, buf);
+ break;
+ }
+
+ ss.str( "" );
+ ss << zkRootNode_.c_str()
+ << zkRootNodeInstance_.c_str()
+ << ZCLIENT_CLUSTER_ZNODE;
+ string clusterDir( ss.str( ) );
+
+ rc = zoo_exists( ZHandle, clusterDir.c_str( ), 0, &stat );
+ switch (rc)
+ {
+ case ZOK:
+ break;
+ case ZNONODE:
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d RegisterZNode(%s)\n"
+ , method_name, __LINE__
+ , clusterDir.c_str() );
+ }
+ rc = RegisterZNode( clusterDir.c_str(), NULL, 0 );
+ break;
+ default:
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], zoo_exists() failed with error %s\n"
+ , method_name, ZooErrorStr(rc) );
+ mon_log_write(MON_ZCLIENT_CHECKCLUSTERZNODES_3, SQ_LOG_ERR, buf);
+ break;
+ }
+
+ TRACE_EXIT;
+ return(rc);
+ }
+
+ // ZClient main processing loop
+ void CZClient::MonitorZCluster()
+ {
+ const char method_name[] = "CZClient::MonitorZCluster";
+ TRACE_ENTRY;
+
+ int rc;
+ struct timespec timeout;
+
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d thread %lx starting\n"
+ , method_name, __LINE__, threadId_);
+ }
+
+ if (zcMonitoringRate_ >= 0)
+ {
+ SetTimeToWakeUp( timeout );
+ }
+
+ while ( GetState() != ZC_SHUTDOWN )
+ {
+ lock();
+ if ( !IsEnabled() )
+ {
+ // Wait until timer started
+ CLock::wait();
+ }
+ else
+ {
+ if (zcMonitoringRate_ < 0)
+ {
+ // Wait until signaled
+ CLock::wait();
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d" " - ZCluster signaled, state_=%s\n"
+ , method_name, __LINE__
+ , ZClientStateStr(GetState()) );
+ }
+ }
+ else
+ {
+ // Wait until signaled or timer expires
+ rc = CLock::timedWait( &timeout );
+ if ( rc != ETIMEDOUT )
+ {
+ if ( rc != 0 )
+ {
+ StopClusterMonitoring();
+ }
+ else
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d" " - ZCluster signaled, state_=%s\n"
+ , method_name, __LINE__
+ , ZClientStateStr(GetState()) );
+ }
+ }
+ }
+ }
+ }
+
+ switch ( GetState() )
+ {
+ case ZC_START:
+ StartClusterMonitoring();
+ break;
+ case ZC_CLUSTER:
+ if ( IsCheckCluster() )
+ {
+ CheckCluster();
+ }
+ break;
+ case ZC_WATCH:
+ if ( !IsCheckCluster() )
+ {
+ WatchCluster();
+ }
+ break;
+ case ZC_ZNODE:
+ if ( IsCheckCluster() )
+ {
+ HandleExpiredZNode();
+ }
+ break;
+ case ZC_STOP:
+ StopClusterMonitoring();
+ break;
+ default:
+ break;
+ }
+ if (zcMonitoringRate_ >= 0 )
+ {
+ SetTimeToWakeUp( timeout );
+ }
+ unlock();
+ }
+
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf("%s@%d thread %lx exiting\n"
+ , method_name,__LINE__, pthread_self());
+ }
+
+ TRACE_EXIT;
+ }
+
+ int CZClient::RegisterMyNodeZNode( void )
+ {
+ const char method_name[] = "CZClient::RegisterMyNodeZNode";
+ TRACE_ENTRY;
+
+ int rc;
+ char pnidStr[10];
+
+ sprintf( pnidStr, "%d", MyPNID);
+
+ stringstream newpath;
+ newpath.str( "" );
+ newpath << zkRootNode_.c_str()
+ << zkRootNodeInstance_.c_str()
+ << ZCLIENT_CLUSTER_ZNODE << "/"
+ << Node_name;
+ string monZnode = newpath.str( );
+
+ stringstream ss;
+ ss.str( "" );
+ ss << Node_name << ":" << pnidStr;
+ string monData = ss.str( );
+
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d RegisterZNode(%s:%s)\n"
+ , method_name, __LINE__
+ , monZnode.c_str()
+ , monData.c_str() );
+ }
+
+ rc = RegisterZNode( monZnode.c_str(), monData.c_str(), ZOO_EPHEMERAL );
+
+ TRACE_EXIT;
+
+ return(rc);
+ }
+
+ int CZClient::RegisterZNode( const char *znodePath
+ , const char *znodeData
+ , int flags )
+ {
+ const char method_name[] = "CZClient::RegisterZNode";
+ TRACE_ENTRY;
+
+ int rc = -1;
+ char realpath[1024] = { 0 };
+
+ stringstream ss;
+ ss.str( "" );
+ ss << znodePath;
+ string zpath( ss.str( ) );
+
+ ss.str( "" );
+ ss << ((znodeData) ? znodeData : "");
+ string zdata( ss.str( ) );
+
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d zoo_create (%s : %s)\n"
+ , method_name, __LINE__
+ , zpath.c_str()
+ , zdata.c_str());
+ }
+ rc = zoo_create( ZHandle
+ , zpath.c_str( )
+ , zdata.length() ? zdata.c_str() : NULL
+ , zdata.length() ? zdata.length() : -1
+ , &ZOO_OPEN_ACL_UNSAFE
+ , flags
+ , realpath
+ , sizeof(realpath)-1 );
+ if ( rc != ZOK )
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], zoo_create(%s) failed with error %s\n"
+ , method_name
+ , zpath.c_str()
+ , ZooErrorStr(rc) );
+ mon_log_write(MON_ZCLIENT_REGISTERZNODE_1, SQ_LOG_ERR, buf);
+ }
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf("%s@%d realpath=%s\n", method_name, __LINE__, realpath);
+ }
+
+ TRACE_EXIT;
+ return( rc );
+ }
+
+ void CZClient::SetState( ZClientState_t state, const char *znodePath )
+ {
+ CAutoLock lock(getLocker());
+ state_ = state;
+ znodeQueue_.push_back( znodePath );
+ }
+
+ void CZClient::SetTimeToWakeUp( struct timespec &ts )
+ {
+ const char method_name[] = "CZClient::SetTimeToWakeUp";
+ TRACE_ENTRY;
+
+ clock_gettime(CLOCK_REALTIME, &ts);
+ #if 0
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf("%s@%d" " - Clock time %ld(secs):%ld(nsecs)(zcMonitoringRate_=%ld)\n"
+ , method_name, __LINE__
+ , ts.tv_sec, ts.tv_nsec, zcMonitoringRate_);
+ }
+ #endif
+
+ ts.tv_sec += zcMonitoringRate_;
+
+ #if 0
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf("%s@%d" " - Timeout time %ld(secs):%ld(nsecs)(zcMonitoringRate_=%ld)\n"
+ , method_name, __LINE__
+ , ts.tv_sec, ts.tv_nsec, zcMonitoringRate_);
+ }
+ #endif
+ TRACE_EXIT;
+ }
+
+ int CZClient::SetZNodeWatch( string &monZnode )
+ {
+ const char method_name[] = "CZClient::SetZNodeWatch";
+ TRACE_ENTRY;
+
- char zkData[MAX_PROCESSOR_NAME];
++ char zkData[MPI_MAX_PROCESSOR_NAME];
+ int rc = -1;
+ int zkDataLen = sizeof(zkData);
+ Stat stat;
+
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d monZnode=%s\n"
+ , method_name, __LINE__, monZnode.c_str() );
+ }
+ rc = zoo_exists( ZHandle, monZnode.c_str( ), 0, &stat );
+ if ( rc == ZNONODE )
+ {
+ // return the error
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d monZnode=%s does not exist (ZNONODE)\n"
+ , method_name, __LINE__, monZnode.c_str() );
+ }
+ }
+ else if ( rc == ZOK )
+ {
+ // Get the pnid from the data part of znode
+ rc = zoo_get( ZHandle, monZnode.c_str( ), true, zkData, &zkDataLen, &stat );
+ if ( rc != ZOK )
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], zoo_get() for %s failed with error %s\n"
+ , method_name, monZnode.c_str( ), ZooErrorStr(rc));
+ mon_log_write(MON_ZCLIENT_SETZNODEWATCH_1, SQ_LOG_ERR, buf);
+ }
+ }
+ else
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], zoo_exists() for %s failed with error %s\n"
+ , method_name, monZnode.c_str( ), ZooErrorStr(rc));
+ mon_log_write(MON_ZCLIENT_SETZNODEWATCH_1, SQ_LOG_ERR, buf);
+ }
+
+ TRACE_EXIT;
+ return( rc );
+ }
+
+ void CZClient::StartClusterMonitoring( void )
+ {
+ const char method_name[] = "CZClient::StartClusterMonitoring";
+ TRACE_ENTRY;
+
+ if ( !IsEnabled() )
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d Cluster monitoring started!\n\n", method_name, __LINE__ );
+ }
+ SetEnabled( true );
+ SetState( ZC_WATCH );
+ CLock::wakeOne();
+ }
+
+ TRACE_EXIT;
+ }
+
+ void CZClient::StopClusterMonitoring( void )
+ {
+ const char method_name[] = "CZClient::StopClusterMonitoring";
+ TRACE_ENTRY;
+
+ if ( IsEnabled() )
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "\n%s@%d Cluster monitoring stopped!\n", method_name, __LINE__ );
+ }
+ SetCheckCluster( false );
+ SetEnabled( false );
+ SetState( ZC_DISABLED );
+ CLock::wakeOne();
+ }
+
+ TRACE_EXIT;
+ }
+
+ int CZClient::ShutdownWork(void)
+ {
+ const char method_name[] = "CZClient::ShutdownWork";
+ TRACE_ENTRY;
+
+ // Set flag that tells the commAcceptor thread to exit
+ SetState( ZC_SHUTDOWN );
+ CLock::wakeOne();
+
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d waiting for ZClient thread %lx to exit.\n"
+ , method_name, __LINE__, threadId_);
+ }
+
+ // Wait for commAcceptor thread to exit
+ int rc = pthread_join( threadId_, NULL );
+ if (rc != 0)
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ int err = rc;
+ sprintf(buf, "[%s], Error= Can't join thread! - errno=%d (%s)\n", method_name, err, strerror(err));
+ mon_log_write(MON_ZCLIENT_SHUTDOWNWORK_1, SQ_LOG_ERR, buf);
+ }
+
+ TRACE_EXIT;
+ return(rc);
+ }
+
+ // ZClientThread main
+ static void *ZClientThread(void *arg)
+ {
+ const char method_name[] = "ZClientThread";
+ TRACE_ENTRY;
+
+ // Parameter passed to the thread is an instance of the CommAccept object
+ CZClient *zooClient = (CZClient *) arg;
+
+ // Mask all allowed signals
+ sigset_t mask;
+ sigfillset(&mask);
+ sigdelset(&mask, SIGPROF); // allows profiling such as google profiler
+ int rc = pthread_sigmask(SIG_SETMASK, &mask, NULL);
+ if (rc != 0)
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf(buf, sizeof(buf), "[%s], pthread_sigmask error=%d\n",
+ method_name, rc);
+ mon_log_write(MON_ZCLIENT_ZCLIENTTHREAD_1, SQ_LOG_ERR, buf);
+ }
+
+ // Enter thread processing loop
+ zooClient->MonitorZCluster();
+
+ TRACE_EXIT;
+ return NULL;
+ }
+
+
+ // Create the ZClientThread
+ int CZClient::StartWork()
+ {
+ const char method_name[] = "CZClient::StartWork";
+ TRACE_ENTRY;
+
+ int rc = pthread_create(&threadId_, NULL, ZClientThread, this);
+ if (rc != 0)
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf(buf, sizeof(buf), "[%s], ZClientThread create error=%d\n",
+ method_name, rc);
+ mon_log_write(MON_ZCLIENT_STARTWORK_1, SQ_LOG_ERR, buf);
+ }
+
+ TRACE_EXIT;
+ return(rc);
+ }
+
+ void CZClient::StartMonitoring( void )
+ {
+ const char method_name[] = "CZClient::StartMonitoring";
+ TRACE_ENTRY;
+ if (ZHandle)
+ {
+ ZClient->SetState( CZClient::ZC_START );
+ ZClient->CLock::wakeOne();
+ }
+ TRACE_EXIT;
+ }
+
+ void CZClient::StopMonitoring( void )
+ {
+ const char method_name[] = "CZClient::StopMonitoring";
+ TRACE_ENTRY;
+ ZClient->SetState( CZClient::ZC_STOP );
+ ZClient->CLock::wakeOne();
+ TRACE_EXIT;
+ }
+
+ void CZClient::TriggerCheck( int type, const char *znodePath )
+ {
+ const char method_name[] = "CZClient::TriggerCheck";
+ TRACE_ENTRY;
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d" " - state = %s\n"
+ , method_name, __LINE__
+ , ZooConnectionTypeStr( type ) );
+ }
+
+ if ( type == ZOO_CREATED_EVENT )
+ {
+ SetState( ZC_ZNODE, znodePath );
+ }
+ else if ( type == ZOO_DELETED_EVENT )
+ {
+ SetState( ZC_ZNODE, znodePath );
+ }
+ else if ( type == ZOO_CHANGED_EVENT )
+ {
+ SetState( ZC_ZNODE, znodePath );
+ }
+ else if ( type == ZOO_CHILD_EVENT )
+ {
+ SetState( ZC_CLUSTER, znodePath );
+ }
+ else if ( type == ZOO_NOTWATCHING_EVENT )
+ {
+ SetState( ZC_CLUSTER );
+ }
+ CLock::wakeOne();
+ TRACE_EXIT;
+ }
+
+ void CZClient::WatchCluster( void )
+ {
+ const char method_name[] = "CZClient::WatchCluster";
+ TRACE_ENTRY;
+
+ int rc;
+ struct String_vector nodes;
+
+ if ( !IsCheckCluster() )
+ {
+ rc = GetClusterZNodes( &nodes );
+ if ( rc != ZOK )
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], GetClusterZNodes() failed!\n"
+ , method_name );
+ mon_log_write(MON_ZCLIENT_WATCHCLUSTER_1, SQ_LOG_ERR, buf);
+ SetState( CZClient::ZC_STOP );
+ CLock::wakeOne();
+ return;
+ }
+
+ stringstream newpath;
+ string monZnode;
+
+ if ( nodes.count > 0 )
+ {
+ for (int i = 0; i < nodes.count ; i++ )
+ {
+ newpath.str( "" );
+ newpath << zkRootNode_.c_str()
+ << zkRootNodeInstance_.c_str()
+ << ZCLIENT_CLUSTER_ZNODE << "/"
+ << nodes.data[i];
+ string monZnode = newpath.str( );
+
+ rc = SetZNodeWatch( monZnode );
+ if ( rc != ZOK )
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], GetZNodeData() failed!\n"
+ , method_name );
+ mon_log_write(MON_ZCLIENT_WATCHCLUSTER_2, SQ_LOG_ERR, buf);
+
+ FreeStringVector( &nodes );
+ TRACE_EXIT;
+ return;
+ }
+ else
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d Watch set on monZnode=%s\n"
+ , method_name, __LINE__
+ , monZnode.c_str() );
+ }
+ }
+ }
+ SetCheckCluster( true );
+ SetState( ZC_CLUSTER );
+ FreeStringVector( &nodes );
+ }
+ }
+ else
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d CheckCluster is NOT set!\n"
+ , method_name, __LINE__ );
+ }
+ }
+
+ TRACE_EXIT;
+ }
+
+ int CZClient::WatchNode( const char *nodeName )
+ {
+ const char method_name[] = "CZClient::WatchNode";
+ TRACE_ENTRY;
+
+ int rc;
+ stringstream newpath;
+ newpath.str( "" );
+ newpath << zkRootNode_.c_str()
+ << zkRootNodeInstance_.c_str()
+ << ZCLIENT_CLUSTER_ZNODE << "/"
+ << nodeName;
+ string monZnode = newpath.str( );
+
+ lock();
+ rc = SetZNodeWatch( monZnode );
+ unlock();
+ if ( rc != ZOK )
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], SetZNodeWatch() failed!\n"
+ , method_name );
+ mon_log_write(MON_ZCLIENT_WATCHNODE_1, SQ_LOG_ERR, buf);
+ }
+ else
+ {
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d Watch set on monZnode=%s\n"
+ , method_name, __LINE__
+ , monZnode.c_str() );
+ }
+ }
+
+ TRACE_EXIT;
+ return(rc);
+ }
+
+ int CZClient::WatchNodeDelete( const char *nodeName )
+ {
+ const char method_name[] = "CZClient::WatchNodeDelete";
+ TRACE_ENTRY;
+
+ int rc = -1;
+
+ stringstream newpath;
+ newpath.str( "" );
+ newpath << zkRootNode_.c_str()
+ << zkRootNodeInstance_.c_str()
+ << ZCLIENT_CLUSTER_ZNODE << "/"
+ << nodeName;
+ string monZnode = newpath.str( );
+
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d zoo_delete(%s)\n"
+ , method_name, __LINE__
+ , monZnode.c_str() );
+ }
+ rc = zoo_delete( ZHandle
+ , monZnode.c_str( )
+ , -1 );
+ if ( rc == ZOK )
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], znode (%s) deleted!\n"
+ , method_name, nodeName );
+ mon_log_write(MON_ZCLIENT_WATCHNODEDELETE_1, SQ_LOG_INFO, buf);
+ }
+ else if ( rc == ZNONODE )
+ {
+ rc = ZOK;
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], znode (%s) already deleted!\n"
+ , method_name, nodeName );
+ mon_log_write(MON_ZCLIENT_WATCHNODEDELETE_2, SQ_LOG_INFO, buf);
+ }
+ else
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf( buf, sizeof(buf)
+ , "[%s], zoo_delete(%s) failed with error %s\n"
+ , method_name, nodeName, ZooErrorStr(rc) );
+ mon_log_write(MON_ZCLIENT_WATCHNODEDELETE_3, SQ_LOG_INFO, buf);
+ }
+
+ TRACE_EXIT;
+ return( rc );
+ }
+
http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/1e294233/core/sqf/monitor/linux/zootest.cxx
----------------------------------------------------------------------
diff --cc core/sqf/monitor/linux/zootest.cxx
index 0000000,8a90299..1536e98
mode 000000,100644..100644
--- a/core/sqf/monitor/linux/zootest.cxx
+++ b/core/sqf/monitor/linux/zootest.cxx
@@@ -1,0 -1,283 +1,283 @@@
+ /**********************************************************************
+ // @@@ START COPYRIGHT @@@
+ //
+ // Licensed to the Apache Software Foundation (ASF) under one
+ // or more contributor license agreements. See the NOTICE file
+ // distributed with this work for additional information
+ // regarding copyright ownership. The ASF licenses this file
+ // to you under the Apache License, Version 2.0 (the
+ // "License"); you may not use this file except in compliance
+ // with the License. You may obtain a copy of the License at
+ //
+ // http://www.apache.org/licenses/LICENSE-2.0
+ //
+ // Unless required by applicable law or agreed to in writing,
+ // software distributed under the License is distributed on an
+ // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ // KIND, either express or implied. See the License for the
+ // specific language governing permissions and limitations
+ // under the License.
+ //
+ // @@@ END COPYRIGHT @@@
+ ********************************************************************/
+ #include <errno.h>
+ #include <sys/socket.h>
+ #include <signal.h>
+ #include <ctype.h>
+ #include <string.h>
+ #include <ifaddrs.h>
+ #include <netdb.h>
+ #include <new>
+ #include <stdio.h>
+ #include <list>
+ #include <string>
+
+ #include "msgdef.h"
+ #include "montrace.h"
+ #include "monlogging.h"
+ #include "zookeeper/zookeeper.h"
+ #include "zclient.h"
+ #include "zootest.h"
+
+ using namespace std;
+
+ bool debugFlag = true;
+
+ bool ZClientEnabled = true;
+ char Node_name[MPI_MAX_PROCESSOR_NAME] = {'\0'};
+ char MyPNidStr[8];
+ int MyPNID = -1;
+ int MyNid = -1;
+ int MyPid = -1;
+
+ CZClient *ZClient = NULL;
+ CMonLog *MonLog = NULL;
+ CMonLog *SnmpLog = NULL;
+
+ void HandleZSessionExpiration( void )
+ {
+ const char method_name[] = "HandleZSessionExpiration";
+ TRACE_ENTRY;
+ printf( "%s@%d ZSession expired!\n", method_name, __LINE__ );
+ ZClient->StopMonitoring();
+ ZClient->ShutdownWork();
+ printf( "%s@%d zootest exiting!\n", method_name, __LINE__ );
+ TRACE_EXIT;
+ exit( 1 );
+ }
+
+ void HandleNodeExpiration( const char *nodeName )
+ {
+ const char method_name[] = "HandleNodeExpiration";
+ TRACE_ENTRY;
+ printf( "%s@%d Node %s znode deleted!\n"
+ , method_name, __LINE__, nodeName );
+ TRACE_EXIT;
+ }
+
+ void CreateZookeeperClient( void )
+ {
+ const char method_name[] = "CreateZookeeperClient";
+ TRACE_ENTRY;
+
+ if ( ZClientEnabled )
+ {
+ string hostName;
+ string zkQuorumHosts;
+ stringstream zkQuorumPort;
+ char *env;
- char hostsStr[MAX_PROCESSOR_NAME*3] = { 0 };
++ char hostsStr[MPI_MAX_PROCESSOR_NAME*3] = { 0 };
+ char *tkn = NULL;
+
+ int zport;
+ env = getenv("ZOOKEEPER_PORT");
+ if ( env && isdigit(*env) )
+ {
+ zport = atoi(env);
+ }
+ else
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf(buf, sizeof(buf),
+ "[%s], Zookeeper quorum port is not defined!\n"
+ , method_name);
+ mon_log_write(MON_ZOOCLIENT_MAIN_3, SQ_LOG_CRIT, buf);
+
+ ZClientEnabled = false;
+ TRACE_EXIT;
+ return;
+ }
+
+ env = getenv("ZOOKEEPER_NODES");
+ if ( env )
+ {
+ zkQuorumHosts = env;
+ if ( zkQuorumHosts.length() == 0 )
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf(buf, sizeof(buf),
+ "[%s], Zookeeper quorum hosts are not defined!\n"
+ , method_name);
+ mon_log_write(MON_ZOOCLIENT_MAIN_4, SQ_LOG_CRIT, buf);
+
+ ZClientEnabled = false;
+ TRACE_EXIT;
+ return;
+ }
+
+ strcpy( hostsStr, zkQuorumHosts.c_str() );
+ zkQuorumPort.str( "" );
+
+ tkn = strtok( hostsStr, "," );
+ do
+ {
+ if ( tkn != NULL )
+ {
+ hostName = tkn;
+ zkQuorumPort << hostName.c_str()
+ << ":"
+ << zport;
+ }
+ tkn = strtok( NULL, "," );
+ if ( tkn != NULL )
+ {
+ zkQuorumPort << ",";
+ }
+
+ }
+ while( tkn != NULL );
+ if (trace_settings & (TRACE_INIT | TRACE_RECOVERY))
+ {
+ trace_printf( "%s@%d zkQuorumPort=%s\n"
+ , method_name, __LINE__
+ , zkQuorumPort.str().c_str() );
+ }
+ }
+
+ ZClient = new CZClient( zkQuorumPort.str().c_str()
+ , ZCLIENT_TRAFODION_ZNODE
+ , ZCLIENT_INSTANCE_ZNODE );
+ if ( ZClient == NULL )
+ {
+ char buf[MON_STRING_BUF_SIZE];
+ snprintf(buf, sizeof(buf),
+ "[%s], Failed to allocate ZClient object!\n"
+ , method_name);
+ mon_log_write(MON_ZOOCLIENT_MAIN_5, SQ_LOG_CRIT, buf);
+ abort();
+ }
+ }
+
+ TRACE_EXIT;
+ }
+
+ /*
+ *
+ * The znode hierarchy is as follows:
+ * /trafodion/<instance-name>/cluster
+ * /trafodion/<instance-name>/cluster/<node-name-1>
+ * /trafodion/<instance-name>/cluster/<node-name-2>
+ * Each monitor will create an ephemeral node using its node name (hostname)
+ * followed by its <pnid>.
+ * The monitor processes will watch the cluster parent znode changes.
+ * When a change in the cluster znode occurs they will check the state of
+ * each child. A missing child znode will is assumed to be a down node.
+ *
+ */
+ int main( int argc, char *argv[], char *envp[] )
+ {
+ const char method_name[] = "main";
+ TRACE_ENTRY;
+
+ char *env;
+ char MyName[MPI_MAX_PROCESSOR_NAME];
+
+ trace_settings |= TRACE_INIT;
+
+ int count = 1;
+ while ( count < argc )
+ {
+ if ( strcmp( argv[count], "-pnid" ) == 0 )
+ {
+ if ( ++count < argc )
+ {
+ MyPNID=atoi( argv[count] );
+ }
+ }
+ count++;
+ }
+
+
+ sigset_t newset, oldset;
+ sigemptyset( &newset );
+ sigaddset( &newset,SIGQUIT );
+ sigaddset( &newset,SIGTERM );
+ sigprocmask( SIG_BLOCK,&newset,&oldset );
+
+ gethostname(Node_name, MPI_MAX_PROCESSOR_NAME);
+
+ sprintf( MyName,"zooclient" );
+ MyPid = getpid();
+
+ MonLog = new CMonLog( "log4cxx.monitor.wdg.config", "ZOO", "alt.wdg", MyPNID, MyNid, MyPid, MyName );
+
+ int rc;
+ env = getenv("SQ_MON_ZCLIENT_ENABLED");
+ if ( env )
+ {
+ if ( env && isdigit(*env) )
+ {
+ if ( strcmp(env,"0")==0 )
+ {
+ ZClientEnabled = false;
+ }
+ }
+ }
+
+ if ( ZClientEnabled )
+ {
+ CreateZookeeperClient();
+
+ sleep( 3 ); // Wait for the other zclients to register
+
+ rc = ZClient->StartWork();
+ if (rc != 0)
+ {
+ TRACE_EXIT;
+ exit( 1 );
+ }
+
+ ZClient->StartMonitoring();
+
+ unsigned int sleepTime = 10; // 10 seconds
+ env = getenv("MON_INIT_SLEEP");
+ if ( env && isdigit(*env) )
+ {
+ sleepTime = atoi(env);
+ }
+ sleep( sleepTime ); // Til' quitting time!
+
+ ZClient->StopMonitoring();
+
+ sleep( 1 );
+
+ // Stop the Process Monitor thread
+ rc = ZClient->ShutdownWork();
+ if (rc != 0)
+ {
+ TRACE_EXIT;
+ exit( 1 );
+ }
+ }
+ else
+ {
+ printf( "%s@%d ZClient is disabled, exiting!\n"
+ , method_name, __LINE__ );
+ }
+
+ printf( "%s@%d zootest exiting!\n"
+ , method_name, __LINE__ );
+
+ TRACE_EXIT;
+ exit( 0 );
+ }
http://git-wip-us.apache.org/repos/asf/incubator-trafodion/blob/1e294233/core/sqf/sql/scripts/sqcheck
----------------------------------------------------------------------
diff --cc core/sqf/sql/scripts/sqcheck
index f5806c9,b5d79ae..2c10044
--- a/core/sqf/sql/scripts/sqcheck
+++ b/core/sqf/sql/scripts/sqcheck
@@@ -226,7 -251,14 +247,11 @@@ if [[ -z $SQSCRIPTS_DIR ]]; the
SQSCRIPTS_DIR=$MY_SQROOT/sql/scripts
fi
-SQSCRIPT_FILE="$SQSCRIPTS_DIR/gomon.cold"
-STARTSSMP_FILE="$SQSCRIPTS_DIR/ssmpstart"
-STARTSSCP_FILE="$SQSCRIPTS_DIR/sscpstart"
+ if [ $check_node '>' -1 ]; then
+ getNodeStatus
+ fi
+
### CONFIGURED NODEs
sq_tmp_node_info=`mktemp -t`
sqshell -c node info > $sq_tmp_node_info