You are viewing a plain text version of this content. The canonical link for it is here.
Posted to codereview@trafodion.apache.org by zcorrea <gi...@git.apache.org> on 2018/01/11 17:34:52 UTC
[GitHub] trafodion pull request #1392: [TRAFODION-2881] HA fixes
GitHub user zcorrea opened a pull request:
https://github.com/apache/trafodion/pull/1392
[TRAFODION-2881] HA fixes
Fixed multiple problems in monitor Allgather() socket reconnect logic.
- Separated node down detection logic from communication errors and timeouts
to better handle multiple failure scenarios
- Better handling network resets
- Additional trace information
- Fixed 'node up' hang in monitor shell due to TmSync race condition
You can merge this pull request into a Git repository by running:
$ git pull https://github.com/zcorrea/trafodion TRAFODION-2881
Alternatively you can review and apply these changes as the patch at:
https://github.com/apache/trafodion/pull/1392.patch
To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:
This closes #1392
----
commit e832d827507521998567d4cc5d92e4239007d19a
Author: Zalo Correa <za...@...>
Date: 2018-01-11T17:32:11Z
[TRAFODION-2881] HA fixes
Fixed multiple problems in monitor Allgather() socket reconnect logic.
- Separated node down detection logic from communication errors and timeouts
to better handle multiple failure scenarios
- Better handling network resets
- Additional trace information
- Fixed 'node up' hang in monitor shell due to TmSync race condition
----
---
[GitHub] trafodion pull request #1392: [TRAFODION-2881] HA fixes
Posted by zcorrea <gi...@git.apache.org>.
Github user zcorrea commented on a diff in the pull request:
https://github.com/apache/trafodion/pull/1392#discussion_r161296519
--- Diff: core/sqf/monitor/linux/cmsh.cxx ---
@@ -128,31 +168,97 @@ int CCmsh::GetClusterState( PhysicalNodeNameMap_t &physicalNodeMap )
if (it != physicalNodeMap.end())
{
// TEST_POINT and Exclude List : to force state down on node name
- const char *downNodeName = getenv( TP001_NODE_DOWN );
- const char *downNodeList = getenv( TRAF_EXCLUDE_LIST );
- string downNodeString = " ";
- if (downNodeList)
- {
- downNodeString += downNodeList;
- downNodeString += " ";
- }
- string downNodeToFind = " ";
- downNodeToFind += nodeName.c_str();
- downNodeToFind += " ";
- if (((downNodeList != NULL) &&
- strstr(downNodeString.c_str(),downNodeToFind.c_str())) ||
- ( (downNodeName != NULL) &&
- !strcmp( downNodeName, nodeName.c_str()) ))
- {
- nodeState = StateDown;
- }
-
+ const char *downNodeName = getenv( TP001_NODE_DOWN );
+ const char *downNodeList = getenv( TRAF_EXCLUDE_LIST );
+ string downNodeString = " ";
+ if (downNodeList)
+ {
+ downNodeString += downNodeList;
+ downNodeString += " ";
+ }
+ string downNodeToFind = " ";
+ downNodeToFind += nodeName.c_str();
+ downNodeToFind += " ";
+ if (((downNodeList != NULL) &&
+ strstr(downNodeString.c_str(),downNodeToFind.c_str())) ||
+ ((downNodeName != NULL) &&
+ !strcmp(downNodeName, nodeName.c_str())))
+ {
+ nodeState = StateDown;
+ }
+
// Set physical node state
physicalNode = it->second;
physicalNode->SetState( nodeState );
}
}
- }
+ }
+
+ TRACE_EXIT;
+ return( rc );
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Function/Method: CCmsh::GetNodeState
+//
+// Description: Updates the state of the nodeName in the physicalNode passed in
+// as a parameter. Caller should ensure that the node names are already
+// present in the physicalNodeMap.
+//
+// Return:
+// 0 - success
+// -1 - failure
+//
+///////////////////////////////////////////////////////////////////////////////
+int CCmsh::GetNodeState( char *name ,CPhysicalNode *physicalNode )
+{
+ const char method_name[] = "CCmsh::GetNodeState";
+ TRACE_ENTRY;
+
+ int rc;
+
+ rc = PopulateNodeState( name );
+
+ if ( rc != -1 )
+ {
+ // Parse each line extracting name and state
+ string nodeName;
+ NodeState_t nodeState;
+ PhysicalNodeNameMap_t::iterator it;
+
+ StringList_t::iterator alit;
+ for ( alit = nodeStateList_.begin(); alit != nodeStateList_.end() ; alit++ )
+ {
+ ParseNodeStatus( *alit, nodeName, nodeState );
+
+ // TEST_POINT and Exclude List : to force state down on node name
+ const char *downNodeName = getenv( TP001_NODE_DOWN );
+ const char *downNodeList = getenv( TRAF_EXCLUDE_LIST );
--- End diff --
Created JIRA (TRAFODION-2907) to clean up the use of TRAF_EXCLUDE_LIST in the monitor code.
---
[GitHub] trafodion pull request #1392: [TRAFODION-2881] HA fixes
Posted by trinakrug <gi...@git.apache.org>.
Github user trinakrug commented on a diff in the pull request:
https://github.com/apache/trafodion/pull/1392#discussion_r161276162
--- Diff: core/sqf/monitor/linux/cmsh.cxx ---
@@ -128,31 +168,97 @@ int CCmsh::GetClusterState( PhysicalNodeNameMap_t &physicalNodeMap )
if (it != physicalNodeMap.end())
{
// TEST_POINT and Exclude List : to force state down on node name
- const char *downNodeName = getenv( TP001_NODE_DOWN );
- const char *downNodeList = getenv( TRAF_EXCLUDE_LIST );
- string downNodeString = " ";
- if (downNodeList)
- {
- downNodeString += downNodeList;
- downNodeString += " ";
- }
- string downNodeToFind = " ";
- downNodeToFind += nodeName.c_str();
- downNodeToFind += " ";
- if (((downNodeList != NULL) &&
- strstr(downNodeString.c_str(),downNodeToFind.c_str())) ||
- ( (downNodeName != NULL) &&
- !strcmp( downNodeName, nodeName.c_str()) ))
- {
- nodeState = StateDown;
- }
-
+ const char *downNodeName = getenv( TP001_NODE_DOWN );
+ const char *downNodeList = getenv( TRAF_EXCLUDE_LIST );
+ string downNodeString = " ";
+ if (downNodeList)
+ {
+ downNodeString += downNodeList;
+ downNodeString += " ";
+ }
+ string downNodeToFind = " ";
+ downNodeToFind += nodeName.c_str();
+ downNodeToFind += " ";
+ if (((downNodeList != NULL) &&
+ strstr(downNodeString.c_str(),downNodeToFind.c_str())) ||
+ ((downNodeName != NULL) &&
+ !strcmp(downNodeName, nodeName.c_str())))
+ {
+ nodeState = StateDown;
+ }
+
// Set physical node state
physicalNode = it->second;
physicalNode->SetState( nodeState );
}
}
- }
+ }
+
+ TRACE_EXIT;
+ return( rc );
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Function/Method: CCmsh::GetNodeState
+//
+// Description: Updates the state of the nodeName in the physicalNode passed in
+// as a parameter. Caller should ensure that the node names are already
+// present in the physicalNodeMap.
+//
+// Return:
+// 0 - success
+// -1 - failure
+//
+///////////////////////////////////////////////////////////////////////////////
+int CCmsh::GetNodeState( char *name ,CPhysicalNode *physicalNode )
+{
+ const char method_name[] = "CCmsh::GetNodeState";
+ TRACE_ENTRY;
+
+ int rc;
+
+ rc = PopulateNodeState( name );
+
+ if ( rc != -1 )
+ {
+ // Parse each line extracting name and state
+ string nodeName;
+ NodeState_t nodeState;
+ PhysicalNodeNameMap_t::iterator it;
+
+ StringList_t::iterator alit;
+ for ( alit = nodeStateList_.begin(); alit != nodeStateList_.end() ; alit++ )
+ {
+ ParseNodeStatus( *alit, nodeName, nodeState );
+
+ // TEST_POINT and Exclude List : to force state down on node name
+ const char *downNodeName = getenv( TP001_NODE_DOWN );
+ const char *downNodeList = getenv( TRAF_EXCLUDE_LIST );
--- End diff --
Is TRAF_EXCLUDE_LIST obsolete?
---
[GitHub] trafodion pull request #1392: [TRAFODION-2881] HA fixes
Posted by zcorrea <gi...@git.apache.org>.
Github user zcorrea commented on a diff in the pull request:
https://github.com/apache/trafodion/pull/1392#discussion_r161294809
--- Diff: core/sqf/monitor/linux/cmsh.cxx ---
@@ -128,31 +168,97 @@ int CCmsh::GetClusterState( PhysicalNodeNameMap_t &physicalNodeMap )
if (it != physicalNodeMap.end())
{
// TEST_POINT and Exclude List : to force state down on node name
- const char *downNodeName = getenv( TP001_NODE_DOWN );
- const char *downNodeList = getenv( TRAF_EXCLUDE_LIST );
- string downNodeString = " ";
- if (downNodeList)
- {
- downNodeString += downNodeList;
- downNodeString += " ";
- }
- string downNodeToFind = " ";
- downNodeToFind += nodeName.c_str();
- downNodeToFind += " ";
- if (((downNodeList != NULL) &&
- strstr(downNodeString.c_str(),downNodeToFind.c_str())) ||
- ( (downNodeName != NULL) &&
- !strcmp( downNodeName, nodeName.c_str()) ))
- {
- nodeState = StateDown;
- }
-
+ const char *downNodeName = getenv( TP001_NODE_DOWN );
+ const char *downNodeList = getenv( TRAF_EXCLUDE_LIST );
+ string downNodeString = " ";
+ if (downNodeList)
+ {
+ downNodeString += downNodeList;
+ downNodeString += " ";
+ }
+ string downNodeToFind = " ";
+ downNodeToFind += nodeName.c_str();
+ downNodeToFind += " ";
+ if (((downNodeList != NULL) &&
+ strstr(downNodeString.c_str(),downNodeToFind.c_str())) ||
+ ((downNodeName != NULL) &&
+ !strcmp(downNodeName, nodeName.c_str())))
+ {
+ nodeState = StateDown;
+ }
+
// Set physical node state
physicalNode = it->second;
physicalNode->SetState( nodeState );
}
}
- }
+ }
+
+ TRACE_EXIT;
+ return( rc );
+}
+
+///////////////////////////////////////////////////////////////////////////////
+//
+// Function/Method: CCmsh::GetNodeState
+//
+// Description: Updates the state of the nodeName in the physicalNode passed in
+// as a parameter. Caller should ensure that the node names are already
+// present in the physicalNodeMap.
+//
+// Return:
+// 0 - success
+// -1 - failure
+//
+///////////////////////////////////////////////////////////////////////////////
+int CCmsh::GetNodeState( char *name ,CPhysicalNode *physicalNode )
+{
+ const char method_name[] = "CCmsh::GetNodeState";
+ TRACE_ENTRY;
+
+ int rc;
+
+ rc = PopulateNodeState( name );
+
+ if ( rc != -1 )
+ {
+ // Parse each line extracting name and state
+ string nodeName;
+ NodeState_t nodeState;
+ PhysicalNodeNameMap_t::iterator it;
+
+ StringList_t::iterator alit;
+ for ( alit = nodeStateList_.begin(); alit != nodeStateList_.end() ; alit++ )
+ {
+ ParseNodeStatus( *alit, nodeName, nodeState );
+
+ // TEST_POINT and Exclude List : to force state down on node name
+ const char *downNodeName = getenv( TP001_NODE_DOWN );
+ const char *downNodeList = getenv( TRAF_EXCLUDE_LIST );
--- End diff --
Yes, it is! Good catch!
---
[GitHub] trafodion pull request #1392: [TRAFODION-2881] HA fixes
Posted by asfgit <gi...@git.apache.org>.
Github user asfgit closed the pull request at:
https://github.com/apache/trafodion/pull/1392
---