You are viewing a plain text version of this content. The canonical link for it is here.
Posted to codereview@trafodion.apache.org by zcorrea <gi...@git.apache.org> on 2018/01/11 17:34:52 UTC

[GitHub] trafodion pull request #1392: [TRAFODION-2881] HA fixes

GitHub user zcorrea opened a pull request:

    https://github.com/apache/trafodion/pull/1392

    [TRAFODION-2881] HA fixes

    Fixed multiple problems in monitor Allgather() socket reconnect logic.
    - Separated node down detection logic from communication errors and timeouts
      to better handle multiple failure scenarios
    - Better handling network resets
    - Additional trace information
    - Fixed 'node up' hang in monitor shell due to TmSync race condition

You can merge this pull request into a Git repository by running:

    $ git pull https://github.com/zcorrea/trafodion TRAFODION-2881

Alternatively you can review and apply these changes as the patch at:

    https://github.com/apache/trafodion/pull/1392.patch

To close this pull request, make a commit to your master/trunk branch
with (at least) the following in the commit message:

    This closes #1392
    
----
commit e832d827507521998567d4cc5d92e4239007d19a
Author: Zalo Correa <za...@...>
Date:   2018-01-11T17:32:11Z

    [TRAFODION-2881] HA fixes
    Fixed multiple problems in monitor Allgather() socket reconnect logic.
    - Separated node down detection logic from communication errors and timeouts
      to better handle multiple failure scenarios
    - Better handling network resets
    - Additional trace information
    - Fixed 'node up' hang in monitor shell due to TmSync race condition

----


---

[GitHub] trafodion pull request #1392: [TRAFODION-2881] HA fixes

Posted by zcorrea <gi...@git.apache.org>.
Github user zcorrea commented on a diff in the pull request:

    https://github.com/apache/trafodion/pull/1392#discussion_r161296519
  
    --- Diff: core/sqf/monitor/linux/cmsh.cxx ---
    @@ -128,31 +168,97 @@ int CCmsh::GetClusterState( PhysicalNodeNameMap_t &physicalNodeMap )
                 if (it != physicalNodeMap.end())
                 {
                    // TEST_POINT and Exclude List : to force state down on node name 
    -               const char *downNodeName = getenv( TP001_NODE_DOWN );
    -               const char *downNodeList = getenv( TRAF_EXCLUDE_LIST );
    -	       string downNodeString = " ";
    -	       if (downNodeList)
    -	       {
    -		 downNodeString += downNodeList;
    -	         downNodeString += " ";
    -	       }
    -	       string downNodeToFind = " ";
    -	       downNodeToFind += nodeName.c_str();
    -	       downNodeToFind += " ";
    -               if (((downNodeList != NULL) && 
    -		      strstr(downNodeString.c_str(),downNodeToFind.c_str())) ||
    -                   ( (downNodeName != NULL) && 
    -                     !strcmp( downNodeName, nodeName.c_str()) ))
    -              {
    -                   nodeState = StateDown;
    -              }
    -	   
    +                const char *downNodeName = getenv( TP001_NODE_DOWN );
    +                const char *downNodeList = getenv( TRAF_EXCLUDE_LIST );
    +                string downNodeString = " ";
    +                if (downNodeList)
    +                {
    +                    downNodeString += downNodeList;
    +                    downNodeString += " ";
    +                }
    +                string downNodeToFind = " ";
    +                downNodeToFind += nodeName.c_str();
    +                downNodeToFind += " ";
    +                if (((downNodeList != NULL) && 
    +                      strstr(downNodeString.c_str(),downNodeToFind.c_str())) ||
    +                    ((downNodeName != NULL) && 
    +                     !strcmp(downNodeName, nodeName.c_str())))
    +                {
    +                    nodeState = StateDown;
    +                }
    +          
                     // Set physical node state
                     physicalNode = it->second;
                     physicalNode->SetState( nodeState );
                 }
             }
    -    }	
    +    }  
    +
    +    TRACE_EXIT;
    +    return( rc );
    +}
    +
    +///////////////////////////////////////////////////////////////////////////////
    +//
    +// Function/Method: CCmsh::GetNodeState
    +//
    +// Description: Updates the state of the nodeName in the physicalNode passed in
    +//              as a parameter. Caller should ensure that the node names are already
    +//              present in the physicalNodeMap. 
    +//
    +// Return:
    +//        0 - success
    +//       -1 - failure
    +//
    +///////////////////////////////////////////////////////////////////////////////
    +int CCmsh::GetNodeState( char *name ,CPhysicalNode  *physicalNode )
    +{
    +    const char method_name[] = "CCmsh::GetNodeState";
    +    TRACE_ENTRY;
    +
    +    int rc;
    +
    +    rc = PopulateNodeState( name );
    +
    +    if ( rc != -1 )
    +    {
    +        // Parse each line extracting name and state
    +        string nodeName;
    +        NodeState_t nodeState;
    +        PhysicalNodeNameMap_t::iterator it;
    +        
    +        StringList_t::iterator    alit;
    +        for ( alit = nodeStateList_.begin(); alit != nodeStateList_.end() ; alit++ )
    +        {
    +            ParseNodeStatus( *alit, nodeName, nodeState );
    +
    +            // TEST_POINT and Exclude List : to force state down on node name 
    +            const char *downNodeName = getenv( TP001_NODE_DOWN );
    +            const char *downNodeList = getenv( TRAF_EXCLUDE_LIST );
    --- End diff --
    
    Created JIRA (TRAFODION-2907) to clean up the use of TRAF_EXCLUDE_LIST in the monitor code.


---

[GitHub] trafodion pull request #1392: [TRAFODION-2881] HA fixes

Posted by trinakrug <gi...@git.apache.org>.
Github user trinakrug commented on a diff in the pull request:

    https://github.com/apache/trafodion/pull/1392#discussion_r161276162
  
    --- Diff: core/sqf/monitor/linux/cmsh.cxx ---
    @@ -128,31 +168,97 @@ int CCmsh::GetClusterState( PhysicalNodeNameMap_t &physicalNodeMap )
                 if (it != physicalNodeMap.end())
                 {
                    // TEST_POINT and Exclude List : to force state down on node name 
    -               const char *downNodeName = getenv( TP001_NODE_DOWN );
    -               const char *downNodeList = getenv( TRAF_EXCLUDE_LIST );
    -	       string downNodeString = " ";
    -	       if (downNodeList)
    -	       {
    -		 downNodeString += downNodeList;
    -	         downNodeString += " ";
    -	       }
    -	       string downNodeToFind = " ";
    -	       downNodeToFind += nodeName.c_str();
    -	       downNodeToFind += " ";
    -               if (((downNodeList != NULL) && 
    -		      strstr(downNodeString.c_str(),downNodeToFind.c_str())) ||
    -                   ( (downNodeName != NULL) && 
    -                     !strcmp( downNodeName, nodeName.c_str()) ))
    -              {
    -                   nodeState = StateDown;
    -              }
    -	   
    +                const char *downNodeName = getenv( TP001_NODE_DOWN );
    +                const char *downNodeList = getenv( TRAF_EXCLUDE_LIST );
    +                string downNodeString = " ";
    +                if (downNodeList)
    +                {
    +                    downNodeString += downNodeList;
    +                    downNodeString += " ";
    +                }
    +                string downNodeToFind = " ";
    +                downNodeToFind += nodeName.c_str();
    +                downNodeToFind += " ";
    +                if (((downNodeList != NULL) && 
    +                      strstr(downNodeString.c_str(),downNodeToFind.c_str())) ||
    +                    ((downNodeName != NULL) && 
    +                     !strcmp(downNodeName, nodeName.c_str())))
    +                {
    +                    nodeState = StateDown;
    +                }
    +          
                     // Set physical node state
                     physicalNode = it->second;
                     physicalNode->SetState( nodeState );
                 }
             }
    -    }	
    +    }  
    +
    +    TRACE_EXIT;
    +    return( rc );
    +}
    +
    +///////////////////////////////////////////////////////////////////////////////
    +//
    +// Function/Method: CCmsh::GetNodeState
    +//
    +// Description: Updates the state of the nodeName in the physicalNode passed in
    +//              as a parameter. Caller should ensure that the node names are already
    +//              present in the physicalNodeMap. 
    +//
    +// Return:
    +//        0 - success
    +//       -1 - failure
    +//
    +///////////////////////////////////////////////////////////////////////////////
    +int CCmsh::GetNodeState( char *name ,CPhysicalNode  *physicalNode )
    +{
    +    const char method_name[] = "CCmsh::GetNodeState";
    +    TRACE_ENTRY;
    +
    +    int rc;
    +
    +    rc = PopulateNodeState( name );
    +
    +    if ( rc != -1 )
    +    {
    +        // Parse each line extracting name and state
    +        string nodeName;
    +        NodeState_t nodeState;
    +        PhysicalNodeNameMap_t::iterator it;
    +        
    +        StringList_t::iterator    alit;
    +        for ( alit = nodeStateList_.begin(); alit != nodeStateList_.end() ; alit++ )
    +        {
    +            ParseNodeStatus( *alit, nodeName, nodeState );
    +
    +            // TEST_POINT and Exclude List : to force state down on node name 
    +            const char *downNodeName = getenv( TP001_NODE_DOWN );
    +            const char *downNodeList = getenv( TRAF_EXCLUDE_LIST );
    --- End diff --
    
    Is TRAF_EXCLUDE_LIST obsolete?


---

[GitHub] trafodion pull request #1392: [TRAFODION-2881] HA fixes

Posted by zcorrea <gi...@git.apache.org>.
Github user zcorrea commented on a diff in the pull request:

    https://github.com/apache/trafodion/pull/1392#discussion_r161294809
  
    --- Diff: core/sqf/monitor/linux/cmsh.cxx ---
    @@ -128,31 +168,97 @@ int CCmsh::GetClusterState( PhysicalNodeNameMap_t &physicalNodeMap )
                 if (it != physicalNodeMap.end())
                 {
                    // TEST_POINT and Exclude List : to force state down on node name 
    -               const char *downNodeName = getenv( TP001_NODE_DOWN );
    -               const char *downNodeList = getenv( TRAF_EXCLUDE_LIST );
    -	       string downNodeString = " ";
    -	       if (downNodeList)
    -	       {
    -		 downNodeString += downNodeList;
    -	         downNodeString += " ";
    -	       }
    -	       string downNodeToFind = " ";
    -	       downNodeToFind += nodeName.c_str();
    -	       downNodeToFind += " ";
    -               if (((downNodeList != NULL) && 
    -		      strstr(downNodeString.c_str(),downNodeToFind.c_str())) ||
    -                   ( (downNodeName != NULL) && 
    -                     !strcmp( downNodeName, nodeName.c_str()) ))
    -              {
    -                   nodeState = StateDown;
    -              }
    -	   
    +                const char *downNodeName = getenv( TP001_NODE_DOWN );
    +                const char *downNodeList = getenv( TRAF_EXCLUDE_LIST );
    +                string downNodeString = " ";
    +                if (downNodeList)
    +                {
    +                    downNodeString += downNodeList;
    +                    downNodeString += " ";
    +                }
    +                string downNodeToFind = " ";
    +                downNodeToFind += nodeName.c_str();
    +                downNodeToFind += " ";
    +                if (((downNodeList != NULL) && 
    +                      strstr(downNodeString.c_str(),downNodeToFind.c_str())) ||
    +                    ((downNodeName != NULL) && 
    +                     !strcmp(downNodeName, nodeName.c_str())))
    +                {
    +                    nodeState = StateDown;
    +                }
    +          
                     // Set physical node state
                     physicalNode = it->second;
                     physicalNode->SetState( nodeState );
                 }
             }
    -    }	
    +    }  
    +
    +    TRACE_EXIT;
    +    return( rc );
    +}
    +
    +///////////////////////////////////////////////////////////////////////////////
    +//
    +// Function/Method: CCmsh::GetNodeState
    +//
    +// Description: Updates the state of the nodeName in the physicalNode passed in
    +//              as a parameter. Caller should ensure that the node names are already
    +//              present in the physicalNodeMap. 
    +//
    +// Return:
    +//        0 - success
    +//       -1 - failure
    +//
    +///////////////////////////////////////////////////////////////////////////////
    +int CCmsh::GetNodeState( char *name ,CPhysicalNode  *physicalNode )
    +{
    +    const char method_name[] = "CCmsh::GetNodeState";
    +    TRACE_ENTRY;
    +
    +    int rc;
    +
    +    rc = PopulateNodeState( name );
    +
    +    if ( rc != -1 )
    +    {
    +        // Parse each line extracting name and state
    +        string nodeName;
    +        NodeState_t nodeState;
    +        PhysicalNodeNameMap_t::iterator it;
    +        
    +        StringList_t::iterator    alit;
    +        for ( alit = nodeStateList_.begin(); alit != nodeStateList_.end() ; alit++ )
    +        {
    +            ParseNodeStatus( *alit, nodeName, nodeState );
    +
    +            // TEST_POINT and Exclude List : to force state down on node name 
    +            const char *downNodeName = getenv( TP001_NODE_DOWN );
    +            const char *downNodeList = getenv( TRAF_EXCLUDE_LIST );
    --- End diff --
    
    Yes, it is! Good catch!


---

[GitHub] trafodion pull request #1392: [TRAFODION-2881] HA fixes

Posted by asfgit <gi...@git.apache.org>.
Github user asfgit closed the pull request at:

    https://github.com/apache/trafodion/pull/1392


---