You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@vcl.apache.org by ar...@apache.org on 2013/05/22 21:15:03 UTC
svn commit: r1485348 -
/vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm
Author: arkurth
Date: Wed May 22 19:15:03 2013
New Revision: 1485348
URL: http://svn.apache.org/r1485348
Log:
VCL-682
Fixed problem in xCAT.pm::load where it would incorrectly detect that an image load was complete shortly after beginning if nodestat returns ping or sshd after it previously returned install.
Added _get_install_status subroutine which uses a socket to connect directly to the node when it is being installed in order to retrieve the installation status. If the installer image is configured with a working SSH key nodestat will return 'sshd' instead of the detailed status.
Modified:
vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm
Modified: vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm
URL: http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm?rev=1485348&r1=1485347&r2=1485348&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm Wed May 22 19:15:03 2013
@@ -66,6 +66,7 @@ use VCL::utils;
use Fcntl qw(:DEFAULT :flock);
use File::Copy;
use IO::Seekable;
+use Socket;
##############################################################################
@@ -282,7 +283,8 @@ sub load {
# Set to a short delay at the beginning of monitoring, this will be increased once installation start is detected
my $monitor_delay_seconds = 5;
- my $previous_status;
+ my $previous_nodestat_status;
+ my $previous_nodeset_status;
my $current_time;
my $install_started = 0;
MONITOR_LOADING: while (($current_time = time) < $nochange_timeout_time && $current_time < $overall_timeout_time) {
@@ -317,35 +319,48 @@ sub load {
}
# Get the current status of the node
- my $current_status = $self->_nodestat($computer_node_name);
-
# Set previous status to current status if this is the first iteration
- $previous_status = $current_status if !defined($previous_status);
+ my $current_nodestat_status = $self->_nodestat($computer_node_name);
+ $previous_nodestat_status = $current_nodestat_status if !defined($previous_nodestat_status);
- # Check if the installation has completed
- if ($install_started && $current_status =~ /^(boot|complete|sshd)$/i) {
- notify($ERRORS{'DEBUG'}, 0, "$computer_node_name is finished loading image, current status: $current_status");
- insertloadlog($reservation_id, $computer_id, "bootstate", "$computer_node_name image load complete: $current_status");
- last MONITOR_LOADING;
- }
+ my $current_nodeset_status = $self->_nodeset($computer_node_name, 'stat');
+ $previous_nodeset_status = $current_nodeset_status if !defined($previous_nodeset_status);
- # Set the install started flag if it hasn't been set already
- if (!$install_started && $current_status =~ /(install|partimage)/i) {
- $monitor_delay_seconds = 20;
- notify($ERRORS{'DEBUG'}, 0, "installation has started, increasing wait between monitoring checks to $monitor_delay_seconds seconds");
- $install_started = 1;
+ if (!$install_started) {
+ # Check if the installation has started
+ if ($current_nodestat_status =~ /(install|partimage)/i) {
+ # Slow down the monitor looping
+ $monitor_delay_seconds = 20;
+ notify($ERRORS{'DEBUG'}, 0, "installation has started, increasing wait between monitoring checks to $monitor_delay_seconds seconds");
+ $install_started = 1;
+ }
+ }
+ else {
+ # nodestat will return 'sshd' if the computer is responding to SSH while it is being installed instead of the more detailed information
+ # Try to get the installation status directly using a socket
+ if ($current_nodestat_status eq 'sshd') {
+ $current_nodestat_status = $self->_get_install_status($computer_node_name) || 'sshd';
+ }
+
+ # Check if the installation has completed
+ if ($current_nodestat_status =~ /^(boot|complete)$/i || $current_nodeset_status =~ 'boot') {
+ notify($ERRORS{'DEBUG'}, 0, "$computer_node_name is finished loading image, current nodestat status: $current_nodestat_status, nodeset status: $current_nodeset_status");
+ insertloadlog($reservation_id, $computer_id, "bootstate", "$computer_node_name image load complete: $current_nodestat_status, $current_nodeset_status");
+ last MONITOR_LOADING;
+ }
}
# Check if the nodestat status changed from previous iteration
- if ($current_status ne $previous_status) {
+ if ($current_nodestat_status ne $previous_nodestat_status || $current_nodeset_status ne $previous_nodeset_status) {
$reset_timeout = 1;
notify($ERRORS{'DEBUG'}, 0, "status of $computer_node_name changed");
# Set previous status to the current status
- $previous_status = $current_status;
+ $previous_nodestat_status = $current_nodestat_status;
+ $previous_nodeset_status = $current_nodeset_status;
}
else {
- notify($ERRORS{'DEBUG'}, 0, "status of $computer_node_name has not changed: $current_status");
+ notify($ERRORS{'DEBUG'}, 0, "status of $computer_node_name has not changed: $current_nodestat_status");
}
# If any changes were detected, reset the nochange timeout
@@ -1334,7 +1349,7 @@ sub _edit_nodelist {
# For HPC, use image project = vclhpc. There should be an xCAT postscript group named 'vclhpc' configured with specific HPC postscripts
my $groups;
- if ($request_state_name eq 'image' || $image_os_install_type =~ /image/i) {
+ if ($request_state_name eq 'image') {
# Image-based install or capture
$groups = "all,blade,image";
}
@@ -1589,7 +1604,7 @@ sub _nodeset {
my ($status) = $line =~ /^$computer_node_name:\s+(.+)$/;
if ($status) {
if ($nodeset_option eq 'stat') {
- notify($ERRORS{'DEBUG'}, 0, "retrieved nodeset status of $computer_node_name: $status");
+ notify($ERRORS{'DEBUG'}, 0, "retrieved nodeset status of $computer_node_name: '$status'");
return $status;
}
else {
@@ -2221,6 +2236,66 @@ sub _is_throttle_limit_reached {
#/////////////////////////////////////////////////////////////////////////////
+=head2 _get_install_status
+
+ Parameters : $computer_node_name
+ Returns : string
+ Description : Attempts to connect to TCP port 3001 on a node to retrieve the
+ installation status. This is done to overcome a problem which
+ occurs if the node is responding to SSH while it is being
+ installed and nodestat returns 'sshd' instead of the more
+ detailed status.
+
+=cut
+
+sub _get_install_status {
+ my $self = shift;
+ if (ref($self) !~ /xCAT/i) {
+ notify($ERRORS{'CRITICAL'}, 0, "subroutine was called as a function, it must be called as a class method");
+ return;
+ }
+
+ # Get the computer name argument
+ my $computer_node_name = shift;
+ if (!$computer_node_name) {
+ notify($ERRORS{'WARNING'}, 0, "computer name argument was not specified");
+ return;
+ }
+
+ my $protocol = 'tcp';
+ my $port = 3001;
+
+ my $socket;
+ if (!socket($socket, PF_INET, SOCK_STREAM, getprotobyname($protocol))) {
+ return;
+ }
+
+ my $host_by_name = gethostbyname($computer_node_name);
+ my $sockaddr_in = sockaddr_in($port, $host_by_name);
+ if (!connect($socket, $sockaddr_in)) {
+ return;
+ }
+
+ print $socket "stat \n";
+ $socket->flush;
+
+ my $status;
+ while (<$socket>) {
+ $status .= $_;
+ }
+ close($socket);
+
+ if ($status =~ /\w/) {
+ notify($ERRORS{'DEBUG'}, 0, "retrieved install status from $computer_node_name: '$status'");
+ return $status;
+ }
+ else {
+ return;
+ }
+}
+
+#/////////////////////////////////////////////////////////////////////////////
+
=head2 DESTROY
Parameters : none
@@ -2231,18 +2306,39 @@ sub _is_throttle_limit_reached {
sub DESTROY {
my $self = shift;
+ if (!defined($self)) {
+ notify($ERRORS{'DEBUG'}, 0, "skipping xCAT DESTROY tasks, \$self is not defined");
+ return;
+ }
+
+ my $address = sprintf('%x', $self);
my $type = ref($self);
- if ($type =~ /xCAT/ && $self->data) {
+ notify($ERRORS{'DEBUG'}, 0, "destroying $type object, address: $address");
+
+ if (!$self->data(0)) {
+ notify($ERRORS{'DEBUG'}, 0, "skipping xCAT DESTROY tasks, \$self->data is not defined");
+ }
+ elsif (!$self->mn_os(0)) {
+ notify($ERRORS{'DEBUG'}, 0, "skipping xCAT DESTROY tasks, \$self->mn_os is not defined");
+ }
+ else {
my $node = $self->data->get_computer_node_name(0);
my $request_state_name = $self->data->get_request_state_name(0);
- if ($request_state_name && $node && $request_state_name =~ /^(new|reload|image)$/) {
+ if (!defined($node) || !defined($request_state_name)) {
+ notify($ERRORS{'WARNING'}, 0, "skipping xCAT DESTROY tasks, unable to retrieve node name and request state name from DataStructure");
+ }
+ elsif ($request_state_name =~ /^(new|reload|image)$/) {
+ notify($ERRORS{'DEBUG'}, 0, "request state is '$request_state_name', attempting to set nodeset state of $node to 'boot'");
$self->_nodeset($node, 'boot');
}
-
- # Check for an overridden destructor
- $self->SUPER::DESTROY if $self->can("SUPER::DESTROY");
+ else {
+ notify($ERRORS{'DEBUG'}, 0, "request state is '$request_state_name', skipping setting nodeset state of $node to 'boot'");
+ }
}
+
+ # Check for an overridden destructor
+ $self->SUPER::DESTROY if $self->can("SUPER::DESTROY");
} ## end sub DESTROY
#/////////////////////////////////////////////////////////////////////////////