You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@vcl.apache.org by ar...@apache.org on 2013/04/05 18:49:44 UTC
svn commit: r1465040 [2/2] -
/vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm
Modified: vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm
URL: http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm?rev=1465040&r1=1465039&r2=1465040&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/Module/Provisioning/xCAT.pm Fri Apr 5 16:49:44 2013
@@ -24,15 +24,20 @@ VCL::Provisioning::xCAT - VCL module to
=head1 SYNOPSIS
- Needs to be written
+ From another VCL module instantiated normally for a reservation:
+ $self->provisioner->load();
+ my $status = $self->provisioner->node_status();
+
+ From a script:
+ my $xcat = new VCL::Module::Provisioning::xCAT();
+ my $status = $xcat->node_status('node1a2-3');
=head1 DESCRIPTION
This module provides VCL support for xCAT (Extreme Cluster Administration
- Toolkit). xCAT is a scalable distributed computing management and
+ Toolkit) version 2.x. xCAT is a scalable distributed computing management and
provisioning tool that provides a unified interface for hardware control,
- discovery, and OS diskful/diskfree deployment.
- http://xcat.sourceforge.net
+ discovery, and OS diskful/diskfree deployment. http://xcat.sourceforge.net
=cut
@@ -60,6 +65,7 @@ use English qw( -no_match_vars );
use VCL::utils;
use Fcntl qw(:DEFAULT :flock);
use File::Copy;
+use IO::Seekable;
##############################################################################
@@ -72,7 +78,7 @@ use File::Copy;
Data type : scalar
Description : $XCAT_ROOT stores the location of the xCAT binary files. xCAT
should set the XCATROOT environment variable. This is used if
- it is set. If XCATROOT is not set, /opt/xcat is used.
+ it is set. If XCATROOT is not set, /opt/xcat is used.
=cut
@@ -89,9 +95,10 @@ my $XCAT_ROOT;
=head2 initialize
- Parameters :
- Returns :
- Description :
+ Parameters : none
+ Returns : boolean
+ Description : Checks to make sure xCAT appears to be installed on the
+ management node.
=cut
@@ -119,14 +126,19 @@ sub initialize {
notify($ERRORS{'WARNING'}, 0, "unable to initialize xCAT module, $XCAT_ROOT directory does not exist");
return;
}
-
+
# Check to make sure one of the expected executables is where it should be
if (!-x "$XCAT_ROOT/bin/rpower") {
notify($ERRORS{'WARNING'}, 0, "unable to initialize xCAT module, expected executable was not found: $XCAT_ROOT/bin/rpower");
return;
}
- notify($ERRORS{'DEBUG'}, 0, "xCAT root path found: $XCAT_ROOT");
-
+
+ # Check to make sure one of the xCAT 2.x executables not included in 1/x exists
+ if (!-x "$XCAT_ROOT/bin/lsdef") {
+ notify($ERRORS{'WARNING'}, 0, "unable to initialize xCAT module, xCAT version is not supported, expected xCAT 2.x+ executable was not found: $XCAT_ROOT/bin/lsdef");
+ return;
+ }
+
notify($ERRORS{'DEBUG'}, 0, "xCAT module initialized");
return 1;
} ## end sub initialize
@@ -135,9 +147,9 @@ sub initialize {
=head2 load
- Parameters : hash
- Returns : 1(success) or 0(failure)
- Description : loads node with provided image
+ Parameters : none
+ Returns : boolean
+ Description : Loads a computer with the image defined in the reservation data.
=cut
@@ -145,601 +157,228 @@ sub load {
my $self = shift;
if (ref($self) !~ /xCAT/i) {
notify($ERRORS{'CRITICAL'}, 0, "subroutine was called as a function, it must be called as a class method");
- return 0;
+ return;
}
# Get the data
- my $reservation_id = $self->data->get_reservation_id();
- my $image_name = $self->data->get_image_name();
- my $image_os_name = $self->data->get_image_os_name();
- my $image_os_type = $self->data->get_image_os_type();
- my $image_project = $self->data->get_image_project();
- my $image_reload_time = $self->data->get_image_reload_time();
- my $imagemeta_postoption = $self->data->get_imagemeta_postoption();
- my $image_architecture = $self->data->get_image_architecture();
- my $computer_id = $self->data->get_computer_id();
- my $computer_node_name = $self->data->get_computer_node_name();
- my $computer_ip_address = $self->data->get_computer_ip_address();
- my $ip_configuration = $self->data->get_management_node_public_ip_configuration();
-
- notify($ERRORS{'OK'}, 0, "nodename not set")
- if (!defined($computer_node_name));
- notify($ERRORS{'OK'}, 0, "imagename not set")
- if (!defined($image_name));
- notify($ERRORS{'OK'}, 0, "project not set")
- if (!defined($image_project));
- notify($ERRORS{'OK'}, 0, "estimated reload time not set")
- if (!defined($image_reload_time));
- notify($ERRORS{'OK'}, 0, "osname not set")
- if (!defined($image_os_name));
- notify($ERRORS{'OK'}, 0, "computerid not set")
- if (!defined($computer_id));
- notify($ERRORS{'OK'}, 0, "reservationid not set")
- if (!defined($reservation_id));
- notify($ERRORS{'OK'}, 0, "architecture not set")
- if (!defined($image_architecture));
-
- # Initialize some timer variables
- # Do this here in case goto passes over the declaration
- my $sshd_start_time;
- my $sshd_end_time;
-
- insertloadlog($reservation_id, $computer_id, "startload", "$computer_node_name $image_name");
-
- #make sure the following services are running on management node
- # dhcpd named xcatd
- # start them if they are not actively running
- $image_project = "vcl" if (!defined($image_project));
-
- $image_architecture = "x86" if (!defined($image_architecture));
-
- # Run xCAT's assign2project utility
- if (_assign2project($computer_node_name, $image_project)) {
- notify($ERRORS{'OK'}, 0, "$computer_node_name _assign2project return successful");
- }
- else {
- notify($ERRORS{'CRITICAL'}, 0, "$computer_node_name could not _assign2project to $image_project");
- return 0;
- }
-
- # Make sure dhcpd is started on management node
- if (!(_checknstartservice("dhcpd"))) {
- notify($ERRORS{'CRITICAL'}, 0, "dhcpd is not running or failed to restart");
- return 0;
- }
-
- # Make sure named is started on management node
- if (!(_checknstartservice("named"))) {
- notify($ERRORS{'CRITICAL'}, 0, "named is not running or failed to restart");
- return 0;
- }
-
- # Make sure xcatd is started on management node
- if (!(_checknstartservice("xcatd"))) {
- notify($ERRORS{'CRITICAL'}, 0, "xcatd is not running or failed to restart");
- return 0;
- }
-
-
- # Make sure the image repository path can be retrieved and directory exists
- my $image_repository_path = $self->_get_image_repository_path();
- if (!$image_repository_path) {
- notify($ERRORS{'CRITICAL'}, 0, "unable to determine image repository path");
- return 0;
- }
- if (-d $image_repository_path) {
- notify($ERRORS{'DEBUG'}, 0, "confirmed image repository directory exists: $image_repository_path");
- }
- else {
- notify($ERRORS{'CRITICAL'}, 0, "unable to confirm image repository directory exists: $image_repository_path, no output returned from ls command");
- return 0;
- }
+ my $reservation_id = $self->data->get_reservation_id();
+ my $image_name = $self->data->get_image_name();
+ my $image_reload_time_minutes = $self->data->get_image_reload_time() || 10;
+ my $computer_id = $self->data->get_computer_id();
+ my $computer_node_name = $self->data->get_computer_node_name();
- # Insert a computerloadlog record and edit nodetype.tab
- insertloadlog($reservation_id, $computer_id, "editnodetype", "updating nodetype file");
- if ($self->_edit_nodetype($computer_node_name, $image_name, $image_os_name, $image_architecture)) {
- notify($ERRORS{'OK'}, 0, "nodetype updated for $computer_node_name with $image_name");
- }
- else {
- notify($ERRORS{'CRITICAL'}, 0, "could not edit nodetype for $computer_node_name with $image_name");
- return 0;
- }
-
- # Begin reinstallation using xCAT's rinstall
- # Loop and continue checking
-
- # Set flags and counters
- my $rinstall_attempts = 0;
- my $rpower_fixes = 0;
- my $bootstatus = 0;
- my $wait_loops = 0;
- my @status;
- my $xcat_throttle = 0;
-
- #get Throttle value from database if set
- my $variable_name = $self->data->get_management_node_hostname() . "|xcat|throttle";
- if($self->data->is_variable_set($variable_name)){
- notify($ERRORS{'DEBUG'}, 0, "throttle is set for $variable_name");
- #fetch variable
- $xcat_throttle = $self->data->get_variable($variable_name);
+ insertloadlog($reservation_id, $computer_id, "startload", "$computer_node_name $image_name");
- }
- else{
- notify($ERRORS{'DEBUG'}, 0, "throttle is not set for $variable_name");
- $xcat_throttle = 0;
- }
+ # Insert a computerloadlog record and edit nodetype table to set the image information for the computer
+ insertloadlog($reservation_id, $computer_id, "editnodetype", "updating nodetype table");
+ $self->_edit_nodetype($computer_node_name, $image_name) || return;
+ # Insert a computerloadlog record and edit nodelist table to set the xCAT groups for the computer
+ $self->_edit_nodelist($computer_node_name, $image_name) || return;
-
# Check to see if management node throttle is configured
- if ($xcat_throttle) {
- notify($ERRORS{'DEBUG'}, 0, "throttle is set to $xcat_throttle");
-
- my $lckloadfile = "/tmp/nodeloading.lockfile";
- notify($ERRORS{'DEBUG'}, 0, "attempting to open node loading lockfile for throttling: $lckloadfile");
- if (sysopen(SEM, $lckloadfile, O_RDONLY | O_CREAT)) {
- notify($ERRORS{'DEBUG'}, 0, "opened lockfile, attempting to obtain lock");
-
- if (flock(SEM, LOCK_EX)) {
- notify($ERRORS{'DEBUG'}, 0, "obtained exclusive lock on $lckloadfile, checking for concurrent loads");
- my $maxload = 1;
- while ($maxload) {
- notify($ERRORS{'DEBUG'}, 0, "running 'nodeset all stat' to determine number of nodes currently being loaded");
- if (open(NODESET, "$XCAT_ROOT/bin/nodeset all stat \| egrep \'install\|image\' 2>&1 | ")) {
- my @nodesetout = <NODESET>;
- close(NODESET);
- my $ld = @nodesetout;
- notify($ERRORS{'DEBUG'}, 0, "current number of nodes loading: $ld");
-
- if ($ld < $xcat_throttle) {
- notify($ERRORS{'OK'}, 0, "current nodes loading is less than throttle, ok to proceed");
- $maxload = 0;
- }
- else {
- notify($ERRORS{'OK'}, 0, "current nodes loading=$ld, throttle=$xcat_throttle, must wait, sleeping for 10 seconds");
- sleep 10;
- }
- } ## end if (open(NODESET, "$XCAT_ROOT/bin/nodeset all stat \| grep install 2>&1 | "...
- else {
- notify($ERRORS{'WARNING'}, 0, "failed to run 'nodeset all stat' to determine number of nodes currently being loaded");
- }
- } ## end while ($maxload)
- } ## end if (flock(SEM, LOCK_EX))
- else {
- notify($ERRORS{'WARNING'}, 0, "failed to obtain exclusive lock on $lckloadfile");
- }
-
- notify($ERRORS{'OK'}, 0, "releasing exclusive lock on $lckloadfile, proceeding to install");
- close(SEM);
-
- } ## end if (sysopen(SEM, $lckloadfile, O_RDONLY | ...
- else {
- notify($ERRORS{'WARNING'}, 0, "failed to open node loading lockfile");
- }
-
- } ## end if ($xcat_throttle)
- else {
- notify($ERRORS{'DEBUG'}, 0, "throttle is NOT set");
- }
-
- XCATRINSTALL:
-
- # Reset sshd wait start time, used only for diagnostic purposes
- $sshd_start_time = 0;
-
- # Make use of semaphore files to control the flow
- # xCAT's rinstall does not handle locking of files
- my $lckfile = "/tmp/rinstall.lockfile";
- notify($ERRORS{'DEBUG'}, 0, "attempting to open rinstall lockfile: $lckfile");
- if (sysopen(SEM, $lckfile, O_RDONLY | O_CREAT)) {
- notify($ERRORS{'DEBUG'}, 0, "opened lockfile, attempting to obtain lock");
-
- if (flock(SEM, LOCK_EX)) {
- notify($ERRORS{'DEBUG'}, 0, "obtained exclusive lock on $lckfile");
-
- # Safe to run rinstall command
- insertloadlog($reservation_id, $computer_id, "rinstall", "starting install process");
- notify($ERRORS{'OK'}, 0, "executing rinstall $computer_node_name");
- if (open(RINSTALL, "$XCAT_ROOT/bin/rinstall $computer_node_name 2>&1 |")) {
- $rinstall_attempts++;
- notify($ERRORS{'OK'}, 0, "beginning rinstall attempt $rinstall_attempts");
- while (<RINSTALL>) {
- chomp($_);
-
- #notify($ERRORS{'OK'},0,"$_");
- if ($_ =~ /not in bay/) {
- notify($ERRORS{'WARNING'}, 0, "rpower not in bay issue, will attempt to correct, calling rinv");
- if (_fix_rpower($computer_node_name)) {
-
- #try xcatrinstall again
- close(RINSTALL);
- close(SEM); # remove lock
- # loop control
- if ($rpower_fixes < 10) {
- $rpower_fixes++;
- sleep 1;
- goto XCATRINSTALL;
- }
- else {
- notify($ERRORS{'CRITICAL'}, 0, "rpower failed $rpower_fixes times on $computer_node_name");
- return 0;
- }
- } ## end if (_fix_rpower($computer_node_name))
- } ## end if ($_ =~ /not in bay/)
- if ($_ =~ /Invalid login|does not exist/) {
- notify($ERRORS{'CRITICAL'}, 0, "failed to initate rinstall on $computer_node_name - $_");
- close(RINSTALL);
- close(SEM);
- insertloadlog($reservation_id, $computer_id, "failed", "failed to start load process on $computer_node_name");
- return 0;
- }
-
- } #while RINSTALL
- close(RINSTALL);
-
- notify($ERRORS{'OK'}, 0, "releasing exclusive lock on $lckfile");
- close(SEM);
- } ## end if (open(RINSTALL, "$XCAT_ROOT/bin/rinstall $computer_node_name 2>&1 |"...
- else {
- notify($ERRORS{'CRITICAL'}, 0, "could not execute $XCAT_ROOT/bin/rinstall $computer_node_name $!");
- close(SEM);
- return 0;
- }
- } ## end if (flock(SEM, LOCK_EX))
- else {
- notify($ERRORS{'WARNING'}, 0, "failed to obtain exclusive lock on $lckfile, error: $!, returning");
+ my $throttle_limit;
+ my $variable_name = $self->data->get_management_node_hostname() . "|xcat|throttle";
+ if ($self->data->is_variable_set($variable_name) && ($throttle_limit = $self->data->get_variable($variable_name))) {
+ notify($ERRORS{'DEBUG'}, 0, "'$variable_name' xCAT load throttle limit variable is set in database: $throttle_limit");
+
+ my $throttle_limit_wait_seconds = (30 * 60);
+ if (!$self->code_loop_timeout(sub{!$self->_is_throttle_limit_reached(@_)}, [$throttle_limit], 'checking throttle limit', $throttle_limit_wait_seconds, 1, 10)) {
+ notify($ERRORS{'WARNING'}, 0, "failed to load image due to throttle limit, waited $throttle_limit_wait_seconds seconds");
return;
}
- } ## end if (sysopen(SEM, $lckfile, O_RDONLY | O_CREAT...
- else {
- notify($ERRORS{'WARNING'}, 0, "failed to open node loading lockfile, error: $!, returning");
- return;
- }
-
- # Check progress, locate MAC and IP address for this node, monitor /var/log/messages for communication from node
- # dhcp req/ack, xcat calls, etc
- my ($eth0MACaddress, $privateIP);
- if (open(MACTAB, "$XCAT_ROOT/etc/mac.tab")) {
- my @mactab = <MACTAB>;
- close(MACTAB);
- foreach my $line (@mactab) {
- if ($line =~ /(^$computer_node_name(-eth[0-9])?)(\s+)([0-9:a-f]*)/i) {
- $eth0MACaddress = $4;
- notify($ERRORS{'OK'}, 0, "MAC address for $computer_node_name collected $eth0MACaddress");
- }
- }
- } ## end if (open(MACTAB, "$XCAT_ROOT/etc/mac.tab"))
- if (!defined($eth0MACaddress)) {
- notify($ERRORS{'WARNING'}, 0, "MAC address not found for $computer_node_name , possible issue with regex");
- }
-
- #should also store/pull private address from the database
- if (open(HOSTS, "/etc/hosts")) {
- my @hosts = <HOSTS>;
- close(HOSTS);
- foreach my $line (@hosts) {
- if ($line =~ /([0-9]*.[0-9]*.[0-9]*.[0-9]*)\s+($computer_node_name)/) {
- $privateIP = $1;
- notify($ERRORS{'OK'}, 0, "PrivateIP address for $computer_node_name collected $privateIP");
- last;
- }
- }
- } ## end if (open(HOSTS, "/etc/hosts"))
- if (!defined($privateIP)) {
- notify($ERRORS{'WARNING'}, 0, "private IP address not found for $computer_node_name, possible issue with regex");
- }
- my ($s1, $s2, $s3, $s4) = 0;
- my $sloop = 0;
-
- #insertloadlog($reservation_id,$computer_id,"info","SUCCESS initiated install process");
- #sleep for boot process to happen takes anywhere from 60-90 seconds
- notify($ERRORS{'OK'}, 0, "sleeping 65 to allow bootstrap of $computer_node_name");
- sleep 65;
- my @TAILLOG;
- my $t;
-
- if ($eth0MACaddress && $privateIP) {
- @TAILLOG = 0;
- $t = 0;
- if (open(TAIL, "</var/log/messages")) {
- seek TAIL, -1, 2; #
- for (;;) {
- notify($ERRORS{'OK'}, 0, "$computer_node_name ROUND 1 checks loop $sloop of 45");
- while (<TAIL>) {
- if (!$s1) {
- if ($_ =~ /dhcpd: DHCPDISCOVER from $eth0MACaddress/i) {
- $s1 = 1;
- notify($ERRORS{'OK'}, 0, "$computer_node_name STAGE 1 set DHCPDISCOVER from $eth0MACaddress");
- insertloadlog($reservation_id, $computer_id, "xcatstage1", "SUCCESS stage1 detected dhcp request for node");
- }
- }
- if (!$s2) {
- if ($_ =~ /dhcpd: DHCPACK on $privateIP to $eth0MACaddress/i) {
- $s2 = 1;
- notify($ERRORS{'OK'}, 0, "$computer_node_name STAGE 2 set DHCPACK on $privateIP to $eth0MACaddress");
- insertloadlog($reservation_id, $computer_id, "xcatstage2", "SUCCESS stage2 detected dhcp ack for node");
- }
- }
- } #while
- #either stages are set or we loop or we rinstall again
- if ($s2) {
- notify($ERRORS{'OK'}, 0, "$computer_node_name ROUND1 stages are set proceeding to next round");
- close(TAIL);
- #Pause here
- sleep 30;
- goto ROUND2;
- }
- elsif ($sloop > 45) {
- insertloadlog($reservation_id, $computer_id, "WARNING", "potential problem started $rinstall_attempts install attempt");
-
- #hrmm this is taking too long
- #have we been here before? if less than 3 attempts continue on the 3rd try fail
- #whats the problem, chck known locations
- # /tftpboot/xcat/image/x86
- # look for tmpl file (in does_image_exist routine)
- # does the machine need to reboot, premission to reboot issue
- if (_check_pxe_grub_files($image_name)) {
- notify($ERRORS{'OK'}, 0, "checkpxe_grub_file checked");
- }
-
- if ($rinstall_attempts < 3) {
- close(TAIL);
- insertloadlog($reservation_id, $computer_id, "repeat", "starting install process");
- goto XCATRINSTALL;
- }
- else {
-
- #fail this one and let whoever called me get another machine
- notify($ERRORS{'CRITICAL'}, 0, "rinstall made $rinstall_attempts in ROUND1 on $computer_node_name with no success, admin needs to check it out");
- insertloadlog($reservation_id, $computer_id, "failed", "FAILED problem made $rinstall_attempts install attempts failing reservation");
- if (_nodeset_option($computer_node_name, "boot")) {
- notify($ERRORS{'OK'}, 0, "due to failure reseting state of blade to boot");
- }
- close(TAIL);
- return 0;
- } ## end else [ if ($rinstall_attempts < 3)
- } ## end elsif ($sloop > 45)
- else {
-
- #keep checking the messages log
- $sloop++;
- sleep 7;
- seek TAIL, 0, 1;
- }
- } #for loop
- } #if Tail
- else {
- notify($ERRORS{'CRITICAL'}, 0, "could open /var/log/messages to $!");
- }
- } ## end if ($eth0MACaddress && $privateIP)
- else {
- notify($ERRORS{'CRITICAL'}, 0, "eth0MACaddress $eth0MACaddress && privateIP $privateIP are not set not able to use these checks");
- insertloadlog($reservation_id, $computer_id, "failed", "FAILED could not locate private IP and MAC addresses in XCAT files failing reservation");
- return 0;
}
-
- ROUND2:
-
- #begin second round of checks reset $sX
- ($s1, $s2, $s3, $s4) = 0;
- $sloop = 0;
-
- # start time for loading
- my $R2starttime = convert_to_epoch_seconds();
-
- #during loading we need to wait based on some precentage of the estimated reload time (50%?)
- #times range from 4-10 minutes perhaps longer for a large image
- my $TM2waittime = int($image_reload_time / 2);
- insertloadlog($reservation_id, $computer_id, "xcatround2", "starting ROUND2 checks - waiting for boot flag");
-
- notify($ERRORS{'OK'}, 0, "Round 2 TM2waittime set to $TM2waittime on $computer_node_name");
- if (open(TAIL, "</var/log/messages")) {
- seek TAIL, -1, 2;
- my $gettingclose = 0;
- for (;;) {
- notify($ERRORS{'OK'}, 0, "$computer_node_name round2 log checks 30sec loop count is $sloop of $image_reload_time TM2waittime= $TM2waittime");
- while (<TAIL>) {
- if (!$s1) {
- if ($_ =~ /xcat: xcatd: set boot request from $computer_node_name/) {
-
- insertloadlog($reservation_id, $computer_id, "bootstate", "node in boot state completed imaging process - proceeding to next round");
- $s1 = 1;
- notify($ERRORS{'OK'}, 0, "Round 2 STAGE 1 set $computer_node_name in boot state");
- }
-
- #is it even near completion only checking rhel installs
- #not really useful for linux_images
- if ($image_os_type =~ /linux/i) {
- if (!$gettingclose) {
- if ($_ =~ /rpc.mountd: authenticated mount request from $computer_node_name:(\d+) for \/install\/post/) {
- $gettingclose = 1;
- notify($ERRORS{'OK'}, 0, "Round 2 STAGE 1 install nearing completion on node $computer_node_name");
- }
- }
- else {
- if (!$s4) {
- if ($sloop == $image_reload_time) {
- notify($ERRORS{'OK'}, 0, "$computer_node_name Round 2 getting close, loop eq $image_reload_time, substracting 6 from loop count");
- $sloop = ($sloop - 8);
- $s4 = 1; #loop control, don't set this we loop forever
- notify($ERRORS{'WARNING'}, 0, "ert estimated reload time may be too low\n $computer_node_name\nimagename $image_name\n current ert = $image_reload_time");
- }
- }
- } ## end else [ if (!$gettingclose)
- } ## end if ($image_os_type =~ /linux/i)
- } ## end if (!$s1)
- } #while
- if ($s1) {
-
- #good, move on
- close(TAIL);
- goto ROUND3;
- }
- else {
- if ($sloop > $image_reload_time) {
- notify($ERRORS{'OK'}, 0, "exceeded TM2waittime of $TM2waittime minutes sloop= $sloop ert= $image_reload_time");
-
- # check delta from when we started actual loading till now
- my $rtime = convert_to_epoch_seconds();
- my $delta = $rtime - $R2starttime;
- if ($delta < ($image_reload_time * 60)) {
-
- #ok delta is actually less then ert, we don't need to stop it yet.
- notify($ERRORS{'OK'}, 0, "loading delta is less than ert, not stopping yet delta is $delta/60 ");
- sleep 35;
- $sloop = ($sloop - 8); #decrement loop control
- seek TAIL, 0, 1;
-
- }
- elsif ($rinstall_attempts < 2) {
- notify($ERRORS{'WARNING'}, 0, "starting rinstall again");
- insertloadlog($reservation_id, $computer_id, "WARNING", "potential problem restarting rinstall current attemp $rinstall_attempts");
- close(TAIL);
- insertloadlog($reservation_id, $computer_id, "repeat", "starting install process");
- goto XCATRINSTALL;
- }
- else {
-
- #fail this one and let whoever called me get another machine
- notify($ERRORS{'CRITICAL'}, 0, "rinstall made $rinstall_attempts in ROUND2 on $computer_node_name with no success, admin needs to check it out");
- insertloadlog($reservation_id, $computer_id, "failed", "rinstall made $rinstall_attempts failing request");
- close(TAIL);
- return 0;
- }
- } ## end if ($sloop > $image_reload_time)
- else {
- sleep 35;
- $sloop++; #loop control
- insertloadlog($reservation_id, $computer_id, "info", "node in load process waiting for signal");
- seek TAIL, 0, 1;
-
- #goto TAILMESSAGES2;
- }
- } ## end else [ if ($s1)
- } #for
- } ## end if (open(TAIL, "</var/log/messages"))
else {
- notify($ERRORS{'CRITICAL'}, 0, "could open /var/log/messages to $!");
- return 0;
+ notify($ERRORS{'DEBUG'}, 0, "'$variable_name' xCAT load throttle limit variable is NOT set in database");
}
-
- ROUND3:
- insertloadlog($reservation_id, $computer_id, "xcatround3", "starting round 3 checks - finishing post configuration");
+ # Run rinstall to initiate the installation
+ $self->_rinstall($computer_node_name) || return;
- if ($self->os->can("post_load")) {
- notify($ERRORS{'DEBUG'}, 0, "calling " . ref($self->os) . "->post_load()");
- my $post_load_result = $self->os->post_load($rinstall_attempts);
+ # Run lsdef to retrieve the node's configuration including its MAC address
+ my $node_info = $self->_lsdef($computer_node_name);
+ if (!$node_info) {
+ notify($ERRORS{'WARNING'}, 0, "unable to monitor loading of $computer_node_name, failed to retrieve node info");
+ return;
+ }
+ my $mac_address = $node_info->{mac};
+ if ($mac_address) {
+ notify($ERRORS{'DEBUG'}, 0, "retrieved MAC address of $computer_node_name: $mac_address");
+ }
+ else {
+ notify($ERRORS{'WARNING'}, 0, "unable to monitor loading of $computer_node_name, node info does not contain the MAC address:\n" . format_data($node_info));
+ return;
+ }
+
+ # rinstall initiated
+ # nodeset changes xCAT state to 'install'
+ # node is power cycled or powered on (nodeset/nodestat status: install/noping)
+ # Wait for node to boot from network (may take from 30 seconds to several minutes if node is using UEFI)
+ # In /var/log/messages:, node makes DHCP request & requests PXE boot information from DHCP server running on management node:
+ # Apr 1 09:36:39 vclmgt dhcpd: DHCPDISCOVER from xx:xx:xx:xx:xx:xx via ethX
+ # Apr 1 09:36:39 vclmgt dhcpd: DHCPOFFER on 10.yy.yy.yy to xx:xx:xx:xx:xx:xx via ethX
+ # Apr 1 09:36:43 vclmgt dhcpd: DHCPREQUEST for 10.yy.yy.yy (10.mn.mn.mn) from xx:xx:xx:xx:xx:xx via ethX
+ # Apr 1 09:36:43 vclmgt dhcpd: DHCPACK on 10.yy.yy.yy to xx:xx:xx:xx:xx:xx via ethX
+ #
+ # Node requests PXE boot files from TFTP server running on management node:
+ # Apr 1 09:36:43 vclmgt atftpd[27522]: Serving pxelinux.0 to 10.yy.yy.yy:2070
+ # Apr 1 09:36:43 vclmgt atftpd[27522]: Serving pxelinux.0 to 10.yy.yy.yy:2071
+ # Apr 1 09:36:43 vclmgt atftpd[27522]: Serving pxelinux.cfg/xx-xx-xx-xx-xx-xx to 10.yy.yy.yy:57089
+ # Apr 1 09:36:43 vclmgt atftpd[27522]: Serving pxelinux.cfg/0A0A0132 to 10.yy.yy.yy:57090
+ # Apr 1 09:36:43 vclmgt atftpd[27522]: Serving xcat/rhel6/x86_64/vmlinuz to 10.yy.yy.yy:57091
+ # Apr 1 09:36:43 vclmgt atftpd[27522]: Serving xcat/rhel6/x86_64/initrd.img to 10.yy.yy.yy:57092
+ #
+ # Node boots using files downloaded from TFTP/PXE server, makes another DHCP request:
+ # Apr 1 09:37:15 vclmgt dhcpd: DHCPDISCOVER from xx:xx:xx:xx:xx:xx via ethX
+ # Apr 1 09:37:15 vclmgt dhcpd: DHCPOFFER on 10.yy.yy.yy to xx:xx:xx:xx:xx:xx via ethX
+ # Apr 1 09:37:15 vclmgt dhcpd: DHCPREQUEST for 10.yy.yy.yy (10.mn.mn.mn) from xx:xx:xx:xx:xx:xx via ethX
+ # Apr 1 09:37:15 vclmgt dhcpd: DHCPACK on 10.yy.yy.yy to xx:xx:xx:xx:xx:xx via ethX
+ # OS installation begins (nodeset/nodestat status: install/installing prep)
+ # If Kickstart, Linux packages are installed (nodestat status: 'installing <package> (x%)')
+ # If Kickstart, postscripts are installed (nodestat status: 'installing post scripts')
+ # When installation is complete, xCAT status is changed to 'boot' and node is restarted (nodeset/nodestat status: boot/noping)
+ # Node boots from hard drive (nodeset/nodestat status: boot/boot)
+
+ # Open the /var/log/messages file for reading
+ my $messages_file_path = '/var/log/messages';
+ my $log = IO::File->new($messages_file_path, "r");
+ if (!$log) {
+ my $error = $! || 'none';
+ notify($ERRORS{'WARNING'}, 0, "failed to open $messages_file_path for reading, error: $error");
+ return;
+ }
+ # Go to the end of the messages file
+ if (!$log->seek(0, SEEK_END)) {
+ my $error = $! || 'none';
+ notify($ERRORS{'CRITICAL'}, 0, "failed to seek end of $messages_file_path, error: $error");
+ }
+
+ insertloadlog($reservation_id, $computer_id, "xcatstage5", "loading image $image_name");
+
+ if ($image_reload_time_minutes < 10) {
+ $image_reload_time_minutes = 10;
+ }
+ my $nochange_timeout_seconds = ($image_reload_time_minutes * 60);
+ my $monitor_delay_seconds = 20;
+
+ my $monitor_start_time = time;
+ my $last_change_time = $monitor_start_time;
+ my $nochange_timeout_time = ($last_change_time + $nochange_timeout_seconds);
+
+ # Sanity check, timeout the load monitoring after a set amount of time
+ # This is done in case there is an endless loop which causes the node status to change over and over again
+ # Overall timeout is the lesser of 60 minutes or 2x image reload time
+ my $overall_timeout_minutes;
+ if ($image_reload_time_minutes < 30) {
+ $overall_timeout_minutes = 60;
+ }
+ else {
+ $overall_timeout_minutes = ($image_reload_time_minutes * 2);
+ }
+ my $overall_timeout_time = ($monitor_start_time + $overall_timeout_minutes * 60);
+
+ my $previous_status;
+ my $current_time;
+ MONITOR_LOADING: while (($current_time = time) < $nochange_timeout_time && $current_time < $overall_timeout_time) {
+ my $total_elapsed_seconds = ($current_time - $monitor_start_time);
+ my $nochange_elapsed_seconds = ($current_time - $last_change_time);
+ my $nochange_remaining_seconds = ($nochange_timeout_time - $current_time);
+ my $overall_remaining_seconds = ($overall_timeout_time - $current_time);
+ notify($ERRORS{'DEBUG'}, 0, "monitoring $image_name loading on $computer_node_name/$overall_remaining_seconds\n" .
+ "seconds since monitor start/until unconditional timeout: $total_elapsed_seconds/$overall_remaining_seconds\n" .
+ "seconds since last change/until no change timeout: $nochange_elapsed_seconds/$nochange_remaining_seconds"
+ );
- if (!defined $post_load_result) {
- notify($ERRORS{'WARNING'}, 0, "post_load returned undefined");
- return;
+
+ # Check if any lines have shown in in /var/log/messages for the node
+ my @lines = $log->getlines;
+ my @dhcp_lines = grep(/dhcpd:.+DHCP.+\s$mac_address\s/i, @lines);
+ if (@dhcp_lines) {
+ if (grep(/DHCPREQUEST/i, @dhcp_lines)) {
+ insertloadlog($reservation_id, $computer_id, "xcatstage1", "requested DHCP lease");
+ }
+
+ if (my ($dhcpack_line) = grep(/DHCPACK/i, @dhcp_lines)) {
+ notify($ERRORS{'DEBUG'}, 0, "$computer_node_name acquired DHCP lease: '$dhcpack_line'");
+ insertloadlog($reservation_id, $computer_id, "xcatstage2", "acquired DHCP lease");
+ insertloadlog($reservation_id, $computer_id, "xcatround2", "waiting for boot flag");
+ }
+
+ notify($ERRORS{'DEBUG'}, 0, "reset no change timeout, DHCP activity detected in $messages_file_path:\n" . join("\n", @dhcp_lines));
+
+ # Reset the nochange timeout
+ $last_change_time = $current_time;
+ $nochange_timeout_time = ($last_change_time + $nochange_timeout_seconds);
}
- elsif (!$post_load_result) {
- notify($ERRORS{'WARNING'}, 0, "post_load subroutine returned $post_load_result");
+ else {
+ # Get the current status of the node
+ my $current_status = $self->_nodestat($computer_node_name);
+
+ # Set previous status to current status if this is the first iteration
+ $previous_status = $current_status if !defined($previous_status);
- if ($rinstall_attempts < 2) {
- my $debugging_message = "*reservation has NOT failed yet*\n";
- $debugging_message .= "this notice is for debugging purposes so that node can be watched during 2nd rinstall attempt\n";
- $debugging_message .= "sshd did not become active on $computer_node_name after first rinstall attempt\n\n";
- $debugging_message .= $self->data->get_reservation_info_string();
- notify($ERRORS{'CRITICAL'}, 0, "$debugging_message");
+ if ($current_status =~ /(boot|complete)/) {
+ notify($ERRORS{'DEBUG'}, 0, "$computer_node_name is finished loading image, current status: $current_status");
+ insertloadlog($reservation_id, $computer_id, "bootstate", "$computer_node_name image load complete: $current_status");
+ last MONITOR_LOADING;
+ }
+
+ if ($current_status ne $previous_status) {
+ notify($ERRORS{'DEBUG'}, 0, "reset no change timeout, status of $computer_node_name changed: $previous_status --> $current_status");
- goto XCATRINSTALL;
+ # Set previous status to the current status
+ $previous_status = $current_status;
+
+ # Reset the nochange timeout
+ $last_change_time = $current_time;
+ $nochange_timeout_time = ($last_change_time + $nochange_timeout_seconds);
}
else {
- return;
+ notify($ERRORS{'DEBUG'}, 0, "status of $computer_node_name has not changed: $current_status");
}
}
- else {
- notify($ERRORS{'OK'}, 0, "post_load subroutine returned $post_load_result");
- }
- }
- else {
- notify($ERRORS{'DEBUG'}, 0, ref($self->os) . "::post_load() has not been implemented");
+
+ #notify($ERRORS{'DEBUG'}, 0, "sleeping for $monitor_delay_seconds seconds");
+ sleep $monitor_delay_seconds;
}
+ $log->close;
- # Clear ssh public keys from /root/.ssh/known_hosts
- my $known_hosts = "/root/.ssh/known_hosts";
- my @file;
- if (open(FILE, $known_hosts)) {
- @file = <FILE>;
- close FILE;
-
- foreach my $line (@file) {
- if ($line =~ s/$computer_node_name.*\n//) {
- notify($ERRORS{'OK'}, 0, "removing $computer_node_name ssh public key from $known_hosts");
- }
+ # Check if timeout was reached
+ if ($current_time >= $nochange_timeout_time) {
+ notify($ERRORS{'WARNING'}, 0, "failed to load $image_name on $computer_node_name, timed out because no progress was detected for $nochange_timeout_seconds seconds");
+ return;
+ }
+ elsif ($current_time >= $overall_timeout_time) {
+ notify($ERRORS{'CRITICAL'}, 0, "failed to load $image_name on $computer_node_name, timed out because loading took longer than $overall_timeout_minutes minutes");
+ return;
+ }
+
+ # Call the OS module's post_load() subroutine if implemented
+ insertloadlog($reservation_id, $computer_id, "xcatround3", "initiating OS post-load configuration");
+ if ($self->os->can("post_load")) {
+ if ($self->os->post_load()) {
+ insertloadlog($reservation_id, $computer_id, "loadimagecomplete", "performed OS post-load tasks on $computer_node_name");
}
-
- if (open(FILE, ">$known_hosts")) {
- print FILE @file;
- close FILE;
- }
- } ## end if (open(FILE, $known_hosts))
- else {
- notify($ERRORS{'OK'}, 0, "could not open $known_hosts for editing the $computer_node_name public ssh key");
- }
-
- # Synchronize ssh keys using xCAT's makesshgkh
- my $makessygkh_attempts = 0;
- MAKESSH:
- notify($ERRORS{'OK'}, 0, " resting 1sec before executing makesshgkh");
- sleep 1;
- if (open(MAKESSHGKH, "$XCAT_ROOT/sbin/makesshgkh $computer_node_name |")) {
- $makessygkh_attempts++;
- notify($ERRORS{'OK'}, 0, " makesshgkh attempt $makessygkh_attempts ");
- while (<MAKESSHGKH>) {
- chomp($_);
- if ($_ =~ /Scanning keys/) {
- notify($ERRORS{'OK'}, 0, "$_");
- }
+ else {
+ notify($ERRORS{'WARNING'}, 0, "failed to perform OS post-load tasks on VM $computer_node_name");
+ return;
}
- close MAKESSHGKH;
- my $keysync = 0;
- my $keysynccheck = 0;
-
- while (!$keysync) {
- $keysynccheck++;
- my $sshd = _sshd_status($computer_node_name, $image_name, $image_os_type);
- if ($sshd =~ /on/) {
- $keysync = 1;
- notify($ERRORS{'OK'}, 0, "keys synced");
- insertloadlog($reservation_id, $computer_id, "info", "SUCCESS keys synchronized");
- last;
- }
- if ($keysynccheck > 3) {
- if ($makessygkh_attempts < 1) {
- notify($ERRORS{'OK'}, 0, "keysynccheck exceeded 5 minutes, there might be a problem running makesshgkh again");
- goto MAKESSH;
- }
- else {
- notify($ERRORS{'WARNING'}, 0, "makesshgkh exceeded 2 attempts to create new ssh keys there appears to be a problem with $computer_node_name moving on");
-
- #move on-
- $keysync = 1;
- last;
- }
- } ## end if ($keysynccheck > 3)
- notify($ERRORS{'OK'}, 0, "waiting for ssh keys to be updated");
- sleep 5;
- } ## end while (!$keysync)
- } ## end if (open(MAKESSHGKH, "$XCAT_ROOT/sbin/makesshgkh $computer_node_name |"...
+ }
else {
- notify($ERRORS{'CRITICAL'}, 0, "could not execute $XCAT_ROOT/sbin/makesshgkh $computer_node_name $!");
+ insertloadlog($reservation_id, $computer_id, "loadimagecomplete", "OS post-load tasks not necessary on $computer_node_name");
}
-
+
return 1;
-} ## end sub load
+}
#/////////////////////////////////////////////////////////////////////////////
=head2 capture
- Parameters :
- Returns : 1 if successful, 0 if failed
- Description :
+ Parameters : none
+ Returns : boolean
+ Description : Captures the image which is currently loaded on the computer.
=cut
@@ -747,2399 +386,1735 @@ sub capture {
my $self = shift;
if (ref($self) !~ /xCAT/i) {
notify($ERRORS{'CRITICAL'}, 0, "subroutine was called as a function, it must be called as a class method");
- return 0;
+ return;
}
-
- # Get required data
+
my $image_name = $self->data->get_image_name();
- my $computer_short_name = $self->data->get_computer_short_name();
my $computer_node_name = $self->data->get_computer_node_name();
-
+
+ # Get the image repository path
+ my $image_repository_path = $self->get_image_repository_directory_path($image_name);
+ if (!$image_repository_path) {
+ notify($ERRORS{'CRITICAL'}, 0, "xCAT image repository information could not be determined");
+ return;
+ }
+ my $capture_done_file_path = "$image_repository_path/$image_name.img.capturedone";
+ my $capture_failed_file_path = "$image_repository_path/$image_name.img.capturefailed";
+
# Print some preliminary information
- notify($ERRORS{'OK'}, 0, "xCAT capture beginning: image=$image_name, computer=$computer_short_name");
+ notify($ERRORS{'OK'}, 0, "attempting to capture image '$image_name' on $computer_node_name");
- # Create currentimage.txt on the node containing information about the new image revision
- if (write_currentimage_txt($self->data)) {
- notify($ERRORS{'OK'}, 0, "currentimage.txt updated on $computer_short_name");
+ # Make sure the computer is powered on
+ my $power_status = $self->power_status();
+ if (!$power_status || $power_status !~ /on/i) {
+ if (!$self->power_on()) {
+ notify($ERRORS{'WARNING'}, 0, "failed to power on computer before monitoring image capture");
+ return;
+ }
}
- else {
- notify($ERRORS{'WARNING'}, 0, "unable to update currentimage.txt on $computer_short_name");
- return 0;
+
+ # Modify currentimage.txt
+ if (!write_currentimage_txt($self->data)) {
+ notify($ERRORS{'WARNING'}, 0, "unable to update currentimage.txt on $computer_node_name");
+ return;
}
-
+
# Check if pre_capture() subroutine has been implemented by the OS module
if ($self->os->can("pre_capture")) {
# Call OS pre_capture() - it should perform all OS steps necessary to capture an image
# pre_capture() should shut down the computer when it is done
- notify($ERRORS{'OK'}, 0, "calling OS module's pre_capture() subroutine");
if (!$self->os->pre_capture({end_state => 'off'})) {
notify($ERRORS{'WARNING'}, 0, "OS module pre_capture() failed");
- return 0;
+ return;
}
-
+
# The OS module should turn the computer power off
# Wait up to 2 minutes for the computer's power status to be off
- if ($self->wait_for_off(2)) {
+ if ($self->_wait_for_off($computer_node_name, 120)) {
notify($ERRORS{'OK'}, 0, "computer $computer_node_name power is off");
}
else {
notify($ERRORS{'WARNING'}, 0, "$computer_node_name power is still on, turning computer off");
-
+
# Attempt to power off computer
if ($self->power_off()) {
notify($ERRORS{'OK'}, 0, "$computer_node_name was powered off");
}
else {
notify($ERRORS{'WARNING'}, 0, "failed to power off $computer_node_name");
- return 0;
+ return;
}
}
- } ## end if ($self->os->can("pre_capture"))
- elsif ($self->os->can("capture_prepare")) {
- notify($ERRORS{'OK'}, 0, "calling OS module's capture_prepare() subroutine");
- if (!$self->os->capture_prepare()) {
- notify($ERRORS{'WARNING'}, 0, "OS module capture_prepare() failed");
- return 0;
- }
}
else {
- notify($ERRORS{'WARNING'}, 0, "OS module does not have either a pre_capture() or capture_prepare() subroutine");
- return 0;
+ notify($ERRORS{'WARNING'}, 0, "OS module does implement a pre_capture() subroutine");
+ return;
}
+
+ # Set the xCAT nodetype to the new image for the node
+ $self->_edit_nodetype($computer_node_name, $image_name) || return;
+ # Create the .tmpl file for the image
+ $self->_create_template($image_name) || return;
- # Create the tmpl file for the image
- if ($self->_create_template()) {
- notify($ERRORS{'OK'}, 0, "created .tmpl file for $image_name");
- }
- else {
- notify($ERRORS{'WARNING'}, 0, "failed to create .tmpl file for $image_name");
- return 0;
- }
-
- # Edit the nodetype.tab file to set the node with the new image name
- if ($self->_edit_nodetype($computer_node_name, $image_name)) {
- notify($ERRORS{'OK'}, 0, "nodetype modified, node $computer_node_name, image name $image_name");
- }
- else {
- notify($ERRORS{'WARNING'}, 0, "could not edit nodetype, node $computer_node_name, image name $image_name");
- return 0;
- }
+ # Edit xCAT's nodelist table to set the correct node groups
+ $self->_edit_nodelist($computer_node_name, $image_name) || return;
- # Call xCAT's 'nodeset <nodename> image', configures xCAT to save image on next reboot
- if (_nodeset_option($computer_node_name, "image")) {
- notify($ERRORS{'OK'}, 0, "$computer_node_name set to capture image on next reboot");
- }
- else {
- notify($ERRORS{'WARNING'}, 0, "failed to set $computer_node_name to capture image on next reboot");
- return 0;
+ # Call xCAT's nodeset to configure xCAT to save image on next reboot
+ $self->_nodeset($computer_node_name, 'image') || return;
+
+ # Power on the node in order to capture the image
+ if (!$self->power_on()) {
+ notify($ERRORS{'WARNING'}, 0, "failed to power on computer before monitoring image capture");
+ return;
}
-
- # Check if pre_capture() subroutine has been implemented by the OS module
- # If so, all that needs to happen is for the computer to be powered on
- if ($self->os->can("pre_capture")) {
- # Turn the computer on
- if ($self->power_on()) {
- notify($ERRORS{'OK'}, 0, "$computer_node_name was powered on");
+
+ my $nochange_timeout_minutes = 20;
+ my $nochange_timeout_seconds = ($nochange_timeout_minutes * 60);
+ my $monitor_delay_seconds = 30;
+
+ my $monitor_start_time = time;
+ my $last_change_time = $monitor_start_time;
+ my $nochange_timeout_time = ($last_change_time + $nochange_timeout_seconds);
+
+ # Sanity check, timeout the monitoring after 4 hours
+ my $overall_timeout_hours = 6;
+ my $overall_timeout_minutes = ($overall_timeout_hours * 60);
+ my $overall_timeout_time = ($monitor_start_time + $overall_timeout_minutes * 60);
+
+ my $previous_status;
+ my $previous_image_size = 0;
+ my $current_time;
+ MONITOR_CAPTURE: while (($current_time = time) < $nochange_timeout_time && $current_time < $overall_timeout_time) {
+ my $total_elapsed_seconds = ($current_time - $monitor_start_time);
+ my $nochange_elapsed_seconds = ($current_time - $last_change_time);
+ my $nochange_remaining_seconds = ($nochange_timeout_time - $current_time);
+ my $overall_remaining_seconds = ($overall_timeout_time - $current_time);
+ notify($ERRORS{'DEBUG'}, 0, "monitoring capture of $image_name on $computer_node_name:\n" .
+ "seconds since monitor start/until unconditional timeout: $total_elapsed_seconds/$overall_remaining_seconds\n" .
+ "seconds since last change/until no change timeout: $nochange_elapsed_seconds/$nochange_remaining_seconds"
+ );
+
+ if ($self->mn_os->file_exists($capture_done_file_path)) {
+ notify($ERRORS{'OK'}, 0, "capture of $image_name on $computer_node_name complete, file exists: $capture_done_file_path");
+ $self->mn_os->delete_file($capture_done_file_path);
+ last MONITOR_CAPTURE;
+ }
+ elsif ($self->mn_os->file_exists($capture_failed_file_path)) {
+ notify($ERRORS{'WARNING'}, 0, "failed to capture $image_name on $computer_node_name, file exists: $capture_failed_file_path");
+ $self->mn_os->delete_file($capture_failed_file_path);
+ return;
}
- else {
- notify($ERRORS{'WARNING'}, 0, "failed to turn computer on before monitoring image capture");
- return 0;
+
+ # Check if the image size has changed
+ my $current_image_size = $self->get_image_size($image_name);
+ if ($current_image_size ne $previous_image_size) {
+ notify($ERRORS{'DEBUG'}, 0, "size of $image_name changed: $previous_image_size --> $current_image_size, reset monitoring timeout to $nochange_timeout_seconds seconds");
+
+ # Set previous image size to the current image size
+ $previous_image_size = $current_image_size;
+
+ $last_change_time = $current_time;
+ $nochange_timeout_time = ($last_change_time + $nochange_timeout_seconds);
}
- } ## end if ($self->os->can("pre_capture"))
- # If capture_start() is implemented, call it, it will initiate a reboot
- elsif ($self->os->can("capture_start")) {
- notify($ERRORS{'OK'}, 0, "calling OS module's capture_start() subroutine");
- if (!$self->os->capture_start()) {
- notify($ERRORS{'WARNING'}, 0, "OS module capture_start() failed");
- return 0;
+ else {
+ # Get the current status of the node
+ my $current_status = $self->_nodestat($computer_node_name);
+ # Set previous status to current status if this is the first iteration
+ $previous_status = $current_status if !defined($previous_status);
+ if ($current_status ne $previous_status) {
+
+ # If the node status changed to 'boot' and the image size > 0, assume image capture complete
+ if ($current_status =~ /boot/ && $current_image_size > 0) {
+ notify($ERRORS{'DEBUG'}, 0, "image capture appears to be complete, node status changed: $previous_status --> $current_status, image size > 0: $current_image_size");
+ last MONITOR_CAPTURE;
+ }
+
+ notify($ERRORS{'DEBUG'}, 0, "status of $computer_node_name changed: $previous_status --> $current_status, reset monitoring timeout to $nochange_timeout_seconds seconds");
+
+ # Set previous status to the current status
+ $previous_status = $current_status;
+
+ $last_change_time = $current_time;
+ $nochange_timeout_time = ($last_change_time + $nochange_timeout_seconds);
+ }
}
+
+ notify($ERRORS{'DEBUG'}, 0, "sleeping for $monitor_delay_seconds seconds");
+ sleep $monitor_delay_seconds;
}
- else {
- notify($ERRORS{'WARNING'}, 0, "OS module does not have either a pre_capture() or capture_start() subroutine");
- return 0;
- }
-
-
- # Monitor the image capture
- if ($self->capture_monitor()) {
- notify($ERRORS{'OK'}, 0, "image capture monitoring is complete");
+
+ # Check if timeout was reached
+ if ($current_time >= $nochange_timeout_time) {
+ notify($ERRORS{'WARNING'}, 0, "failed to capture $image_name on $computer_node_name, timed out because no progress was detected for $nochange_timeout_minutes minutes");
+ return;
}
- else {
- notify($ERRORS{'WARNING'}, 0, "problem occurred while monitoring image capture");
- return 0;
+ elsif ($current_time >= $overall_timeout_time) {
+ notify($ERRORS{'CRITICAL'}, 0, "failed to capture $image_name on $computer_node_name, timed out because capture took longer than $overall_timeout_hours hours");
+ return;
}
-
- notify($ERRORS{'OK'}, 0, "image was successfully captured, returning 1");
+
+ # Set the permissions on the captured image files
+ $self->mn_os->set_file_permissions("$image_repository_path/$image_name\*", 644, 1);
+
+ notify($ERRORS{'OK'}, 0, "successfully captured $image_name on $computer_node_name");
return 1;
-} ## end sub capture
+}
#/////////////////////////////////////////////////////////////////////////////
-=head2 capture_monitor
+=head2 node_status
- Parameters :
- Returns :
- Description :
+ Parameters : $computer_node_name (optional)
+ Returns : string
+ Description : Checks the status of an xCAT-provisioned machine. If no
+ arguments are supplied, the node and image for the current
+ reservation will be used. The return value will be one of the
+ following:
+
+ READY
+ If $self->data contains image information:
+ - The computer is responding to SSH
+ - nodetype.profile is set to the image defined in $self->data
+ - Current image retrieved from computer's OS matches $self->data
+ If $self->data does not contain image:
+ - The computer is responding to SSH
+ - Current image retrieved from computer's OS matches
+ nodetype.profile
+
+ RELOAD
+ - Only returned if $self->data contains image information
+ - Either nodetype.profile does not match $self->data or the
+ current image retrieved from computer's OS does not match
+ $self->data
+
+ UNRESPONSIVE
+ - The computer is not responding to SSH
+
+ INCONSISTENT
+ - nodetype.profile does not match the current image retrieved
+ from computer's OS
=cut
-sub capture_monitor {
+sub node_status {
my $self = shift;
if (ref($self) !~ /xCAT/i) {
notify($ERRORS{'CRITICAL'}, 0, "subroutine was called as a function, it must be called as a class method");
- return 0;
+ return;
}
-
- # Get the required data
- my $computer_node_name = $self->data->get_computer_node_name();
- my $image_name = $self->data->get_image_name();
-
- # Get the image repository path
- my $image_repository_path = $self->_get_image_repository_path();
- if (!$image_repository_path) {
- notify($ERRORS{'CRITICAL'}, 0, "xCAT image repository information could not be determined");
- return 0;
+
+ # Get the computer name argument
+ my $computer_node_name = shift || $self->data->get_computer_node_name();
+ if (!$computer_node_name) {
+ notify($ERRORS{'WARNING'}, 0, "computer name argument was not specified");
+ return;
}
-
- # Wait for node to reboot
- notify($ERRORS{'OK'}, 0, "sleeping for 120 seconds before beginning to monitor image copy process");
- sleep 120;
-
- # Set variables to control how may attempts are made to wait for capture to finish
- my $capture_loop_attempts = 40;
- my $capture_loop_wait = 30;
-
- # Figure out and print how long will wait before timing out
- my $maximum_wait_minutes = ($capture_loop_attempts * $capture_loop_wait) / 60;
- notify($ERRORS{'OK'}, 0, "beginning to wait for image capture to complete, maximum wait time: $maximum_wait_minutes minutes");
-
- my $image_size = 0;
- my $nodeset_status;
- CAPTURE_LOOP: for (my $capture_loop_count = 0; $capture_loop_count < $capture_loop_attempts; $capture_loop_count++) {
- notify($ERRORS{'OK'}, 0, "image copy not complete, sleeping for $capture_loop_wait seconds");
- if ($capture_loop_attempts > 1) {
- notify($ERRORS{'OK'}, 0, "attempt $capture_loop_count/$capture_loop_attempts: image copy not complete, sleeping for $capture_loop_wait seconds");
- }
- sleep $capture_loop_wait;
-
- # Get the nodeset status for the node being captured
- $nodeset_status = _nodeset_option($computer_node_name, "stat");
- notify($ERRORS{'DEBUG'}, 0, "nodeset status for $computer_node_name: $nodeset_status");
-
- # nodeset stat will return 'boot' when image capture (Partimage) is complete
- if ($nodeset_status eq "boot") {
- last CAPTURE_LOOP;
+ notify($ERRORS{'DEBUG'}, 0, "checking status of node: $computer_node_name");
+
+ my $image_name = $self->data->get_image_name(0);
+
+ # Check if the node is powered on
+ my $power_status = $self->power_status($computer_node_name);
+ if (!defined($power_status)) {
+ notify($ERRORS{'WARNING'}, 0, "unable to determine status of $computer_node_name, failed to retrieve power status");
+ return;
+ }
+ elsif ($power_status !~ /on/) {
+ my $return_value = uc($power_status);
+ notify($ERRORS{'DEBUG'}, 0, "power status of $computer_node_name is '$power_status', returning '$return_value'");
+ return $return_value;
+ }
+
+ # Get the xCAT definition for the node
+ my $node_info = $self->_lsdef($computer_node_name);
+ if (!$node_info) {
+ notify($ERRORS{'WARNING'}, 0, "unable to determine status of $computer_node_name, failed to retrieve xCAT object definition using lsdef utility");
+ return;
+ }
+
+ # Make sure node.profile is configured
+ my $node_profile = $node_info->{profile};
+ if (!$node_info) {
+ notify($ERRORS{'WARNING'}, 0, "unable to determine status of $computer_node_name, node.profile is not configured:\n" . format_data($node_info));
+ return;
+ }
+
+ # Check if node.profile matches the reservation image name
+ if ($image_name) {
+ if ($node_profile eq $image_name) {
+ notify($ERRORS{'DEBUG'}, 0, "nodetype.profile matches the reservation image name: $image_name");
}
-
- # Check the image size to see if it's growing
- notify($ERRORS{'OK'}, 0, "checking size of $image_name");
- my $current_image_size = $self->get_image_size($image_name);
-
- # Check if image size is larger than the last time it was checked
- if (defined $current_image_size && $current_image_size > $image_size) {
- notify($ERRORS{'OK'}, 0, "image size has increased: $image_size -> $current_image_size, still copying");
- $image_size = $current_image_size;
- #reset capture_loop_count
- $capture_loop_count = 0;
+ else {
+ my $return_value = 'RELOAD';
+ notify($ERRORS{'DEBUG'}, 0, "nodetype.profile '$node_profile' does NOT match the reservation image name: '$image_name', returning '$return_value'");
+ return $return_value;
}
- elsif (defined $current_image_size) {
- notify($ERRORS{'OK'}, 0, "image size is the same: $image_size=$current_image_size, copy may be complete");
+ }
+
+ # Check if $self->os is defined, it may not be if xCAT.pm object is created from a monitoring script
+ my $os = $self->os(0);
+ if (!$os) {
+ my $data;
+ eval { $data = new VCL::DataStructure({computer_identifier => $computer_node_name, image_identifier => $node_profile}) };
+ if ($EVAL_ERROR) {
+ notify($ERRORS{'WARNING'}, 0, "unable to determine status of $computer_node_name, failed to create DataStructure object for image set as nodetype.profile: '$node_profile', error:\n$EVAL_ERROR");
+ return;
}
- else {
- notify($ERRORS{'WARNING'}, 0, "unable to determine current image size");
+ elsif (!$data) {
+ notify($ERRORS{'WARNING'}, 0, "unable to determine status of $computer_node_name, \$self->os is not defined, failed to create DataStructure object for image set as nodetype.profile: '$node_profile'");
+ return;
+ }
+
+ # Set the data, create_os_object copies the data from the calling object to the new OS object
+ $self->set_data($data);
+
+ my $image_os_module_perl_package = $data->get_image_os_module_perl_package();
+
+ $os = $self->create_os_object($image_os_module_perl_package);
+ if (!$os) {
+ notify($ERRORS{'WARNING'}, 0, "unable to determine status of $computer_node_name, failed to create OS object for image set as nodetype.profile: '$node_profile'");
+ return;
}
- } ## end for (my $capture_loop_count = 0; $capture_loop_count...
-
- # Exiting waiting loop, nodeset status should be boot if successful
- if ($nodeset_status eq "boot") {
- # Nodeset 'boot' flag has been set, image copy process is complete
- notify($ERRORS{'OK'}, 0, "image copy complete, nodeset status was set to 'boot' for $computer_node_name");
}
- else {
- notify($ERRORS{'WARNING'}, 0, "image copy timed out, waited $maximum_wait_minutes minutes, nodeset status for $computer_node_name never changed to boot: $nodeset_status");
- return 0;
+
+ # Check if the node is responding to SSH
+ my $ssh_responding = $os->is_ssh_responding();
+ if (!$ssh_responding) {
+ my $return_value = 'UNRESPONSIVE';
+ notify($ERRORS{'DEBUG'}, 0, "$computer_node_name is NOT responding to SSH, returning '$return_value'");
+ return $return_value;
}
-
- # Create mbr and sfdisk files
- if (open(LS, "/bin/ls -1s $image_repository_path |")) {
- my @LS = <LS>;
- close(LS);
- foreach my $l (@LS) {
- if ($l =~ /$image_name-hda/) {
-
- #create hda.mbr and hda.sfdisk
- if (open(CP, "/bin/cp $image_repository_path/$image_name-hda.mbr $image_repository_path/$image_name-sda.mbr |")) {
- close(CP);
- notify($ERRORS{'OK'}, 0, "copied $image_name-hda.mbr to $image_repository_path/$image_name-sda.mbr");
-
- #create sfdisk modify hardrive type
- if (open(CP, "/bin/cp $image_repository_path/$image_name-hda.sfdisk $image_repository_path/$image_name-sda.sfdisk |")) {
- close(CP);
- notify($ERRORS{'OK'}, 0, "copied $image_name-hda.sfdisk to $image_repository_path/$image_name-sda.sfdisk");
-
- #read in file
- if (open(FILE, "$image_repository_path/$image_name-sda.sfdisk")) {
- my @lines = <FILE>;
- close(FILE);
- foreach my $l (@lines) {
- if ($l =~ s/hda/sda/g) {
-
- #editing file
- }
- }
-
- #print array to file
- if (open(OUTFILE, ">$image_repository_path/$image_name-sda.sfdisk")) {
- print OUTFILE @lines;
- close(OUTFILE);
- notify($ERRORS{'OK'}, 0, "modified drivetype of $image_name-sda.sfdisk");
- }
- } ## end if (open(FILE, "$image_repository_path/$image_name-sda.sfdisk"...
- else {
- notify($ERRORS{'CRITICAL'}, 0, "could not open $image_repository_path/$image_name-sda.mbr for editing $!");
- }
- } # Close if copy hda.sfdisk command
- else {
- notify($ERRORS{'CRITICAL'}, 0, "could not copy $image_name-hda.sfdisk to $image_repository_path/$image_name-sda.sfdisk $!");
- }
- } # Close if copy mbr file command
- else {
- notify($ERRORS{'CRITICAL'}, 0, "could not copy $image_name-hda.mbr to $image_repository_path/$image_name-sda.mbr $!");
- }
- } # Close if imagename-hda
-
- elsif ($l =~ /$image_name-sda/) {
-
- #create sda.mbr and sda.sfdisk
- if (open(CP, "/bin/cp $image_repository_path/$image_name-sda.mbr $image_repository_path/$image_name-hda.mbr |")) {
- close(CP);
- notify($ERRORS{'OK'}, 0, "copied $image_name-sda.mbr to $image_repository_path/$image_name-hda.mbr");
-
- #create sfdisk
- if (open(CP, "/bin/cp $image_repository_path/$image_name-sda.sfdisk $image_repository_path/$image_name-hda.sfdisk |")) {
- close(CP);
- notify($ERRORS{'OK'}, 0, "copied $image_name-sda.sfdisk to $image_repository_path/$image_name-hda.sfdisk");
-
- #read in file
- if (open(FILE, "$image_repository_path/$image_name-hda.sfdisk")) {
- my @lines = <FILE>;
- close(FILE);
- foreach my $l (@lines) {
- if ($l =~ s/sda/hda/g) {
-
- #editing file
- }
- }
-
- #print array to file
- if (open(OUTFILE, ">$image_repository_path/$image_name-hda.sfdisk")) {
- print OUTFILE @lines;
- close(OUTFILE);
- notify($ERRORS{'OK'}, 0, "modified drivetype of $image_name-hda.sfdisk");
- }
- } ## end if (open(FILE, "$image_repository_path/$image_name-hda.sfdisk"...
- else {
- notify($ERRORS{'CRITICAL'}, 0, "could not open $image_repository_path/$image_name-hda.sfdisk for editing $!");
- }
- } ## end if (open(CP, "/bin/cp $image_repository_path/$image_name-sda.sfdisk $image_repository_path/$image_name-hda.sfdisk |"...
- else {
- notify($ERRORS{'OK'}, 0, "could not copy $image_repository_path/$image_name-sda.sfdisk to $image_repository_path/$image_name-hda.sfdisk $!");
- }
- } ## end if (open(CP, "/bin/cp $image_repository_path/$image_name-sda.mbr $image_repository_path/$image_name-hda.mbr |"...
- else {
- notify($ERRORS{'OK'}, 0, "could not copy $image_repository_path/$image_name-sda.mbr to $image_repository_path/$image_name-hda.mbr $!");
- }
- } # Close if image_name-sda
-
- } # Close foreach line returned from the ls imagerepository command
- } # Close if ls imagerepository
-
- # Set file premissions on image files to 644
- # Allows other management nodes to retrieve the image if neccessary
- if (open(CHMOD, "/bin/chmod -R 644 $image_repository_path/$image_name\* 2>&1 |")) {
- close(CHMOD);
- notify($ERRORS{'DEBUG'}, 0, "recursive update file permissions 644 on $image_repository_path/$image_name");
+
+ # Check image name reported from OS
+ my $current_image_name = $os->get_current_image_info('current_image_name');
+ if (!defined($current_image_name)) {
+ notify($ERRORS{'WARNING'}, 0, "unable to determine status of $computer_node_name, failed to retrieve current image name from OS");
+ return;
}
-
- # Image capture complete, return 1
- notify($ERRORS{'OK'}, 0, "image capture complete");
- return 1;
-
-} ## end sub capture_monitor
-
-#/////////////////////////////////////////////////////////////////////////////
-
-=head2 _edit_template
-
- Parameters : imagename,drivetype
- Returns : 0 failed or 1 success
- Description : general routine to edit /opt/xcat/install/image/x86/imagename.tmpl
- used in imaging process
-
-=cut
-
-sub _edit_template {
- my ($imagename, $drivetype) = @_;
- my ($package, $filename, $line, $sub) = caller(0);
- notify($ERRORS{'CRITICAL'}, 0, "drivetype is not defined")
- if (!(defined($drivetype)));
- notify($ERRORS{'CRITICAL'}, 0, "imagename is not defined")
- if (!(defined($imagename)));
-
- my $template = "$XCAT_ROOT/install/image/x86/$imagename.tmpl";
- my @lines;
- if (open(FILE, $template)) {
- @lines = <FILE>;
- close FILE;
- my $line;
- for $line (@lines) {
- if ($line =~ /^export DISKS=/) {
- $line = "export DISKS=\"$drivetype\"\n";
- last;
- }
- }
-
- #dump back to template file
- if (open(FILE, ">$template")) {
- print FILE @lines;
- close FILE;
- return 1;
+
+ # Check if OS's current image matches the reservation image name
+ if ($image_name) {
+ if ($current_image_name eq $image_name) {
+ notify($ERRORS{'DEBUG'}, 0, "current image reported by OS matches the reservation image name: $image_name");
}
else {
-
- # could not open nodetype file for editing
- notify($ERRORS{'CRITICAL'}, 0, "could not open $template for writing\nerror message: $!");
- return 0;
+ my $return_value = 'RELOAD';
+ notify($ERRORS{'DEBUG'}, 0, "current image reported by OS '$current_image_name' does NOT match the reservation image name: '$image_name', returning '$return_value'");
+ return $return_value;
}
- } ## end if (open(FILE, $template))
+ }
+
+ # Check if the OS matches xCAT
+ if ($current_image_name eq $node_profile) {
+ notify($ERRORS{'DEBUG'}, 0, "nodetype.profile matches current image reported by OS: '$current_image_name'");
+ }
else {
-
- # could not open nodetype file for editing
- notify($ERRORS{'CRITICAL'}, 0, "could not open $template for reading\nerror message: $!");
- return 0;
+ my $return_value = 'INCONSISTENT';
+ notify($ERRORS{'DEBUG'}, 0, "nodetype.profile '$node_profile' does NOT match current image reported by OS: '$current_image_name', returning '$return_value'");
+ return $return_value;
}
-} ## end sub _edit_template
+
+ my $return_value = 'READY';
+ notify($ERRORS{'DEBUG'}, 0, "$computer_node_name is loaded with the correct image: $current_image_name, returning '$return_value'");
+ return $return_value;
+} ## end sub node_status
#/////////////////////////////////////////////////////////////////////////////
-=head2 _edit_nodetype
+=head2 does_image_exist
- Parameters : node, imagename, osname
- Returns : 0 failed or 1 success
- Description : xCAT specific edits xcat's nodetype file with requested image name
+ Parameters : $image_name (optional)
+ Returns : boolean
+ Description : Checks the management node's local image repository for the
+ existence of the requested image and xCAT template (.tmpl) file.
+ If the image files exist but the .tmpl file does not, it creates
+ the .tmpl file. If a .tmpl file exists but the image files do
+ not, it deletetes the orphaned .tmpl file.
+
+ This subroutine does not attempt to copy the image from another
+ management node. The retrieve_image() subroutine does this.
+ Callers of does_image_exist must also call retrieve_image if
+ image library retrieval functionality is desired.
=cut
-sub _edit_nodetype {
+sub does_image_exist {
my $self = shift;
- if (ref($self) !~ /xCAT/i) {
- notify($ERRORS{'CRITICAL'}, 0, "subroutine was called as a function, it must be called as a class method");
- return 0;
+ unless (ref($self) && $self->isa('VCL::Module')) {
+ notify($ERRORS{'CRITICAL'}, 0, "subroutine can only be called as a VCL::Module module object method");
+ return;
}
- # Use arguments for computer and image if they were passed
- my $computer_node_name = shift;
- my $image_name = shift;
-
- # Use the new image name if it is set
- $image_name = $self->data->get_image_name() if !$image_name;
-
- # Get the rest of the variables
- $computer_node_name = $self->data->get_computer_node_name() if !$computer_node_name;
- my $image_os_name = $self->data->get_image_os_name();
- my $image_architecture = $self->data->get_image_architecture();
- my $image_os_source_path = $self->data->get_image_os_source_path();
- my $image_repository_path = $self->_get_image_repository_path();
+ # Get the image name, first try passed argument, then data
+ my $image_name = shift || $self->data->get_image_name();
+ if (!$image_name) {
+ notify($ERRORS{'WARNING'}, 0, "unable to determine image name");
+ return;
+ }
- # Fix for Linux images using linux_image repository path
- if ($image_os_source_path eq 'image' && $image_repository_path =~ /linux_image/) {
- $image_os_source_path = 'linux_image';
- notify($ERRORS{'DEBUG'}, 0, "fixed Linux image path: image --> linux_image");
+ # Get the image install type
+ my $image_os_install_type = $self->data->get_image_os_install_type();
+ if (!$image_os_install_type) {
+ notify($ERRORS{'WARNING'}, 0, "image OS install type could not be determined");
+ return;
+ }
+ else {
+ notify($ERRORS{'DEBUG'}, 0, "image OS install type: $image_os_install_type");
}
- # Check to make sure the variables are populated
- if (!$computer_node_name) {
- notify($ERRORS{'CRITICAL'}, 0, "computer node name is not defined");
- return 0;
+ # Get the image repository path
+ my $image_repository_path = $self->get_image_repository_directory_path($image_name);
+ if (!$image_repository_path) {
+ notify($ERRORS{'WARNING'}, 0, "image repository path could not be determined");
+ return;
}
- if (!$image_name) {
- notify($ERRORS{'CRITICAL'}, 0, "image name is not defined");
- return 0;
+ else {
+ notify($ERRORS{'DEBUG'}, 0, "image repository path: $image_repository_path");
}
- if (!$image_os_name) {
- notify($ERRORS{'CRITICAL'}, 0, "image OS name is not defined");
- return 0;
+
+ # Run du to get the size of the image files if the image exists
+ my $du_command;
+ if ($image_os_install_type eq 'kickstart') {
+ $du_command = "du -c $image_repository_path 2>&1 | grep total 2>&1"
}
- if (!$image_architecture) {
- notify($ERRORS{'CRITICAL'}, 0, "image architecture is not defined");
- return 0;
+ else {
+ $du_command = "du -c $image_repository_path/*$image_name* 2>&1 | grep total 2>&1"
}
- if (!$image_os_source_path) {
- notify($ERRORS{'CRITICAL'}, 0, "image OS source path is not defined");
- return 0;
+ my ($du_exit_status, $du_output) = run_command($du_command);
+
+ # If the partner doesn't have the image, a "no such file" error should be displayed
+ my $image_files_exist;
+ if (defined(@$du_output) && grep(/no such file/i, @$du_output)) {
+ notify($ERRORS{'OK'}, 0, "$image_name does NOT exist");
+ $image_files_exist = 0;
}
-
- notify($ERRORS{'DEBUG'}, 0, "$computer_node_name, image=$image_name, os=$image_os_name, arch=$image_architecture, path=$image_os_source_path");
-
- # Assemble the nodetype.tab and lock file paths
- my $nodetype_file_path = "$XCAT_ROOT/etc/nodetype.tab";
- my $lock_file_path = "$nodetype_file_path.lockfile";
-
- # Open the lock file
- if (sysopen(LOCKFILE, $lock_file_path, O_RDONLY | O_CREAT)) {
- notify($ERRORS{'DEBUG'}, 0, "opened $lock_file_path");
-
- # Set exclusive lock on lock file
- if (flock(LOCKFILE, LOCK_EX)) {
- notify($ERRORS{'DEBUG'}, 0, "set exclusive lock on $lock_file_path");
-
- if (open(NODETYPE, $nodetype_file_path)) { #read file
- notify($ERRORS{'DEBUG'}, 0, "opened $nodetype_file_path");
-
- # Get the nodetype.tab lines and close the file
- my @nodetype_lines = <NODETYPE>;
- notify($ERRORS{'DEBUG'}, 0, "lines found in nodetype.tab: " . scalar @nodetype_lines);
-
- # Close the nodetype.tab file
- close(NODETYPE);
- notify($ERRORS{'DEBUG'}, 0, "closed $nodetype_file_path");
-
- # Loop through the nodetype.tab lines
- for my $line (@nodetype_lines) {
-
- # Skip over non-matching lines
- next if ($line !~ /^$computer_node_name\s+([,\w]*)/);
- notify($ERRORS{'OK'}, 0, "matching line found: $line");
-
- # Replace line matching $computer_node_name
- $line = "$computer_node_name\t\t$image_os_source_path,$image_architecture,$image_name\n";
- notify($ERRORS{'OK'}, 0, "line modified: $line");
- } ## end for my $line (@nodetype_lines)
-
- # Dump modified array to nodetype.tab file
- if (open(NODETYPE, ">$nodetype_file_path")) {
- notify($ERRORS{'OK'}, 0, "nodetype.tab opened");
- print NODETYPE @nodetype_lines;
- notify($ERRORS{'OK'}, 0, "nodetype.tab contents replaced");
- close(NODETYPE);
- notify($ERRORS{'OK'}, 0, "nodetype.tab saved");
- close(LOCKFILE);
- notify($ERRORS{'DEBUG'}, 0, "lock file closed");
- return 1;
- } ## end if (open(NODETYPE, ">$nodetype_file_path"))
- else {
-
- # Could not open nodetype.tab file for editing
- notify($ERRORS{'CRITICAL'}, 0, "could not open file for writing: $nodetype_file_path, $!");
- close(LOCKFILE);
- notify($ERRORS{'DEBUG'}, 0, "lock file closed");
- return 0;
- }
- } ## end if (open(NODETYPE, $nodetype_file_path))
- else {
-
- # could not open nodetype file for reading
- notify($ERRORS{'CRITICAL'}, 0, "could not open file for reading: $nodetype_file_path, $!");
- close(LOCKFILE);
- notify($ERRORS{'DEBUG'}, 0, "lock file closed");
- return 0;
- }
- } ## end if (flock(LOCKFILE, LOCK_EX))
- else {
-
- # Could not open lock
- notify($ERRORS{'CRITICAL'}, 0, "unable to get exclusive lock on $lock_file_path to edit nodetype.tab, $!");
- close(LOCKFILE);
- notify($ERRORS{'DEBUG'}, 0, "lock file closed");
- return 0;
- }
- } ## end if (sysopen(LOCKFILE, $lock_file_path, O_RDONLY...
-
- else {
-
- # Could not open lock file
- notify($ERRORS{'CRITICAL'}, 0, "unable to open $lock_file_path to edit nodetype.tab, $!");
- return 0;
+ elsif (defined(@$du_output) && !grep(/\d+\s+total/i, @$du_output)) {
+ notify($ERRORS{'WARNING'}, 0, "du output does not contain a total line:\n" . join("\n", @$du_output));
+ return;
}
-
-} ## end sub _edit_nodetype
-
-#/////////////////////////////////////////////////////////////////////////////
-
-=head2 _pping
-
- Parameters : $node
- Returns : 1 or 0
- Description : using xcat pping cmd to ping blade, xcat specific
-
-=cut
-
-sub _pping {
- my $node = $_[0];
- my ($package, $filename, $line, $sub) = caller(0);
- notify($ERRORS{'WARNING'}, 0, "_pping: node is not defined")
- if (!(defined($node)));
- if (open(PPING, "$XCAT_ROOT/bin/pping $node 2>&1 |")) {
- my @file = <PPING>;
- close(PPING);
- foreach my $l (@file) {
- chomp $l;
- notify($ERRORS{'OK'}, 0, "pinging $l");
- if ($l =~ /noping/) {
- return 0;
- }
- if ($l =~ /$node: ping/) {
- return 1;
- }
- } ## end foreach my $l (@file)
- return 1;
- } ## end if (open(PPING, "$XCAT_ROOT/bin/pping $node 2>&1 |"...
- else {
- notify($ERRORS{'WARNING'}, 0, "could not execute $XCAT_ROOT/bin/pping $node");
- return 0;
+ elsif (!defined($du_exit_status)) {
+ notify($ERRORS{'WARNING'}, 0, "failed to run ssh command to determine if image $image_name exists");
+ return;
}
-} ## end sub _pping
-
-#/////////////////////////////////////////////////////////////////////////////
-
-=head2 _nodeset
-
- Parameters : $node
- Returns : xcat state of node or 0
- Description : using xcat nodeset cmd to retrieve state of blade, xcat specific
-
-=cut
-
-sub _nodeset {
- my $node = $_[0];
- my ($package, $filename, $line, $sub) = caller(0);
- notify($ERRORS{'WARNING'}, 0, "_nodeset: node is not defined")
- if (!(defined($node)));
- return 0 if (!(defined($node)));
-
- my ($blah, $case);
- my @file;
- my $l;
- if (open(NODESET, "$XCAT_ROOT/bin/nodeset $node stat |")) {
-
- #notify($ERRORS{'OK'},0,"executing $XCAT_ROOT/bin/nodeset $node stat ");
- @file = <NODESET>;
- close NODESET;
- foreach $l (@file) {
- chomp($l);
- ($blah, $case) = split(/:\s/, $l);
- }
- if ($case) {
-
- #notify($ERRORS{'OK'},0,"$node in $case state ");
- return $case;
- }
- else {
- notify($ERRORS{'WARNING'}, 0, "case for $node is empty");
- return 0;
- }
- } ## end if (open(NODESET, "$XCAT_ROOT/bin/nodeset $node stat |"...
- else {
- notify($ERRORS{'WARNING'}, 0, "failed to execute $XCAT_ROOT/bin/nodeset $node stat");
- return 0;
+
+ # Return 1 if the image size > 0
+ my ($image_size) = (@$du_output[0] =~ /(\d+)\s+total/);
+ if ($image_size && $image_size > 0) {
+ my $image_size_mb = int($image_size / 1024);
+ notify($ERRORS{'DEBUG'}, 0, "$image_name exists in $image_repository_path, size: $image_size_mb MB");
+ $image_files_exist = 1;
}
-} ## end sub _nodeset
-
-#/////////////////////////////////////////////////////////////////////////////
-
-=head2 _nodeset
-
- Parameters : $node $option
- Returns : xcat state of node or 0
- Description : using xcat nodeset cmd to use the input option of blade, xcat specific
-
-=cut
-
-sub _nodeset_option {
- my ($node, $option) = @_;
- my ($package, $filename, $line, $sub) = caller(0);
- notify($ERRORS{'WARNING'}, 0, "_nodeset_option: node is not defined")
- if (!(defined($node)));
- notify($ERRORS{'WARNING'}, 0, "_nodeset_option: option is not defined")
- if (!(defined($option)));
- my ($blah, $case);
- my @file;
- my $l;
- if (open(NODESET, "$XCAT_ROOT/bin/nodeset $node $option |")) {
-
- #notify($ERRORS{'OK'},0,"executing $XCAT_ROOT/bin/nodeset $node $option");
- @file = <NODESET>;
- close NODESET;
- foreach $l (@file) {
- chomp($l);
- ($blah, $case) = split(/:\s/, $l);
- }
- if ($case) {
- notify($ERRORS{'OK'}, 0, "$node in $case state ");
- return $case;
- }
- else {
- notify($ERRORS{'WARNING'}, 0, "case for $node is empty");
- return 0;
- }
- } ## end if (open(NODESET, "$XCAT_ROOT/bin/nodeset $node $option |"...
else {
- notify($ERRORS{'WARNING'}, 0, "failed to execute $XCAT_ROOT/bin/nodeset $node $option");
- return 0;
+ notify($ERRORS{'DEBUG'}, 0, "image does NOT exist: $image_name");
+ $image_files_exist = 0;
}
-} ## end sub _nodeset_option
-
-#/////////////////////////////////////////////////////////////////////////////
-
-=head2 power_reset
-
- Parameters : $computer_node_name (optional)
- Returns :
- Description :
-
-=cut
-
-sub power_reset {
- my $argument_1 = shift;
- my $argument_2 = shift;
-
- my $computer_node_name;
-
- # Check if subroutine was called as an object method
- if (ref($argument_1) =~ /xcat/i) {
- my $self = $argument_1;
-
- $computer_node_name = $argument_2;
-
- # Check if computer argument was specified
- # If not, use computer node name in the data object
- if (!$computer_node_name) {
- $computer_node_name = $self->data->get_computer_node_name();
- }
- } ## end if (ref($argument_1) =~ /xcat/i)
+ # Image files exist, make sure template (.tmpl) file exists
+ # Get the tmpl repository path
+ my $tmpl_repository_path = $self->_get_tmpl_directory_path($image_name);
+ if (!$tmpl_repository_path) {
+ notify($ERRORS{'WARNING'}, 0, "image template path could not be determined for $image_name");
+ return;
+ }
else {
- # Subroutine was not called as an object method, 2 arguments must be specified
- $computer_node_name = $argument_1;
+ notify($ERRORS{'DEBUG'}, 0, "template repository path for $image_name: $tmpl_repository_path");
}
-
- # Check if computer was determined
- if (!$computer_node_name) {
- notify($ERRORS{'WARNING'}, 0, "computer could not be determined from arguments");
- return;
+
+ # Check if template file exists for the image
+ # -s File has nonzero size
+ my $tmpl_file_exists;
+ if (-s "$tmpl_repository_path/$image_name.tmpl") {
+ $tmpl_file_exists = 1;
+ notify($ERRORS{'DEBUG'}, 0, "template file exists: $image_name.tmpl");
}
+ else {
+ $tmpl_file_exists = 0;
+ notify($ERRORS{'DEBUG'}, 0, "template file does not exist: $tmpl_repository_path/$image_name.tmpl");
+ }
+
+ # Check if either tmpl file or image files exist, but not both
+ # Attempt to correct the situation:
+ # tmpl file exists but not image files: delete tmpl file
+ # image files exist but not tmpl file: create tmpl file
+ if ($tmpl_file_exists && !$image_files_exist && $image_os_install_type ne 'kickstart') {
+ notify($ERRORS{'WARNING'}, 0, "template file exists but image files do not for $image_name");
- # Turn computer off
- my $off_attempts = 0;
- while (!power_off($computer_node_name)) {
- $off_attempts++;
-
- if ($off_attempts == 3) {
- notify($ERRORS{'WARNING'}, 0, "failed to turn $computer_node_name off, rpower status not is off after 3 attempts");
- return;
+ # Attempt to delete the orphaned tmpl file for the image
+ if ($self->_delete_template($image_name)) {
+ notify($ERRORS{'OK'}, 0, "deleted orphaned template file for image $image_name");
+ $tmpl_file_exists = 0;
}
-
- sleep 2;
- } ## end while (!power_off($computer_node_name))
-
- # Turn computer on
- my $on_attempts = 0;
- while (!power_on($computer_node_name)) {
- $on_attempts++;
-
- if ($on_attempts == 3) {
- notify($ERRORS{'WARNING'}, 0, "failed to turn $computer_node_name on, rpower status not is on after 3 attempts");
+ else {
+ notify($ERRORS{'WARNING'}, 0, "failed to delete orphaned template file for image $image_name, returning undefined");
return;
}
+ } ## end if ($tmpl_file_exists && !$image_files_exist)
+ elsif (!$tmpl_file_exists && $image_files_exist && $image_os_install_type ne 'kickstart') {
+ notify($ERRORS{'WARNING'}, 0, "image files exist but template file does not for $image_name");
- sleep 2;
- } ## end while (!power_on($computer_node_name))
-
- notify($ERRORS{'OK'}, 0, "successfully reset power on $computer_node_name");
- return 1;
-} ## end sub power_reset
-
-#/////////////////////////////////////////////////////////////////////////////
-
-=head2 power_on
-
- Parameters : $computer_node_name (optional)
- Returns :
- Description :
-
-=cut
-
-sub power_on {
- my $argument_1 = shift;
- my $argument_2 = shift;
-
- my $computer_node_name;
-
- # Check if subroutine was called as an object method
- if (ref($argument_1) =~ /xcat/i) {
- my $self = $argument_1;
-
- $computer_node_name = $argument_2;
-
- # Check if computer argument was specified
- # If not, use computer node name in the data object
- if (!$computer_node_name) {
- $computer_node_name = $self->data->get_computer_node_name();
+ # Attempt to create the missing tmpl file for the image
+ if ($self->_create_template($image_name)) {
+ notify($ERRORS{'OK'}, 0, "created missing template file for image $image_name");
+ $tmpl_file_exists = 1;
}
- } ## end if (ref($argument_1) =~ /xcat/i)
- else {
- # Subroutine was not called as an object method, 2 arguments must be specified
- $computer_node_name = $argument_1;
- }
-
- # Check if computer was determined
- if (!$computer_node_name) {
- notify($ERRORS{'WARNING'}, 0, "computer could not be determined from arguments");
- return;
- }
-
- # Turn computer on
- my $on_attempts = 0;
- my $power_status = 'unknown';
- while ($power_status !~ /on/) {
- $on_attempts++;
-
- if ($on_attempts == 3) {
- notify($ERRORS{'WARNING'}, 0, "failed to turn $computer_node_name on, rpower status not is on after 3 attempts");
+ else {
+ notify($ERRORS{'WARNING'}, 0, "failed to create missing template file for image $image_name, returning undefined");
return;
}
+ } ## end elsif (!$tmpl_file_exists && $image_files_exist) [ if ($tmpl_file_exists && !$image_files_exist)
- _rpower($computer_node_name, 'on');
-
- # Wait up to 1 minute for the computer power status to be on
- if (wait_for_on($computer_node_name, 1)) {
- last;
- }
-
- $power_status = power_status($computer_node_name);
- } ## end while ($power_status !~ /on/)
-
- notify($ERRORS{'OK'}, 0, "successfully powered on $computer_node_name");
- return 1;
-} ## end sub power_on
-
-#/////////////////////////////////////////////////////////////////////////////
-
-=head2 power_off
-
- Parameters : $computer_node_name (optional)
- Returns :
- Description :
-
-=cut
-
-sub power_off {
- my $argument_1 = shift;
- my $argument_2 = shift;
-
- my $computer_node_name;
-
- # Check if subroutine was called as an object method
- if (ref($argument_1) =~ /xcat/i) {
- my $self = $argument_1;
-
- $computer_node_name = $argument_2;
-
- # Check if computer argument was specified
- # If not, use computer node name in the data object
- if (!$computer_node_name) {
- $computer_node_name = $self->data->get_computer_node_name();
- }
- } ## end if (ref($argument_1) =~ /xcat/i)
- else {
- # Subroutine was not called as an object method, 2 arguments must be specified
- $computer_node_name = $argument_1;
+ # Check if both image files and tmpl file were found and return
+ if ($tmpl_file_exists && $image_files_exist) {
+ notify($ERRORS{'DEBUG'}, 0, "image $image_name exists on this management node");
+ return 1;
}
-
- # Check if computer was determined
- if (!$computer_node_name) {
- notify($ERRORS{'WARNING'}, 0, "computer could not be determined from arguments");
- return;
+ else {
+ notify($ERRORS{'DEBUG'}, 0, "image $image_name does NOT exist on this management node");
+ return 0;
}
- # Turn computer off
- my $power_status = 'unknown';
- my $off_attempts = 0;
- while ($power_status !~ /off/) {
- $off_attempts++;
-
- if ($off_attempts == 3) {
- notify($ERRORS{'WARNING'}, 0, "failed to turn $computer_node_name off, rpower status not is off after 3 attempts");
- return;
- }
-
- # Attempt to run rpower <node> off
- _rpower($computer_node_name, 'off');
-
- # Wait up to 1 minute for the computer power status to be off
- if (wait_for_off($computer_node_name, 1)) {
- last;
- }
-
- $power_status = power_status($computer_node_name);
- } ## end while ($power_status !~ /off/)
-
- notify($ERRORS{'OK'}, 0, "successfully powered off $computer_node_name");
- return 1;
-} ## end sub power_off
+} ## end sub does_image_exist
#/////////////////////////////////////////////////////////////////////////////
-=head2 power_status
+=head2 get_image_size
- Parameters : $computer_node_name (optional)
- Returns :
- Description :
+ Parameters : $image_name (optional)
+ Returns : integer
+ Description : Retrieves the image size in megabytes.
=cut
-sub power_status {
- my $argument_1 = shift;
- my $argument_2 = shift;
-
- my $computer_node_name;
-
- # Check if subroutine was called as an object method
- if (ref($argument_1) =~ /xcat/i) {
- my $self = $argument_1;
-
- $computer_node_name = $argument_2;
-
- # Check if computer argument was specified
- # If not, use computer node name in the data object
- if (!$computer_node_name) {
- $computer_node_name = $self->data->get_computer_node_name();
- }
- } ## end if (ref($argument_1) =~ /xcat/i)
- else {
- # Subroutine was not called as an object method, 2 arguments must be specified
- $computer_node_name = $argument_1;
- }
-
- # Check if computer was determined
- if (!$computer_node_name) {
- notify($ERRORS{'WARNING'}, 0, "computer could not be determined from arguments");
+sub get_image_size {
+ my $self = shift;
+ if (ref($self) !~ /xCAT/i) {
+ notify($ERRORS{'CRITICAL'}, 0, "subroutine was called as a function, it must be called as a class method");
return;
}
- # Call rpower to determine power status
- my $rpower_stat = _rpower($computer_node_name, 'stat');
- notify($ERRORS{'DEBUG'}, 0, "retrieved power status of $computer_node_name: $rpower_stat");
-
- if (!$rpower_stat) {
- notify($ERRORS{'WARNING'}, 0, "failed to determine power status, rpower subroutine returned $rpower_stat");
+ # Either use a passed parameter as the image name or use the one stored in this object's DataStructure
+ my $image_name = shift || $self->data->get_image_name();
+ if (!$image_name) {
+ notify($ERRORS{'CRITICAL'}, 0, "image name could not be determined");
return;
}
- elsif ($rpower_stat =~ /^(on|off)$/i) {
- return lc($1);
- }
- else {
- notify($ERRORS{'WARNING'}, 0, "failed to determine power status, unexpected output returned from rpower: $rpower_stat");
+
+ my $image_repository_path = $self->get_image_repository_directory_path($image_name);
+ if (!$image_repository_path) {
+ notify($ERRORS{'CRITICAL'}, 0, "unable to determine image repository location, returning 0");
return;
}
-} ## end sub power_status
-
-#/////////////////////////////////////////////////////////////////////////////
-
-=head2 wait_for_on
-
- Parameters : Maximum number of minutes to wait (optional)
- Returns : 1 if computer is on, 0 otherwise
- Description :
-
-=cut
-
-sub wait_for_on {
- my $argument_1 = shift;
- my $argument_2 = shift;
- my $argument_3 = shift;
-
- my $self;
- my $computer_node_name;
- my $total_wait_minutes;
-
- # Check if subroutine was called as an object method
- if (ref($argument_1) =~ /xcat/i) {
- $self = $argument_1;
-
- if (defined $argument_3) {
- $computer_node_name = $argument_2;
- $total_wait_minutes = $argument_3;
- }
- else {
- $computer_node_name = $self->data->get_computer_node_name();
- $total_wait_minutes = $argument_2;
- }
- } ## end if (ref($argument_1) =~ /xcat/i)
- else {
- # Subroutine was not called as an object method, 2 arguments must be specified
- $computer_node_name = $argument_1;
- $total_wait_minutes = $argument_2;
- }
+ # Execute the command
+ my $du_command = "du -c $image_repository_path/$image_name* 2>&1";
+ #notify($ERRORS{'DEBUG'}, 0, "du command: $du_command");
+ my $du_output = `$du_command`;
- # Check if computer was determined
- if (!$computer_node_name) {
- notify($ERRORS{'WARNING'}, 0, "computer could not be determined from arguments");
+ # Save the exit status
+ my $du_exit_status = $? >> 8;
+
+ # Make sure du produced output
+ if (!defined($du_output) || length($du_output) == 0) {
+ notify($ERRORS{'WARNING'}, 0, "du did not product any output, du exit status: $du_exit_status");
return;
}
-
- # Make sure total wait minutes was determined
- if (!defined($total_wait_minutes) || $total_wait_minutes !~ /^\d+$/) {
- notify($ERRORS{'DEBUG'}, 0, "total wait minutes argument not specified, using default of 5 minutes");
- $total_wait_minutes = 5;
+
+ # Check if image doesn't exist
+ if ($du_output && $du_output =~ /No such file.*0\s+total/is) {
+ notify($ERRORS{'OK'}, 0, "image does not exist: $image_repository_path/$image_name.*, returning 0");
+ return 0;
+ }
+
+ # Check the du command output
+ my ($size_bytes) = $du_output =~ /(\d+)\s+total/s;
+ if (!defined $size_bytes) {
+ notify($ERRORS{'WARNING'}, 0, "du command did not produce expected output, du exit staus: $du_exit_status, output:\n$du_output");
+ return;
}
- # Looping configuration variables
- # Seconds to wait in between loop attempts
- my $attempt_delay = 15;
- # Total loop attempts made
- # Add 1 to the number of attempts because if you're waiting for x intervals, you check x+1 times including at 0
- my $attempts = ($total_wait_minutes * 4) + 1;
-
- notify($ERRORS{'OK'}, 0, "waiting for $computer_node_name to turn on, maximum of $total_wait_minutes minutes");
-
- # Loop until computer is on
- for (my $attempt = 1; $attempt <= $attempts; $attempt++) {
- if ($attempt > 1) {
- notify($ERRORS{'OK'}, 0, "attempt " . ($attempt - 1) . "/" . ($attempts - 1) . ": $computer_node_name is not on, sleeping for $attempt_delay seconds");
- sleep $attempt_delay;
- }
+ # Calculate the size in MB
+ my $size_mb = int($size_bytes / 1024);
+ notify($ERRORS{'DEBUG'}, 0, "returning image size: $size_mb MB ($size_bytes bytes)");
+ return $size_mb;
- if (power_status($computer_node_name) =~ /on/i) {
- notify($ERRORS{'OK'}, 0, "$computer_node_name is on");
- return 1;
- }
- } ## end for (my $attempt = 1; $attempt <= $attempts...
+} ## end sub get_image_size
- # Calculate how long this waited
- my $total_wait = ($attempts * $attempt_delay);
- notify($ERRORS{'WARNING'}, 0, "$computer_node_name is NOT on after waiting for $total_wait seconds");
- return 0;
-} ## end sub wait_for_on
#/////////////////////////////////////////////////////////////////////////////
-=head2 wait_for_off
+=head2 get_image_repository_directory_path
- Parameters : Maximum number of minutes to wait (optional)
- Returns : 1 if computer is off, 0 otherwise
- Description :
+ Parameters : $image_name, $management_node_identifier (optional)
+ Returns : string
+ Description : Determines the path where the image resides on the management
+ node. Examples:
+ Partimage image: /install/image/x86
+ Kickstart image: /install/centos5/x86_64
=cut
-sub wait_for_off {
- my $argument_1 = shift;
- my $argument_2 = shift;
- my $argument_3 = shift;
-
- my $self;
- my $computer_node_name;
[... 2464 lines stripped ...]