You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@vcl.apache.org by ar...@apache.org on 2013/09/18 17:13:37 UTC

svn commit: r1524445 - in /vcl/trunk/managementnode/lib/VCL: Module/OS/Linux/init/Upstart.pm new.pm utils.pm

Author: arkurth
Date: Wed Sep 18 15:13:37 2013
New Revision: 1524445

URL: http://svn.apache.org/r1524445
Log:
VCL-16
Updated new.pm::wait_for_child_reservations. The parent reservation was not waiting long enough for child reservations to load under some conditions. Added checks in waiting loop to detect any change in the child reservation.lastcheck values or child reservation computerloadlog entires.

Updated utils.pm::get_current_reservation_lastcheck to accept either a single or multiple reservation ID arguments. This is called from new.pm::wait_for_child_reservations in order to detect changes from child reservations.

VCL-681
Updated utils.pm::run_ssh_command to detect 'Connection timed out during banner exchange' errors. These occasionally occur.

Other
Added undefined variable check to Upstart.pm::get_service_names.

Modified:
    vcl/trunk/managementnode/lib/VCL/Module/OS/Linux/init/Upstart.pm
    vcl/trunk/managementnode/lib/VCL/new.pm
    vcl/trunk/managementnode/lib/VCL/utils.pm

Modified: vcl/trunk/managementnode/lib/VCL/Module/OS/Linux/init/Upstart.pm
URL: http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/Module/OS/Linux/init/Upstart.pm?rev=1524445&r1=1524444&r2=1524445&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/Module/OS/Linux/init/Upstart.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/Module/OS/Linux/init/Upstart.pm Wed Sep 18 15:13:37 2013
@@ -136,6 +136,7 @@ sub get_service_names {
 	my %service_name_mappings_reversed = reverse %$SERVICE_NAME_MAPPINGS;
 	for my $line (@$output) {
 		my ($service_name) = $line =~ /^([^\s\t]+)/;
+		next unless $service_name;
 		$service_name_hash{$service_name} = 1 if $service_name;
 		if (my $service_name_mapping = $service_name_mappings_reversed{$service_name}) {
 			$service_name_hash{$service_name_mapping} = 1;

Modified: vcl/trunk/managementnode/lib/VCL/new.pm
URL: http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/new.pm?rev=1524445&r1=1524444&r2=1524445&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/new.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/new.pm Wed Sep 18 15:13:37 2013
@@ -1170,78 +1170,145 @@ EOF
 
 =head2 wait_for_child_reservations
 
- Parameters  :
- Returns     :
- Description :
+ Parameters  : none
+ Returns     : boolean
+ Description : Called by the parent reservation if a cluster request is being
+               processed. The parent waits for the child reservations to finish
+               loading before proceeding to change the request state to
+               reserved. Child reservations indicate they are finished loading
+               by inserting a 'nodeready' computerloadlog entry.
+               
+               The parent attempts to detect any changes in child reservations
+               by retrieving the reservation.lastupdate values for all children
+               as well was all computerloadlog entries. The parent will wait up
+               to 20 minutes if no change is detected for any children.
 
 =cut
 
 sub wait_for_child_reservations {
 	my $self              = shift;
-	
 	my $request_data      = $self->data->get_request_data();
 	my $request_id        = $self->data->get_request_id();
-	my $reservation_id    = $self->data->get_reservation_id();
 	my @reservation_ids   = $self->data->get_reservation_ids();
+	my $reservation_count = $self->data->get_reservation_count();
 
-	# Set limits on how many attempts to make and how long to wait between attempts
-	# Wait a long time - 20 minutes
-	my $loop_iteration_limit = 40;
-	my $loop_iteration_delay = 30;
-
-	WAITING_LOOP: for (my $loop_iteration = 1; $loop_iteration <= $loop_iteration_limit; $loop_iteration++) {
-		if ($loop_iteration > 1) {
-			notify($ERRORS{'OK'}, 0, "waiting for $loop_iteration_delay seconds");
-			sleep $loop_iteration_delay;
-		}
+	# Set limits on how long to wait
+	my $overall_timeout_minutes = 60;
+	my $nochange_timeout_minutes = 20;
+	my $monitor_delay_seconds = 3;
+	
+	my $overall_timeout_seconds = ($overall_timeout_minutes * 60);
+	my $nochange_timeout_seconds = ($nochange_timeout_minutes * 60);
+	
+	my $monitor_start_time = time;
+	my $last_change_time = $monitor_start_time;
+	my $current_time;
+	my $nochange_timeout_time = ($monitor_start_time + $nochange_timeout_seconds);
+	my $overall_timeout_time = ($monitor_start_time + $overall_timeout_seconds);
+	
+	my $previous_lastcheck_info;
+	my $previous_request_loadstate_names;
+	
+	MONITOR_LOADING: while (($current_time = time) < $nochange_timeout_time && $current_time < $overall_timeout_time) {
+		my $total_elapsed_seconds      = ($current_time - $monitor_start_time);
+		my $nochange_elapsed_seconds   = ($current_time - $last_change_time);
+		my $nochange_remaining_seconds = ($nochange_timeout_time - $current_time);
+		my $overall_remaining_seconds  = ($overall_timeout_time - $current_time);
+		
+		notify($ERRORS{'DEBUG'}, 0, "waiting for child reservations seconds elapsed/until no change timeout: $nochange_elapsed_seconds/$nochange_remaining_seconds, unconditional timeout: $total_elapsed_seconds/$overall_remaining_seconds");
 		
 		# Check if request has been deleted
 		if (is_request_deleted($request_id)) {
 			notify($ERRORS{'OK'}, 0, "request has been deleted, setting computer state to 'available' and exiting");
-			
-			# Update state of computer and exit
-			switch_state($request_data, '', 'available', '', '1');
+			$self->state_exit('', 'available');
 		}
 		
-		# Check if all of the reservations are ready according to the computerloadlog table
-		my $computerloadlog_reservations_ready = reservations_ready($request_id);
-		if ($computerloadlog_reservations_ready) {
-			notify($ERRORS{'OK'}, 0, "ready: all child reservations are ready according to computerloadlog, returning 1");
-			return 1;
-		}
-		elsif (defined $computerloadlog_reservations_ready) {
-			notify($ERRORS{'OK'}, 0, "not ready: all child reservations are NOT ready according to computerloadlog");
-		}
-		else {
-			notify($ERRORS{'WARNING'}, 0, "error occurred checking if child reservations are ready according to computerloadlog");
-		}
-
-		notify($ERRORS{'OK'}, 0, "attempt $loop_iteration/$loop_iteration_limit: waiting for child reservations to become ready");
-
-		RESERVATION_LOOP: foreach my $child_reservation_id (@reservation_ids) {
-			# Don't bother checking this reservation
-			if ($child_reservation_id == $reservation_id) {
-				next RESERVATION_LOOP;
+		my @reservations_ready;
+		my @reservations_not_ready;
+		my @reservations_failed;
+		my @reservations_lastcheck_changed;
+		my @reservations_loadstate_changed;
+		my @reservations_unknown;
+		
+		# Get reservation.lastcheck value for all reservations
+		my $current_lastcheck_info = get_current_reservation_lastcheck(@reservation_ids);
+		$previous_lastcheck_info = $current_lastcheck_info if !$previous_lastcheck_info;
+		
+		# Get computerloadlog info for all reservations
+		my $current_request_loadstate_names = get_request_loadstate_names($request_id);
+		$previous_request_loadstate_names = $current_request_loadstate_names if !$previous_request_loadstate_names;
+		
+		RESERVATION_ID: for my $reservation_id (@reservation_ids) {
+			if (!defined($current_request_loadstate_names->{$reservation_id})) {
+				notify($ERRORS{'WARNING'}, 0, "request loadstate info does not contain a key for reservation $reservation_id:\n" . format_data($current_request_loadstate_names));
+				next RESERVATION_ID;
 			}
 			
-			# Get the computer ID of the child reservation
-			my $child_computer_id = $request_data->{reservation}{$child_reservation_id}{computer}{id};
-			notify($ERRORS{'DEBUG'}, 0, "checking reservation $child_reservation_id: computer ID=$child_computer_id");
+			my @previous_reservation_loadstate_names = @{$previous_request_loadstate_names->{$reservation_id}};
+			my @current_reservation_loadstate_names = @{$current_request_loadstate_names->{$reservation_id}};
 			
-			# Get the child reservation's current computer state
-			my $child_computer_state = get_computer_current_state_name($child_computer_id);
-			notify($ERRORS{'DEBUG'}, 0, "reservation $child_reservation_id: computer state=$child_computer_state");
+			if (grep {$_ eq 'failed'} @current_reservation_loadstate_names) {
+				push @reservations_failed, $reservation_id;
+				next RESERVATION_ID;
+			}
+			elsif (grep {$_ eq 'nodeready'} @current_reservation_loadstate_names) {
+				push @reservations_ready, $reservation_id;
+				next RESERVATION_ID;
+			}
+			elsif (grep {$_ eq 'begin'} @current_reservation_loadstate_names) {
+				push @reservations_not_ready, $reservation_id;
+			}
+			else {
+				push @reservations_unknown, $reservation_id;
+			}
 			
-			if ($child_computer_state =~ /^(failed|maintenance)$/) {
-				notify($ERRORS{'WARNING'}, 0, "aborting, child reservation $child_reservation_id computer state: $child_computer_state");
-				return;
+			if ($previous_lastcheck_info->{$reservation_id} ne $current_lastcheck_info->{$reservation_id}) {
+				push @reservations_lastcheck_changed, $reservation_id;
 			}
-		} ## end foreach my $child_reservation_id (@reservation_ids)
-	} ## end for (my $loop_iteration = 1; $loop_iteration...
+			if (scalar(@previous_reservation_loadstate_names) != scalar(@current_reservation_loadstate_names)) {
+				push @reservations_loadstate_changed, $reservation_id;
+			}
+		}
+		
+		my $ready_count     = scalar @reservations_ready;
+		my $not_ready_count = scalar @reservations_not_ready;
+		my $unknown_count   = scalar @reservations_unknown;
+		my $failed_count    = scalar @reservations_failed;
+		
+		notify($ERRORS{'DEBUG'}, 0, "current status of reservations:\n" .
+			"ready     : $ready_count (" . join(', ', @reservations_ready) . ")\n" .
+			"not ready : $not_ready_count (" . join(', ', @reservations_not_ready) . ")\n" .
+			"unknown   : $unknown_count (" . join(', ', @reservations_unknown) . ")\n" .
+			"failed    : $failed_count (" . join(', ', @reservations_failed) . ")"
+		);
+		
+		if ($failed_count) {
+			$self->state_exit('failed', 'available');
+		}
+		elsif ($ready_count == $reservation_count) {
+			notify($ERRORS{'OK'}, 0, "all reservations are ready");
+			return 1;
+		}
+		
+		
+		# If any changes were detected, reset the nochange timeout
+		if (@reservations_lastcheck_changed || @reservations_loadstate_changed) {
+			notify($ERRORS{'DEBUG'}, 0, "resetting no change timeout, detected reservation change:\n" .
+				"reservation lastcheck changed: (" . join(', ', @reservations_lastcheck_changed) . ")\n" .
+				"reservation loadstate changed: (" . join(', ', @reservations_loadstate_changed) . ")"
+			);
+			$last_change_time = $current_time;
+			$nochange_timeout_time = ($last_change_time + $nochange_timeout_seconds);
+		}
+		
+		$previous_request_loadstate_names = $current_request_loadstate_names;
+		$previous_lastcheck_info = $current_lastcheck_info;
+		sleep $monitor_delay_seconds;
+	}
 
 	# If out of main loop, waited maximum amount of time
-	notify($ERRORS{'WARNING'}, 0, "waited maximum amount of time for child reservations to become ready, returning 0");
-	return 0;
+	notify($ERRORS{'WARNING'}, 0, "waited maximum amount of time for all reservations to become ready");
+	return;
 
 } ## end sub wait_for_child_reservations
 

Modified: vcl/trunk/managementnode/lib/VCL/utils.pm
URL: http://svn.apache.org/viewvc/vcl/trunk/managementnode/lib/VCL/utils.pm?rev=1524445&r1=1524444&r2=1524445&view=diff
==============================================================================
--- vcl/trunk/managementnode/lib/VCL/utils.pm (original)
+++ vcl/trunk/managementnode/lib/VCL/utils.pm Wed Sep 18 15:13:37 2013
@@ -1577,7 +1577,7 @@ EOF
 				return 1;
 			}
 			else {
-				notify($ERRORS{'WARNING'}, $LOGFILE, "failed to update request $request_id state to: $state_name/$laststate_name, current state: $current_state_name/$current_laststate_name");
+				notify($ERRORS{'WARNING'}, $LOGFILE, "unable to update request $request_id state to: $state_name/$laststate_name, current state: $current_state_name/$current_laststate_name");
 				return;
 			}
 		}
@@ -2091,42 +2091,57 @@ sub get_reservation_accounts {
 
 =head2 get_current_reservation_lastcheck
 
- Parameters  : $reservation_id
+ Parameters  : @reservation_ids
  Returns     : string
  Description : Retrieves the current value of reservation.lastcheck from the
-               database.
+               database. Either a single reservation ID or multiple reservation
+               IDs may be passed as the argument. If a single reservation ID is
+               passed, a string is returned containing the reservation.lastcheck
+               value. If multiple reservation IDs are passed, a hash reference
+               is returned with the keys set to the reservation IDs.
 
 =cut
 
 sub get_current_reservation_lastcheck {
-	my ($reservation_id) = @_;
+	my @reservation_ids = @_;
 
 	# Check the passed parameter
-	if (!defined($reservation_id)) {
-		notify($ERRORS{'WARNING'}, 0, "reservation ID was not specified");
+	if (!@reservation_ids) {
+		notify($ERRORS{'WARNING'}, 0, "reservation ID argument was not specified");
 		return;
 	}
+	
+	my $reservation_id_string = join(', ', @reservation_ids);
 
 	# Create the select statement
 	my $select_statement = <<EOF;
 SELECT
+reservation.id,
 reservation.lastcheck
 FROM
 reservation
 WHERE
-reservation.id = $reservation_id
+reservation.id IN ($reservation_id_string)
 EOF
 
 	my @selected_rows = database_select($select_statement);
 
 	# Check to make sure 1 row was returned
 	if (!@selected_rows) {
-		notify($ERRORS{'WARNING'}, 0, "failed to retrieve current reservation lastcheck for reservation $reservation_id");
+		notify($ERRORS{'WARNING'}, 0, "failed to retrieve current reservation lastcheck for reservations: $reservation_id_string");
 		return;
 	}
-	
-	my $row = $selected_rows[0];
-	return $row->{lastcheck};
+	elsif (scalar(@selected_rows) == 1) {
+		my $row = $selected_rows[0];
+		return $row->{lastcheck};
+	}
+	else {
+		my $reservation_lastcheck_info = {};
+		for my $row (@selected_rows) {
+			$reservation_lastcheck_info->{$row->{id}} = $row->{lastcheck};
+		}
+		return $reservation_lastcheck_info;
+	}
 }
 
 #/////////////////////////////////////////////////////////////////////////////
@@ -4840,7 +4855,10 @@ sub run_ssh_command {
 	my $ssh_output_formatted = '';
 	my $attempts = 0;
 	my $exit_status = 255;
-
+	
+	my $banner_exchange_error_count = 0;
+	my $banner_exchange_error_limit = 3;
+	
 	# Make multiple attempts if failure occurs
 	while ($attempts < $max_attempts) {
 		$attempts++;
@@ -4851,10 +4869,10 @@ sub run_ssh_command {
 			notify($ERRORS{'DEBUG'}, 0, "sleeping for $delay_seconds seconds before making next SSH attempt") if $output_level;
 			sleep $delay_seconds;
 		}
-
+		
 		## Add -v (verbose) argument to command if this is the 2nd attempt
 		#$ssh_command =~ s/$ssh_path/$ssh_path -v/ if $attempts == 2;
-
+		
 		# Print the SSH command, only display the attempt # if > 1
 		if ($attempts == 1) {
 			notify($ERRORS{'DEBUG'}, 0, "executing SSH command on $node: '$command'") if $output_level;
@@ -4963,6 +4981,21 @@ sub run_ssh_command {
 			notify($ERRORS{'WARNING'}, 0, "attempt $attempts/$max_attempts: failed to execute SSH command on $node: '$command', exit status: $exit_status, output:\n$ssh_output_formatted") if $output_level;
 			next;
 		}
+		elsif ($ssh_output_formatted =~ /(Connection timed out during banner exchange)/i) {
+			$banner_exchange_error_count++;
+			if ($banner_exchange_error_count >= $banner_exchange_error_limit) {
+				notify($ERRORS{'WARNING'}, 0, "failed to execute SSH command on $node, encountered $banner_exchange_error_count banner exchange errors");
+				return ();
+			}
+			else {
+				# Don't count against attempt limit
+				$attempts--;
+				my $banner_exchange_delay_seconds = ($banner_exchange_error_count * 2);
+				notify($ERRORS{'DEBUG'}, 0, "encountered banner exchange error on $node, sleeping for $banner_exchange_delay_seconds seconds, command:\n$command\noutput:\n$ssh_output") if $output_level;
+				sleep $banner_exchange_delay_seconds;
+				next;
+			}
+		}
 		elsif ($exit_status == 255 && $ssh_command !~ /(vmware-cmd|vim-cmd|vmkfstools|vmrun)/i) {
 			notify($ERRORS{'WARNING'}, 0, "attempt $attempts/$max_attempts: failed to execute SSH command on $node: '$command', exit status: $exit_status, SSH exits with the exit status of the remote command or with 255 if an error occurred, output:\n$ssh_output_formatted") if $output_level;
 			next;
@@ -8729,7 +8762,7 @@ EOF
 		$computerloadlog_string .= "\n";
 	}
 	
-	notify($ERRORS{'DEBUG'}, 0, "retrieved computerloadstate names for request $request_id:\n$computerloadlog_string");
+	#notify($ERRORS{'DEBUG'}, 0, "retrieved computerloadstate names for request $request_id:\n$computerloadlog_string");
 	return $computerloadlog_info;
 }