You are viewing a plain text version of this content. The canonical link for it is here.
Posted to cvs@httpd.apache.org by ha...@hyperreal.org on 1998/01/07 18:29:02 UTC
cvs commit: apachen/src/support logresolve.pl

hartill     98/01/07 09:29:01

  Added:       src/support logresolve.pl
  Log:
  Adding logresolve.pl  -  a Perl version of logresolve.c that spawns lots
  of low overhead processes to map IP numbers into hostnames concurrently
  instead of one by one at a snails pace (logresolve.c).
  
  Nobody said it was shite so here it is.
  
  Submitted by: Rob Hartill
  
  Revision  Changes    Path
  1.1                  apachen/src/support/logresolve.pl
  
  Index: logresolve.pl
  ===================================================================
  #!/usr/local/bin/perl
  
  # ====================================================================
  # Copyright (c) 1995-1998 The Apache Group.  All rights reserved.
  #
  # Redistribution and use in source and binary forms, with or without
  # modification, are permitted provided that the following conditions
  # are met:
  #
  # 1. Redistributions of source code must retain the above copyright
  #    notice, this list of conditions and the following disclaimer. 
  #
  # 2. Redistributions in binary form must reproduce the above copyright
  #    notice, this list of conditions and the following disclaimer in
  #    the documentation and/or other materials provided with the
  #    distribution.
  #
  # 3. All advertising materials mentioning features or use of this
  #    software must display the following acknowledgment:
  #    "This product includes software developed by the Apache Group
  #    for use in the Apache HTTP server project (http://www.apache.org/)."
  #
  # 4. The names "Apache Server" and "Apache Group" must not be used to
  #    endorse or promote products derived from this software without
  #    prior written permission. For written permission, please contact
  #    apache@apache.org.
  #
  # 5. Redistributions of any form whatsoever must retain the following
  #    acknowledgment:
  #    "This product includes software developed by the Apache Group
  #    for use in the Apache HTTP server project (http://www.apache.org/)."
  #
  # THIS SOFTWARE IS PROVIDED BY THE APACHE GROUP ``AS IS'' AND ANY
  # EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
  # PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE APACHE GROUP OR
  # ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  # SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
  # NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
  # LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  # HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
  # STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
  # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
  # OF THE POSSIBILITY OF SUCH DAMAGE.
  # ====================================================================
  #
  # This software consists of voluntary contributions made by many
  # individuals on behalf of the Apache Group and was originally based
  # on public domain software written at the National Center for
  # Supercomputing Applications, University of Illinois, Urbana-Champaign.
  # For more information on the Apache Group and the Apache HTTP server
  # project, please see <http://www.apache.org/>.
  
  # logresolve.pl
  #
  # v 1.0 by robh @ imdb.com
  # 
  # usage: logresolve.pl <infile >outfile
  #
  # input = Apache/NCSA/.. logfile with IP numbers at start of lines
  # output = same logfile with IP addresses resolved to hostnames where
  #  name lookups succeeded.
  #
  # this differs from the C based 'logresolve' in that this script
  # spawns a number ($CHILDREN) of subprocesses to resolve addresses
  # concurrently and sets a short timeout ($TIMEOUT) for each lookup in
  # order to keep things moving quickly.
  #
  # the parent process handles caching of IP->hostnames using a Perl hash
  # it also avoids sending the same IP to multiple child processes to be
  # resolved multiple times concurrently.
  #
  # Depending on the settings of $CHILDREN and $TIMEOUT you should see
  # significant reductions in the overall time taken to resolve your
  # logfiles. With $CHILDREN=40 and $TIMEOUT=5 I've seen 200,000 - 300,000
  # logfile lines processed per hour compared to ~45,000 per hour
  # with 'logresolve'.
  #
  # I haven't yet seen any noticable reduction in the percentage of IPs
  # that fail to get resolved. Your mileage will no doubt vary. 5s is long
  # enough to wait IMO.
  
  $|=1;
  
  use FileHandle;
  use Socket;
  
  use strict;
  no strict 'refs';
  
  use vars qw($AF_UNIX $SOCK_STREAM $PROTOCOL);
  ($AF_UNIX, $SOCK_STREAM, $PROTOCOL) = (1, 1, 0);
  
  my $CHILDREN = 40;
  my $TIMEOUT  = 5;
  
  my $filename;
  my %hash = ();
  my $parent = $$;
  
  my @children = ();
  for (my $child = 1; $child <=$CHILDREN; $child++) {
  	my $f = fork();	
  	if (!$f) {
  		$filename = "./.socket.$parent.$child";
  		if (-e $filename) { unlink($filename) || warn "$filename .. $!\n";}
  		&child($child);
  		exit(0);
  	}
  	push(@children, $f);
  }
  
  &parent;
  &cleanup;
  
  ## remove all temporary files before shutting down
  sub cleanup {
  	 # die kiddies, die
  	kill(15, @children);
  	for (my $child = 1; $child <=$CHILDREN; $child++) {
  		if (-e "./.socket.$parent.$child") {
  			unlink("./.socket.$parent.$child")
  				|| warn ".socket.$parent.$child $!";
  		}
  	}
  }
  	
  sub parent {
  	# Trap some possible signals to trigger temp file cleanup
  	$SIG{'KILL'} = $SIG{'INT'} = $SIG{'PIPE'} = \&cleanup;
  
  	my %CHILDSOCK;
  	my $filename;
   
  	 ## fork child processes. Each child will create a socket connection
  	 ## to this parent and use an unique temp filename to do so.
  	for (my $child = 1; $child <=$CHILDREN; $child++) {
  		$CHILDSOCK{$child}= FileHandle->new;
  
  		if (!socket($CHILDSOCK{$child}, $AF_UNIX, $SOCK_STREAM, $PROTOCOL)) {
  			warn "parent socket to child failed $!";
  		}
  		$filename = "./.socket.$parent.$child";
  		my $response;
  		do {
  			$response = connect($CHILDSOCK{$child}, "$filename");
  			if ($response != 1) {
  				sleep(1);
  			}                       
  		} while ($response != 1);
  		$CHILDSOCK{$child}->autoflush;
  	}
  	## All child processes should now be ready or at worst warming up 
  
  	my (@buffer, $child, $ip, $rest, $hostname, $response);
  	 ## read the logfile lines from STDIN
  	while(<STDIN>) {
  		@buffer = ();	# empty the logfile line buffer array.
  		$child = 1;		# children are numbered 1..N, start with #1
  
  		# while we have a child to talk to and data to give it..
  		do {
  			push(@buffer, $_);					# buffer the line
  			($ip, $rest) = split(/ /, $_, 2);	# separate IP form rest
  
  			unless ($hash{$ip}) {				# resolve if unseen IP
  				$CHILDSOCK{$child}->print("$ip\n"); # pass IP to next child
  				$hash{$ip} = $ip;				# don't look it up again.
  				$child++;
  			}
  		} while (($child < ($CHILDREN-1)) and ($_ = <STDIN>));
  
  		 ## now poll each child for a response
  		while (--$child > 0) { 
  			$response = $CHILDSOCK{$child}->getline;
  			chomp($response);
  			 # child sends us back both the IP and HOSTNAME, no need for us
  			 # to remember what child received any given IP, and no worries
  			 # what order we talk to the children
  			($ip, $hostname) = split(/\|/, $response, 2);
  			$hash{$ip} = $hostname;
  		}
  
  		 # resolve all the logfiles lines held in the log buffer array..
  		for (my $line = 0; $line <=$#buffer; $line++) {
  			 # get next buffered line
  			($ip, $rest) = split(/ /, $buffer[$line], 2);
  			 # separate IP from rest and replace with cached hostname
  			printf STDOUT ("%s %s", $hash{$ip}, $rest);
  		}
  	}
  }
  
  ########################################
  
  sub child {
  	 # arg = numeric ID - how the parent refers to me
  	my $me = shift;
  
  	 # add trap for alarm signals.
  	$SIG{'ALRM'} = sub { die "alarmed"; };
  
  	 # create a socket to communicate with parent
  	socket(INBOUND, $AF_UNIX, $SOCK_STREAM, $PROTOCOL)
  		|| die "Error with Socket: !$\n";
  	$filename = "./.socket.$parent.$me";
  	bind(INBOUND, $filename) || die "Error Binding $filename: $!\n";
  	listen(INBOUND, 5) || die "Error Listening: $!\n";
  
  	my ($ip, $send_back);
  	my $talk = FileHandle->new;
  
  	 # accept a connection from the parent process. We only ever have
  	 # have one connection where we exchange 1 line of info with the
  	 # parent.. 1 line in (IP address), 1 line out (IP + hostname).
  	accept($talk, INBOUND) || die "Error Accepting: $!\n";
  	 # disable I/O buffering just in case
  	$talk->autoflush;
  	 # while the parent keeps sending data, we keep responding..
  	while(($ip = $talk->getline)) {
  		chomp($ip);
  		 # resolve the IP if time permits and send back what we found..
  		$send_back = sprintf("%s|%s", $ip, &nslookup($ip));
  		$talk->print($send_back."\n");
  	}
  }
  
  # perform a time restricted hostname lookup.
  sub nslookup {
  	 # get the IP as an arg
  	my $ip = shift;
  	my $hostname = undef;
  
  	 # do the hostname lookup inside an eval. The eval will use the
  	 # already configured SIGnal handler and drop out of the {} block
  	 # regardless of whether the alarm occured or not.
  	eval {
  		alarm($TIMEOUT);
  		$hostname = gethostbyaddr(gethostbyname($ip), AF_INET);
  		alarm(0);
  	};
  	if ($@ =~ /alarm/) {
  		 # useful for debugging perhaps..
  		# print "alarming, isn't it? ($ip)";
  	}
  
  	 # return the hostname or the IP address itself if there is no hostname
  	$hostname ne "" ? $hostname : $ip;
  }