You are viewing a plain text version of this content. The canonical link for it is here.
Posted to cvs@httpd.apache.org by Dirk-Willem van Gulik <di...@covalent.net> on 2001/01/19 07:20:00 UTC

cvs commit: apache-search-site index.cgi search.html index.new.cgi

Return-Path: <di...@apache.org>
Received: (qmail 27144 invoked by uid 1011); 19 Jan 2001 05:52:11 -0000
Date: 19 Jan 2001 05:52:11 -0000
Message-ID: <20...@apache.org>
From: dirkx@apache.org
To: apache-search-site-cvs@apache.org
Subject: cvs commit: apache-search-site index.cgi search.html index.new.cgi

dirkx       01/01/18 21:52:11

  Modified:    .        index.cgi search.html
  Removed:     .        index.new.cgi
  Log:
  Making the new search interface visible; whilst making
  sure that users of the old one are forwarded to a query
  page during the period when the mirror's are not in
  sync with the main apche site.
  
  Revision  Changes    Path
  1.2       +335 -1    apache-search-site/index.cgi
  
  Index: index.cgi
  ===================================================================
  RCS file: /home/cvs/apache-search-site/index.cgi,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -u -r1.1 -r1.2
  --- index.cgi	2001/01/19 05:40:49	1.1
  +++ index.cgi	2001/01/19 05:52:11	1.2
  @@ -1,8 +1,342 @@
   #!/usr/local/bin/perl
   #
  -print "Status: 301 Has moved
  +# Simple Searches using swish-e.
  +#
  +# (c) 1993 dirkx@webweaving.org 
  +#     Under the Apache licence. 
  +#
  +# Simple CGI script to do searching.
  +#
  +# When run from the command line; some simple
  +# interactive options allow for easy creating of
  +# a small config file and an index file.
  +#
  +# In a production environment you might want
  +# to do better.
  +#
  +# Guess my localtion (_dir). Or course best
  +# set manually by any sensible admin
  +#
  +# ($_dir=$ENV{SCRIPT_FILENAME} or 
  +# 	($_=`pwd`,chop,($_dir=$_.'/'.$0)=~s!/\./!/!g));
  +# $_dir =~ s![^/]+$!!; # Unix Centric ! 
  +#
  +$_dir='/da1/www/search.apache.org';
  +
  +# Admin's email address quoted at the bottom of
  +# all pages.
  +#
  +# $admin=$ENV{ SERVER_ADMIN } or 'webmaster@this.host';
  +$admin='webmaster@apache.org';
  +
  +# Localtion of the swish binary. Again a sensible admin
  +# would nail these values down.
  +#
  +$swish = '/usr/local/bin/swish-e';
  +
  +# # No user servicable parts beyond here.
  +#
  +# Does it look like we came from an mirror.
  +#
  +$on_master=1;
  +$mirror_root = 'http://www.apache.org';
  +
  +if ($ENV{HTTP_REFERER} =~ m!(.*)/search.html$!) {
  +	$mirror_root = $1;
  +	$on_master= ($mirror_root =~ m/.apache.org/i)  ? 1 : 0;
  +	};
  +
  +%db = (
  +	'apr.apache.org' => 'Apache Runtime portability',
  +	'bugs.apache.org' => 'Apache bugs database',
  +	'dev.apache.org' => 'Developers site',
  +	'httpd.apache.org' => 'The apache web server',
  +	'jakarta.apache.org' => 'Jakarta',
  +	'java.apache.org' => 'Apache/Java',
  +	'perl.apache.org' => 'Apache/Perl',
  +	'tcl.apache.org' => 'Apache/Tcl',
  +	'www.apache.org' => 'General Foundation site',
  +	'xml.apache.org' => 'XML'
  +);
  +
  +%map = (
  +	_ => sub { $x=$_[0];
  +		$x =~ s!http://www.apache.org/!$mirror_root/!;
  +		return ($x, $_[0]); },
  +
  +	'perl.apache.org' => sub {
  +		$x=$_[0];
  +		$x =~ s!http://perl.apache.org/!$mirror_root/perl/!
  +			unless $on_master;
  +		return ($x, $_[0]); 
  +		},
  +	'bugs.apche.org' => sub { $_[0] =~ m/(\d+)$/;
  +		return "http://bugs.apache.org/index/full/".$1,'';
  +		}
  +	);
  +
  +&search_bang("Can only do CGI")
  +	unless $ENV{GATEWAY_INTERFACE} =~ m!CGI/\d+\.\d+!i;
  +
  +$request_method = $ENV{'REQUEST_METHOD'};
  +
  +if ($request_method eq 'GET') {
  +   $query_string = $ENV{'QUERY_STRING'};
  +} elsif ($request_method eq "POST") {
  +   read (STDIN, $query_string, $ENV{'CONTENT_LENGTH'});
  +} else {
  +   &search_bang("Forms must use either GET or POST.");
  +   exit 1;
  +}
  +
  +&search_bang("Hmm, I am just a bit confused as to how you got here. There seems to be \
  +	no form input information telling me what to search for.")
  +	unless length($query_string)>0;
  +
  +foreach (split(/&/,$query_string)) {
  +   ($key,  $val) = map {
  +   	tr/+/ /;
  +	s/%([\dA-F]{2})/pack("C",hex($1))/iegs;
  +	# Zap anything which is not 'normal' and might
  +	# cause havoc when we pass it to the shell later.
  +	s/[^a-zA-Z0-9\_\*\.\@\(\)\=\" \-]+/ /gs;
  +	$_;
  +	} split /=/;
  +   $form{$key} .= $val.' ';
  +   }
  +
  +if ($form{version} < 2) {
  +	print "Status: 301 Has moved
   Content-type: text/html
   Location: http://search.apache.org/search.html
   
   This page has moved as part of the SE upgrade
   ";
  +	exit 0;
  +};
  +
  +
  +($query=$form{'keyword'}) =~ s/\s+/ /g;
  +$max= int($form{'results'}) + 0;
  +
  +if ($query !~ m/\w+/) {
  +	&search_bang("You must enter a keyword or phrase in one or more of the text boxes".$query);
  +	exit 0;
  +	};
  +
  +# Check if we know the databases to search.
  +#
  +@dbs=();
  +foreach(split /\s+/, lc $form{what}) {
  +	next unless $db{$_};
  +	push @dbs,$_;
  +};
  +
  +push @dbs,(keys %db)
  +	if ($#dbs == -1);
  +
  +$age=0;
  +$list = '';
  +
  +foreach(@dbs) {
  +	my $file = $_dir . '/' . $_.'.idx';
  +
  +	if (! -r $file) {
  +		$errors .= "<p align='CENTER'> Skipping $index - temporarily not available</p>";
  +		next;
  +	};
  +
  +	$list .= ' -f '.$file;
  + 	$age += -M $file;
  +};
  +
  +# Here we pass two potentially troublesome or tainted variables to
  +# the shell; they are kept honest by a s/<only normals>//i in the
  +# param decoding loop above.
  +#
  +if (!(open(SWISH, "$swish -w \"$query\" -m $max $list |"))) {
  +	&search_error("Configuration/Resource problem");
  +	exit 1;
  +	};
  +
  +while (<SWISH>) {
  +   chop;
  +   $index = $1 if m/^#\s*Name:\s*(.*)\s*$/i;
  +   next if m/^#/;
  +
  +   if ($_ eq "err: no results") {
  +	next; 
  +	}
  +   elsif ($_ eq "err: a word is too common") {
  +	$errors .= "<p align='CENTER'>One of your search terms accurs too often in $index to be useful; skipping.</p>";
  +	next;
  +	}
  +   elsif ($_ eq "err: all search words too common to be useful") { 
  +	$errors .= "<p align='CENTER'>All of your search term(s) accurs too often in $index to be useful; skipping.</p>";
  +	next;
  +	}
  +   elsif ($_ eq "err: could not open index file") {
  +	&search_error("Could not open SWISH Index File $list/$index ");
  +	exit 1;
  +	}
  +   elsif (m/^\s*err:\s+(.*)\s*$/) {
  +	&search_bang("Search failed: $1");
  +	exit 0;
  +	}
  +
  +   next if /^\D/;
  +
  +   ($stringone, $title, $filesize) = split /\"/ or next;
  +   ($rank, $url) = split(/ /, $stringone) or next;
  +   $title=~s/^[:\s]+//;
  +   if ($title =~ m/^\s*$/) {
  +	$title ='no title';
  +	$title = $1 if $url =~ m/([^\/]+)\s*$/;
  +	};
  +   push @results, {
  +	rank => $rank,
  +	db =>  $db{ $index },
  +	title => $title,
  +	url => $url
  +	};
  +}
  +close(SWISH);
  +
  +if ($?) {
  +	&search_error("Configuration/Resource problem");
  +        exit 1;
  +        };
  +
  +$age *= 24/(@dbs+1);
  +if ($age<2) {
  +	$age='about an hour';
  +	}
  +elsif ($age<48) {
  +	$age=int(0.5+$age).' hours';
  +	}
  +else {
  +	$age = int(0.5+$age/24) . ' days';
  +	};
  +$age="The databases are about $age old.";
  +
  +if ($#results == -1) {
  +	&search_bang("There were no items that matched your search request.".$errors);
  +	exit 1;
  +	};
  +
  +$count = $#results +1;
  +$count = $max > $count ? $count : $max if $max >0;
  +
  +# Note that we need to sort again, as we (might) have
  +# collated results from different DBs. We have up to
  +# N x $max.. that is why we do the limitation again
  +# later in the foreach.
  +#
  +@results = sort { 
  +	$b->{rank} <=> $a->{rank}
  +	} @results;
  +
  +&html_header(200,"Apache Sites: Search Results");
  +
  +$upto='(up to the maximum) '
  +	if $count == $max;
  +
  +print <<Search_Results;
  +<P ALIGN=CENTER>
  +Your Search for <strong>$query</strong>, returned $upto$count Items;
  +listed in order of computed relevance<BR>
  +</P>
  +<HR>
  +<UL>
  +Search_Results
  +foreach $r (@results[ 0 .. $count-1 ]) {
  +   select(STDOUT); 
  +   $rule = exists $map{ $r->{db} } ? $map{ $r->{db} } : '_';
  +   ($url,$master) =  &{ $map{ $rule }}($r->{url}); 
  +   print qq|<li><A HREF="$url">$r->{title}</A>|;
  +   print " (on your Mirror)" unless $master eq '' or $on_master;
  +   print "<br><dir>";
  +   print "From the <b>$r->{db}</b> collection: " if $#dbs;
  +   print $url;
  +   print qq| <a href="$master">(or at the origin site)</a>|
  +	unless $master eq '' or $on_master;
  +   print " <i>Ranking $rank</i></dir>\n";
  +}
  +
  +print "</UL>$errors\n";
  +&html_trailer;
  +
  +exit 0;
  +
  +
  +#Subroutine for print a generic HTML header.
  +
  +sub html_header {
  +my($status, $document_title)=@_;
  +
  +print <<HTML_Header;
  +Status: $status
  +Content-type: text/html
  +
  +<HTML>
  +<HEAD>
  +<TITLE>$document_title</TITLE>
  +</HEAD>
  +<BODY BGCOLOR=WHITE>
  +<CENTER>
  +<IMG SRC="$mirror_root/images/apache_sub.gif" ALT="">
  +<H1 ALIGN=CENTER>$document_title</H1>
  +</CENTER>
  +HTML_Header
  +
  +# foreach (keys(%form)) { print "<li>$_ $form{$_}"; };
  +}
  +
  +#Subroutine for printing a generic HTML trailer.
  +
  +sub html_trailer {
  +print <<HTML_Trailer;
  +<HR>
  +<i>Back to the <a href="http://www.apache.org">Main Apache site</a>. Or
  +back to the <a href="$mirror_root/search.html">Search Form</a>.
  +$age
  +This code uses <a href="http://sunsite.berkeley.edu/SWISH-E/">Swish-e</a>.</i>
  +</BODY>
  +</HTML>
  +HTML_Trailer
  +
  +exit;
  +
  +}
  +
  +#Subroutine for printing error messages.
  +
  +sub search_error {
  +&html_header(500,"Apache Site: Search Error");
  +$error_message = $_[0];
  +print "
  +<P>
  +Something Failed. If this problem persists, contact
  +the administrator. He or she will be able to track
  +the problem dowm. The error log file of the web
  +server should give further details.
  +</p>
  +<P>
  +$error_message
  +</P>\n";
  +&html_trailer;
  +}
  +
  +sub search_bang {
  +&html_header(200,"Apache Site: Search Failed");
  +$error_message = $_[0];
  +print qq|<P ALIGN=CENTER>\n$error_message</P>\n
  +<P ALIGN=CENTER>
  +Just go back to the
  +<a href="$mirror_root/search.html">Search Form</a>
  +and try again.
  +</p>
  +|;
  +&html_trailer;
  +}
  +
  
  
  
  1.2       +9 -5      apache-search-site/search.html
  
  Index: search.html
  ===================================================================
  RCS file: /home/cvs/apache-search-site/search.html,v
  retrieving revision 1.1
  retrieving revision 1.2
  diff -u -u -r1.1 -r1.2
  --- search.html	2001/01/19 05:40:49	1.1
  +++ search.html	2001/01/19 05:52:11	1.2
  @@ -20,11 +20,15 @@
   words with a dot, a dash, a hyphen or an '@' sign are taken as one
   searchable token. Thus you are <i>strongly</i> encouraged to use
   asterisk '*' completion.
  -
  -<FORM ACTION="http://search.apache.org/index.new.cgi" METHOD="post">
  -
  -<P><INPUT TYPE="text" NAME="keyword" SIZE=50>
  +<p>
  +<i>Note that searching on 'http', 'apache' and other commonly
  +used words such as 'the' is not possible; as they apppear on
  +virtually all pages.</i>
  +<P>
  +<FORM ACTION="http://search.apache.org/index.cgi" METHOD="post">
  +<INPUT TYPE="HIDDEN" NAME="version" VALUE="2">
   
  +<P><INPUT TYPE="text" NAME="keyword" SIZE=50><BR>
   <P>
   
   Maximum number of records to return: 
  @@ -53,7 +57,7 @@
   <td><input name="what" value="xml.apache.org" type="checkbox"> Apache/XML</td>
   </tr>
   </table>
  -<i>(All projects are searchif nothing is selected.)</i>
  +<i>(All projects are searched if nothing is selected.)</i>
   </blockquote>
   <P>
   <INPUT TYPE="submit" VALUE="Start Search">