You are viewing a plain text version of this content. The canonical link for it is here.
Posted to cvs@httpd.apache.org by Dirk-Willem van Gulik <di...@covalent.net> on 2001/01/19 07:20:00 UTC
cvs commit: apache-search-site index.cgi search.html index.new.cgi
Return-Path: <di...@apache.org>
Received: (qmail 27144 invoked by uid 1011); 19 Jan 2001 05:52:11 -0000
Date: 19 Jan 2001 05:52:11 -0000
Message-ID: <20...@apache.org>
From: dirkx@apache.org
To: apache-search-site-cvs@apache.org
Subject: cvs commit: apache-search-site index.cgi search.html index.new.cgi
dirkx 01/01/18 21:52:11
Modified: . index.cgi search.html
Removed: . index.new.cgi
Log:
Making the new search interface visible; whilst making
sure that users of the old one are forwarded to a query
page during the period when the mirror's are not in
sync with the main apche site.
Revision Changes Path
1.2 +335 -1 apache-search-site/index.cgi
Index: index.cgi
===================================================================
RCS file: /home/cvs/apache-search-site/index.cgi,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -u -r1.1 -r1.2
--- index.cgi 2001/01/19 05:40:49 1.1
+++ index.cgi 2001/01/19 05:52:11 1.2
@@ -1,8 +1,342 @@
#!/usr/local/bin/perl
#
-print "Status: 301 Has moved
+# Simple Searches using swish-e.
+#
+# (c) 1993 dirkx@webweaving.org
+# Under the Apache licence.
+#
+# Simple CGI script to do searching.
+#
+# When run from the command line; some simple
+# interactive options allow for easy creating of
+# a small config file and an index file.
+#
+# In a production environment you might want
+# to do better.
+#
+# Guess my localtion (_dir). Or course best
+# set manually by any sensible admin
+#
+# ($_dir=$ENV{SCRIPT_FILENAME} or
+# ($_=`pwd`,chop,($_dir=$_.'/'.$0)=~s!/\./!/!g));
+# $_dir =~ s![^/]+$!!; # Unix Centric !
+#
+$_dir='/da1/www/search.apache.org';
+
+# Admin's email address quoted at the bottom of
+# all pages.
+#
+# $admin=$ENV{ SERVER_ADMIN } or 'webmaster@this.host';
+$admin='webmaster@apache.org';
+
+# Localtion of the swish binary. Again a sensible admin
+# would nail these values down.
+#
+$swish = '/usr/local/bin/swish-e';
+
+# # No user servicable parts beyond here.
+#
+# Does it look like we came from an mirror.
+#
+$on_master=1;
+$mirror_root = 'http://www.apache.org';
+
+if ($ENV{HTTP_REFERER} =~ m!(.*)/search.html$!) {
+ $mirror_root = $1;
+ $on_master= ($mirror_root =~ m/.apache.org/i) ? 1 : 0;
+ };
+
+%db = (
+ 'apr.apache.org' => 'Apache Runtime portability',
+ 'bugs.apache.org' => 'Apache bugs database',
+ 'dev.apache.org' => 'Developers site',
+ 'httpd.apache.org' => 'The apache web server',
+ 'jakarta.apache.org' => 'Jakarta',
+ 'java.apache.org' => 'Apache/Java',
+ 'perl.apache.org' => 'Apache/Perl',
+ 'tcl.apache.org' => 'Apache/Tcl',
+ 'www.apache.org' => 'General Foundation site',
+ 'xml.apache.org' => 'XML'
+);
+
+%map = (
+ _ => sub { $x=$_[0];
+ $x =~ s!http://www.apache.org/!$mirror_root/!;
+ return ($x, $_[0]); },
+
+ 'perl.apache.org' => sub {
+ $x=$_[0];
+ $x =~ s!http://perl.apache.org/!$mirror_root/perl/!
+ unless $on_master;
+ return ($x, $_[0]);
+ },
+ 'bugs.apche.org' => sub { $_[0] =~ m/(\d+)$/;
+ return "http://bugs.apache.org/index/full/".$1,'';
+ }
+ );
+
+&search_bang("Can only do CGI")
+ unless $ENV{GATEWAY_INTERFACE} =~ m!CGI/\d+\.\d+!i;
+
+$request_method = $ENV{'REQUEST_METHOD'};
+
+if ($request_method eq 'GET') {
+ $query_string = $ENV{'QUERY_STRING'};
+} elsif ($request_method eq "POST") {
+ read (STDIN, $query_string, $ENV{'CONTENT_LENGTH'});
+} else {
+ &search_bang("Forms must use either GET or POST.");
+ exit 1;
+}
+
+&search_bang("Hmm, I am just a bit confused as to how you got here. There seems to be \
+ no form input information telling me what to search for.")
+ unless length($query_string)>0;
+
+foreach (split(/&/,$query_string)) {
+ ($key, $val) = map {
+ tr/+/ /;
+ s/%([\dA-F]{2})/pack("C",hex($1))/iegs;
+ # Zap anything which is not 'normal' and might
+ # cause havoc when we pass it to the shell later.
+ s/[^a-zA-Z0-9\_\*\.\@\(\)\=\" \-]+/ /gs;
+ $_;
+ } split /=/;
+ $form{$key} .= $val.' ';
+ }
+
+if ($form{version} < 2) {
+ print "Status: 301 Has moved
Content-type: text/html
Location: http://search.apache.org/search.html
This page has moved as part of the SE upgrade
";
+ exit 0;
+};
+
+
+($query=$form{'keyword'}) =~ s/\s+/ /g;
+$max= int($form{'results'}) + 0;
+
+if ($query !~ m/\w+/) {
+ &search_bang("You must enter a keyword or phrase in one or more of the text boxes".$query);
+ exit 0;
+ };
+
+# Check if we know the databases to search.
+#
+@dbs=();
+foreach(split /\s+/, lc $form{what}) {
+ next unless $db{$_};
+ push @dbs,$_;
+};
+
+push @dbs,(keys %db)
+ if ($#dbs == -1);
+
+$age=0;
+$list = '';
+
+foreach(@dbs) {
+ my $file = $_dir . '/' . $_.'.idx';
+
+ if (! -r $file) {
+ $errors .= "<p align='CENTER'> Skipping $index - temporarily not available</p>";
+ next;
+ };
+
+ $list .= ' -f '.$file;
+ $age += -M $file;
+};
+
+# Here we pass two potentially troublesome or tainted variables to
+# the shell; they are kept honest by a s/<only normals>//i in the
+# param decoding loop above.
+#
+if (!(open(SWISH, "$swish -w \"$query\" -m $max $list |"))) {
+ &search_error("Configuration/Resource problem");
+ exit 1;
+ };
+
+while (<SWISH>) {
+ chop;
+ $index = $1 if m/^#\s*Name:\s*(.*)\s*$/i;
+ next if m/^#/;
+
+ if ($_ eq "err: no results") {
+ next;
+ }
+ elsif ($_ eq "err: a word is too common") {
+ $errors .= "<p align='CENTER'>One of your search terms accurs too often in $index to be useful; skipping.</p>";
+ next;
+ }
+ elsif ($_ eq "err: all search words too common to be useful") {
+ $errors .= "<p align='CENTER'>All of your search term(s) accurs too often in $index to be useful; skipping.</p>";
+ next;
+ }
+ elsif ($_ eq "err: could not open index file") {
+ &search_error("Could not open SWISH Index File $list/$index ");
+ exit 1;
+ }
+ elsif (m/^\s*err:\s+(.*)\s*$/) {
+ &search_bang("Search failed: $1");
+ exit 0;
+ }
+
+ next if /^\D/;
+
+ ($stringone, $title, $filesize) = split /\"/ or next;
+ ($rank, $url) = split(/ /, $stringone) or next;
+ $title=~s/^[:\s]+//;
+ if ($title =~ m/^\s*$/) {
+ $title ='no title';
+ $title = $1 if $url =~ m/([^\/]+)\s*$/;
+ };
+ push @results, {
+ rank => $rank,
+ db => $db{ $index },
+ title => $title,
+ url => $url
+ };
+}
+close(SWISH);
+
+if ($?) {
+ &search_error("Configuration/Resource problem");
+ exit 1;
+ };
+
+$age *= 24/(@dbs+1);
+if ($age<2) {
+ $age='about an hour';
+ }
+elsif ($age<48) {
+ $age=int(0.5+$age).' hours';
+ }
+else {
+ $age = int(0.5+$age/24) . ' days';
+ };
+$age="The databases are about $age old.";
+
+if ($#results == -1) {
+ &search_bang("There were no items that matched your search request.".$errors);
+ exit 1;
+ };
+
+$count = $#results +1;
+$count = $max > $count ? $count : $max if $max >0;
+
+# Note that we need to sort again, as we (might) have
+# collated results from different DBs. We have up to
+# N x $max.. that is why we do the limitation again
+# later in the foreach.
+#
+@results = sort {
+ $b->{rank} <=> $a->{rank}
+ } @results;
+
+&html_header(200,"Apache Sites: Search Results");
+
+$upto='(up to the maximum) '
+ if $count == $max;
+
+print <<Search_Results;
+<P ALIGN=CENTER>
+Your Search for <strong>$query</strong>, returned $upto$count Items;
+listed in order of computed relevance<BR>
+</P>
+<HR>
+<UL>
+Search_Results
+foreach $r (@results[ 0 .. $count-1 ]) {
+ select(STDOUT);
+ $rule = exists $map{ $r->{db} } ? $map{ $r->{db} } : '_';
+ ($url,$master) = &{ $map{ $rule }}($r->{url});
+ print qq|<li><A HREF="$url">$r->{title}</A>|;
+ print " (on your Mirror)" unless $master eq '' or $on_master;
+ print "<br><dir>";
+ print "From the <b>$r->{db}</b> collection: " if $#dbs;
+ print $url;
+ print qq| <a href="$master">(or at the origin site)</a>|
+ unless $master eq '' or $on_master;
+ print " <i>Ranking $rank</i></dir>\n";
+}
+
+print "</UL>$errors\n";
+&html_trailer;
+
+exit 0;
+
+
+#Subroutine for print a generic HTML header.
+
+sub html_header {
+my($status, $document_title)=@_;
+
+print <<HTML_Header;
+Status: $status
+Content-type: text/html
+
+<HTML>
+<HEAD>
+<TITLE>$document_title</TITLE>
+</HEAD>
+<BODY BGCOLOR=WHITE>
+<CENTER>
+<IMG SRC="$mirror_root/images/apache_sub.gif" ALT="">
+<H1 ALIGN=CENTER>$document_title</H1>
+</CENTER>
+HTML_Header
+
+# foreach (keys(%form)) { print "<li>$_ $form{$_}"; };
+}
+
+#Subroutine for printing a generic HTML trailer.
+
+sub html_trailer {
+print <<HTML_Trailer;
+<HR>
+<i>Back to the <a href="http://www.apache.org">Main Apache site</a>. Or
+back to the <a href="$mirror_root/search.html">Search Form</a>.
+$age
+This code uses <a href="http://sunsite.berkeley.edu/SWISH-E/">Swish-e</a>.</i>
+</BODY>
+</HTML>
+HTML_Trailer
+
+exit;
+
+}
+
+#Subroutine for printing error messages.
+
+sub search_error {
+&html_header(500,"Apache Site: Search Error");
+$error_message = $_[0];
+print "
+<P>
+Something Failed. If this problem persists, contact
+the administrator. He or she will be able to track
+the problem dowm. The error log file of the web
+server should give further details.
+</p>
+<P>
+$error_message
+</P>\n";
+&html_trailer;
+}
+
+sub search_bang {
+&html_header(200,"Apache Site: Search Failed");
+$error_message = $_[0];
+print qq|<P ALIGN=CENTER>\n$error_message</P>\n
+<P ALIGN=CENTER>
+Just go back to the
+<a href="$mirror_root/search.html">Search Form</a>
+and try again.
+</p>
+|;
+&html_trailer;
+}
+
1.2 +9 -5 apache-search-site/search.html
Index: search.html
===================================================================
RCS file: /home/cvs/apache-search-site/search.html,v
retrieving revision 1.1
retrieving revision 1.2
diff -u -u -r1.1 -r1.2
--- search.html 2001/01/19 05:40:49 1.1
+++ search.html 2001/01/19 05:52:11 1.2
@@ -20,11 +20,15 @@
words with a dot, a dash, a hyphen or an '@' sign are taken as one
searchable token. Thus you are <i>strongly</i> encouraged to use
asterisk '*' completion.
-
-<FORM ACTION="http://search.apache.org/index.new.cgi" METHOD="post">
-
-<P><INPUT TYPE="text" NAME="keyword" SIZE=50>
+<p>
+<i>Note that searching on 'http', 'apache' and other commonly
+used words such as 'the' is not possible; as they apppear on
+virtually all pages.</i>
+<P>
+<FORM ACTION="http://search.apache.org/index.cgi" METHOD="post">
+<INPUT TYPE="HIDDEN" NAME="version" VALUE="2">
+<P><INPUT TYPE="text" NAME="keyword" SIZE=50><BR>
<P>
Maximum number of records to return:
@@ -53,7 +57,7 @@
<td><input name="what" value="xml.apache.org" type="checkbox"> Apache/XML</td>
</tr>
</table>
-<i>(All projects are searchif nothing is selected.)</i>
+<i>(All projects are searched if nothing is selected.)</i>
</blockquote>
<P>
<INPUT TYPE="submit" VALUE="Start Search">