You are viewing a plain text version of this content. The canonical link for it is here.
Posted to docs-cvs@perl.apache.org by st...@apache.org on 2002/01/30 07:35:01 UTC
cvs commit: modperl-docs/src/search/modules DateRanges.pm DefaultHighlight.pm PhraseHighlight.pm PhraseTest.pm SimpleHighlight.pm TemplateDefault.pm TemplateDumper.pm TemplateHTMLTemplate.pm TemplateToolkit.pm
stas 02/01/29 22:35:01
Modified: src config.cfg
Added: src/search .htaccess .swishcgi.conf search.tt
searchresults.html spider.pl swish.cgi swish.conf
src/search/modules DateRanges.pm DefaultHighlight.pm
PhraseHighlight.pm PhraseTest.pm SimpleHighlight.pm
TemplateDefault.pm TemplateDumper.pm
TemplateHTMLTemplate.pm TemplateToolkit.pm
Removed: src searchresults.html
Log:
- add the search facility
Revision Changes Path
1.7 +2 -1 modperl-docs/src/config.cfg
Index: config.cfg
===================================================================
RCS file: /home/cvs/modperl-docs/src/config.cfg,v
retrieving revision 1.6
retrieving revision 1.7
diff -u -r1.6 -r1.7
--- config.cfg 28 Jan 2002 03:36:58 -0000 1.6
+++ config.cfg 30 Jan 2002 06:35:00 -0000 1.7
@@ -51,7 +51,7 @@
qw(
404.html
creation.html
- searchresults.html
+ search/searchresults.html
)
],
],
@@ -62,6 +62,7 @@
images/*
robots.txt
.htaccess
+ search
)
],
1.1 modperl-docs/src/search/.htaccess
Index: .htaccess
===================================================================
deny from all
<files swish.cgi>
allow from all
Options +ExecCGI
</files>
1.1 modperl-docs/src/search/.swishcgi.conf
Index: .swishcgi.conf
===================================================================
return {
title => 'Search mod_perl Site',
template => {
package => 'TemplateToolkit',
file => 'search.tt',
options => {
INCLUDE_PATH => '.',
},
},
};
1.1 modperl-docs/src/search/search.tt
Index: search.tt
===================================================================
[% # This is just an example -- you would want your own "page" for the wrapper %]
[% WRAPPER searchresults.html %]
[% PROCESS swish_header %]
[% title = PROCESS title %]
[% IF ! search.results %]
[% PROCESS show_message %]
[% PROCESS search_form %]
[% ELSE %]
[% PROCESS search_form %]
[% PROCESS nav_bar %]
[% PROCESS results_list %]
[% END %]
[% PROCESS swish_footer %]
[% END %]
[% # This is just an example -- you would want your own "page" to wrap around "swish" %]
[% BLOCK page %]
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<title>
[% title || "Welcome" %]
</title>
<body>
[% content %]
</body>
</html>
[% END %]
[% BLOCK title %]
[% IF ! search.results %]
[% search.errstr || search.config('title') || "Search page" %]
[% ELSE %]
[% search.navigation('hits') %] Results for [[% search.query_simple | html %]]
[% END %]
[% END %]
[% BLOCK swish_header %]
<table border="0" cellpadding="0" cellspacing="0">
<tr><td>
<a href="http://swish-e.org">
<img border="0" alt="Swish-e home page" src="http://swish-e.org/Images/swish-e.gif"></a>
</td><td valign="middle">
<font size="+3">[% search.config('title') || "Search page" %]</font>
</td></tr>
</table>
[% END %]
[% BLOCK swish_footer %]
<hr>
<small>Powered by <em>Swish-e</em> <a href="http://swish-e.org">swish-e.org</a></small>
[% IF search.MOD_PERL %]
<br><small>Response brought to you by <a href="http://perl.apache.org"><em>mod_perl</em></a></small>
[% END %]
<p>
<a href="http://validator.w3.org/check/referer"><img border="0"
src="http://www.w3.org/Icons/valid-html401"
alt="Valid HTML 4.01!" height="31" width="88"></a>
</p>
[% END %]
[% BLOCK show_message %]
[% IF search.errstr %]
<font size="+2" color="red">[% search.errstr | html %]</font>
[% END %]
[% END %]
[% BLOCK search_form %]
[% CGI.start_form( '-action' => CGI.script_name) %]
[% CGI.textfield( {
name => 'query',
size => 40,
maxlength => 200,
} ) %]
[% CGI.submit('submit','Search!') %]<br>
[% search.get_meta_name_limits %]
[% search.get_sort_select_list %]
[% search.get_index_select_list %]
[% search.get_limit_select %]
[% CGI.end_form.join('') %]
[% END %]
[% BLOCK nav_bar %]
[% search.stopwords_removed %]
<table cellpadding=0 cellspacing=0 border=0 width="100%">
<tr>
<td height=20 bgcolor="#FF9999">
<font size="-1" face="Geneva, Arial, Helvetica, San-Serif">
Results for <b>[% search.query_simple | html %]</b>
[% search.navigation('from') %] to [% search.navigation('to') %] of [% search.navigation('hits') %] results.
</font>
</td>
<td align=right bgcolor="#FF9999">
<font size="-2" face="Geneva, Arial, Helvetica, San-Serif">
Run time: [% search.navigation('run_time') %] |
Search time: [% search.navigation('search_time') %]
</font>
</td>
</tr>
[% IF search.navigation('pages') %]
<tr>
<td colspan=2 bgcolor="#EEEEEE">
<font size="-1" face="Geneva, Arial, Helvetica, San-Serif"> Page:</font>
[% search.navigation('pages') %]
[% IF search.navigation('prev_count') %]
<a href="[% search.query_href %]&start=[% search.navigation('prev') %]">
Previous [% search.navigation('prev_count') %]</a>
[% END %]
[% IF search.navigation('next_count') %]
<a href="[% search.query_href %]&start=[% search.navigation('next') %]">
Next [% search.navigation('next_count') %]</a>
[% END %]
</td>
</tr>
[% END %]
</table>
[% END %]
[% BLOCK results_list %]
[% FOREACH item = search.results %]
<dl>
<dt>
[% item.swishreccount %]
<a href="[% item.swishdocpath_href %]">
[% ( item.swishtitle || item.swishdocpath ) %]
</a>
<small>-- rank: <b>[% item.swishrank %]</b></small>
</dt>
<dd>
[% item.swishdescription %]
<br>
<small>
[% item.swishdocpath %]
- [% item.swishdocsize %] bytes
- [% item.swishlastmodified %]
</small>
</dd>
</dl>
[% END %]
[% END %]
1.1 modperl-docs/src/search/searchresults.html
Index: searchresults.html
===================================================================
<html>
<head>
<title>[% title %]</title>
<meta name="Description" content="search results">
</head>
<body bgcolor="white">
[% content %]
</body>
</html>
1.1 modperl-docs/src/search/spider.pl
Index: spider.pl
===================================================================
#!/usr/local/bin/perl -w
use strict;
# $Id: spider.pl,v 1.1 2002/01/30 06:35:00 stas Exp $
#
# "prog" document source for spidering web servers
#
# For documentation, type:
#
# perldoc spider.pl
#
# Apr 7, 2001 -- added quite a bit of bulk for easier debugging
#
# Nov 19, 2001 -- to do, build a server object so we are not using the passed in hash,
# and so we can warn on invalid config settings.
$HTTP::URI_CLASS = "URI"; # prevent loading default URI::URL
# so we don't store long list of base items
# and eat up memory with >= URI 1.13
use LWP::RobotUA;
use HTML::LinkExtor;
use HTML::Tagset;
use vars '$VERSION';
$VERSION = sprintf '%d.%02d', q$Revision: 1.1 $ =~ /: (\d+)\.(\d+)/;
use vars '$bit';
use constant DEBUG_ERRORS => $bit = 1; # program errors
use constant DEBUG_URL => $bit <<= 1; # print out every URL processes
use constant DEBUG_HEADERS => $bit <<= 1; # prints the response headers
use constant DEBUG_FAILED => $bit <<= 1; # failed to return a 200
use constant DEBUG_SKIPPED => $bit <<= 1; # didn't index for some reason
use constant DEBUG_INFO => $bit <<= 1; # more verbose
use constant DEBUG_LINKS => $bit <<= 1; # prints links as they are extracted
use constant MAX_SIZE => 5_000_000; # Max size of document to fetch
#Can't locate object method "host" via package "URI::mailto" at ../prog-bin/spider.pl line 473.
#sub URI::mailto::host { return '' };
sub UNIVERSAL::host { '' };
sub UNIVERSAL::port { '' };
sub UNIVERSAL::host_port { '' };
#-----------------------------------------------------------------------
use vars '@servers';
my $config = shift || 'SwishSpiderConfig.pl';
if ( lc( $config ) eq 'default' ) {
@servers = default_urls();
} else {
do $config or die "Failed to read $0 configuration parameters '$config' $! $@";
}
print STDERR "$0: Reading parameters from '$config'\n";
my $abort;
local $SIG{HUP} = sub { $abort++ };
my %visited; # global -- I suppose would be smarter to localize it per server.
my %validated;
my %bad_links;
for my $s ( @servers ) {
if ( !$s->{base_url} ) {
die "You must specify 'base_url' in your spider config settings\n";
}
for (ref $s->{base_url} eq 'ARRAY' ? @{$s->{base_url}} : $s->{base_url} ) {
$s->{base_url} = $_;
process_server( $s );
}
}
if ( %bad_links ) {
print STDERR "\nBad Links:\n\n";
foreach my $page ( sort keys %bad_links ) {
print STDERR "On page: $page\n";
printf(STDERR " %-40s %s\n", $_, $validated{$_} ) for @{$bad_links{$page}};
print STDERR "\n";
}
}
#-----------------------------------------------------------------------
sub process_server {
my $server = shift;
# set defaults
$server->{debug} ||= 0;
$server->{debug} = 0 unless $server->{debug} =~ /^\d+$/;
$server->{max_size} ||= MAX_SIZE;
die "max_size parameter '$server->{max_size}' must be a number\n" unless $server->{max_size} =~ /^\d+$/;
$server->{link_tags} = ['a'] unless ref $server->{link_tags} eq 'ARRAY';
$server->{link_tags_lookup} = { map { lc, 1 } @{$server->{link_tags}} };
die "max_depth parameter '$server->{max_depth}' must be a number\n" if defined $server->{max_depth} && $server->{max_depth} !~ /^\d+/;
for ( qw/ test_url test_response filter_content/ ) {
next unless $server->{$_};
$server->{$_} = [ $server->{$_} ] unless ref $server->{$_} eq 'ARRAY';
my $n;
for my $sub ( @{$server->{$_}} ) {
$n++;
die "Entry number $n in $_ is not a code reference\n" unless ref $sub eq 'CODE';
}
}
my $start = time;
if ( $server->{skip} ) {
print STDERR "Skipping: $server->{base_url}\n";
return;
}
require "HTTP/Cookies.pm" if $server->{use_cookies};
require "Digest/MD5.pm" if $server->{use_md5};
# set starting URL, and remove any specified fragment
my $uri = URI->new( $server->{base_url} );
$uri->fragment(undef);
print STDERR "\n -- Starting to spider: $uri --\n" if $server->{debug};
# set the starting server name (including port) -- will only spider on server:port
$server->{authority} = $uri->authority;
$server->{same} = [ $uri->authority ];
push @{$server->{same}}, @{$server->{same_hosts}} if ref $server->{same_hosts};
$server->{same_host_lookup} = { map { $_, 1 } @{$server->{same}} };
# set time to end
$server->{max_time} = $server->{max_time} * 60 + time
if $server->{max_time};
# set default agent for log files
$server->{agent} ||= 'swish-e spider 2.2 http://swish-e.org/';
# get a user agent object
my $ua;
if ( $server->{ignore_robots_file} ) {
$ua = LWP::UserAgent->new( );
return unless $ua;
$ua->agent( $server->{agent} );
$ua->from( $server->{email} );
} else {
$ua = LWP::RobotUA->new( $server->{agent}, $server->{email} );
return unless $ua;
$ua->delay( $server->{delay_min} || 0.1 );
}
$server->{ua} = $ua; # save it for fun.
# $ua->parse_head(0); # Don't parse the content
$ua->cookie_jar( HTTP::Cookies->new ) if $server->{use_cookies};
if ( $server->{keep_alive} ) {
if ( $ua->can( 'conn_cache' ) ) {
my $keep_alive = $server->{keep_alive} =~ /^\d+$/ ? $server->{keep_alive} : 1;
$ua->conn_cache( { total_capacity => $keep_alive } );
} else {
warn "Can't use keep-alive: conn_cache method not available\n";
}
}
# uri, parent, depth
eval { spider( $server, $uri ) };
print STDERR $@ if $@;
$start = time - $start;
$start++ unless $start;
my $max_width = 0;
my $max_num = 0;
for ( keys %{$server->{counts}} ) {
$max_width = length if length > $max_width;
my $val = commify( $server->{counts}{$_} );
$max_num = length $val if length $val > $max_num;
}
printf STDERR "\nSummary for: $server->{base_url}\n";
for ( sort keys %{$server->{counts}} ) {
printf STDERR "%${max_width}s: %${max_num}s (%0.1f/sec)\n",
$_,
commify( $server->{counts}{$_} ),
$server->{counts}{$_}/$start;
}
}
#----------- Non recursive spidering ---------------------------
sub spider {
my ( $server, $uri ) = @_;
# Validate the first link, just in case
return unless check_link( $uri, $server, '', '(Base URL)' );
my @link_array = [ $uri, '', 0 ];
while ( @link_array ) {
die if $abort || $server->{abort};
my ( $uri, $parent, $depth ) = @{shift @link_array};
my $new_links = process_link( $server, $uri, $parent, $depth );
push @link_array, map { [ $_, $uri, $depth+1 ] } @$new_links if $new_links;
}
}
#----------- Process a url and return links -----------------------
sub process_link {
my ( $server, $uri, $parent, $depth ) = @_;
$server->{counts}{'Unique URLs'}++;
die "$0: Max files Reached\n"
if $server->{max_files} && $server->{counts}{'Unique URLs'} > $server->{max_files};
die "$0: Time Limit Exceeded\n"
if $server->{max_time} && $server->{max_time} < time;
# make request
my $ua = $server->{ua};
my $request = HTTP::Request->new('GET', $uri );
my $content = '';
# Really should just subclass the response object!
$server->{no_contents} = 0;
$server->{no_index} = 0;
$server->{no_spider} = 0;
my $been_here;
my $callback = sub {
die "test_response" if !$been_here++ && !check_user_function( 'test_response', $uri, $server, $_[1], \$_[0] );
if ( length( $content ) + length( $_[0] ) > $server->{max_size} ) {
print STDERR "-Skipped $uri: Document exceeded $server->{max_size} bytes\n" if $server->{debug}&DEBUG_SKIPPED;
die "too big!\n";
}
$content .= $_[0];
};
my $response = $ua->simple_request( $request, $callback, 4096 );
return if $server->{abort};
# Log the response
if ( ( $server->{debug} & DEBUG_URL ) || ( $server->{debug} & DEBUG_FAILED && !$response->is_success) ) {
print STDERR '>> ',
join( ' ',
( $response->is_success ? '+Fetched' : '-Failed' ),
$depth,
"Cnt: $server->{counts}{'Unique URLs'}",
$uri,
( $response->status_line || $response->status || 'unknown status' ),
( $response->content_type || 'Unknown content type'),
( $response->content_length || '???' ),
"parent:$parent",
),"\n";
}
# If the LWP callback aborts
if ( $response->header('client-aborted') ) {
$server->{counts}{Skipped}++;
return;
}
# skip excluded by robots.txt
if ( !$response->is_success && $response->status_line =~ 'robots.txt' ) {
print STDERR "-Skipped $depth $uri: ", $response->status_line,"\n" if $server->{debug}&DEBUG_SKIPPED;
$server->{counts}{'robots.txt'}++;
return;
}
# Report bad links (excluding those skipped by robots.txt
if ( $server->{validate_links} && !$response->is_success ) {
validate_link( $server, $uri, $parent, $response );
}
# And check for meta robots tag
# -- should probably be done in request sub to avoid fetching docs that are not needed
unless ( $server->{ignore_robots_file} ) {
if ( my $directives = $response->header('X-Meta-ROBOTS') ) {
my %settings = map { lc $_, 1 } split /\s*,\s*/, $directives;
$server->{no_contents}++ if exists $settings{nocontents}; # an extension for swish
$server->{no_index}++ if exists $settings{noindex};
$server->{no_spider}++ if exists $settings{nofollow};
}
}
print STDERR "\n----HEADERS for $uri ---\n", $response->headers_as_string,"-----END HEADERS----\n\n"
if $server->{debug} & DEBUG_HEADERS;
unless ( $response->is_success ) {
# look for redirect
if ( $response->is_redirect && $response->header('location') ) {
my $u = URI->new_abs( $response->header('location'), $response->base );
if ( $u->canonical eq $uri->canonical ) {
print STDERR "Warning: $uri redirects to itself!.\n";
return;
}
return [$u] if check_link( $u, $server, $response->base, '(redirect)','Location' );
}
return;
}
return unless $content; # $$$ any reason to index empty files?
# make sure content is unique - probably better to chunk into an MD5 object above
if ( $server->{use_md5} ) {
my $digest = Digest::MD5::md5($content);
if ( $visited{ $digest } ) {
print STDERR "-Skipped $uri has same digest as $visited{ $digest }\n"
if $server->{debug} & DEBUG_SKIPPED;
$server->{counts}{Skipped}++;
$server->{counts}{'MD5 Duplicates'}++;
return;
}
$visited{ $digest } = $uri;
}
# Extract out links (if not too deep)
my $links_extracted = extract_links( $server, \$content, $response )
unless defined $server->{max_depth} && $depth >= $server->{max_depth};
# Index the file
if ( $server->{no_index} ) {
$server->{counts}{Skipped}++;
print STDERR "-Skipped indexing $uri some callback set 'no_index' flag\n" if $server->{debug}&DEBUG_SKIPPED;
} else {
return unless check_user_function( 'filter_content', $uri, $server, $response, \$content );
output_content( $server, \$content, $uri, $response )
unless $server->{no_index};
}
return $links_extracted;
}
#===================================================================================================
# Calls a user-defined function
#
#---------------------------------------------------------------------------------------------------
sub check_user_function {
my ( $fn, $uri, $server ) = ( shift, shift, shift );
return 1 unless $server->{$fn};
my $tests = ref $server->{$fn} eq 'ARRAY' ? $server->{$fn} : [ $server->{$fn} ];
my $cnt;
for my $sub ( @$tests ) {
$cnt++;
print STDERR "?Testing '$fn' user supplied function #$cnt '$uri'\n" if $server->{debug} & DEBUG_INFO;
my $ret;
eval { $ret = $sub->( $uri, $server, @_ ) };
if ( $@ ) {
print STDERR "-Skipped $uri due to '$fn' user supplied function #$cnt death '$@'\n" if $server->{debug} & DEBUG_SKIPPED;
$server->{counts}{Skipped}++;
return;
}
next if $ret;
print STDERR "-Skipped $uri due to '$fn' user supplied function #$cnt\n" if $server->{debug} & DEBUG_SKIPPED;
$server->{counts}{Skipped}++;
return;
}
print STDERR "+Passed all $cnt tests for '$fn' user supplied function\n" if $server->{debug} & DEBUG_INFO;
return 1;
}
#==============================================================================================
# Extract links from a text/html page
#
# Call with:
# $server - server object
# $content - ref to content
# $response - response object
#
#----------------------------------------------------------------------------------------------
sub extract_links {
my ( $server, $content, $response ) = @_;
return unless $response->header('content-type') &&
$response->header('content-type') =~ m[^text/html];
# allow skipping.
if ( $server->{no_spider} ) {
print STDERR '-Links not extracted: ', $response->request->uri->canonical, " some callback set 'no_spider' flag\n" if $server->{debug}&DEBUG_SKIPPED;
return;
}
$server->{Spidered}++;
my @links;
my $base = $response->base;
print "\nExtracting links from ", $response->request->uri, ":\n" if $server->{debug} & DEBUG_LINKS;
my $p = HTML::LinkExtor->new;
$p->parse( $$content );
my %skipped_tags;
for ( $p->links ) {
my ( $tag, %attr ) = @$_;
# which tags to use ( not reported in debug )
print STDERR " ?? Looking at extracted tag '$tag'\n" if $server->{debug} & DEBUG_LINKS;
unless ( $server->{link_tags_lookup}{$tag} ) {
# each tag is reported only once per page
print STDERR
" ?? <$tag> skipped because not one of (",
join( ',', @{$server->{link_tags}} ),
")\n" if $server->{debug} & DEBUG_LINKS && !$skipped_tags{$tag}++;
if ( $server->{validate_links} && $tag eq 'img' && $attr{src} ) {
my $img = URI->new_abs( $attr{src}, $base );
validate_link( $server, $img, $base );
}
next;
}
# Grab which attribute(s) which might contain links for this tag
my $links = $HTML::Tagset::linkElements{$tag};
$links = [$links] unless ref $links;
my $found;
# Now, check each attribut to see if a link exists
for my $attribute ( @$links ) {
if ( $attr{ $attribute } ) { # ok tag
# Create a URI object
my $u = URI->new_abs( $attr{$attribute},$base );
next unless check_link( $u, $server, $base, $tag, $attribute );
push @links, $u;
print STDERR qq[ ++ <$tag $attribute="$u"> Added to list of links to follow\n] if $server->{debug} & DEBUG_LINKS;
$found++;
}
}
if ( !$found && $server->{debug} & DEBUG_LINKS ) {
my $s = "<$tag";
$s .= ' ' . qq[$_="$attr{$_}"] for sort keys %attr;
$s .= '>';
print STDERR " ?? tag $s did not include any links to follow\n";
}
}
print STDERR "! Found ", scalar @links, " links in ", $response->base, "\n\n" if $server->{debug} & DEBUG_INFO;
return \@links;
}
#=============================================================================
# This function check's if a link should be added to the list to spider
#
# Pass:
# $u - URI object
# $server - the server hash
# $base - the base or parent of the link
#
# Returns true if a valid link
#
# Calls the user function "test_url". Link rewriting before spider
# can be done here.
#
#------------------------------------------------------------------------------
sub check_link {
my ( $u, $server, $base, $tag, $attribute ) = @_;
$tag ||= '';
$attribute ||= '';
# Kill the fragment
$u->fragment( undef );
# This should not happen, but make sure we have a host to check against
unless ( $u->host ) {
print STDERR qq[ ?? <$tag $attribute="$u"> skipped because missing host name\n] if $server->{debug} & DEBUG_LINKS;
return;
}
# Here we make sure we are looking at a link pointing to the correct (or equivalent) host
unless ( $server->{same_host_lookup}{$u->authority} ) {
print STDERR qq[ ?? <$tag $attribute="$u"> skipped because different authority (server:port)\n] if $server->{debug} & DEBUG_LINKS;
$server->{counts}{'Off-site links'}++;
validate_link( $server, $u, $base ) if $server->{validate_links};
return;
}
$u->authority( $server->{authority} ); # Force all the same host name
# Allow rejection of this URL by user function
return unless check_user_function( 'test_url', $u, $server );
# Don't add the link if already seen - these are so common that we don't report
if ( $visited{ $u->canonical }++ ) {
#$server->{counts}{Skipped}++;
$server->{counts}{Duplicates}++;
# Just so it's reported for all pages
if ( $server->{validate_links} && $validated{$u->canonical} ) {
push @{$bad_links{ $base->canonical }}, $u->canonical;
}
return;
}
return 1;
}
#=============================================================================
# This function is used to validate links that are off-site.
#
# It's just a very basic link check routine that lets you validate the
# off-site links at the same time as indexing. Just because we can.
#
#------------------------------------------------------------------------------
sub validate_link {
my ($server, $uri, $base, $response ) = @_;
# Already checked?
if ( exists $validated{ $uri->canonical } )
{
# Add it to the list of bad links on that page if it's a bad link.
push @{$bad_links{ $base->canonical }}, $uri->canonical
if $validated{ $uri->canonical };
return;
}
$validated{ $uri->canonical } = 0; # mark as checked and ok.
unless ( $response ) {
my $ua = LWP::UserAgent->new;
my $request = HTTP::Request->new('HEAD', $uri->canonical );
eval {
$SIG{ALRM} = sub { die "timed out\n" };
alarm 5;
$response = $ua->simple_request( $request );
alarm 0;
};
if ( $@ ) {
$server->{counts}{'Bad Links'}++;
my $msg = $@;
$msg =~ tr/\n//s;
$validated{ $uri->canonical } = $msg;
push @{$bad_links{ $base->canonical }}, $uri->canonical;
return;
}
}
return if $response->is_success;
my $error = $response->status_line || $response->status || 'unknown status';
$error .= ' ' . URI->new_abs( $response->header('location'), $response->base )->canonical
if $response->is_redirect && $response->header('location');
$validated{ $uri->canonical } = $error;
push @{$bad_links{ $base->canonical }}, $uri->canonical;
}
sub output_content {
my ( $server, $content, $uri, $response ) = @_;
$server->{indexed}++;
unless ( length $$content ) {
print STDERR "Warning: document '", $response->request->uri, "' has no content\n";
$$content = ' ';
}
$server->{counts}{'Total Bytes'} += length $$content;
$server->{counts}{'Total Docs'}++;
my $headers = join "\n",
'Path-Name: ' . $uri,
'Content-Length: ' . length $$content,
'';
$headers .= 'Last-Mtime: ' . $response->last_modified . "\n"
if $response->last_modified;
$headers .= "No-Contents: 1\n" if $server->{no_contents};
print "$headers\n$$content";
die "$0: Max indexed files Reached\n"
if $server->{max_indexed} && $server->{counts}{'Total Docs'} >= $server->{max_indexed};
}
sub commify {
local $_ = shift;
1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
return $_;
}
sub default_urls {
die "$0: Must list URLs when using 'default'\n" unless @ARGV;
my $validate = 0;
if ( $ARGV[0] eq 'validate' ) {
shift @ARGV;
$validate = 1;
}
my @content_types = qw{ text/html text/plain };
return map {
{
#debug => DEBUG_HEADERS,
#debug => DEBUG_URL|DEBUG_SKIPPED|DEBUG_INFO,
base_url => \@ARGV,
email => 'swish@domain.invalid',
delay_min => .0001,
link_tags => [qw/ a frame /],
txest_url => sub { $_[0]->path !~ /\.(?:gif|jpeg|.png)$/i },
test_response => sub {
my $content_type = $_[2]->content_type;
my $ok = grep { $_ eq $content_type } @content_types;
return 1 if $ok;
print STDERR "$_[0] $content_type != (@content_types)\n";
return;
},
validate_links => $validate,
}
} @ARGV;
}
__END__
=head1 NAME
spider.pl - Example Perl program to spider web servers
=head1 SYNOPSIS
swish.config:
IndexDir ./spider.pl
SwishProgParameters spider.config
# other swish-e settings
spider.config:
@servers = (
{
base_url => 'http://myserver.com/',
email => 'me@myself.com',
# other spider settings described below
},
);
begin indexing:
swish-e -S prog -c swish.config
=head1 DESCRIPTION
This is a swish-e "prog" document source program for spidering
web servers. It can be used instead of the C<http> method for
spidering with swish.
The spider typically uses a configuration
file that lists the URL(s) to spider, and configuration parameters that control
the behavior of the spider. In addition, you may define I<callback> perl functions
in the configuration file that can dynamically change the behavior of the spider
based on URL, HTTP response headers, or the content of the fetched document. These
callback functions can also be used to filter or convert documents (e.g. PDF, gzip, MS Word)
into a format that swish-e can parse. Some examples are provided.
You define "servers" to spider, set a few parameters,
create callback routines, and start indexing as the synopsis above shows.
The spider requires its own configuration file (unless you want the default values). This
is NOT the same configuration file that swish-e uses.
The example configuration file C<SwishSpiderConfig.pl> is
included in the C<prog-bin> directory along with this script. Please just use it as an
example, as it contains more settings than you probably want to use. Start with a tiny config file
and add settings as required by your situation.
The available configuration parameters are discussed below.
If all that sounds confusing, then you can run the spider with default settings. In fact, you can
run the spider without using swish just to make sure it works. Just run
./spider.pl default http://someserver.com/sometestdoc.html
And you should see F<sometestdoc.html> dumped to your screen. Get ready to kill the script
if the file you request contains links as the output from the fetched pages will be displayed.
./spider.pl default http://someserver.com/sometestdoc.html > output.file
might be more friendly.
If the first parameter passed to the spider is the word "default" (as in the preceeding example)
then the spider uses the default parameters,
and the following parameter(s) are expected to be URL(s) to spider.
Otherwise, the first parameter is considered to be the name of the configuration file (as described
below). When using C<-S prog>, the swish-e configuration setting
C<SwishProgParameters> is used to pass parameters to the program specified
with C<IndexDir> or the C<-i> switch.
If you do not specify any parameters the program will look for the file
SwishSpiderConfig.pl
in the current directory.
The spider does require Perl's LWP library and a few other reasonably common modules.
Most well maintained systems should have these modules installed. If not, start here:
http://search.cpan.org/search?dist=libwww-perl
http://search.cpan.org/search?dist=HTML-Parser
See more below in C<REQUIREMENTS>. It's a good idea to check that you are running
a current version of these modules.
=head2 Robots Exclusion Rules and being nice
This script will not spider files blocked by F<robots.txt>. In addition,
The script will check for <meta name="robots"..> tags, which allows finer
control over what files are indexed and/or spidered.
See http://www.robotstxt.org/wc/exclusion.html for details.
This spider provides an extension to the <meta> tag exclusion, by adding a
C<NOCONTENTS> attribute. This attribute turns on the C<no_contents> setting, which
asks swish-e to only index the document's title (or file name if not title is found).
For example:
<META NAME="ROBOTS" CONTENT="NOCONTENTS, NOFOLLOW">
says to just index the document's title, but don't index its contents, and don't follow
any links within the document. Granted, it's unlikely that this feature will ever be used...
If you are indexing your own site, and know what you are doing, you can disable robot exclusion by
the C<ignore_robots_file> configuration parameter, described below. This disables both F<robots.txt>
and the meta tag parsing.
This script only spiders one file at a time, so load on the web server is not that great.
And with libwww-perl-5.53_91 HTTP/1.1 keep alive requests can reduce the load on
the server even more (and potentially reduce spidering time considerably!)
Still, discuss spidering with a site's administrator before beginning.
Use the C<delay_min> to adjust how fast the spider fetches documents.
Consider running a second web server with a limited number of children if you really
want to fine tune the resources used by spidering.
=head2 Duplicate Documents
The spider program keeps track of URLs visited, so a document is only indexed
one time.
The Digest::MD5 module can be used to create a "fingerprint" of every page
indexed and this fingerprint is used in a hash to find duplicate pages.
For example, MD5 will prevent indexing these as two different documents:
http://localhost/path/to/some/index.html
http://localhost/path/to/some/
But note that this may have side effects you don't want. If you want this
file indexed under this URL:
http://localhost/important.html
But the spider happens to find the exact content in this file first:
http://localhost/developement/test/todo/maybeimportant.html
Then only that URL will be indexed.
MD5 may slow down indexing a tiny bit, so test with and without if speed is an
issue (which it probably isn't since you are spidering in the first place).
This feature will also use more memory.
Note: the "prog" document source in swish bypasses many swish-e configuration settings.
For example, you cannot use the C<IndexOnly> directive with the "prog" document
source. This is by design to limit the overhead when using an external program
for providing documents to swish; after all, with "prog", if you don't want to index a file, then
don't give it to swish to index in the first place.
So, for spidering, if you do not wish to index images, for example, you will
need to either filter by the URL or by the content-type returned from the web
server. See L<CALLBACK FUNCTIONS|CALLBACK FUNCTIONS> below for more information.
=head1 REQUIREMENTS
Perl 5 (hopefully at least 5.00503) or later.
You must have the LWP Bundle on your computer. Load the LWP::Bundle via the CPAN.pm shell,
or download libwww-perl-x.xx from CPAN (or via ActiveState's ppm utility).
Also required is the the HTML-Parser-x.xx bundle of modules also from CPAN
(and from ActiveState for Windows).
http://search.cpan.org/search?dist=libwww-perl
http://search.cpan.org/search?dist=HTML-Parser
You will also need Digest::MD5 if you wish to use the MD5 feature.
HTML::Tagset is also required.
Other modules may be required (for example, the pod2xml.pm module
has its own requirementes -- see perldoc pod2xml for info).
The spider.pl script, like everyone else, expects perl to live in /usr/local/bin.
If this is not the case then either add a symlink at /usr/local/bin/perl
to point to where perl is installed
or modify the shebang (#!) line at the top of the spider.pl program.
Note that the libwww-perl package does not support SSL (Secure Sockets Layer) (https)
by default. See F<README.SSL> included in the libwww-perl package for information on
installing SSL support.
=head1 CONFIGURATION FILE
Configuration is not very fancy. The spider.pl program simply does a
C<do "path";> to read in the parameters and create the callback subroutines.
The C<path> is the first parameter passed to the spider script, which is set
by the Swish-e configuration setting C<SwishProgParameters>.
For example, if in your swish-e configuration file you have
SwishProgParameters /path/to/config.pl
IndexDir /home/moseley/swish-e/prog-bin/spider.pl
And then run swish as
swish-e -c swish.config -S prog
swish will run C</home/moseley/swish-e/prog-bin/spider.pl> and the spider.pl
program will receive as its first parameter C</path/to/config.pl>, and
spider.pl will read C</path/to/config.pl> to get the spider configuration
settings. If C<SwishProgParameters> is not set, the program will try to
use C<SwishSpiderConfig.pl> by default.
There is a special case of:
SwishProgParameters default http://www.mysite/index.html ...
Where default parameters are used. This will only index documents of type
C<text/html> or C<text/plain>, and will skip any file with an extension that matches
the pattern:
/\.(?:gif|jpeg|.png)$/i
This can be useful for indexing just your web documnts, but you will probably want finer
control over your spidering by using a configuration file.
The configuration file must set a global variable C<@servers> (in package main).
Each element in C<@servers> is a reference to a hash. The elements of the has
are described next. More than one server hash may be defined -- each server
will be spidered in order listed in C<@servers>, although currently a I<global> hash is
used to prevent spidering the same URL twice.
Examples:
my %serverA = (
base_url => 'http://swish-e.org/',
same_hosts => [ qw/www.swish-e.org/ ],
email => 'my@email.address',
);
my %serverB = (
...
...
);
@servers = ( \%serverA, \%serverB, );
=head1 CONFIGURATION OPTIONS
This describes the required and optional keys in the server configuration hash, in random order...
=over 4
=item base_url
This required setting is the starting URL for spidering.
Typically, you will just list one URL for the base_url. You may specify more than one
URL as a reference to a list
base_url => [qw! http://swish-e.org/ http://othersite.org/other/index.html !],
=item same_hosts
This optional key sets equivalent B<authority> name(s) for the site you are spidering.
For example, if your site is C<www.mysite.edu> but also can be reached by
C<mysite.edu> (with or without C<www>) and also C<web.mysite.edu> then:
Example:
$serverA{base_url} = 'http://www.mysite.edu/index.html';
$serverA{same_hosts} = ['mysite.edu', 'web.mysite.edu'];
Now, if a link is found while spidering of:
http://web.mysite.edu/path/to/file.html
it will be considered on the same site, and will actually spidered and indexed
as:
http://www.mysite.edu/path/to/file.html
Note: This should probably be called B<same_authority> because it compares the URI C<authority>
against the list of host names in C<same_hosts>. So, if you specify a port name in you will
probably want to specify the port name in the the list of hosts in C<same_hosts>:
my %serverA = (
base_url => 'http://sunsite.berkeley.edu:4444/',
same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ],
email => 'my@email.address',
);
=item email
This required key sets the email address for the spider. Set this to
your email address.
=item agent
This optional key sets the name of the spider.
=item link_tags
This optional tag is a reference to an array of tags. Only links found in these tags will be extracted.
The default is to only extract links from C<a> tags.
For example, to extract tags from C<a> tags and from C<frame> tags:
my %serverA = (
base_url => 'http://sunsite.berkeley.edu:4444/',
same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ],
email => 'my@email.address',
link_tags => [qw/ a frame /],
);
=item delay_min
This optional key sets the delay in minutes to wait between requests. See the
LWP::RobotUA man page for more information. The default is 0.1 (6 seconds),
but in general you will probably want it much smaller. But, check with
the webmaster before using too small a number.
=item max_time
This optional key will set the max minutes to spider. Spidering
for this host will stop after C<max_time> minutes, and move on to the
next server, if any. The default is to not limit by time.
=item max_files
This optional key sets the max number of files to spider before aborting.
The default is to not limit by number of files. This is the number of requests
made to the remote server, not the total number of files to index (see C<max_indexed>).
This count is displayted at the end of indexing as C<Unique URLs>.
This feature can (and perhaps should) be use when spidering a web site where dynamic
content may generate unique URLs to prevent run-away spidering.
=item max_indexed
This optional key sets the max number of files that will be indexed.
The default is to not limit. This is the number of files sent to
swish for indexing (and is reported by C<Total Docs> when spidering ends).
=item max_size
This optional key sets the max size of a file read from the web server.
This B<defaults> to 5,000,000 bytes. If the size is exceeded the resource is
skipped and a message is written to STDERR if the DEBUG_SKIPPED debug flag is set.
=item keep_alive
This optional parameter will enable keep alive requests. This can dramatically speed
up searching and reduce the load on server being spidered. The default is to not use
keep alives, although enabling it will probably be the right thing to do.
To get the most out of keep alives, you may want to set up your web server to
allow a lot of requests per single connection (i.e MaxKeepAliveRequests on Apache).
Apache's default is 100, which should be good. (But, in general, don't enable KeepAlives
on a mod_perl server.)
Note: try to filter as many documents as possible B<before> making the request to the server. In
other words, use C<test_url> to look for files ending in C<.html> instead of using C<test_response> to look
for a content type of C<text/html> if possible.
Do note that aborting a request from C<test_response> will break the
current keep alive connection.
Note: you must have at least libwww-perl-5.53_90 installed to use this feature.
=item skip
This optional key can be used to skip the current server. It's only purpose
is to make it easy to disable a server in a configuration file.
=item debug
Set this to a number to display different amounts of info while spidering. Writes info
to STDERR. Zero/undefined is no debug output.
The following constants are defined for debugging. They may be or'ed together to
get the individual debugging of your choice.
Here are basically the levels:
DEBUG_ERRORS general program errors (not used at this time)
DEBUG_URL print out every URL processes
DEBUG_HEADERS prints the response headers
DEBUG_FAILED failed to return a 200
DEBUG_SKIPPED didn't index for some reason
DEBUG_INFO more verbose
DEBUG_LINKS prints links as they are extracted
For example, to display the urls processed, failed, and skipped use:
debug => DEBUG_URL | DEBUG_FAILED | DEBUG_SKIPPED,
To display the returned headers
debug => DEBUG_HEADERS,
You can easily run the spider without using swish for debugging purposes:
./spider.pl test.config > spider.out
And you will see debugging info as it runs, and the fetched documents will be saved
in the C<spider.out> file.
=item max_depth
The C<max_depth> parameter can be used to limit how deeply to recurse a web site.
The depth is just a count of levels of web pages decended, and not related to
the number of path elements in a URL.
A max_depth of zero says to only spider the page listed as the C<base_url>. A max_depth of one will
spider the C<base_url> page, plus all links on that page, and no more. The default is to spider all
pages.
=item ignore_robots_file
If this is set to true then the robots.txt file will not be checked when spidering
this server. Don't use this option unless you know what you are doing.
=item use_cookies
If this is set then a "cookie jar" will be maintained while spidering. Some
(poorly written ;) sites require cookies to be enabled on clients.
This requires the HTTP::Cookies module.
=item use_md5
If this setting is true, then a MD5 digest "fingerprint" will be made from the content of every
spidered document. This digest number will be used as a hash key to prevent
indexing the same content more than once. This is helpful if different URLs
generate the same content.
Obvious example is these two documents will only be indexed one time:
http://localhost/path/to/index.html
http://localhost/path/to/
This option requires the Digest::MD5 module. Spidering with this option might
be a tiny bit slower.
=item validate_links
Just a hack. If you set this true the spider will do HEAD requests all links (e.g. off-site links), just
to make sure that all your links work.
=back
=head1 CALLBACK FUNCTIONS
Three callback functions can be defined in your parameter hash.
These optional settings are I<callback> subroutines that are called while
processing URLs.
A little perl discussion is in order:
In perl, a scalar variable can contain a reference to a subroutine. The config example above shows
that the configuration parameters are stored in a perl I<hash>.
my %serverA = (
base_url => 'http://sunsite.berkeley.edu:4444/',
same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ],
email => 'my@email.address',
link_tags => [qw/ a frame /],
);
There's two ways to add a reference to a subroutine to this hash:
sub foo {
return 1;
}
my %serverA = (
base_url => 'http://sunsite.berkeley.edu:4444/',
same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ],
email => 'my@email.address',
link_tags => [qw/ a frame /],
test_url => \&foo, # a reference to a named subroutine
);
Or the subroutine can be coded right in place:
my %serverA = (
base_url => 'http://sunsite.berkeley.edu:4444/',
same_hosts => [ qw/www.sunsite.berkeley.edu:4444/ ],
email => 'my@email.address',
link_tags => [qw/ a frame /],
test_url => sub { reutrn 1; },
);
The above example is not very useful as it just creates a user callback function that
always returns a true value (the number 1). But, it's just an example.
The function calls are wrapped in an eval, so calling die (or doing something that dies) will just cause
that URL to be skipped. If you really want to stop processing you need to set $server->{abort} in your
subroutine (or send a kill -HUP to the spider).
The first two parameters passed are a URI object (to have access to the current URL), and
a reference to the current server hash. The C<server> hash is just a global hash for holding data, and
useful for setting flags as describe belwo.
Other parameters may be also passes, as described below.
In perl parameters are passed in an array called "@_". The first element (first parameter) of
that array is $_[0], and the second is $_[1], and so on. Depending on how complicated your
function is you may wish to shift your parameters off of the @_ list to make working with them
easier. See the examples below.
To make use of these routines you need to understand when they are called, and what changes
you can make in your routines. Each routine deals with a given step, and returning false from
your routine will stop processing for the current URL.
=over 4
=item test_url
C<test_url> allows you to skip processing of urls based on the url before the request
to the server is made. This function is called for the C<base_url> links (links you define in
the spider configuration file) and for every link extracted from a fetched web page.
This function is a good place to skip links that you are not interested in following. For example,
if you know there's no point in requesting images then you can exclude them like:
test_url => sub {
my $uri = shift;
return 0 if $uri->path =~ /\.(gif|jpeg|png)$/;
return 1;
},
Or to write it another way:
test_url => sub { $_[0]->path !~ /\.(gif|jpeg|png)$/ },
Another feature would be if you were using a web server where path names are
NOT case sensitive (e.g. Windows). You can normalize all links in this situation
using something like
test_url => sub {
my $uri = shift;
return 0 if $uri->path =~ /\.(gif|jpeg|png)$/;
$uri->path( lc $uri->path ); # make all path names lowercase
return 1;
},
The important thing about C<test_url> (compared to the other callback functions) is that
it is called while I<extracting> links, not while actually fetching that page from the web
server. Returning false from C<test_url> simple says to not add the URL to the list of links to
spider.
You may set a flag in the server hash (second parameter) to tell the spider to abort processing.
test_url => sub {
my $server = $_[1];
$server->{abort}++ if $_[0]->path =~ /foo\.html/;
return 1;
},
You cannot use the server flags:
no_contents
no_index
no_spider
This is discussed below.
=item test_response
This function allows you to filter based on the response from the
remote server (such as by content-type). This function is called while the
web pages is being fetched from the remote server, typically after just enought
data has been returned to read the response from the web server.
The spider requests a document in "chunks" of 4096 bytes. 4096 is only a suggestion
of how many bytes to return in each chunk. The C<test_response> routine is
called when the first chunk is received only. This allows ignoring (aborting)
reading of a very large file, for example, without having to read the entire file.
Although not much use, a reference to this chunk is passed as the forth parameter.
Web servers use a Content-Type: header to define the type of data returned from the server.
On a web server you could have a .jpeg file be a web page -- file extensions may not always
indicate the type of the file. The third parameter ($_[2]) returned is a reference to a
HTTP::Response object:
For example, to only index true HTML (text/html) pages:
test_response => sub {
my $content_type = $_[2]->content_type;
return $content_type =~ m!text/html!;
},
You can also set flags in the server hash (the second parameter) to control indexing:
no_contents -- index only the title (or file name), and not the contents
no_index -- do not index this file, but continue to spider if HTML
no_spider -- index, but do not spider this file for links to follow
abort -- stop spidering any more files
For example, to avoid index the contents of "private.html", yet still follow any links
in that file:
test_url => sub {
my $server = $_[1];
$server->{no_index}++ if $_[0]->path =~ /private\.html$/;
return 1;
},
Note: Do not modify the URI object in this call back function.
=item filter_content
This callback function is called right before sending the content to swish.
Like the other callback function, returning false will cause the URL to be skipped.
Setting the C<abort> server flag and returning false will abort spidering.
You can also set the C<no_contents> flag.
This callback function is passed four parameters.
The URI object, server hash, the HTTP::Response object,
and a reference to the content.
You can modify the content as needed. For example you might not like upper case:
filter_content => sub {
my $content_ref = $_[3];
$$content_ref = lc $$content_ref;
return 1;
},
I more reasonable example would be converting PDF or MS Word documents for parsing by swish.
Examples of this are provided in the F<prog-bin> directory of the swish-e distribution.
You may also modify the URI object to change the path name passed to swish for indexing.
filter_content => sub {
my $uri = $_[0];
$uri->host('www.other.host') ;
return 1;
},
Swish-e's ReplaceRules feature can also be used for modifying the path name indexed.
Here's a bit more advanced example of indexing text/html and PDF files only:
use pdf2xml; # included example pdf converter module
$server{filter_content} = sub {
my ( $uri, $server, $response, $content_ref ) = @_;
return 1 if $response->content_type eq 'text/html';
return 0 unless $response->content_type eq 'application/pdf';
# for logging counts
$server->{counts}{'PDF transformed'}++;
$$content_ref = ${pdf2xml( $content_ref )};
return 1;
}
=back
Note that you can create your own counters to display in the summary list when spidering
is finished by adding a value to the hash pointed to by C<$server->{counts}>.
test_url => sub {
my $server = $_[1];
$server->{no_index}++ if $_[0]->path =~ /private\.html$/;
$server->{counts}{'Private Files'}++;
return 1;
},
Each callback function B<must> return true to continue processing the URL. Returning false will
cause processing of I<the current> URL to be skipped.
=head2 More on setting flags
Swish (not this spider) has a configuration directive C<NoContents> that will instruct swish to
index only the title (or file name), and not the contents. This is often used when
indexing binary files such as image files, but can also be used with html
files to index only the document titles.
As shown above, you can turn this feature on for specific documents by setting a flag in
the server hash passed into the C<test_response> or C<filter_contents> subroutines.
For example, in your configuration file you might have the C<test_response> callback set
as:
test_response => sub {
my ( $uri, $server, $response ) = @_;
# tell swish not to index the contents if this is of type image
$server->{no_contents} = $response->content_type =~ m[^image/];
return 1; # ok to index and spider this document
}
The entire contents of the resource is still read from the web server, and passed
on to swish, but swish will also be passed a C<No-Contents> header which tells
swish to enable the NoContents feature for this document only.
Note: Swish will index the path name only when C<NoContents> is set, unless the document's
type (as set by the swish configuration settings C<IndexContents> or C<DefaultContents>) is
HTML I<and> a title is found in the html document.
Note: In most cases you probably would not want to send a large binary file to swish, just
to be ignored. Therefore, it would be smart to use a C<filter_contents> callback routine to
replace the contents with single character (you cannot use the empty string at this time).
A similar flag may be set to prevent indexing a document at all, but still allow spidering.
In general, if you want completely skip spidering a file you return false from one of the
callback routines (C<test_url>, C<test_response>, or C<filter_content>). Returning false from any of those
three callbacks will stop processing of that file, and the file will B<not> be spidered.
But there may be some cases where you still want to spider (extract links) yet, not index the file. An example
might be where you wish to index only PDF files, but you still need to spider all HTML files to find
the links to the PDF files.
$server{test_response} = sub {
my ( $uri, $server, $response ) = @_;
$server->{no_index} = $response->content_type ne 'application/pdf';
return 1; # ok to spider, but don't index
}
So, the difference between C<no_contents> and C<no_index> is that C<no_contents> will still index the file
name, just not the contents. C<no_index> will still spider the file (if it's C<text/html>) but the
file will not be processed by swish at all.
B<Note:> If C<no_index> is set in a C<test_response> callback function then
the document I<will not be filtered>. That is, your C<filter_content>
callback function will not be called.
The C<no_spider> flag can be set to avoid spiderering an HTML file. The file will still be indexed unless
C<no_index> is also set. But if you do not want to index and spider, then simply return false from one of the three
callback funtions.
=head1 SIGNALS
Sending a SIGHUP to the running spider will cause it to stop spidering. This is a good way to abort spidering, but
let swish index the documents retrieved so far.
=head1 SEE ALSO
L<URI> L<LWP::RobotUA> L<WWW::RobotRules> L<Digest::MD5>
=head1 COPYRIGHT
Copyright 2001 Bill Moseley
This program is free software; you can redistribute it and/or modify
it under the same terms as Perl itself.
=head1 SUPPORT
Send all questions to the The SWISH-E discussion list.
See http://sunsite.berkeley.edu/SWISH-E.
=cut
1.1 modperl-docs/src/search/swish.cgi
Index: swish.cgi
===================================================================
#!/usr/local/bin/perl -w
package SwishSearch;
use strict;
use lib qw( modules ); ### This must be adjusted!
####################################################################################
#
# If this text is displayed on your browser then your web server
# is not configured to run .cgi programs. Contact your web server administrator.
#
# To display documentation for this program type "perldoc swish.cgi"
#
# swish.cgi $Revision: 1.1 $ Copyright (C) 2001 Bill Moseley swishscript@hank.org
# Example CGI program for searching with SWISH-E
#
# This example program will only run under an OS that supports fork().
# Ok, piped opens.
#
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version
# 2 of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# The above lines must remain at the top of this program
#
# $Id: swish.cgi,v 1.1 2002/01/30 06:35:00 stas Exp $
#
####################################################################################
# This is written this way so the script can be used as a CGI script or a mod_perl
# module without any code changes.
# use CGI (); # might not be needed if using Apache::Request
#=================================================================================
# CGI entry point
#
#=================================================================================
# Run the script -- entry point if running as a CGI script
unless ( $ENV{MOD_PERL} ) {
my $config = default_config();
# Merge with disk config file.
$config = merge_read_config( $config );
process_request( $config );
}
#=================================================================================
# mod_perl entry point
#
# As an example, you might use a PerlSetVar to point to paths to different
# config files, and then cache the different configurations by path.
#
#=================================================================================
my %cached_configs;
sub handler {
my $r = shift;
if ( my $config_path = $r->dir_config( 'Swish_Conf_File' ) ) {
# Already cached?
if ( $cached_configs{ $config_path } ) {
process_request( $cached_configs{ $config_path } );
return Apache::Constants::OK();
}
# Else, load config
my $config = default_config();
$config->{config_file} = $config_path;
# Merge with disk config file.
$cached_configs{ $config_path } = merge_read_config( $config );
process_request( $cached_configs{ $config_path } );
return Apache::Constants::OK();
}
# Otherwise, use hard-coded config
process_request( default_config() );
return Apache::Constants::OK();
}
#==================================================================================
# This sets the default configuration
# Any configuration read from disk is merged with these settings.
#
# Only a few settings are actually required. Some reasonable defaults are used
# for most. If fact, you can probably create a complete config as:
#
# return = {
# swish_binary => '/usr/local/bin/swish-e',
# swish_index => '/usr/local/share/swish/index.swish-e',
# title_property => 'swishtitle', # Not required, but recommended
# };
#
# But, that doesn't really show all the options.
#
# You can modify the options below, or you can use a config file. The config file
# is .swishcgi.conf by default (read from the current directory) that must return
# a hash reference. For example, to create a config file that changes the default
# title and index file name, plus uses Template::Toolkit to generate output
# create a config file as:
#
# # Example config file -- returns a hash reference
# {
# title => 'Search Our Site',
# swish_index => 'index.web',
#
# template => {
# package => 'TemplateToolkit',
# file => 'search.tt',
# options => {
# INCLUDE_PATH => '/home/user/swish-e/example',
# },
# };
#
#
#-----------------------------------------------------------------------------------
sub default_config {
##### Configuration Parameters #########
#---- This lists all the options, with many commented out ---
# By default, this config is used -- see the process_request() call below.
# You should adjust for your site, and how your swish index was created.
##>>
##>> Please don't post this entire section on the swish-e list if looking for help!
##>>
##>> Send a small example, without all the comments.
# Items beginning with an "x" or "#" are commented out
return {
title => 'Search our site', # Title of your choice.
swish_binary => './swish-e', # Location of swish-e binary
# By default, this script tries to read a config file. You should probably
# comment this out if not used save a disk stat
config_file => '.swishcgi.conf', # Default config file
# The location of your index file. Typically, this would not be in
# your web tree.
# If you have more than one index to search then specify an array
# reference. e.g. swish_index =>[ qw/ index1 index2 index3 /],
swish_index => 'index.swish-e', # Location of your index file
# See "select_indexes" below for how to
# select more than one index.
page_size => 15, # Number of results per page - default 15
# Property name to use as the main link text to the indexed document.
# Typically, this will be 'swishtitle' if have indexed html documents,
# But you can specify any PropertyName defined in your document.
# By default, swish will return the pathname for documents that do not
# have a title.
title_property => 'swishtitle',
# prepend this path to the filename (swishdocpath) returned by swish. This is used to
# make the href link back to the original document. Comment out to disable.
#prepend_path => 'http://localhost/mydocs',
# Swish has a configuration directive "StoreDescription" that will save part or
# all of a document's contents in the index file. This can then be displayed
# along with results. If you are indexing a lot of files this can use a lot of disk
# space, so test carefully before indexing your entire site.
# Building swish with zlib can greatly reduce the space used by StoreDescription
#
# This settings tells this script to display this description.
# Normally, this should be 'swishdescription', but you can specify another property name.
# There is no default.
description_prop => 'swishdescription',
# Property names listed here will be displayed in a table below each result
# You may wish to modify this list if you are using document properties (PropertyNames)
# in your swish-e index configuration
# There is no default.
display_props => [qw/swishlastmodified swishdocsize swishdocpath/],
# Results can be be sorted by any of the properties listed here
# They will be displayed in a drop-down list
# Again, you may modify this list if you are using document properties of your own creation
# Swish uses the rank as the default sort
sorts => [qw/swishrank swishlastmodified swishtitle swishdocpath/],
# Secondary_sort is used to sort within a sort
# You may enter a property name followed by a direction (asc|desc)
secondary_sort => [qw/swishlastmodified desc/],
# You can limit by MetaNames here. Names listed here will be displayed in
# a line of radio buttons.
# The default is to not allow any metaname selection.
# To use this feature you must define MetaNames while indexing.
# The special "swishdefault" says to search any text that was not indexed
# as a metaname (e.g. the body of a HTML document).
# To see how this might work, add to your config file:
# MetaNames swishtitle swishdocpath
# and try:
metanames => [qw/swishdefault swishtitle swishdocpath/],
# Another example: if you indexed an email archive
# that defined the metanames subject name email (as in the swish-e discussion archive)
# you might use:
#metanames => [qw/body subject name email/],
# Note that you can do a real "all" search if you use nested metanames in your source documents.
# Nesting metanames is most common with XML documents.
# These are used to map MetaNames and PropertyNames to user-friendly names
# on the form.
name_labels => {
swishdefault => 'Title & Body',
swishtitle => 'Title',
swishrank => 'Rank',
swishlastmodified => 'Last Modified Date',
swishdocpath => 'Document Path',
swishdocsize => 'Document Size',
subject => 'Message Subject',
name => "Poster's Name",
email => "Poster's Email",
sent => 'Message Date',
ALL => 'Message text',
},
timeout => 10, # limit time used by swish when fetching results - DoS protection.
# These settings will use some crude highlighting code to highlight search terms in the
# property specified above as the description_prop (normally, 'swishdescription').
max_chars => 500, # If "highlight" is not defined, then just truncate the description to this many *chars*.
# If you want to go by *words*, enable highlighting,
# and then comment-out show_words. It will be a little slower.
# This structure defines term highlighting, and what type of highlighting to use
highlight => {
# Pick highlighting module -- you must make sure the module can be found
# Ok speed, but doesn't handle phrases.
#Deals with stemming, but not stopwords
#package => 'DefaultHighlight',
# Somewhat slow, but deals with phases, stopwords, and stemming.
# Takes into consideration WordCharacters, IgnoreFirstChars and IgnoreLastChars.
package => 'PhraseHighlight',
# Fast: phrases without regard to wordcharacter settings
# doesn't do context display, so must match in first X words,
# doesn't handle stemming or stopwords.
#package => 'SimpleHighlight',
show_words => 10, # Number of swish words words to show around highlighted word
max_words => 100, # If no words are found to highlighted then show this many words
occurrences => 6, # Limit number of occurrences of highlighted words
#highlight_on => '<b>', # HTML highlighting codes
#highlight_off => '</b>',
highlight_on => '<font style="background:#FFFF99">',
highlight_off => '</font>',
meta_to_prop_map => { # this maps search metatags to display properties
swishdefault => [ qw/swishtitle swishdescription/ ],
swishtitle => [ qw/swishtitle/ ],
swishdocpath => [ qw/swishdocpath/ ],
},
},
# If you specify more than one index file (as an array reference) you
# can set this allow selection of which indexes to search.
# The default is to search all indexes specified if this is not used.
# When used, the first index is the default index.
# You need to specify your indexes as an array reference:
#swish_index => [ qw/ index.swish-e index.other index2.other index3.other index4.other / ],
Xselect_indexes => {
#method => 'radio_group', # pico radio_group, popup_menu, or checkbox_group
method => 'checkbox_group',
#method => 'popup_menu',
columns => 3,
labels => [ 'Main Index', 'Other Index', qw/ two three four/ ], # Must match up one-to-one
description => 'Select Site: ',
},
# Similar to select_indexes, this adds a metaname search
# based on a metaname. You can use any metaname, and this will
# add an "AND" search to limit results to a subset of your records.
# i.e. it adds something like 'site=(foo or bar or baz)' if foo, bar, and baz were selected.
# Swish-e's ExtractPath would work well with this. For example, the apache docs:
# ExtractPath site regex !^/usr/local/apache/htdocs/manual/([^/]+)/.+$!$1!
# ExtractPathDefault site other
Xselect_by_meta => {
#method => 'radio_group', # pico radio_group, popup_menu, or checkbox_group
method => 'checkbox_group',
#method => 'popup_menu',
columns => 3,
metaname => 'site', # Can't be a metaname used elsewhere!
values => [qw/misc mod vhosts other/],
labels => {
misc => 'General Apache docs',
mod => 'Apache Modules',
vhosts => 'Virutal hosts',
},
description => 'Limit search to these areas: ',
},
# The 'template' setting defines what generates the output
# The default is "TemplateDefault" which is reasonably ugly.
# Note that some of the above options may not be available
# for templating, as it's up to you do layout the form
# and results in your template.
xtemplate => {
package => 'TemplateDefault',
},
xtemplate => {
package => 'TemplateDumper',
},
xtemplate => {
package => 'TemplateToolkit',
file => 'search.tt',
options => {
INCLUDE_PATH => '/home/user/swish-e/example',
#PRE_PROCESS => 'config',
},
},
xtemplate => {
package => 'TemplateHTMLTemplate',
options => {
filename => 'swish.tmpl',
die_on_bad_params => 0,
loop_context_vars => 1,
cache => 1,
},
},
# This defines the package object for reading CGI parameters
# Defaults to CGI. Might be useful with mod_perl.
# request_package => 'CGI',
# request_package => 'Apache::Request',
# Limit to date ranges
# This adds in the date_range limiting options
# You will need the DateRanges.pm module from the author to use that feature
# Noramlly, you will want to limit by the last modified date, so specify
# "swishlastmodified" as the property_name. If indexing a mail archive, and, for
# example, you store the date (a unix timestamp) as "date" then specify
# "date" as the property_name.
date_ranges => {
property_name => 'swishlastmodified', # property name to limit by
# what you specify here depends on the DateRanges.pm module.
time_periods => [
'All',
'Today',
'Yesterday',
#'Yesterday onward',
'This Week',
'Last Week',
'Last 90 Days',
'This Month',
'Last Month',
#'Past',
#'Future',
#'Next 30 Days',
],
line_break => 0,
default => 'All',
date_range => 1,
},
};
}
#============================================================================
# Read config settings from disk, and merge
# Note, all errors are ignored since by default this script looks for a
# config file.
#
#============================================================================
sub merge_read_config {
my $config = shift;
return $config unless $config->{config_file};
my $return = do $config->{config_file};
return $config unless ref $return eq 'HASH';
# Merge settings
return { %$config, %$return };
}
#============================================================================
#
# This is the main entry point, where a config hash is passed in.
#
#============================================================================
sub process_request {
my $conf = shift; # configuration parameters
# Use CGI.pm by default
my $request_package = $conf->{request_package} || 'CGI';
$request_package =~ s[::][/]g;
require "$request_package.pm";
# create search object
my $search = SwishQuery->new(
config => $conf,
request => ($conf->{request_package} ? $conf->{request_package}->new : CGI->new),
);
# run the query
my $results = $search->run_query; # currently, results is the just the $search object
my $template = $conf->{template} || { package => 'TemplateDefault' };
my $package = $template->{package};
my $file = "$package.pm";
$file =~ s[::][/]g;
require $file;
$package->show_template( $template, $results );
}
#==================================================================================================
package SwishQuery;
#==================================================================================================
use Carp;
#--------------------------------------------------------------------------------
# new() doesn't do much, just create the object
#--------------------------------------------------------------------------------
sub new {
my $class = shift;
my %options = @_;
my $conf = $options{config};
croak "Failed to set the swish index files in config setting 'swish_index'" unless $conf->{swish_index};
croak "Failed to specify 'swish_binary' in configuration" unless $conf->{swish_binary};
# initialize the request search hash
my $sh = {
prog => $conf->{swish_binary},
config => $conf,
q => $options{request},
hits => 0,
MOD_PERL => $ENV{MOD_PERL},
};
return bless $sh, $class;
}
sub hits { shift->{hits} }
sub config {
my ($self, $setting, $value ) = @_;
croak "Failed to pass 'config' a setting" unless $setting;
my $cur = $self->{config}{$setting} if exists $self->{config}{$setting};
$self->{config}{$setting} = $value if $value;
return $cur;
}
sub header {
my $self = shift;
return unless ref $self->{_headers} eq 'HASH';
return $self->{_headers}{$_[0]} || '';
}
# return a ref to an array
sub results {
my $self = shift;
return $self->{_results} || undef;
}
sub navigation {
my $self = shift;
return unless ref $self->{navigation} eq 'HASH';
return exists $self->{navigation}{$_[0]} ? $self->{navigation}{$_[0]} : '';
}
sub CGI { $_[0]->{q} };
sub swish_command {
my $self = shift;
unless ( @_ ) {
return $self->{swish_command} ? @{$self->{swish_command}} : undef;
}
push @{$self->{swish_command}}, @_;
}
sub errstr {
my ($self, $value ) = @_;
$self->{_errstr} = $value if $value;
return $self->{_errstr} || '';
}
#============================================
# This returns "$self" just in case we want to seperate out into two objects later
sub run_query {
my $self = shift;
my $q = $self->{q};
my $conf = $self->{config};
# Sets the query string, and any -L limits.
return $self unless $self->build_query;
# Set the starting position (which is offset by one)
my $start = $q->param('start') || 0;
$start = 0 unless $start =~ /^\d+$/ && $start >= 0;
$self->swish_command( '-b', $start+1 );
# Set the max hits
my $page_size = $self->config('page_size') || 15;
$self->swish_command( '-m', $page_size );
return $self unless $self->set_index_file;
# Set the sort option, if any
return $self unless $self->set_sort_order;
# Trap the call - not portable.
my $timeout = $self->config('timeout');
if ( $timeout ) {
eval {
local $SIG{ALRM} = sub { die "Timed out\n" };
alarm ( $self->config('timeout') || 5 );
$self->run_swish;
alarm 0;
};
if ( $@ ) {
$self->errstr( $@ );
return $self;
}
} else {
$self->run_swish;
}
my $hits = $self->hits;
return $self unless $hits;
# Build href for repeated search via GET (forward, backward links)
my @query_string =
map { "$_=" . $q->escape( $q->param($_) ) }
grep { $q->param($_) } qw/query metaname sort reverse/;
for my $p ( qw/si sbm/ ) {
my @settings = $q->param($p);
next unless @settings;
push @query_string, "$p=" . $q->escape( $_ ) for @settings;
}
if ( $conf->{date_ranges} ) {
my $dr = DateRanges::GetDateRangeArgs( $q );
push @query_string, $dr, if $dr;
}
$self->{query_href} = $q->script_name . '?' . join '&', @query_string;
# Return the template fields
$self->{my_url} = $q->script_name;
$self->{hits} = $hits;
$self->{navigation} = {
showing => $hits,
from => $start + 1,
to => $start + $hits,
hits => $self->header('number of hits') || 0,
run_time => $self->header('run time') || 'unknown',
search_time => $self->header('search time') || 'unknown',
};
$self->set_page ( $page_size );
return $self;
}
#============================================================
# Build a query string from swish
# Just builds the -w string
#------------------------------------------------------------
sub build_query {
my $self = shift;
my $q = $self->{q};
# set up the query string to pass to swish.
my $query = $q->param('query') || '';
for ( $query ) { # trim the query string
s/\s+$//;
s/^\s+//;
}
$self->{query_simple} = $query; # without metaname
$q->param('query', $query ); # clean up the query, if needed.
# Read in the date limits, if any. This can create a new query
return unless $self->get_date_limits( \$query );
unless ( $query ) {
$self->errstr('Please enter a query string') if $q->param('submit');
return;
}
if ( length( $query ) > 100 ) {
$self->errstr('Please enter a shorter query');
return;
}
# Adjust the query string for metaname search
# *Everything* is a metaname search
# Might also like to allow searching more than one metaname at the same time
my $metaname = $q->param('metaname') || 'swishdefault';
# make sure it's a valid metaname
my $conf = $self->{config};
my @metas = ('swishdefault');
push @metas, @{ $self->config('metanames')} if $self->config('metanames');
unless ( grep { $metaname eq $_ } @metas ) {
$self->errstr('Bad MetaName provided');
return;
}
# prepend metaname to query
$query = $metaname . "=($query)";
# save the metaname so we know what field to highlight
$self->{metaname} = $metaname;
## Look for a "limit" metaname -- perhaps used with ExtractPath
# Here we don't worry about user supplied data
my $limits = $self->config('select_by_meta');
my @limits = $q->param('sbm'); # Select By Metaname
# Note that this could be messed up by ending the query in a NOT or OR
# Should look into doing:
# $query = "( $query ) AND " . $limits->{metaname} . '=(' . join( ' OR ', @limits ) . ')';
if ( @limits && ref $limits eq 'HASH' && $limits->{metaname} ) {
$query .= ' and ' . $limits->{metaname} . '=(' . join( ' or ', @limits ) . ')';
}
$self->swish_command('-w', $query );
return 1;
}
#========================================================================
# Get the index files from the form, or from simple the config settings
#------------------------------------------------------------------------
sub set_index_file {
my $self = shift;
my $q = $self->CGI;
# Set the index file
if ( $self->config('select_indexes') && ref $self->config('swish_index') eq 'ARRAY' ) {
my @choices = $q->param('si');
if ( !@choices ) {
$self->errstr('Please select a source to search');
return;
}
my @indexes = @{$self->config('swish_index')};
my @selected_indexes = grep {/^\d+$/ && $_ >= 0 && $_ < @indexes } @choices;
if ( !@selected_indexes ) {
$self->errstr('Invalid source selected');
return $self;
}
$self->swish_command( '-f', @indexes[ @selected_indexes ] );
} else {
my $indexes = $self->config('swish_index');
$self->swish_command( '-f', ref $indexes ? @$indexes : $indexes );
}
return 1;
}
#================================================================================
# Parse out the date limits from the form or from GET request
#
#---------------------------------------------------------------------------------
sub get_date_limits {
my ( $self, $query_ref ) = @_;
my $conf = $self->{config};
# Are date ranges enabled?
return 1 unless $conf->{date_ranges};
eval { require DateRanges };
if ( $@ ) {
$self->errstr( $@ );
delete $conf->{date_ranges};
return;
}
my $q = $self->{q};
my %limits;
unless ( DateRanges::DateRangeParse( $q, \%limits ) ) {
$self->errstr( $limits{DateRanges_error} || 'Bad date range selection' );
return;
}
# Store the values for later
$self->{DateRanges_time_low} = $limits{DateRanges_time_low};
$self->{DateRanges_time_high} = $limits{DateRanges_time_high};
# Allow searchs just be date if not "All dates" search
# $$$ should place some limits here, and provide a switch to disable
if ( !$$query_ref && $limits{DateRanges_time_high} ) {
$$query_ref = 'not skaisikdeekk';
$self->{_search_all}++; # flag
}
my $limit_prop = $conf->{date_ranges}{property_name} || 'swishlastmodified';
if ( $limits{DateRanges_time_low} && $limits{DateRanges_time_high} ) {
$self->swish_command( '-L', $limit_prop, $limits{DateRanges_time_low}, $limits{DateRanges_time_high} );
}
return 1;
}
#================================================================
# Set the sort order
# Just builds the -s string
#----------------------------------------------------------------
sub set_sort_order {
my $self = shift;
my $q = $self->{q};
my $sorts_array = $self->config('sorts');
return 1 unless $sorts_array;
my $conf = $self->{config};
# Now set sort option - if a valid option submitted (or you could let swish-e return the error).
my %sorts = map { $_, 1 } @$sorts_array;
if ( $q->param('sort') && $sorts{ $q->param('sort') } ) {
my $direction = $q->param('sort') eq 'swishrank'
? $q->param('reverse') ? 'asc' : 'desc'
: $q->param('reverse') ? 'desc' : 'asc';
$self->swish_command( '-s', $q->param('sort'), $direction );
if ( $conf->{secondary_sort} && $q->param('sort') ne $conf->{secondary_sort}[0] ) {
$self->swish_command(ref $conf->{secondary_sort} ? @{ $conf->{secondary_sort} } : $conf->{secondary_sort} );
}
} else {
$self->errstr( 'Invalid Sort Option Selected' );
return;
}
return 1;
}
#========================================================
# Sets prev and next page links.
# Feel free to clean this code up!
#
# Pass:
# $resutls - reference to a hash (for access to the headers returned by swish)
# $q - CGI object
#
# Returns:
# Sets entries in the $results hash
#
sub set_page {
my ( $self, $Page_Size ) = @_;
my $q = $self->{q};
my $navigation = $self->{navigation};
my $start = $navigation->{from} - 1; # Current starting record
my $prev = $start - $Page_Size;
$prev = 0 if $prev < 0;
if ( $prev < $start ) {
$navigation->{prev} = $prev;
$navigation->{prev_count} = $start - $prev;
}
my $last = $navigation->{hits} - 1;
my $next = $start + $Page_Size;
$next = $last if $next > $last;
my $cur_end = $start + $self->{hits} - 1;
if ( $next > $cur_end ) {
$navigation->{next} = $next;
$navigation->{next_count} = $next + $Page_Size > $last
? $last - $next + 1
: $Page_Size;
}
# Calculate pages ( is this -1 correct here? )
my $pages = int (($navigation->{hits} -1) / $Page_Size);
if ( $pages ) {
my @pages = 0..$pages;
my $max_pages = 10;
if ( @pages > $max_pages ) {
my $current_page = int ( $start / $Page_Size - $max_pages/2) ;
$current_page = 0 if $current_page < 0;
if ( $current_page + $max_pages - 1 > $pages ) {
$current_page = $pages - $max_pages;
}
@pages = $current_page..$current_page + $max_pages - 1;
unshift @pages, 0 if $current_page;
push @pages, $pages unless $current_page + $max_pages - 1 == $pages;
}
$navigation->{pages} =
join ' ', map {
my $page_start = $_ * $Page_Size;
my $page = $_ + 1;
$page_start == $start
? $page
: qq[<a href="$self->{query_href}&start=$page_start">$page</a>];
} @pages;
}
}
#==================================================
# Format and return the date range options in HTML
#
#--------------------------------------------------
sub get_date_ranges {
my $self = shift;
my $q = $self->{q};
my $conf = $self->{config};
return '' unless $conf->{date_ranges};
# pass parametes, and a hash to store the returned values.
my %fields;
DateRanges::DateRangeForm( $q, $conf->{date_ranges}, \%fields );
# Set the layout:
my $string = '<br>Limit to: '
. ( $fields{buttons} ? "$fields{buttons}<br>" : '' )
. ( $fields{date_range_button} || '' )
. ( $fields{date_range_low}
? " $fields{date_range_low} through $fields{date_range_high}"
: '' );
return $string;
}
#============================================
# Run swish-e and gathers headers and results
# Currently requires fork() to run.
#
# Pass:
# $sh - an array with search parameters
#
# Returns:
# a reference to a hash that contains the headers and results
# or possibly a scalar with an error message.
#
use Symbol;
sub run_swish {
my $self = shift;
my $results = $self->{results};
my $conf = $self->{config};
my $q = $self->{q};
my @properties;
my %seen;
# Gather up the properties specified
for ( qw/ title_property description_prop display_props / ) {
push @properties, ref $conf->{$_} ? @{$conf->{$_}} : $conf->{$_}
if $conf->{$_} && !$seen{$_}++;
}
# Add in the default props
for ( qw/swishrank swishdocpath/ ) {
push @properties, $_ unless $seen{$_};
}
# add in the default prop - a number must be first (this might be a duplicate in -x, oh well)
@properties = ( 'swishreccount', @properties );
$self->swish_command( -x => join( '\t', map { "<$_>" } @properties ) . '\n' );
$self->swish_command( -H => 9 );
# Run swish
my $fh = gensym;
my $pid = open( $fh, '-|' );
die "Failed to fork: $!\n" unless defined $pid;
if ( !$pid ) { # in child
exec $self->{prog}, $self->swish_command or die "Failed to exec '$self->{prog}' Error:$!";
}
$self->{COMMAND} = join ' ', $self->{prog}, $self->swish_command;
# read in from child
my @results;
my $trim_prop = $self->config('description_prop');
my $highlight = $self->config('highlight');
my $highlight_object;
# Loop through values returned from swish.
my %stops_removed;
while (<$fh>) {
chomp;
# This will not work correctly with multiple indexes when different values are used.
if ( /^# ([^:]+):\s+(.+)$/ ) {
my $h = lc $1;
my $value = $2;
$self->{_headers}{$h} = $value;
push @{$self->{_headers}{'removed stopwords'}}, $value if $h eq 'removed stopword' && !$stops_removed{$value}++;
next;
}
# return errors as text
$self->errstr($1), return if /^err:\s*(.+)/;
# Found a result
if ( /^\d/ ) {
my %h;
@h{@properties} = split /\t/;
push @results, \%h;
# There's a chance that the docpath could be modified by highlighting
# when used in a "display_props".
$h{saved_swishdocpath} = $h{swishdocpath};
my $docpath = $h{swishdocpath};
$docpath =~ s/\s/%20/g; # Replace spaces
$h{swishdocpath_href} = ( $self->config('prepend_path') || '' ) . $docpath;
# Now do any formatting
if ( $highlight ) {
if ( !$highlight_object ) {
my $package = $highlight->{package} || 'DefaultHighlight';
eval { require "$package.pm" };
if ( $@ ) {
$self->errstr( $@ );
$highlight = '';
next;
} else {
$highlight_object = $package->new( $self, $self->{metaname} );
}
}
# Highlight any fields, as needed
$highlight_object->highlight( \%h );
next;
}
# Trim down the description if no highlight, or if highlighting some other property
# Not very nice. The highlighting code would limit by words
if ( $trim_prop && $h{$trim_prop} ) {
my $max = $conf->{max_chars} || 500;
if ( length $h{$trim_prop} > $max ) {
$h{$trim_prop} = substr( $h{$trim_prop}, 0, $max) . ' <b>...</b>';
}
}
}
# Might check for "\n." for end of results.
}
$self->{hits} = @results;
$self->{_results} = \@results if @results;
}
#=====================================================================================
# This method parses out the query from the "Parsed words" returned by swish
# for use in highlighting routines
# This returns a hash ref:
# $query->{text} # evertying is currently at level "text"
# {$metaname} # the meta name
# [ array of phrases ]
# each phrase is made up of an array of words
use constant DEBUG_QUERY_PARSED => 0;
sub extract_query_match {
my $self = shift;
my $query = $self->header('parsed words'); # grab query parsed by swish
my %query_match; # kewords broken down by layer and field.
$self->{query_match} = \%query_match;
# Loop through the query
while ( $query =~ /([a-z]+)\s+=\s+(.+?)(?=\s+[a-z]+\s+=|$)/g ) {
my ( $field, $words ) = ( $1, $2 );
my $inquotes;
my $buffer;
my %single_words;
my $layer = 'text'; # This might be used in the future to highlight tags when matching a href.
# Expand group searches -- not currently used
my @fields = ( $field );
for my $word ( split /\s+/, $words ) {
# XXX This list of swish operators could change "and or not" and is dependent on stopwords.
# remove control words and parens
next if !$inquotes && $word =~ /^(and|or|not|\(|\))$/;
$buffer = [] unless $inquotes; # is there a better way to allocate memory like this?
if ( $word eq '"' ) {
unless ( $inquotes ) {
$inquotes++;
next;
} else {
$inquotes = 0;
}
} else {
push @$buffer, $word;
}
next if $inquotes;
# Only record single words once (this will probably break something)
# Reason: to reduce the number of matches must check
next if @$buffer == 1 && $single_words{ $buffer->[0] }++;
push @{$query_match{$layer}{$_}}, $buffer foreach @fields;
}
}
# Now, sort in desending order of phrase lenght
foreach my $layer ( keys %query_match ) {
print STDERR " LAYER: $layer\n" if DEBUG_QUERY_PARSED;
foreach my $tag ( keys %{$query_match{$layer}} ) {
@{$query_match{$layer}{$tag}} = sort { @$b <=> @$a } @{$query_match{$layer}{$tag}};
if ( DEBUG_QUERY_PARSED ) {
print STDERR " TAG: '$tag'\n";
print STDERR " : '@$_'\n" foreach @{$query_match{$layer}{$tag}};
}
}
}
return \%query_match;
}
1;
__END__
=head1 NAME
swish.cgi -- Example Perl script for searching with the SWISH-E search engine.
=head1 DESCRIPTION
C<swish.cgi> is an example CGI script for searching with the SWISH-E search engine version 2.1-dev and above.
It returns results a page at a time, with matching words from the source document highlighted, showing a
few words of content on either side of the highlighted word.
The standard configuration should work with most swish index files. Customization of the parameters will be
needed if you are indexing special meta data and want to search and/or display the meta data. The
configuration can be modified by editing this script directly, or by using a configuration file (.swishcgi.conf
by default).
The script is modular in design. Both the highlighting code and output generation is handled by modules, which
are included in the F<example/modules> directory. This allows for easy customization of the output without
changing the main CGI script. A module exists to generate standard HTML output. There's also modules and
template examples to use with the popular templating systems HTML::Template and Template-Toolkit. This allows
you to tightly integrate this script with the look of an existing template-driven web site.
This scipt can also run basically unmodified as a mod_perl handler, providing much better performance than
running as a CGI script.
Due to the forking nature of this program and its use of signals,
this script probably will not run under Windows without some modifications.
There's plan to change this soon.
=head1 INSTALLATION
Installing a CGI application is dependent on your specific web server's configuration.
For this discussion we will assume you are using Apache in a typical configuration. For example,
a common location for the DocumentRoot is C</usr/local/apache/htdocs>. If you are installing this
on your shell account, your DocumentRoot might be C<~yourname/public_html>.
For the sake of this example we will assume the following:
/usr/local/apache/htdocs - Document root
/usr/local/apache/cgi-bin - CGI directory
=head2 Move the files to their locations
=over 4
=item Copy the swish.cgi file to your CGI directory
Most web servers have a directory where CGI programs are kept.
Copy the C<swish.cgi> perl script into that directory if this is the case on your
server. You will need to provide read
and execute permisssions to the file. Exactly what permissions are needed again depends on
your specific configuration. For example, under Unix:
chmod 0755 swish.cgi
This gives the file owner (that's you) write access, and everyone read and execute access.
Note that you are not required to use a cgi-bin directory with Apache. You may place the
CGI script in any directory accessible via the web server and
enable it as a CGI script with something like the following
(place either in httpd.conf or in .htaccess):
<Files swish.cgi>
Allow from all
SetHandler cgi-script
Options +ExecCGI
</Files>
Using this method you don't even need to use the C<.cgi> extension. For example, rename
the script to "search" and then use that in the C<Files> directive. Take to your web
administrator for further information.
=item Copy the modules directory
Copying the modules directory is optional, but the script needs to find additional modules so you will
need to edit the script to point to the modules directory. Unlike CPAN modules that need to
be uncompressed, built, and installed, all you need to do is make sure the modules are some place where
the web server can read them. You may decide to leave them where you uncompressed the swish-e distribution,
or you may wish to move them to your perl library.
=head1 CONFIGURATION
=head2 Configure the swish.cgi program
Use a text editor and open the C<swish.cgi> program.
=over 4
=item 1 Check the C<shebang> line
The first line of the program must point to the location of your perl program. Typical
examples are:
#!/usr/local/bin/perl -w
#!/usr/bin/perl -w
#!/opt/perl/bin/perl -w
=item 2 Set the perl library path
The script must find the modules that the script is distributed with. These modules handle
the highlighting of the search terms, and the output generation. Again, where you place the
modules is up to you, and the only requirement is that the web server can access those files.
You tell perl the location of the modules with the "use lib" directive. The default for this script is:
use lib qw( modules );
This says to look for the modules in the F<modules> directory of the current directory.
For example, say you want to leave the modules where you unpacked the swish-e distribution. If
you unpacked in your home directory of F</home/yourname/swish-e> then you must add this to the
script:
use lib qw( /home/yourname/swish-e/example/modules );
=item 3 Set the configuration parameters
To make things somewhat simple, the configuration parameters are included at the top of the program.
The parameters are all part of a perl C<hash> structure, and the comments at the top of the program should
get you going.
You will probably need to specify at least the location of the swish-e binary, your index file or files,
and a title.
You have two options for changing the configuration settings from their default:
you may edit the script directly, or you may use a configuration file. In either case, the configuration
settings are a basic perl hash reference.
Using a configuration file is described below.
The configuration settings might look like:
return {
title => 'Search the Swish-e list', # Title of your choice.
swish_binary => './swish-e', # Location of swish-e binary
swish_index => '../index.swish-e', # Location of your index file
};
Or if searching more than one index:
return {
title => 'Search the Swish-e list',
swish_binary => './swish-e',
swish_index => ['../index.swish-e', '../index2'],
};
Both of these examples return a reference to a perl hash ( C<return {...}> ). Again, this same format is
used either at the top of this program, or in a configuration file.
The examples above place the swish index file(s)
in the directory above the C<swish.cgi> CGI script. If using the example paths above
of C</usr/local/apache/cgi-bin> for the CGI bin directory, that means that the index file
is in C</usr/local/apache>. That places the index out of web space (e.g. cannot be accessed
via the web server), yet relative to where the C<swish.cgi> script is located.
(If running under mod_perl you will most likely specify absolute paths for your index files.)
There's more than one way to do it, of course.
One option is to place the index in the same directory as the <swish.cgi> script, but
then be sure to use your web server's configuration to prohibit access to the index directly.
Another common option is to maintain a separate directory of the all your swish index files. This decision is
up to you.
As mentioned above, you can either edit this script directly and modify the configuration settings, or
use an external configuration file. The settings in the configuration file are merged with (override)
the settings defined in the script.
By default, the script will attempt to read from the file F<.swishcgi.conf>.
For example, you might only wish to change the title used
in the script. Simply create a file called F<.swishcgi.conf> in the same directory as the CGI script:
> cat .swishcgi.conf
# Example swish.cgi configuration script.
return {
title => 'Search Our Mailing List Archive',
};
Look at the default configuration settings at the top of this program for information on the available settings.
=item 4 Create your index
You must index your web site before you can begin to use the C<swish.cgi> script.
Create a configuration file called C<swish.conf> in the directory where you will store
the index file.
This next example uses the file system to index your web documents.
In general, you will probably wish to I<spider> your web site if your web pages do not
map exactly to your file system, and to only index files available from links on you web
site.
See B<Spidering> below for more information.
Example C<swish.conf> file:
# Define what to index
IndexDir /usr/local/apache/htdocs
IndexOnly .html .htm
# Tell swish how to parse .html and .html documents
IndexContents HTML .html .htm
# And just in case we have files without an extension
DefaultContents HTML
# Replace the path name with a URL
ReplaceRules replace /usr/local/apache/htdocs/ http://www.myserver.name/
# Allow limiting search to titles and URLs.
MetaNames swishdocpath swishtitle
# Optionally use stemming for "fuzzy" searches
#UseStemming yes
Now to index you simply run:
swish-e -c swish.conf
The default index file C<index.swish-e> will be placed in the current directory.
Note that the above swish-e configuration defines two MetaNames "swishdocpath" and "swishtitle".
This allows searching just the document path or the title instead of the document's content.
Here's an expanded C<swish.cgi> configuration to make use of the above settings used while indexing:
return {
title => 'Search the Apache documentation',
swish_binary => './swish-e',
swish_index => 'index.swish-e',
metanames => [qw/swishdefault swishdocpath swishtitle/],
display_props => [qw/swishlastmodified swishdocsize swishdocpath/],
title_property => 'swishtitle', # Not required, but recommended
name_labels => {
swishdefault => 'Body & Title',
swishtitle => 'Title',
swishrank => 'Rank',
swishlastmodified => 'Last Modified Date',
swishdocpath => 'Document Path',
swishdocsize => 'Document Size',
},
};
The above configuration defines metanames to use on the form.
Searches can be limited to these metanames.
"display_props" tells the script to display the property "swishlastmodified" (the last modified
date of the file), the document size, and path with the search results.
The parameter "name_labels" is a hash (reference)
that is used to give friendly names to the metanames.
Swish-e can store part of all of the contents of the documents as they are indexed, and this
"document description" can be returned with search results.
# Store the text of the documents within the swish index file
StoreDescription HTML <body> 100000
Adding the above to your C<swish.conf> file tells swish-e to store up to 100,000 characters from the body of each document within the
swish-e index. To display this information in search results, highlighting search terms,
use the follow configuration in C<swish.cgi>:
return {
title => 'Search the Apache documentation',
swish_binary => './swish-e',
swish_index => 'index.swish-e',
metanames => [qw/swishdefault swishdocpath swishtitle/],
display_props => [qw/swishlastmodified swishdocsize swishdocpath/],
title_property => 'swishtitle', # Not required, but recommended
description_prop=> 'swishdescription',
name_labels => {
swishdefault => 'Body & Title',
swishtitle => 'Title',
swishrank => 'Rank',
swishlastmodified => 'Last Modified Date',
swishdocpath => 'Document Path',
swishdocsize => 'Document Size',
},
highlight => {
package => 'PhraseHighlight',
meta_to_prop_map => { # this maps search metatags to display properties
swishdefault => [ qw/swishtitle swishdescription/ ],
swishtitle => [ qw/swishtitle/ ],
swishdocpath => [ qw/swishdocpath/ ],
},
}
};
Other C<swish.cgi> configuration settings are available, and are listed at the top of the F<swish.cgi>
script.
=back
You should now be ready to run your search engine. Point your browser to:
http://www.myserver.name/cgi-bin/swish.cgi
adjusting the server and URL to match your system, of course.
=head1 MOD_PERL
This script can be run under mod_perl (see http://perl.apache.org).
This will improve the response time of the script compared to running under CGI.
Configuration is simple. In your httpd.conf or your startup.pl file you need to
load the script. For example, in httpd.conf you can use a perl section:
<perl>
use lib '/usr/local/apache/cgi-bin';
use lib '/home/yourname/swish-e/example/modules';
require "swish.cgi";
</perl>
Again, note that the paths used will depend on where you installed the script and the modules.
When running under mod_perl the swish.cgi script becomes a perl module, and therefore the script
does not need to be installed in the cgi-bin directory. (But, you can actually use the same script as
both a CGI script and a mod_perl module at the same time, read from the same location.)
The above loads the script into mod_perl. Then to configure the script to run add this to your httpd.conf
configuration file:
<location /search>
allow from all
SetHandler perl-script
PerlHandler SwishSearch
</location>
Unlike CGI, mod_perl does not change the current directory to the location of the perl module, so
your settings for the swish binary and the path to your index files must be absolute
paths (or relative to the server root).
Take a look at the C<handler()> routine in this script for ideas how to use PerlSetVar commands
in httpd.conf to control the script.
Please post to the swish-e discussion list if you have any questions about running this
script under mod_perl.
=head1 DEBUGGING
The key to debugging CGI scripts is to run them from the command line, not with a browser.
First, make sure the program compiles correctly:
> perl -c swish.cgi
swish.cgi syntax OK
Next, simply try running the program:
> ./swish.cgi | head
Content-Type: text/html; charset=ISO-8859-1
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<title>
Search our site
</title>
</head>
<body>
Now, you know that the program compiles and will run from the command line.
Next, try accessing the script from a web browser.
If you see the contents of the CGI script instead of its output then your web server is
not configured to run the script. You will need to look at settings like ScriptAlias, SetHandler,
and Options.
If an error is reported (such as Internal Server Error or Forbidden)
you need to locate your web server's error_log file
and carefully read what the problem is. Contact your web administrator for help.
=head1 Spidering
There are two ways to spider with swish-e. One uses the "http" input method that uses code that's
part of swish. The other way is to use the new "prog" method along with a perl helper program called
C<spider.pl>.
Here's an example of a configuration file for spidering with the "http" input method.
You can see that the configuration is not much different than the file system input method.
(But, don't use the http input method -- use the -S prog method shown below.)
# Define what to index
IndexDir http://www.myserver.name/index.html
IndexOnly .html .htm
IndexContents HTML .html .htm
DefaultContents HTML
StoreDescription HTML <body> 200000
MetaNames swishdocpath swishtitle
# Define http method specific settings -- see swish-e documentation
SpiderDirectory ../swish-e/src/
Delay 0
You index with the command:
swish-e -S http -c spider.conf
Note that this does take longer. For example, spidering the Apache documentation on
a local web server with this method took over a minute, where indexing with the
file system took less than two seconds. Using the "prog" method can speed this up.
Here's an example configuration file for using the "prog" input method:
# Define the location of the spider helper program
IndexDir ../swish-e/prog-bin/spider.pl
# Tell the spider what to index.
SwishProgParameters default http://www.myserver.name/index.html
IndexContents HTML .html .htm
DefaultContents HTML
StoreDescription HTML <body> 200000
MetaNames swishdocpath swishtitle
Then to index you use the command:
swish-e -c prog.conf -S prog -v 0
Spidering with this method took nine seconds.
=head1 Stemmed Indexes
Many people enable a feature of swish called word stemming to provide "fuzzy" search
options to their users.
The stemming code does not actually find the "stem" of word, rather removes and/or replaces
common endings on words.
Stemming is far from perfect, and many words do not stem as you might expect. But, it can
be a helpful tool for searching your site. You may wish to create both a stemmed and non-stemmed index, and
provide a checkbox for selecting the index file.
To enable a stemmed index you simply add to your configuration file:
UseStemming yes
If you want to use a stemmed index with this program and continue to highlight search terms you will need
to install a perl module that will stem words. This section explains how to do this.
The perl module is included with the swish-e distribution. It can be found in the examples directory (where
you found this file) and called something like:
SWISH-Stemmer-0.05.tar.gz
The module should also be available on CPAN (http://search.cpan.org/).
Here's an example session for installing the module. (There will be quite a bit of output
when running make.)
% gzip -dc SWISH-Stemmer-0.05.tar.gz |tar xof -
% cd SWISH-Stemmer-0.05
% perl Makefile.PL
or
% perl Makefile.PL PREFIX=$HOME/perl_lib
% make
% make test
(perhaps su root at this point if you did not use a PREFIX)
% make install
% cd ..
Use the B<PREFIX> if you do not have root access or you want to install the modules
in a local library. If you do use a PREFIX setting, add a C<use lib> statement to the top of this
swish.cgi program.
For example:
use lib qw(
/home/bmoseley/perl_lib/lib/site_perl/5.6.0
/home/bmoseley/perl_lib/lib/site_perl/5.6.0/i386-linux/
);
Once the stemmer module is installed, and you are using a stemmed index, the C<swish.cgi> script will automatically
detect this and use the stemmer module.
=head1 DISCLAIMER
Please use this CGI script at your own risk.
This script has been tested and used without problem, but you should still be aware that
any code running on your server represents a risk. If you have any concerns please carefully
review the code.
See http://www.w3.org/Security/Faq/www-security-faq.html
=head1 SUPPORT
The SWISH-E discussion list is the place to ask for any help regarding SWISH-E or this example
script. See http://swish-e.org.
Before posting please review:
http://swish-e.org/2.2/docs/INSTALL.html#When_posting_please_provide_the_
Please do not contact the author directly.
=head1 LICENSE
swish.cgi $Revision: 1.1 $ Copyright (C) 2001 Bill Moseley search@hank.org
Example CGI program for searching with SWISH-E
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version
2 of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
=head1 AUTHOR
Bill Moseley -- search@hank.org
=cut
1.1 modperl-docs/src/search/swish.conf
Index: swish.conf
===================================================================
SwishProgParameters default http://perl.apache.org/~stas/modperl-site/
#SwishProgParameters default http://localhost/modperl-site/
IndexDir ./spider.pl
DefaultContents HTML2
StoreDescription HTML2 <body> 100000
MetaNames swishtitle swishdocpath
1.1 modperl-docs/src/search/modules/DateRanges.pm
Index: DateRanges.pm
===================================================================
# $Id: DateRanges.pm,v 1.1 2002/01/30 06:35:00 stas Exp $
package DateRanges;
use strict;
=head1 NAME
DateRanges
=head1 SYNOPSIS
use DateRanges;
use CGI;
my $cgi = CGI->new();
...
my %hash = (
date_ranges => {
# Define what buttons to include
time_periods => [
'All',
'Today',
'Yesterday',
#'Yesterday onward',
'This Week',
'Last Week',
'Last 90 Days',
'This Month',
'Last Month',
#'Past',
#'Future',
#'Next 30 Days',
],
# Default button
default => 'All',
# Should buttons be in a row or a column?
line_break => 0,
# Should a date input form be shown, too?
date_range => 1,
},
);
=head1 DESCRIPTION
This module provides I<basic> support for entering and using date ranges. It
was written to use with swish-e (http://swish-e.org).
See swish.cgi in the swish-e distribution for an example.
Sorry about the interface -- if anyone really wants to use this please let me know and I'll
rewrite as OO interface!
=head1 FUNCTIONS
=cut
require Exporter;
use vars qw/$VERSION @ISA @EXPORT/;
@ISA = qw(Exporter);
$VERSION = '0.01';
@EXPORT = qw (
DateRangeForm
DateRangeParse
GetDateRangeArgs
);
# what to pick from
my @TIME_PERIODS = (
'Today',
'Yesterday',
'Yesterday onward',
'This Week',
'Last Week',
'Last 90 Days',
'This Month',
'Last Month',
'Past',
'Future',
'Next 30 Days',
'All'
);
my %TIME_PERIODS = map { $_, 1} @TIME_PERIODS;
use Date::Calc qw /
Day_of_Week_to_Text
Day_of_Week
Date_to_Text
Monday_of_Week
Week_of_Year
Today
Add_Delta_Days
Days_in_Month
check_date
/;
use Time::Local;
=item DateRangeForm( $cgi, $params, $fields );
This function simple creates a simple form for selecting date ranges based
on the fields passed in C<$params>. Will call C<die()> on errors.
C<$params> must be a hash reference with a key named C<time_periods> as shown in
B<SYNOPSIS> above. This is used to select which time periods to display.
C<$fields> is a reference to a hash where C<DateRanges> returns data.
These store the HTML for display on your form.
buttons - the buttons to select the different time ranges
date_range_button - the button to select a date range
date_range_low - the low range select form fields
date_range_hi - the hight range select form fields
=cut
sub DateRangeForm {
my ( $CGI, $params, $fields ) = @_;
die "Must supply arrary ref for 'options'"
unless $params->{time_periods} && ref $params->{time_periods} eq 'ARRAY';
my @time_periods = grep { $TIME_PERIODS{$_} } @{ $params->{time_periods} };
$fields->{buttons} = '';
$fields->{date_range_button} = '';
$fields->{date_range_low} = '';
$fields->{date_range_high} = '';
$fields->{buttons} =
$CGI->radio_group(
-name => 'DateRanges_date_option',
-values => \@time_periods,
-default => ($params->{default} || $time_periods[0]),
-linebreak => (exists $params->{line_break} ? $params->{line_break} : 1),
#-columns=>2,
) if @time_periods;
return unless $params->{date_range};
$fields->{date_range_button} =
$CGI->radio_group(
-name => 'DateRanges_date_option',
-values => ['Select Date Range'],
-default => ($params->{default} || $time_periods[0]),
-linebreak => (exists $params->{line_break} ? $params->{line_break} : 1),
);
$fields->{date_range_low} = show_date_input($CGI, 'start');
$fields->{date_range_high} = show_date_input($CGI, 'end');
=pod
print '<br>Limit to the hour of: ',
popup_menu( -name => 'Limit_hour',
-default => ' ',
-values => [' ',0..23], ),
'<br>';
=cut
}
=item my $args = GetDateRangeArgs( $cgi );
Returns a string to use in a HREF with all the parameters set.
=cut
sub GetDateRangeArgs {
my $CGI = shift;
my %args;
$args{DateRanges_date_option} = $CGI->param('DateRanges_date_option')
if defined $CGI->param('DateRanges_date_option');
for ( qw/ mon day year / ) {
my $start = "DateRanges_start_$_";
my $end = "DateRanges_end_$_";
$args{$start} = $CGI->param($start) if defined $CGI->param($start);
$args{$end} = $CGI->param($end) if defined $CGI->param($end);
}
return '' unless %args;
return join '&', map { "$_=" . $CGI->escape($args{$_}) } keys %args;
}
=item DateRangeParse( $cgi, $form )
Parses the date range form and returns a low and high range unix timestamp.
Returns false on error with the folowing key set in C<$form>:
DateRanges_error - error string explaining the problem
C<$form> is a hash reference where the following keys may be set:
All - no date ranges were selected
DateRanges_time_low - low range unix timestamp
DateRanges_time_high - high range unix timestamp
=cut
#------------------------ Get the report dates ---------------------
sub DateRangeParse {
my ( $q, $form ) = @_;
$form->{DateRanges_error} = '';
# If requesting ALL (or not found in form) return true for all
if ( !$q->param('DateRanges_date_option') || $q->param('DateRanges_date_option') eq 'All' ) {
$form->{All}++;
return 1;
}
my $time = time();
my ( @start, @end );
for ($q->param('DateRanges_date_option') ) {
/^Today/ && do { @start = @end = Today(); last; };
/^Yesterday onward/ && do { @start = Add_Delta_Days( Today(), -1 ); last };
/^Yesterday/ && do { @start = @end = Add_Delta_Days( Today(), -1 ); last };
/^This Week/ && do {
@start = Monday_of_Week( Week_of_Year( Today() ) );
@end = Add_Delta_Days( @start, 6 );
last;
};
/^Last Week/ && do {
@start = Monday_of_Week( Week_of_Year( Add_Delta_Days( Today(), -7 ) ) );
@end = Add_Delta_Days( @start, 6 );
last;
};
/^This Month/ && do {
@start = @end = Today();
$start[2] = 1;
$end[2] = Days_in_Month($end[0],$end[1]);
last;
};
/^Last Month/ && do {
@start = Today();
$start[2] = 1;
$start[1] = 1 if --$start[1] == 0;
@end = @start;
$end[2] = Days_in_Month($end[0],$end[1]);
last;
};
/^Last 90 Days/ && do {
@end = Today();
@start = Add_Delta_Days( Today(), -90 );
last
};
/^Past/ && return 1; # use defaults;
/^Future/ && do {
$form->{DateRanges_time_low} = time;
delete $form->{DateRanges_time_high};
return 1;
};
/^Next 30 Days/ && do {
@start = Today();
@end = Add_Delta_Days( Today(), +30 );
last
};
/^Select/ && do {
my ( $day, $mon, $year );
$day = $q->param('DateRanges_start_day') || 0;
$mon = $q->param('DateRanges_start_mon') || 0;
$year = $q->param('DateRanges_start_year') || 0;
@start = ( $year, $mon, $day );
$day = $q->param('DateRanges_end_day') || 0;
$mon = $q->param('DateRanges_end_mon') || 0;
$year = $q->param('DateRanges_end_year') || 0;
@end = ( $year, $mon, $day );
last;
};
$form->{DateRanges_error} = 'Invalid Date Option ' . $q->param('DateRanges_date_option') . ' Selected';
return;
}
$form->{DateRanges_error} = 'Invalid Start Date' && return if @start && !check_date( @start );
$form->{DateRanges_error} = 'Invalid Ending Date' && return if @end && !check_date( @end );
my $start_time = @start ? timelocal( 0, 0, 0, $start[2], $start[1]-1, $start[0]-1900 ) : 0;
my $end_time = @end ? timelocal( 59, 59, 23, $end[2], $end[1]-1, $end[0]-1900 ) : 0;
$form->{DateRanges_error} = "Starting time should be before now, don't you think?" && return
if $start_time && $start_time > time();
$form->{DateRanges_error} = 'Start date must be same day or before end date' && return
if $start_time && $end_time && $start_time > $end_time;
$form->{DateRanges_time_low} = $start_time;
$form->{DateRanges_time_high} = $end_time;
return 1;
}
#----------------------------------------------------------------------------
sub show_date_input {
my ( $CGI, $name ) = @_;
$name = "DateRanges_$name";
my @months = qw/Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/;
my $x = 1;
my %months = map { $x++, $_ } @months;
my ($mon, $day, $year) = (localtime)[4,3,5];
$year = $year + 1900;
$mon++;
my $cur_year = $year;
$cur_year += 5;
($year,$mon,$day) = Date::Calc::Add_Delta_Days($year,$mon,$day, -28 ) if $name eq 'start';
my $str = join "\n",
$CGI->popup_menu(
-name => "${name}_mon",
-values => [1..12],
-default => $mon,
-labels => \%months
),
' ',
$CGI->popup_menu(
-name => "${name}_day",
-default => $day,
-values => [1..31],
),
' ',
$CGI->popup_menu(
-name => "${name}_year",
-default => $year,
-values => [$year-5..$cur_year],
);
return $str;
}
#----------------------- ymd_to_unix --------------------------
# Silly Date::Calc
sub ymd_to_unix {
my ($y, $m, $d ) = @_;
$m--;
$y -= 1900;
return timelocal( 0, 0, 0, $d, $m, $y );
}
1;
=back
=head1 COPYRIGHT
Copyright 2001 Bill Moseley
This library is free software; you can redistribute it and/or
modify it under the same terms as Perl itself.
=cut
1.1 modperl-docs/src/search/modules/DefaultHighlight.pm
Index: DefaultHighlight.pm
===================================================================
#=======================================================================
# Default Highlighting Code
#
# $Id: DefaultHighlight.pm,v 1.1 2002/01/30 06:35:00 stas Exp $
#=======================================================================
package DefaultHighlight;
use strict;
sub new {
my ( $class, $results, $metaname ) = @_;
my $self = bless {
results => $results, # just in case we need a method
settings=> $results->config('highlight'),
metaname=> $metaname,
}, $class;
# parse out the query into words
my $query = $results->extract_query_match;
# Do words exist for this layer (all text at this time) and metaname?
# This is a reference to an array of phrases and words
$self->{description_prop} = $results->config('description_prop') || '';
if ( $results->header('stemming applied') =~ /^(?:1|yes)$/i ) {
eval { require SWISH::Stemmer };
if ( $@ ) {
$results->errstr('Stemmed index needs Stemmer.pm to highlight: ' . $@);
} else {
$self->{stemmer_function} = \&SWISH::Stemmer::SwishStem;
}
}
if ( $query && exists $query->{text}{$metaname} ) {
$self->{query} = $query->{text}{$metaname};
$self->set_match_regexp;
}
return $self;
}
sub highlight {
my ( $self, $properties ) = @_;
return unless $self->{query};
my $phrase_array = $self->{query};
my $settings = $self->{settings};
my $metaname = $self->{metaname};
# Do we care about this meta?
return unless exists $settings->{meta_to_prop_map}{$metaname};
# Get the related properties
my @props = @{ $settings->{meta_to_prop_map}{$metaname} };
my %checked;
for ( @props ) {
if ( $properties->{$_} ) {
$checked{$_}++;
$self->highlight_text( \$properties->{$_}, $phrase_array );
}
}
# Truncate the description, if not processed.
my $description = $self->{description_prop};
if ( $description && !$checked{ $description } && $properties->{$description} ) {
my $max_words = $settings->{max_words} || 100;
my @words = split /\s+/, $properties->{$description};
if ( @words > $max_words ) {
$properties->{$description} = join ' ', @words[0..$max_words], '<b>...</b>';
}
}
}
#==========================================================================
#
sub highlight_text {
my ( $self, $text_ref, $phrase_array ) = @_;
my $wc_regexp = $self->{wc_regexp};
my $extract_regexp = $self->{extract_regexp};
my $match_regexp = $self->{match_regexp};
my $last = 0;
my $settings = $self->{settings};
my $Show_Words = $settings->{show_words} || 10;
my $Occurrences = $settings->{occurrences} || 5;
my $Max_Words = $settings->{max_words} || 100;
my $On = $settings->{highlight_on} || '<b>';
my $Off = $settings->{highlight_off} || '</b>';
my $stemmer_function = $self->{stemmer_function};
# Should really call unescapeHTML(), but then would need to escape <b> from escaping.
my @words = split /$wc_regexp/, $$text_ref;
return 'No Content saved: Check StoreDescription setting' unless @words;
my @flags;
$flags[$#words] = 0; # Extend array.
my $occurrences = $Occurrences ;
my $pos = 0;
while ( $Show_Words && $pos <= $#words ) {
# Check if the word is a swish word (ignoring begin and end chars)
if ( $words[$pos] =~ /$extract_regexp/ ) {
my ( $begin, $word, $end ) = ( $1, $2, $3 );
my $test = $stemmer_function
? $stemmer_function->($word)
: lc $word;
$test ||= lc $word;
# Not check if word matches
if ( $test =~ /$match_regexp/ ) {
$words[$pos] = "$begin$On$word$Off$end";
my $start = $pos - ($Show_Words-1)* 2;
my $end = $pos + ($Show_Words-1)* 2;
if ( $start < 0 ) {
$end = $end - $start;
$start = 0;
}
$end = $#words if $end > $#words;
$flags[$_]++ for $start .. $end;
# All done, and mark where to stop looking
if ( $occurrences-- <= 0 ) {
$last = $end;
last;
}
}
}
$pos += 2; # Skip to next wordchar word
}
my $dotdotdot = ' <b>...</b> ';
my @output;
my $printing;
my $first = 1;
my $some_printed;
if ( $Show_Words && @words > 50 ) { # don't limit context if a small number of words
for my $i ( 0 ..$#words ) {
if ( $last && $i >= $last && $i < $#words ) {
push @output, $dotdotdot;
last;
}
if ( $flags[$i] ) {
push @output, $dotdotdot if !$printing++ && !$first;
push @output, $words[$i];
$some_printed++;
} else {
$printing = 0;
}
$first = 0;
}
}
if ( !$some_printed ) {
for my $i ( 0 .. $Max_Words ) {
if ( $i > $#words ) {
$printing++;
last;
}
push @output, $words[$i];
}
}
push @output, $dotdotdot if !$printing;
$$text_ref = join '', @output;
}
#============================================
# Returns compiled regular expressions for matching
#
sub set_match_regexp {
my $self = shift;
my $results = $self->{results};
my $wc = $results->header('wordcharacters');
my $ignoref = $results->header('ignorefirstchar');
my $ignorel = $results->header('ignorelastchar');
my $query = join ' ', map { join ' ', @$_} @{$self->{query}}; # join everything together!
$wc = quotemeta $wc;
my $match_string =
join '|',
map { substr( $_, -1, 1 ) eq '*'
? quotemeta( substr( $_, 0, -1) ) . "[$wc]*?"
: quotemeta
}
grep { ! /^(and|or|not|["()=])$/oi } # left over code
split /\s+/, $query;
return unless $match_string;
for ( $ignoref, $ignorel ) {
if ( $_ ) {
$_ = quotemeta;
$_ = "([$_]*)";
} else {
$_ = '()';
}
}
# Yuck!
$wc .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; # Warning: dependent on tolower used while indexing
# Now, wait a minute. Look at this more, as I'd hope that making a
# qr// go out of scope would release the compiled pattern.
if ( $ENV{MOD_PERL} ) {
$self->{wc_regexp} = qr/([^$wc]+)/; # regexp for splitting into swish-words
$self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/i; # regexp for extracting out the words to compare
$self->{match_regexp} = qr/^(?:$match_string)$/; # regexp for comparing extracted words to query
} else {
$self->{wc_regexp} = qr/([^$wc]+)/o; # regexp for splitting into swish-words
$self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/oi; # regexp for extracting out the words to compare
$self->{match_regexp} = qr/^(?:$match_string)$/o; # regexp for comparing extracted words to query
}
}
1;
1.1 modperl-docs/src/search/modules/PhraseHighlight.pm
Index: PhraseHighlight.pm
===================================================================
#=======================================================================
# Phrase Highlighting Code
#
# $Id: PhraseHighlight.pm,v 1.1 2002/01/30 06:35:00 stas Exp $
#=======================================================================
package PhraseHighlight;
use strict;
use constant DEBUG_HIGHLIGHT => 0;
sub new {
my ( $class, $results, $metaname ) = @_;
my $self = bless {
results => $results, # just in case we need a method
settings=> $results->config('highlight'),
metaname=> $metaname,
}, $class;
# parse out the query into words
my $query = $results->extract_query_match;
# Do words exist for this layer (all text at this time) and metaname?
# This is a reference to an array of phrases and words
$self->{description_prop} = $results->config('description_prop') || '';
if ( $results->header('stemming applied') =~ /^(?:1|yes)$/i ) {
eval { require SWISH::Stemmer };
if ( $@ ) {
$results->errstr('Stemmed index needs Stemmer.pm to highlight: ' . $@);
} else {
$self->{stemmer_function} = \&SWISH::Stemmer::SwishStem;
}
}
my %stopwords = map { $_, 1 } split /\s+/, $results->header('stopwords');
$self->{stopwords} = \%stopwords;
if ( $query && exists $query->{text}{$metaname} ) {
$self->{query} = $query->{text}{$metaname};
$self->set_match_regexp;
}
return $self;
}
sub highlight {
my ( $self, $properties ) = @_;
return unless $self->{query};
my $phrase_array = $self->{query};
my $settings = $self->{settings};
my $metaname = $self->{metaname};
# Do we care about this meta?
return unless exists $settings->{meta_to_prop_map}{$metaname};
# Get the related properties
my @props = @{ $settings->{meta_to_prop_map}{$metaname} };
my %checked;
for ( @props ) {
if ( $properties->{$_} ) {
$checked{$_}++;
$self->highlight_text( \$properties->{$_}, $phrase_array );
}
}
# Truncate the description, if not processed.
my $description = $self->{description_prop};
if ( $description && !$checked{ $description } && $properties->{$description} ) {
my $max_words = $settings->{max_words} || 100;
my @words = split /\s+/, $properties->{$description};
if ( @words > $max_words ) {
$properties->{$description} = join ' ', @words[0..$max_words], '<b>...</b>';
}
}
}
#==========================================================================
#
sub highlight_text {
my ( $self, $text_ref, $phrase_array ) = @_;
my $wc_regexp = $self->{wc_regexp};
my $extract_regexp = $self->{extract_regexp};
my $last = 0;
my $settings = $self->{settings};
my $Show_Words = $settings->{show_words} || 10;
my $Occurrences = $settings->{occurrences} || 5;
my $Max_Words = $settings->{max_words} || 100;
my $On = $settings->{highlight_on} || '<b>';
my $Off = $settings->{highlight_off} || '</b>';
my $on_flag = 'sw' . time . 'on';
my $off_flag = 'sw' . time . 'off';
my $stemmer_function = $self->{stemmer_function};
# Should really call unescapeHTML(), but then would need to escape <b> from escaping.
# Split into words. For speed, should work on a stream method.
my @words = split /$wc_regexp/, $$text_ref;
return 'No Content saved: Check StoreDescription setting' unless @words;
my @flags; # This marks where to start and stop display.
$flags[$#words] = 0; # Extend array.
my $occurrences = $Occurrences ;
my $word_pos = $words[0] eq '' ? 2 : 0; # Start depends on if first word was wordcharacters or not
my @phrases = @{ $self->{query} };
# Remember, that the swish words are every other in @words.
WORD:
while ( $Show_Words && $word_pos * 2 < @words ) {
PHRASE:
foreach my $phrase ( @phrases ) {
print STDERR " Search phrase '@$phrase'\n" if DEBUG_HIGHLIGHT;
next PHRASE if ($word_pos + @$phrase -1) * 2 > @words; # phrase is longer than what's left
my $end_pos = 0; # end offset of the current phrase
# now compare all the words in the phrase
my ( $begin, $word, $end );
for my $match_word ( @$phrase ) {
my $cur_word = $words[ ($word_pos + $end_pos) * 2 ];
unless ( $cur_word =~ /$extract_regexp/ ) {
my $idx = ($word_pos + $end_pos) * 2;
my ( $s, $e ) = ( $idx - 10, $idx + 10 );
$s = 0 if $s < 0;
$e = @words-1 if $e >= @words;
warn "Failed to parse IgnoreFirst/Last from word '"
. (defined $cur_word ? $cur_word : '*undef')
. "' (index: $idx) word_pos:$word_pos end_pos:$end_pos total:"
. scalar @words
. "\n-search pharse words-\n"
. join( "\n", map { "$_ '$phrase->[$_]'" } 0..@$phrase -1 )
. "\n-Words-\n"
. join( "\n", map { "$_ '$words[$_]'" . ($_ == $idx ? ' <<< this word' : '') } $s..$e )
. "\n";
next PHRASE;
}
# Strip ignorefirst and ignorelast
( $begin, $word, $end ) = ( $1, $2, $3 ); # this is a waste, as it can operate on the same word over and over
my $check_word = lc $word;
if ( $end_pos && exists $self->{stopwords}{$check_word} ) {
$end_pos++;
print STDERR " Found stopword '$check_word' in the middle of phrase - * MATCH *\n" if DEBUG_HIGHLIGHT;
redo if ( $word_pos + $end_pos ) * 2 < @words; # go on to check this match word with the next word.
# No more words to match with, so go on to next pharse.
next PHRASE;
}
if ( $stemmer_function ) {
my $w = $stemmer_function->($check_word);
$check_word = $w if $w;
}
print STDERR " comparing source # (word:$word_pos offset:$end_pos) '$check_word' == '$match_word'\n" if DEBUG_HIGHLIGHT;
if ( substr( $match_word, -1 ) eq '*' ) {
next PHRASE if index( $check_word, substr($match_word, 0, length( $match_word ) - 1) ) != 0;
} else {
next PHRASE if $check_word ne $match_word;
}
print STDERR " *** Word Matched '$check_word' *** \n" if DEBUG_HIGHLIGHT;
$end_pos++;
}
print STDERR " *** PHRASE MATCHED (word:$word_pos offset:$end_pos) *** \n" if DEBUG_HIGHLIGHT;
# We are currently at the end word, so it's easy to set that highlight
$end_pos--;
if ( !$end_pos ) { # only one word
$words[$word_pos * 2] = "$begin$on_flag$word$off_flag$end";
} else {
$words[($word_pos + $end_pos) * 2 ] = "$begin$word$off_flag$end";
#Now, reload first word of match
$words[$word_pos * 2] =~ /$extract_regexp/ or die "2 Why didn't '$words[$word_pos]' =~ /$extract_regexp/?";
# Strip ignorefirst and ignorelast
( $begin, $word, $end ) = ( $1, $2, $3 ); # probably should cache this!
$words[$word_pos * 2] = "$begin$on_flag$word$end";
}
# Now, flag the words around to be shown
my $start = ($word_pos - $Show_Words + 1) * 2;
my $stop = ($word_pos + $end_pos + $Show_Words - 2) * 2;
if ( $start < 0 ) {
$stop = $stop - $start;
$start = 0;
}
$stop = $#words if $stop > $#words;
$flags[$_]++ for $start .. $stop;
# All done, and mark where to stop looking
if ( $occurrences-- <= 0 ) {
$last = $end;
last WORD;
}
# Now reset $word_pos to word following
$word_pos += $end_pos; # continue will still be executed
next WORD;
}
} continue {
$word_pos ++;
}
my $dotdotdot = ' ... ';
my @output;
my $printing;
my $first = 1;
my $some_printed;
if ( $Show_Words && @words > 50 ) { # don't limit context if a small number of words
for my $i ( 0 ..$#words ) {
if ( $last && $i >= $last && $i < $#words ) {
push @output, $dotdotdot;
last;
}
if ( $flags[$i] ) {
push @output, $dotdotdot if !$printing++ && !$first;
push @output, $words[$i];
$some_printed++;
} else {
$printing = 0;
}
$first = 0;
}
}
if ( !$some_printed ) {
for my $i ( 0 .. $Max_Words ) {
if ( $i > $#words ) {
$printing++;
last;
}
push @output, $words[$i];
}
}
push @output, $dotdotdot if !$printing;
$$text_ref = join '', @output;
my %entities = (
'&' => '&',
'>' => '>',
'<' => '<',
'"' => '"',
);
my %highlight = (
$on_flag => $On,
$off_flag => $Off,
);
$$text_ref =~ s/([&"<>])/$entities{$1}/ge;
$$text_ref =~ s/($on_flag|$off_flag)/$highlight{$1}/ge;
# $$text_ref = join '', @words; # interesting that this seems reasonably faster
}
#============================================
# Returns compiled regular expressions for matching
#
#
sub set_match_regexp {
my $self = shift;
my $results = $self->{results};
my $wc = $results->header('wordcharacters');
my $ignoref = $results->header('ignorefirstchar');
my $ignorel = $results->header('ignorelastchar');
$wc = quotemeta $wc;
#Convert query into regular expressions
for ( $ignoref, $ignorel ) {
if ( $_ ) {
$_ = quotemeta;
$_ = "([$_]*)";
} else {
$_ = '()';
}
}
$wc .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; # Warning: dependent on tolower used while indexing
# Now, wait a minute. Look at this more, as I'd hope that making a
# qr// go out of scope would release the compiled pattern.
if ( $ENV{MOD_PERL} ) {
$self->{wc_regexp} = qr/([^$wc]+)/; # regexp for splitting into swish-words
$self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/i; # regexp for extracting out the words to compare
} else {
$self->{wc_regexp} = qr/([^$wc]+)/o; # regexp for splitting into swish-words
$self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/oi; # regexp for extracting out the words to compare
}
}
1;
1.1 modperl-docs/src/search/modules/PhraseTest.pm
Index: PhraseTest.pm
===================================================================
#!/usr/local/bin/perl -w
use strict;
=pod
To give some background.
when you search with swish you do something like:
./swish-e -w swishdefault=foo
that says: search the field "swishdefault" for the word foo. swishdefault
is the default "metaname". When indexing, say html, swish indexes the body
text AND the title as swishdefault.
So, a document will match if "foo" is found in the title or in the body when searching
the swishdefault metaname. This is why in the sample code below supplies both the title
and description to the highlighting code.
The other part to swish is "properties". A document property is something like
the path name, title, or last modified date. Some bit of data that can be returned
with search results. So, when you do a search for "foo" swish will
return a list of documents, and for each document it will list properties.
Since properties and metanames may or may not be related, in the "config" below you see a hash that maps
one or more properties to metanames.
Note, here's a search for "apache" limiting to four results.
> ./swish-e -w apache -m 4
# SWISH format: 2.1-dev-24
# Search words: apache
# Number of hits: 120
# Search time: 0.001 seconds
# Run time: 0.006 seconds
1000 /usr/local/apache/htdocs/manual/misc/FAQ.html "Apache Server Frequently Asked Questions" 107221
973 /usr/local/apache/htdocs/manual/windows.html "Using Apache with Microsoft Windows" 21664
953 /usr/local/apache/htdocs/manual/mod/core.html "Apache Core Features" 121406
933 /usr/local/apache/htdocs/manual/netware.html "Using Apache with Novell NetWare 5" 11345
That's returning the properties rank, path, title (called swishtitle), document size by default.
Swish can also store as a property the words extracted while indexing and return that
text in the search results. This property is called "swishdescription". It's a lot
faster for swish to return this in results than to go and fetch the source document by path
and then extract out the content.
=cut
# here's the emulated results from swish placed in the "properties" hash
show() for ( 1..15 );
sub show {
my %properties = (
swishtitle => 'Apache module mod_foobar',
swishdescription => Content::content(),
);
# emulate a result object
my $result = result->new;
my $hl = PhraseHighlight->new($result, 'swishdefault' );
$hl->highlight( \%properties );
use Text::Wrap qw(wrap);
print "\nTitle: $properties{ swishtitle }\n\nDescription:\n",
wrap(' ',' ',$properties{ swishdescription }),"\n";
}
#===============================================================
package result;
use strict;
use Carp;
sub new {
bless {
config => {
description_prop => 'swishdescription',
highlight => {
package => 'PhraseHighlight',
show_words => 10, # Number of swish words words to show around highlighted word
max_words => 100, # If no words are found to highlighted then show this many words
occurrences => 6, # Limit number of occurrences of highlighted words
highlight_on => '<<on>>',
highlight_off => '<</off>>',
meta_to_prop_map => { # this maps search metatags to display properties
swishdefault => [ qw/swishtitle swishdescription/ ],
swishtitle => [ qw/swishtitle/ ],
swishdocpath => [ qw/swishdocpath/ ],
},
},
},
}, shift;
}
sub header {
my ( $self, $value ) = @_;
my %values = (
wordcharacters => 'abcdefghijklmnopqrstuvwxyz-,.',
ignorefirstchar => '.-,',
ignorelastchar => '.-,',
'stemming applied' => 0,
stopwords => 'and the is for',
);
return $values{$value} || '';
}
sub config {
my ($self, $setting, $value ) = @_;
croak "Failed to pass 'config' a setting" unless $setting;
my $cur = $self->{config}{$setting} if exists $self->{config}{$setting};
$self->{config}{$setting} = $value if $value;
return $cur;
}
# This emulates the parsing of the query passed to swish-e
sub extract_query_match {
return {
text => { ## can be text or url "layer"
swishdefault => [ ## metaname searched
[qw/directive not compatible/], ## phrase made up of three words (not stopword missing)
[qw/ foobar /], ## phrase of one word
[qw/ des* /], ## wildcard search
],
},
};
}
#=======================================================================
# Phrase Highlighting Code
#
# copyright 2001 - Bill Moseley moseley@hank.org
#
# $Id: PhraseTest.pm,v 1.1 2002/01/30 06:35:00 stas Exp $
#=======================================================================
package PhraseHighlight;
use strict;
use constant DEBUG_HIGHLIGHT => 0;
sub new {
my ( $class, $results, $metaname ) = @_;
my $self = bless {
results => $results, # just in case we need a method
settings=> $results->config('highlight'),
metaname=> $metaname,
}, $class;
# parse out the query into words
my $query = $results->extract_query_match;
# Do words exist for this layer (all text at this time) and metaname?
# This is a reference to an array of phrases and words
$self->{description_prop} = $results->config('description_prop') || '';
if ( $results->header('stemming applied') =~ /^(?:1|yes)$/i ) {
eval { require SWISH::Stemmer };
if ( $@ ) {
$results->errstr('Stemmed index needs Stemmer.pm to highlight: ' . $@);
} else {
$self->{stemmer_function} = \&SWISH::Stemmer::SwishStem;
}
}
my %stopwords = map { $_, 1 } split /\s+/, $results->header('stopwords');
$self->{stopwords} = \%stopwords;
if ( $query && exists $query->{text}{$metaname} ) {
$self->{query} = $query->{text}{$metaname};
$self->set_match_regexp;
}
return $self;
}
sub highlight {
my ( $self, $properties ) = @_;
return unless $self->{query};
my $phrase_array = $self->{query};
my $settings = $self->{settings};
my $metaname = $self->{metaname};
# Do we care about this meta?
return unless exists $settings->{meta_to_prop_map}{$metaname};
# Get the related properties
my @props = @{ $settings->{meta_to_prop_map}{$metaname} };
my %checked;
for ( @props ) {
if ( $properties->{$_} ) {
$checked{$_}++;
$self->highlight_text( \$properties->{$_}, $phrase_array );
}
}
# Truncate the description, if not processed.
my $description = $self->{description_prop};
if ( $description && !$checked{ $description } && $properties->{$description} ) {
my $max_words = $settings->{max_words} || 100;
my @words = split /\s+/, $properties->{$description};
if ( @words > $max_words ) {
$properties->{$description} = join ' ', @words[0..$max_words], '<b>...</b>';
}
}
}
#==========================================================================
#
sub highlight_text {
my ( $self, $text_ref, $phrase_array ) = @_;
my $wc_regexp = $self->{wc_regexp};
my $extract_regexp = $self->{extract_regexp};
my $last = 0;
my $settings = $self->{settings};
my $Show_Words = $settings->{show_words} || 10;
my $Occurrences = $settings->{occurrences} || 5;
my $on_flag = 'sw' . time . 'on';
my $off_flag = 'sw' . time . 'off';
my $stemmer_function = $self->{stemmer_function};
# Should really call unescapeHTML(), but then would need to escape <b> from escaping.
# Split into words. For speed, should work on a stream method.
my @words;
$self->split_by_wordchars( \@words, $text_ref );
return 'No Content saved: Check StoreDescription setting' unless @words;
my @flags; # This marks where to start and stop display.
$flags[$#words] = 0; # Extend array.
my $occurrences = $Occurrences ;
my $word_pos = $words[0] eq '' ? 2 : 0; # Start depends on if first word was wordcharacters or not
my @phrases = @{ $self->{query} };
# Remember, that the swish words are every other in @words.
WORD:
while ( $Show_Words && $word_pos * 2 < @words ) {
PHRASE:
foreach my $phrase ( @phrases ) {
print STDERR " Search phrase '@$phrase'\n" if DEBUG_HIGHLIGHT;
next PHRASE if ($word_pos + @$phrase -1) * 2 > @words; # phrase is longer than what's left
my $end_pos = 0; # end offset of the current phrase
# now compare all the words in the phrase
my ( $begin, $word, $end );
for my $match_word ( @$phrase ) {
my $cur_word = $words[ ($word_pos + $end_pos) * 2 ];
unless ( $cur_word =~ /$extract_regexp/ ) {
my $idx = ($word_pos + $end_pos) * 2;
my ( $s, $e ) = ( $idx - 10, $idx + 10 );
$s = 0 if $s < 0;
$e = @words-1 if $e >= @words;
warn "Failed to parse IgnoreFirst/Last from word '"
. (defined $cur_word ? $cur_word : '*undef')
. "' (index: $idx) word_pos:$word_pos end_pos:$end_pos total:"
. scalar @words
. "\n-search pharse words-\n"
. join( "\n", map { "$_ '$phrase->[$_]'" } 0..@$phrase -1 )
. "\n-Words-\n"
. join( "\n", map { "$_ '$words[$_]'" . ($_ == $idx ? ' <<< this word' : '') } $s..$e )
. "\n";
next PHRASE;
}
# Strip ignorefirst and ignorelast
( $begin, $word, $end ) = ( $1, $2, $3 ); # this is a waste, as it can operate on the same word over and over
my $check_word = lc $word;
if ( $end_pos && exists $self->{stopwords}{$check_word} ) {
$end_pos++;
print STDERR " Found stopword '$check_word' in the middle of phrase - * MATCH *\n" if DEBUG_HIGHLIGHT;
redo if ( $word_pos + $end_pos ) * 2 < @words; # go on to check this match word with the next word.
# No more words to match with, so go on to next pharse.
next PHRASE;
}
if ( $stemmer_function ) {
my $w = $stemmer_function->($check_word);
$check_word = $w if $w;
}
print STDERR " comparing source # (word:$word_pos offset:$end_pos) '$check_word' == '$match_word'\n" if DEBUG_HIGHLIGHT;
if ( substr( $match_word, -1 ) eq '*' ) {
next PHRASE if index( $check_word, substr($match_word, 0, length( $match_word ) - 1) ) != 0;
} else {
next PHRASE if $check_word ne $match_word;
}
print STDERR " *** Word Matched '$check_word' *** \n" if DEBUG_HIGHLIGHT;
$end_pos++;
}
print STDERR " *** PHRASE MATCHED (word:$word_pos offset:$end_pos) *** \n" if DEBUG_HIGHLIGHT;
# We are currently at the end word, so it's easy to set that highlight
$end_pos--;
if ( !$end_pos ) { # only one word
$words[$word_pos * 2] = "$begin$on_flag$word$off_flag$end";
} else {
$words[($word_pos + $end_pos) * 2 ] = "$begin$word$off_flag$end";
#Now, reload first word of match
$words[$word_pos * 2] =~ /$extract_regexp/ or die "2 Why didn't '$words[$word_pos]' =~ /$extract_regexp/?";
# Strip ignorefirst and ignorelast
( $begin, $word, $end ) = ( $1, $2, $3 ); # probably should cache this!
$words[$word_pos * 2] = "$begin$on_flag$word$end";
}
# Now, flag the words around to be shown
my $start = ($word_pos - $Show_Words + 1) * 2;
my $stop = ($word_pos + $end_pos + $Show_Words - 2) * 2;
if ( $start < 0 ) {
$stop = $stop - $start;
$start = 0;
}
$stop = $#words if $stop > $#words;
$flags[$_]++ for $start .. $stop;
# All done, and mark where to stop looking
if ( $occurrences-- <= 0 ) {
$last = $end;
last WORD;
}
# Now reset $word_pos to word following
$word_pos += $end_pos; # continue will still be executed
next WORD;
}
} continue {
$word_pos ++;
}
my @output;
$self->build_highlighted_text( \@output, \@words, \@flags, $last );
$self->join_words( \@output, $text_ref );
$self->escape_entities( $text_ref );
$self->substitue_highlight( $text_ref, $on_flag, $off_flag );
# $$text_ref = join '', @words; # interesting that this seems reasonably faster
}
#====================================================================
# Split the source text into swish and non-swish words
sub split_by_wordchars {
my ( $self, $words, $text_ref ) = @_;
my $wc_regexp = $self->{wc_regexp};
@$words = split /$wc_regexp/, $$text_ref;
}
#=======================================================================
# Put all the words together for display
#
sub build_highlighted_text {
my ( $self, $output, $words, $flags, $last ) = @_;
my $dotdotdot = ' ... ';
my $printing;
my $first = 1;
my $some_printed;
my $settings = $self->{settings};
my $Show_Words = $settings->{show_words} || 10;
if ( $Show_Words && @$words > 50 ) { # don't limit context if a small number of words
for my $i ( 0 ..$#$words ) {
if ( $last && $i >= $last && $i < $#$words ) {
push @$output, $dotdotdot;
last;
}
if ( $flags->[$i] ) {
push @$output, $dotdotdot if !$printing++ && !$first;
push @$output, $words->[$i];
$some_printed++;
} else {
$printing = 0;
}
$first = 0;
}
}
if ( !$some_printed ) {
my $Max_Words = $settings->{max_words} || 100;
for my $i ( 0 .. $Max_Words ) {
if ( $i > $#$words ) {
$printing++;
last;
}
push @$output, $words->[$i];
}
}
push @$output, $dotdotdot if !$printing;
}
#==================================================================
#
sub join_words {
my ( $self, $output, $text_ref ) = @_;
$$text_ref = join '', @$output;
}
sub escape_entities {
my ( $self, $text_ref ) = @_;
my %entities = (
'&' => '&',
'>' => '>',
'<' => '<',
'"' => '"',
);
$$text_ref =~ s/([&"<>])/$entities{$1}/ge;
}
#========================================================
# replace the highlight codes
sub substitue_highlight {
my ( $self, $text_ref, $on_flag, $off_flag ) = @_;
my $settings = $self->{settings};
my $On = $settings->{highlight_on} || '<b>';
my $Off = $settings->{highlight_off} || '</b>';
my %highlight = (
$on_flag => $On,
$off_flag => $Off,
);
$$text_ref =~ s/($on_flag|$off_flag)/$highlight{$1}/ge;
}
#============================================
# Returns compiled regular expressions for matching
#
#
sub set_match_regexp {
my $self = shift;
my $results = $self->{results};
my $wc = $results->header('wordcharacters');
my $ignoref = $results->header('ignorefirstchar');
my $ignorel = $results->header('ignorelastchar');
$wc = quotemeta $wc;
#Convert query into regular expressions
for ( $ignoref, $ignorel ) {
if ( $_ ) {
$_ = quotemeta;
$_ = "([$_]*)";
} else {
$_ = '()';
}
}
$wc .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'; # Warning: dependent on tolower used while indexing
# Now, wait a minute. Look at this more, as I'd hope that making a
# qr// go out of scope would release the compiled pattern.
if ( $ENV{MOD_PERL} ) {
$self->{wc_regexp} = qr/([^$wc]+)/; # regexp for splitting into swish-words
$self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/i; # regexp for extracting out the words to compare
} else {
$self->{wc_regexp} = qr/([^$wc]+)/o; # regexp for splitting into swish-words
$self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/oi; # regexp for extracting out the words to compare
}
}
package Content;
sub content {
my $content = <<EOF;
Apache HTTP Server Version 1.3 Module mod_foobar Add this file as a link in mod/index.html
This module is contained in the mod_foobar.c file, and is/is not compiled in by default.
It provides for the foobar feature. Any document with the mime type foo/bar will be processed by this module.
Add the magic mime type to the list in magic_types.html Summary General module documentation here.
Directives ADirective Add these directives to the list in directives.html
ADirective directive Syntax: ADirective some args Default:
ADirective default value Context: context-list context-list is where this directive can appear;
allowed: server config, virtual host, directory, .htaccess Override: override required if the
directive is allowed in .htaccess files; the AllowOverride option that allows the directive.
Status: status Core if in core apache, Base if in one of the standard modules,
Extension if in an extension module (not compiled in by default) or
Experimental Module: mod_foobar Compatibility: compatibility notes Describe any compatibility issues,
such as "Only available in Apache 1.2 or later," or "The Apache syntax for this directive is not
compatible with the NCSA directive of the same name." The ADirective directive does something.
Apache HTTP Server Version 1.3
EOF
$content =~ s/\n/ /g;
return $content;
}
1.1 modperl-docs/src/search/modules/SimpleHighlight.pm
Index: SimpleHighlight.pm
===================================================================
#=======================================================================
# Simple Highlighting Code
# $Id: SimpleHighlight.pm,v 1.1 2002/01/30 06:35:00 stas Exp $
#=======================================================================
package SimpleHighlight;
use strict;
sub new {
my ( $class, $results, $metaname ) = @_;
my $self = bless {
results => $results, # just in case we need a method
settings=> $results->config('highlight'),
metaname=> $metaname,
}, $class;
# parse out the query into words
my $query = $results->extract_query_match;
# Do words exist for this layer (all text at this time) and metaname?
# This is a reference to an array of phrases and words
if ( $query && exists $query->{text}{$metaname} ) {
$self->{query} = $query->{text}{$metaname};
$self->set_match_regexp;
}
return $self;
}
sub highlight {
my ( $self, $properties ) = @_;
return unless $self->{query};
my $phrase_array = $self->{query};
my $settings = $self->{settings};
my $metaname = $self->{metaname};
# Do we care about this meta?
return unless exists $settings->{meta_to_prop_map}{$metaname};
# Get the related properties
my @props = @{ $settings->{meta_to_prop_map}{$metaname} };
for ( @props ) {
if ( $properties->{$_} ) {
$self->highlight_text( \$properties->{$_}, $phrase_array );
}
}
}
#==========================================================================
#
sub highlight_text {
my ( $self, $text_ref, $phrase_array ) = @_;
my $settings = $self->{settings};
my $Show_Words = $settings->{show_words} || 10;
my $Occurrences = $settings->{occurrences} || 5;
my $Max_Words = $settings->{max_words} || 100;
my $On = $settings->{highlight_on} || '<b>';
my $Off = $settings->{highlight_off} || '</b>';
my @words = split /\s+/, $$text_ref;
if ( @words > $Max_Words ) {
$$text_ref = join ' ', @words[0..$Max_Words], '<b>...</b>';
}
for ( @{ $self->{matches} } ) {
$$text_ref =~ s/($_)/$On$1$Off/g;
}
}
#============================================
# Returns compiled regular expressions for matching
#
# This builds a list of expressions to match against the text.
sub set_match_regexp {
my $self = shift;
my $results = $self->{results};
my $wc = $results->header('wordcharacters');
$wc = quotemeta $wc;
my @matches;
# loop through all the phrase
for ( @{$self->{query}} ) {
# Fix up wildcards
my $exp = join '[^$wc]+',
map { '\b' . $_ . '\b' }
map { substr( $_, -1, 1 ) eq '*'
? quotemeta( substr( $_, 0, -1) ) . "[$wc]*?"
: quotemeta
} @$_;
push @matches, qr/$exp/i;
}
$self->{matches} = \@matches;
}
1;
1.1 modperl-docs/src/search/modules/TemplateDefault.pm
Index: TemplateDefault.pm
===================================================================
#=====================================================================
# These routines format the HTML output.
# $Id: TemplateDefault.pm,v 1.1 2002/01/30 06:35:00 stas Exp $
#=====================================================================
package TemplateDefault;
use strict;
use CGI;
sub show_template {
my ( $class, $template_params, $results ) = @_;
my $q = $results->CGI;
my $output = $q->header . page_header( $results );
# Show form at top always
$output .= show_form( $results );
if ( $results->results ) {
$output .= results_header( $results );
$output .= show_result( $results, $_ ) for @{ $results->results };
}
# Form after results (or at top if no results)
#$output .= show_form( $results );
$output .= footer( $results );
print $output;
}
#=====================================================================
# This generates the header
sub page_header {
my $results = shift;
my $title = $results->config('title') || 'Search our site with Swish-e';
my $message = $results->errstr;
$message = $message
? qq[<br><font color=red>$message</font>]
: '' ;
my $html_title = $results->results
? ( $results->navigation('hits')
. ' Results for ['
. CGI::escapeHTML( $results->{query_simple} )
. ']'
)
: ( $results->errstr || $title );
return <<EOF;
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
<html>
<head>
<title>
$html_title
</title>
</head>
<body>
<h2>
<a href="http://swish-e.org">
<img border="0" alt="Swish-e home page" src="http://swish-e.org/Images/swish-e.gif"></a> $title $message
</h2>
EOF
}
#=====================================================================
# This generates the form
#
# Pass:
# $results hash
sub show_form {
my $results = shift;
my $q = $results->{q};
my $query = $q->param('query') || '';
$query = CGI::escapeHTML( $query ); # May contain quotes
# Here's some form components
my $meta_select_list = get_meta_name_limits( $results );
my $sorts = get_sort_select_list( $results );
my $select_index = get_index_select_list( $results );
my $limit_select = get_limit_select( $results );
my $date_ranges_select = $results->get_date_ranges;
my $form = $q->script_name;
return <<EOF;
<form method="post" action="$form" enctype="application/x-www-form-urlencoded" class="form">
<input / maxlength="200" value="$query" size="32" type="text" name="query">
<input / value="Search!" type="submit" name="submit"><br>
$meta_select_list
$sorts
$select_index
$limit_select
$date_ranges_select
</form>
EOF
}
#=====================================================================
# This routine creates the results header display
# and navigation bar
#
#
#
sub results_header {
my $results = shift;
my $config = $results->{config};
my $q = $results->{q};
my $swr = $results->header('removed stopwords');
my $stopwords = '';
if ( $swr && ref $swr eq 'ARRAY' ) {
$stopwords = @$swr > 1
? join( ', ', map { "<b>$_</b>" } @$swr ) . ' are very common words and were not included in your search'
: join( ', ', map { "<b>$_</b>" } @$swr ) . ' is a very common word and was not included in your search';
}
my $limits = '';
# Ok, this is ugly.
if ( $results->{DateRanges_time_low} && $results->{DateRanges_time_high} ) {
my $low = scalar localtime $results->{DateRanges_time_low};
my $high = scalar localtime $results->{DateRanges_time_high};
$limits = <<EOF;
<tr>
<td colspan=2>
<font size="-2" face="Geneva, Arial, Helvetica, San-Serif">
Results limited to dates $low to $high
</font>
</td>
</tr>
EOF
}
my $query_href = $results->{query_href};
my $query_simple = CGI::escapeHTML( $results->{query_simple} );
my $pages = $results->navigation('pages');
my $prev = $results->navigation('prev');
my $prev_count = $results->navigation('prev_count');
my $next = $results->navigation('next');
my $next_count = $results->navigation('next_count');
my $hits = $results->navigation('hits');
my $from = $results->navigation('from');
my $to = $results->navigation('to');
my $run_time = $results->navigation('run_time');
my $search_time = $results->navigation('search_time');
my $links = '';
$links .= '<font size="-1" face="Geneva, Arial, Helvetica, San-Serif"> Page:</font>' . $pages
if $pages;
$links .= qq[ <a href="$query_href&start=$prev">Previous $prev_count</a>]
if $prev_count;
$links .= qq[ <a href="$query_href&start=$next">Next $next_count</a>]
if $next_count;
# Save for the bottom of the screen.
$results->{LINKS} = $links;
$links = qq[<tr><td colspan=2 bgcolor="#EEEEEE">$links</td></tr>] if $links;
$query_simple = $query_simple
? " Results for <b>$query_simple</b>"
: '';
return <<EOF;
<table cellpadding=0 cellspacing=0 border=0 width="100%">
<tr>
<td height=20 bgcolor="#FF9999">
<font size="-1" face="Geneva, Arial, Helvetica, San-Serif">
$query_simple
$from to $to of $hits results.
</font>
</td>
<td align=right bgcolor="#FF9999">
<font size="-2" face="Geneva, Arial, Helvetica, San-Serif">
Run time: $run_time |
Search time: $search_time
</font>
</td>
</tr>
$links
$limits
$stopwords
</table>
EOF
}
#=====================================================================
# This routine formats a single result
#
#
sub show_result {
my ($results, $this_result ) = @_;
my $conf = $results->{conf};
my $DocTitle = $results->config('title_property') || 'swishtitle';
my $title = $this_result->{$DocTitle} || $this_result->{swishdocpath} || '?';
my $name_labels = $results->config('name_labels');
# The the properties to display
my $props = '';
my $display_props = $results->config('display_props');
if ( $display_props ) {
$props = join "\n",
'<br><table cellpadding=0 cellspacing=0>',
map ( {
'<tr><td><small>'
. ( $name_labels->{$_} || $_ )
. ':</small></td><td><small> '
. '<b>'
. $this_result->{$_}
. '</b>'
. '</small></td></tr>'
} @$display_props
),
'</table>';
}
my $description_prop = $results->config('description_prop');
my $description = '';
if ( $description_prop ) {
$description = $this_result->{ $description_prop } || '';
}
return <<EOF;
<dl>
<dt>$this_result->{swishreccount} <a href="$this_result->{swishdocpath_href}">$title</a> <small>-- rank: <b>$this_result->{swishrank}</b></small></dt>
<dd>$description
$props
</dd>
</dl>
EOF
}
#=====================================================================
# This is displayed on the bottom of every page
#
#
sub footer {
my $mod_perl = $ENV{MOD_PERL}
? '<br><small>Response brought to you by <a href="http://perl.apache.org"><em>mod_perl</em></a></small>'
: '';
return <<EOF;
<hr>
<small>Powered by <em>Swish-e</em> <a href="http://swish-e.org">swish-e.org</a></small>
$mod_perl
<p>
<a href="http://validator.w3.org/check/referer"><img border="0"
src="http://www.w3.org/Icons/valid-html401"
alt="Valid HTML 4.01!" height="31" width="88"></a>
</p>
</body>
</html>
EOF
}
#==================================================================
# Form setup for sorts and metas
#
# This could be methods of $results object
# (and then available for Template-Toolkit)
# But that's too much HTML in the object, perhaps.
#
#
#==================================================================
sub get_meta_name_limits {
my ( $results ) = @_;
my $metanames = $results->config('metanames');
return '' unless $metanames;
my $name_labels = $results->config('name_labels');
my $q = $results->CGI;
return join "\n",
'Limit search to:',
$q->radio_group(
-name =>'metaname',
-values => $metanames,
-default=>$metanames->[0],
-labels =>$name_labels
),
'<br>';
}
sub get_sort_select_list {
my ( $results ) = @_;
my $sort_metas = $results->config('sorts');
return '' unless $sort_metas;
my $name_labels = $results->config('name_labels');
my $q = $results->CGI;
return join "\n",
'Sort by:',
$q->popup_menu(
-name =>'sort',
-values => $sort_metas,
-default=>$sort_metas->[0],
-labels =>$name_labels
),
$q->checkbox(
-name => 'reverse',
-label => 'Reverse Sort'
);
}
sub get_index_select_list {
my ( $results ) = @_;
my $q = $results->CGI;
my $indexes = $results->config('swish_index');
return '' unless ref $indexes eq 'ARRAY';
my $select_config = $results->config('select_indexes');
return '' unless $select_config && ref $select_config eq 'HASH';
# Should return a warning, as this might be a likely mistake
# This jumps through hoops so that real index file name is not exposed
return '' unless exists $select_config->{labels}
&& ref $select_config->{labels} eq 'ARRAY'
&& @$indexes == @{$select_config->{labels}};
my @labels = @{$select_config->{labels}};
my %map;
for ( 0..$#labels ) {
$map{$_} = $labels[$_];
}
my $method = $select_config->{method} || 'checkbox_group';
my @cols = $select_config->{columns} ? ('-columns', $select_config->{columns}) : ();
return join "\n",
'<br>',
( $select_config->{description} || 'Select: '),
$q->$method(
-name => 'si',
-values => [0..$#labels],
-default=> 0,
-labels => \%map,
@cols );
}
sub get_limit_select {
my ( $results ) = @_;
my $q = $results->CGI;
my $limit = $results->config('select_by_meta');
return '' unless ref $limit eq 'HASH';
my $method = $limit->{method} || 'checkbox_group';
my @options = (
-name => 'sbm',
-values => $limit->{values},
-labels => $limit->{labels} || {},
);
push @options, ( -columns=> $limit->{columns} ) if $limit->{columns};
return join "\n",
'<br>',
( $limit->{description} || 'Select: '),
$q->$method( @options );
}
1;
1.1 modperl-docs/src/search/modules/TemplateDumper.pm
Index: TemplateDumper.pm
===================================================================
#===============================================================================
# This just dumps out the results object.
# Need to use a query string to do anything interesting
#
# $Id: TemplateDumper.pm,v 1.1 2002/01/30 06:35:00 stas Exp $
#
#===============================================================================
package TemplateDumper;
use strict;
use Data::Dumper;
sub show_template {
my ( $class, $template_params, $results ) = @_;
$Data::Dumper::Quotekeys = 0;
print "Content-Type: text/plain\n\n",
Dumper $template_params,
Dumper $results;
}
1;
1.1 modperl-docs/src/search/modules/TemplateHTMLTemplate.pm
Index: TemplateHTMLTemplate.pm
===================================================================
#=======================================================================
# Module for using Template-Toolkit for generating output
# $Id: TemplateHTMLTemplate.pm,v 1.1 2002/01/30 06:35:00 stas Exp $
#
# This module probably does not automatically support all the features
# of the swish.cgi script (such as selecting index files). See
# the TemplateToolkit.pm module for examples.
#
#=======================================================================
package TemplateHTMLTemplate;
use strict;
use HTML::Template;
use HTML::FillInForm;
use CGI ();
use vars '$Template';
use TemplateDefault; # ugly hack
sub show_template {
my ( $class, $template_params, $results ) = @_;
my $cgi = $results->CGI;
my $template = HTML::Template->new( %{$template_params->{options}} );
my $params = {
TITLE => ($results->config('title') || 'Search Page'),
QUERY_SIMPLE => CGI::escapeHTML( $results->{query_simple} ),
MESSAGE => CGI::escapeHTML( $results->errstr ),
QUERY_HREF => $results->{query_href},
MY_URL => $cgi->script_name,
HITS => $results->navigation('hits'),
FROM => $results->navigation('from'),
TO => $results->navigation('to'),
SHOWING => $results->navigation('showing'),
PAGES => $results->navigation('pages'),
NEXT => $results->navigation('next'),
NEXT_COUNT => $results->navigation('next_count'),
PREV => $results->navigation('prev'),
PREV_COUNT => $results->navigation('prev_count'),
RUN_TIME => $results->header('run time') || 'unknown',
SEARCH_TIME => $results->header('search time') || 'unknown',
MOD_PERL => $ENV{MOD_PERL},
};
$params->{FILES} = $results->results if $results->results;
my $MapNames = $results->config('name_labels') || {};
my $Sorts = $results->config('sorts');
my $MetaNames = $results->config('metanames');
# set a default
if ( $MetaNames && !$cgi->param('metanames') ) {
$cgi->param('metaname', $MetaNames->[0] );
}
$params->{SORTS} = [ map { { NAME => $_, LABEL => ($MapNames->{$_} || $_) } } @$Sorts ] if $Sorts;
$params->{METANAMES} = [ map { { NAME => $_, LABEL => ($MapNames->{$_} || $_) } } @$MetaNames ] if $MetaNames;
$template->param( $params );
my $page = $template->output;
my $fif = new HTML::FillInForm;
print $cgi->header,
'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">',
$fif->fill(
scalarref => \$page,
fobject => $cgi,
);
}
1;
1.1 modperl-docs/src/search/modules/TemplateToolkit.pm
Index: TemplateToolkit.pm
===================================================================
#=======================================================================
# Module for using Template-Toolkit for generating output
# $Id: TemplateToolkit.pm,v 1.1 2002/01/30 06:35:00 stas Exp $
#
#=======================================================================
package TemplateToolkit;
use strict;
use Template;
use vars '$Template';
sub show_template {
my ( $class, $template_params, $results ) = @_;
my $cgi = $results->CGI;
#/* Cached if running under mod_perl */
$Template ||= Template->new( $template_params->{options} );
die $Template->error() unless $Template;
print $cgi->header;
my $subclass = TemplateToolkit::Helpers->new( $results );
my $vars = {
search => $subclass,
CGI => $results->CGI,
};
$Template->process( $template_params->{file}, $vars )
|| die "Template process failed for page '$template_params->{file}' ", $Template->error(), "\n";
}
#==================================================================
# Form setup for sorts and metas
#
# This could be methods of $results object
# (and then available for Template-Toolkit)
# But that's too much HTML in the object, perhaps.
#
#
#==================================================================
package TemplateToolkit::Helpers;
use strict;
sub new {
my ( $class, $results ) = @_;
@TemplateToolkit::Helpers::ISA = ref $results; # that doesn't look right.
return bless $results, $class;
}
sub get_meta_name_limits {
my ( $results ) = @_;
my $metanames = $results->config('metanames');
return '' unless $metanames;
my $name_labels = $results->config('name_labels');
my $q = $results->CGI;
return join "\n",
'Limit search to:',
$q->radio_group(
-name =>'metaname',
-values => $metanames,
-default=>$metanames->[0],
-labels =>$name_labels
),
'<br>';
}
sub get_sort_select_list {
my ( $results ) = @_;
my $sort_metas = $results->config('sorts');
return '' unless $sort_metas;
my $name_labels = $results->config('name_labels');
my $q = $results->CGI;
return join "\n",
'Sort by:',
$q->popup_menu(
-name =>'sort',
-values => $sort_metas,
-default=>$sort_metas->[0],
-labels =>$name_labels
),
$q->checkbox(
-name => 'reverse',
-label => 'Reverse Sort'
);
}
sub get_index_select_list {
my ( $results ) = @_;
my $q = $results->CGI;
my $indexes = $results->config('swish_index');
return '' unless ref $indexes eq 'ARRAY';
my $select_config = $results->config('select_indexes');
return '' unless $select_config && ref $select_config eq 'HASH';
# Should return a warning, as this might be a likely mistake
# This jumps through hoops so that real index file name is not exposed
return '' unless exists $select_config->{labels}
&& ref $select_config->{labels} eq 'ARRAY'
&& @$indexes == @{$select_config->{labels}};
my @labels = @{$select_config->{labels}};
my %map;
for ( 0..$#labels ) {
$map{$_} = $labels[$_];
}
my $method = $select_config->{method} || 'checkbox_group';
my @cols = $select_config->{columns} ? ('-columns', $select_config->{columns}) : ();
return join "\n",
'<br>',
( $select_config->{description} || 'Select: '),
$q->$method(
-name => 'si',
-values => [0..$#labels],
-default=> 0,
-labels => \%map,
@cols );
}
sub get_limit_select {
my ( $results ) = @_;
my $q = $results->CGI;
my $limit = $results->config('select_by_meta');
return '' unless ref $limit eq 'HASH';
my $method = $limit->{method} || 'checkbox_group';
my @options = (
-name => 'sbm',
-values => $limit->{values},
-labels => $limit->{labels} || {},
);
push @options, ( -columns=> $limit->{columns} ) if $limit->{columns};
return join "\n",
'<br>',
( $limit->{description} || 'Select: '),
$q->$method( @options );
}
sub stopwords_removed {
my $results = shift;
my $swr = $results->header('removed stopwords');
my $stopwords = '';
if ( $swr && ref $swr eq 'ARRAY' ) {
$stopwords = @$swr > 1
? join( ', ', map { "<b>$_</b>" } @$swr ) . ' are very common words and were not included in your search'
: join( ', ', map { "<b>$_</b>" } @$swr ) . ' is a very common word and was not included in your search';
}
return $stopwords;
}
1;
---------------------------------------------------------------------
To unsubscribe, e-mail: docs-cvs-unsubscribe@perl.apache.org
For additional commands, e-mail: docs-cvs-help@perl.apache.org