You are viewing a plain text version of this content. The canonical link for it is here.
Posted to docs-cvs@perl.apache.org by st...@apache.org on 2002/01/30 07:35:01 UTC

cvs commit: modperl-docs/src/search/modules DateRanges.pm DefaultHighlight.pm PhraseHighlight.pm PhraseTest.pm SimpleHighlight.pm TemplateDefault.pm TemplateDumper.pm TemplateHTMLTemplate.pm TemplateToolkit.pm

stas        02/01/29 22:35:01

  Modified:    src      config.cfg
  Added:       src/search .htaccess .swishcgi.conf search.tt
                        searchresults.html spider.pl swish.cgi swish.conf
               src/search/modules DateRanges.pm DefaultHighlight.pm
                        PhraseHighlight.pm PhraseTest.pm SimpleHighlight.pm
                        TemplateDefault.pm TemplateDumper.pm
                        TemplateHTMLTemplate.pm TemplateToolkit.pm
  Removed:     src      searchresults.html
  Log:
  - add the search facility
  
  Revision  Changes    Path
  1.7       +2 -1      modperl-docs/src/config.cfg
  
  Index: config.cfg
  ===================================================================
  RCS file: /home/cvs/modperl-docs/src/config.cfg,v
  retrieving revision 1.6
  retrieving revision 1.7
  diff -u -r1.6 -r1.7
  --- config.cfg	28 Jan 2002 03:36:58 -0000	1.6
  +++ config.cfg	30 Jan 2002 06:35:00 -0000	1.7
  @@ -51,7 +51,7 @@
                qw(
                   404.html
                   creation.html
  -                searchresults.html
  +                search/searchresults.html
                  )
            ],
        ],
  @@ -62,6 +62,7 @@
               images/*
               robots.txt
               .htaccess
  +            search
              )
        ],
   
  
  
  
  1.1                  modperl-docs/src/search/.htaccess
  
  Index: .htaccess
  ===================================================================
  deny from all
  <files swish.cgi>
     allow from all
     Options +ExecCGI
  </files>
  
  
  
  
  1.1                  modperl-docs/src/search/.swishcgi.conf
  
  Index: .swishcgi.conf
  ===================================================================
  return {
      title => 'Search mod_perl Site',
      template => {
              package         => 'TemplateToolkit',
              file            => 'search.tt',
              options         => {
                  INCLUDE_PATH    => '.',
              },
          },
  };
  
  
  
  1.1                  modperl-docs/src/search/search.tt
  
  Index: search.tt
  ===================================================================
  
  
  [% # This is just an example -- you would want your own "page" for the wrapper %]
  
  [% WRAPPER searchresults.html %]
  
      [% PROCESS swish_header %]
  
  
      [% title = PROCESS title %]
  
      [% IF ! search.results %]
          [% PROCESS show_message %]
          [% PROCESS search_form %]
  
      [% ELSE %]
          [% PROCESS search_form %]
          [% PROCESS nav_bar %]
          [% PROCESS results_list %]
      [% END %]
  
      [% PROCESS swish_footer %]
  
  [% END %]
  
  
  [% # This is just an example -- you would want your own "page" to wrap around "swish" %]
  [% BLOCK page %]
  <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
  <html>
  <head>
  <title>
    [% title || "Welcome"  %]
  </title>
  <body>
     [% content %]
  </body>    
  </html>    
  [% END %]
  
  
  [% BLOCK title %]
      [% IF ! search.results %]
          [% search.errstr || search.config('title') || "Search page" %]
  
      [% ELSE %]
          [% search.navigation('hits') %] Results for [[% search.query_simple | html %]]
      [% END %]
  [% END %]
  
  [% BLOCK swish_header %]    
  
      <table border="0" cellpadding="0" cellspacing="0">
      <tr><td>
          <a href="http://swish-e.org">
          <img border="0" alt="Swish-e home page" src="http://swish-e.org/Images/swish-e.gif"></a>
      </td><td valign="middle">
          <font size="+3">[% search.config('title') || "Search page"  %]</font>
      </td></tr>
      </table>
  
  [% END %]    
  
  [% BLOCK swish_footer %]
      <hr>
      <small>Powered by <em>Swish-e</em> <a href="http://swish-e.org">swish-e.org</a></small>
      [% IF search.MOD_PERL %]
          <br><small>Response brought to you by <a href="http://perl.apache.org"><em>mod_perl</em></a></small>
      [% END %]
    <p>
      <a href="http://validator.w3.org/check/referer"><img border="0"
          src="http://www.w3.org/Icons/valid-html401"
          alt="Valid HTML 4.01!" height="31" width="88"></a>
    </p>
  [% END %]    
  
  
  
  [% BLOCK show_message %]
      [% IF search.errstr %]
          <font size="+2" color="red">[% search.errstr | html %]</font>
      [% END %]
  [% END %]
  
  
  
  [% BLOCK search_form %]
      [% CGI.start_form( '-action' => CGI.script_name) %]
          [% CGI.textfield( {
              name    => 'query',
              size    => 40,
              maxlength => 200,
              } ) %]
  
          [% CGI.submit('submit','Search!') %]<br>
          [% search.get_meta_name_limits %]
          [% search.get_sort_select_list %]
          [% search.get_index_select_list %]
          [% search.get_limit_select %]
          
      [% CGI.end_form.join('') %]
  
  [% END %]
  
  
  [% BLOCK nav_bar %]
      [% search.stopwords_removed %]
      <table cellpadding=0 cellspacing=0 border=0 width="100%">
          <tr>
              <td height=20 bgcolor="#FF9999">
                  <font size="-1" face="Geneva, Arial, Helvetica, San-Serif">
                  &nbsp;Results for <b>[% search.query_simple | html %]</b>
                  &nbsp; [% search.navigation('from') %] to [% search.navigation('to') %] of [% search.navigation('hits') %] results.
                  </font>
              </td>
              <td align=right bgcolor="#FF9999">
                  <font size="-2" face="Geneva, Arial, Helvetica, San-Serif">
                  Run time: [% search.navigation('run_time') %] |
                  Search time: [% search.navigation('search_time') %] &nbsp; &nbsp;
                  </font>
              </td>
          </tr>
  
          [% IF search.navigation('pages') %]
  
          <tr>
              <td colspan=2 bgcolor="#EEEEEE">
                  <font size="-1" face="Geneva, Arial, Helvetica, San-Serif">&nbsp;Page:</font>
                  [% search.navigation('pages') %]
  
                  [% IF search.navigation('prev_count') %]
                      <a href="[% search.query_href %]&amp;start=[% search.navigation('prev') %]">
                      Previous [% search.navigation('prev_count') %]</a>
                  [% END %]
          
                  [% IF search.navigation('next_count') %]
                      <a href="[% search.query_href %]&amp;start=[% search.navigation('next') %]">
                      Next [% search.navigation('next_count') %]</a>
                  [% END %]
              </td>
          </tr>
          [% END %]
  
      </table>
  [% END %]
  
  
  [% BLOCK results_list %]
      [% FOREACH item = search.results %]
          <dl>
              <dt>
                  [% item.swishreccount %]
                   <a href="[% item.swishdocpath_href %]">
                  [% ( item.swishtitle || item.swishdocpath )  %]
                  </a>
  
                  <small>-- rank: <b>[% item.swishrank %]</b></small>
              </dt>
  
              <dd>
                  [% item.swishdescription %]
                  <br>
                  <small>
                      [% item.swishdocpath %]
                      - [% item.swishdocsize %] bytes
                      - [% item.swishlastmodified %]
                  </small>
              </dd>
          </dl>
      [% END %]
  [% END %]
      
  
      
  
  
  
  
  1.1                  modperl-docs/src/search/searchresults.html
  
  Index: searchresults.html
  ===================================================================
  <html>
    <head>
      <title>[% title %]</title>
      <meta name="Description" content="search results">
    </head>
    <body bgcolor="white">
  
        [% content %]
  
    </body>
  </html>
  
  
  
  1.1                  modperl-docs/src/search/spider.pl
  
  Index: spider.pl
  ===================================================================
  #!/usr/local/bin/perl -w
  use strict;
  
  
  # $Id: spider.pl,v 1.1 2002/01/30 06:35:00 stas Exp $
  #
  # "prog" document source for spidering web servers
  #
  # For documentation, type:
  #
  #       perldoc spider.pl
  #
  # Apr 7, 2001 -- added quite a bit of bulk for easier debugging
  #
  # Nov 19, 2001 -- to do, build a server object so we are not using the passed in hash,
  #                 and so we can warn on invalid config settings.
  
  $HTTP::URI_CLASS = "URI";   # prevent loading default URI::URL
                              # so we don't store long list of base items
                              # and eat up memory with >= URI 1.13
  use LWP::RobotUA;
  use HTML::LinkExtor;
  use HTML::Tagset;
  
  use vars '$VERSION';
  $VERSION = sprintf '%d.%02d', q$Revision: 1.1 $ =~ /: (\d+)\.(\d+)/;
  
  use vars '$bit';
  use constant DEBUG_ERRORS   => $bit = 1;    # program errors
  use constant DEBUG_URL      => $bit <<= 1;  # print out every URL processes
  use constant DEBUG_HEADERS  => $bit <<= 1;  # prints the response headers
  use constant DEBUG_FAILED   => $bit <<= 1;  # failed to return a 200
  use constant DEBUG_SKIPPED  => $bit <<= 1;  # didn't index for some reason
  use constant DEBUG_INFO     => $bit <<= 1;  # more verbose
  use constant DEBUG_LINKS    => $bit <<= 1;  # prints links as they are extracted
  
  
  use constant MAX_SIZE       => 5_000_000;   # Max size of document to fetch
  
  #Can't locate object method "host" via package "URI::mailto" at ../prog-bin/spider.pl line 473.
  #sub URI::mailto::host { return '' };
  
  sub UNIVERSAL::host { '' };
  sub UNIVERSAL::port { '' };
  sub UNIVERSAL::host_port { '' };
  
  
      
  #-----------------------------------------------------------------------
  
      use vars '@servers';
  
      my $config = shift || 'SwishSpiderConfig.pl';
  
      if ( lc( $config ) eq 'default' ) {
          @servers = default_urls();
      } else {
          do $config or die "Failed to read $0 configuration parameters '$config' $! $@";
      }
  
  
      print STDERR "$0: Reading parameters from '$config'\n";
  
      my $abort;
      local $SIG{HUP} = sub { $abort++ };
  
      my %visited;  # global -- I suppose would be smarter to localize it per server.
  
      my %validated;
      my %bad_links;
  
      for my $s ( @servers ) {
          if ( !$s->{base_url} ) {
              die "You must specify 'base_url' in your spider config settings\n";
          }
  
          for (ref $s->{base_url} eq 'ARRAY' ? @{$s->{base_url}} : $s->{base_url} ) {
              $s->{base_url} = $_;
              process_server( $s );
          }
      }
  
  
      if ( %bad_links ) {
          print STDERR "\nBad Links:\n\n";
          foreach my $page ( sort keys %bad_links ) {
              print STDERR "On page: $page\n";
              printf(STDERR " %-40s  %s\n", $_, $validated{$_} ) for @{$bad_links{$page}};
              print STDERR "\n";
          }
      }
  
  
  #-----------------------------------------------------------------------
  
  
  sub process_server {
      my $server = shift;
  
      # set defaults
  
      $server->{debug} ||= 0;
      $server->{debug} = 0 unless $server->{debug} =~ /^\d+$/;
  
  
      $server->{max_size} ||= MAX_SIZE;
      die "max_size parameter '$server->{max_size}' must be a number\n" unless $server->{max_size} =~ /^\d+$/;
  
      $server->{link_tags} = ['a'] unless ref $server->{link_tags} eq 'ARRAY';
      $server->{link_tags_lookup} = { map { lc, 1 } @{$server->{link_tags}} };
  
      die "max_depth parameter '$server->{max_depth}' must be a number\n" if defined $server->{max_depth} && $server->{max_depth} !~ /^\d+/;
  
  
      for ( qw/ test_url test_response filter_content/ ) {
          next unless $server->{$_};
          $server->{$_} = [ $server->{$_} ] unless ref $server->{$_} eq 'ARRAY';
          my $n;
          for my $sub ( @{$server->{$_}} ) {
              $n++;
              die "Entry number $n in $_ is not a code reference\n" unless ref $sub eq 'CODE';
          }
      }
  
      
  
      my $start = time;
  
      if ( $server->{skip} ) {
          print STDERR "Skipping: $server->{base_url}\n";
          return;
      }
  
      require "HTTP/Cookies.pm" if $server->{use_cookies};
      require "Digest/MD5.pm" if $server->{use_md5};
              
  
      # set starting URL, and remove any specified fragment
      my $uri = URI->new( $server->{base_url} );
      $uri->fragment(undef);
  
      print STDERR "\n -- Starting to spider: $uri --\n" if $server->{debug};
  
      
  
      # set the starting server name (including port) -- will only spider on server:port
      
      $server->{authority} = $uri->authority;
      $server->{same} = [ $uri->authority ];
      push @{$server->{same}}, @{$server->{same_hosts}} if ref $server->{same_hosts};
  
      $server->{same_host_lookup} = { map { $_, 1 } @{$server->{same}} };
  
      
  
  
      # set time to end
      
      $server->{max_time} = $server->{max_time} * 60 + time
          if $server->{max_time};
  
  
      # set default agent for log files
  
      $server->{agent} ||= 'swish-e spider 2.2 http://swish-e.org/';
  
  
      # get a user agent object
  
  
      my $ua;
      if ( $server->{ignore_robots_file} ) {
          $ua = LWP::UserAgent->new( );
          return unless $ua;
          $ua->agent( $server->{agent} );
          $ua->from( $server->{email} );
  
      } else {
          $ua = LWP::RobotUA->new( $server->{agent}, $server->{email} );
          return unless $ua;
          $ua->delay( $server->{delay_min} || 0.1 );
      }
  
          
      $server->{ua} = $ua;  # save it for fun.
      # $ua->parse_head(0);   # Don't parse the content
  
      $ua->cookie_jar( HTTP::Cookies->new ) if $server->{use_cookies};
  
      if ( $server->{keep_alive} ) {
  
          if ( $ua->can( 'conn_cache' ) ) {
              my $keep_alive = $server->{keep_alive} =~ /^\d+$/ ? $server->{keep_alive} : 1;
              $ua->conn_cache( { total_capacity => $keep_alive } );
  
          } else {
              warn "Can't use keep-alive: conn_cache method not available\n";
          }
      }
  
  
      # uri, parent, depth
      eval { spider( $server, $uri ) };
      print STDERR $@ if $@;
  
      $start = time - $start;
      $start++ unless $start;
  
      my $max_width = 0;
      my $max_num = 0;
      for ( keys %{$server->{counts}} ) {
          $max_width = length if length > $max_width;
          my $val = commify( $server->{counts}{$_} );
          $max_num = length $val if length $val > $max_num;
      }
  
      printf STDERR "\nSummary for: $server->{base_url}\n";
  
      for ( sort keys %{$server->{counts}} ) {
          printf STDERR "%${max_width}s: %${max_num}s  (%0.1f/sec)\n",
              $_,
              commify( $server->{counts}{$_} ),
              $server->{counts}{$_}/$start;
      }
  }
  
  #----------- Non recursive spidering ---------------------------
  
  sub spider {
      my ( $server, $uri ) = @_;
  
      # Validate the first link, just in case
      return unless check_link( $uri, $server, '', '(Base URL)' );
  
      my @link_array = [ $uri, '', 0 ];
  
      while ( @link_array ) {
  
          die if $abort || $server->{abort};
  
          my ( $uri, $parent, $depth ) = @{shift @link_array};
          
          my $new_links = process_link( $server, $uri, $parent, $depth );
  
          push @link_array, map { [ $_, $uri, $depth+1 ] } @$new_links if $new_links;
              
      }
  }    
          
  
  #----------- Process a url and return links -----------------------
  sub process_link {
      my ( $server, $uri, $parent, $depth ) = @_;
  
  
      $server->{counts}{'Unique URLs'}++;
  
      die "$0: Max files Reached\n"
          if $server->{max_files} && $server->{counts}{'Unique URLs'} > $server->{max_files};
  
      die "$0: Time Limit Exceeded\n"
          if $server->{max_time} && $server->{max_time} < time;
  
  
  
      # make request
      my $ua = $server->{ua};
      my $request = HTTP::Request->new('GET', $uri );
  
  
      my $content = '';
  
      # Really should just subclass the response object!
      $server->{no_contents} = 0;
      $server->{no_index} = 0;
      $server->{no_spider} = 0;
  
      my $been_here;
      my $callback = sub {
  
          die "test_response" if !$been_here++ && !check_user_function( 'test_response', $uri, $server, $_[1], \$_[0]  );
              
  
          if ( length( $content ) + length( $_[0] ) > $server->{max_size} ) {
              print STDERR "-Skipped $uri: Document exceeded $server->{max_size} bytes\n" if $server->{debug}&DEBUG_SKIPPED;
              die "too big!\n";
          }
  
          $content .= $_[0];
  
      };
  
      my $response = $ua->simple_request( $request, $callback, 4096 );
  
  
      return if $server->{abort};
  
  
      # Log the response
      
      if ( ( $server->{debug} & DEBUG_URL ) || ( $server->{debug} & DEBUG_FAILED && !$response->is_success)  ) {
          print STDERR '>> ',
            join( ' ',
                  ( $response->is_success ? '+Fetched' : '-Failed' ),
                  $depth,
                  "Cnt: $server->{counts}{'Unique URLs'}",
                  $uri,
                  ( $response->status_line || $response->status || 'unknown status' ),
                  ( $response->content_type || 'Unknown content type'),
                  ( $response->content_length || '???' ),
                  "parent:$parent",
             ),"\n";
      }
  
  
  
  
      # If the LWP callback aborts
  
      if ( $response->header('client-aborted') ) {
          $server->{counts}{Skipped}++;
          return;
      }
  
  
  
      # skip excluded by robots.txt
      
      if ( !$response->is_success && $response->status_line =~ 'robots.txt' ) {
          print STDERR "-Skipped $depth $uri: ", $response->status_line,"\n" if $server->{debug}&DEBUG_SKIPPED;
          $server->{counts}{'robots.txt'}++;
          return;
      }
  
  
      # Report bad links (excluding those skipped by robots.txt
  
      if ( $server->{validate_links} && !$response->is_success ) {
          validate_link( $server, $uri, $parent, $response );
      }
  
  
  
      # And check for meta robots tag
      # -- should probably be done in request sub to avoid fetching docs that are not needed
  
      unless ( $server->{ignore_robots_file} ) {
          if ( my $directives = $response->header('X-Meta-ROBOTS') ) {
              my %settings = map { lc $_, 1 } split /\s*,\s*/, $directives;
              $server->{no_contents}++ if exists $settings{nocontents};  # an extension for swish
              $server->{no_index}++    if exists $settings{noindex};
              $server->{no_spider}++   if exists $settings{nofollow};
          }
      }
  
  
  
  
      print STDERR "\n----HEADERS for $uri ---\n", $response->headers_as_string,"-----END HEADERS----\n\n"
         if $server->{debug} & DEBUG_HEADERS;
  
  
      unless ( $response->is_success ) {
  
          # look for redirect
          if ( $response->is_redirect && $response->header('location') ) {
              my $u = URI->new_abs( $response->header('location'), $response->base );
  
              if ( $u->canonical eq $uri->canonical ) {
                  print STDERR "Warning: $uri redirects to itself!.\n";
                  return;
              }
  
              return [$u] if check_link( $u, $server, $response->base, '(redirect)','Location' );
          }
          return;
      }
  
      return unless $content;  # $$$ any reason to index empty files?
      
  
      # make sure content is unique - probably better to chunk into an MD5 object above
  
      if ( $server->{use_md5} ) {
          my $digest =  Digest::MD5::md5($content);
  
          if ( $visited{ $digest } ) {
  
              print STDERR "-Skipped $uri has same digest as $visited{ $digest }\n"
                  if $server->{debug} & DEBUG_SKIPPED;
          
              $server->{counts}{Skipped}++;
              $server->{counts}{'MD5 Duplicates'}++;
              return;
          }
          $visited{ $digest } = $uri;
      }
  
  
      # Extract out links (if not too deep)
  
      my $links_extracted = extract_links( $server, \$content, $response )
          unless defined $server->{max_depth} && $depth >= $server->{max_depth};
  
  
      # Index the file
      
      if ( $server->{no_index} ) {
          $server->{counts}{Skipped}++;
          print STDERR "-Skipped indexing $uri some callback set 'no_index' flag\n" if $server->{debug}&DEBUG_SKIPPED;
  
      } else {
          return unless check_user_function( 'filter_content', $uri, $server, $response, \$content );
  
          output_content( $server, \$content, $uri, $response )
              unless $server->{no_index};
      }
  
  
  
      return $links_extracted;
  }
  
  #===================================================================================================
  #  Calls a user-defined function
  #
  #---------------------------------------------------------------------------------------------------
  
  sub check_user_function {
      my ( $fn, $uri, $server ) = ( shift, shift, shift );
      
      return 1 unless $server->{$fn};
  
      my $tests = ref $server->{$fn} eq 'ARRAY' ? $server->{$fn} : [ $server->{$fn} ];
  
      my $cnt;
  
      for my $sub ( @$tests ) {
          $cnt++;
          print STDERR "?Testing '$fn' user supplied function #$cnt '$uri'\n" if $server->{debug} & DEBUG_INFO;
  
          my $ret;
          
          eval { $ret = $sub->( $uri, $server, @_ ) };
  
          if ( $@ ) {
              print STDERR "-Skipped $uri due to '$fn' user supplied function #$cnt death '$@'\n" if $server->{debug} & DEBUG_SKIPPED;
              $server->{counts}{Skipped}++;
              return;
          }
              
          next if $ret;
          
          print STDERR "-Skipped $uri due to '$fn' user supplied function #$cnt\n" if $server->{debug} & DEBUG_SKIPPED;
          $server->{counts}{Skipped}++;
          return;
      }
      print STDERR "+Passed all $cnt tests for '$fn' user supplied function\n" if $server->{debug} & DEBUG_INFO;
      return 1;
  }
  
  
  #==============================================================================================
  #  Extract links from a text/html page
  #
  #   Call with:
  #       $server - server object
  #       $content - ref to content
  #       $response - response object
  #
  #----------------------------------------------------------------------------------------------
      
  sub extract_links {
      my ( $server, $content, $response ) = @_;
  
      return unless $response->header('content-type') &&
                       $response->header('content-type') =~ m[^text/html];
  
      # allow skipping.
      if ( $server->{no_spider} ) {
          print STDERR '-Links not extracted: ', $response->request->uri->canonical, " some callback set 'no_spider' flag\n" if $server->{debug}&DEBUG_SKIPPED;
          return;
      }
  
      $server->{Spidered}++;
  
      my @links;
  
  
      my $base = $response->base;
  
      print "\nExtracting links from ", $response->request->uri, ":\n" if $server->{debug} & DEBUG_LINKS;
  
      my $p = HTML::LinkExtor->new;
      $p->parse( $$content );
  
      my %skipped_tags;
  
      for ( $p->links ) {
          my ( $tag, %attr ) = @$_;
  
          # which tags to use ( not reported in debug )
  
          print STDERR " ?? Looking at extracted tag '$tag'\n" if $server->{debug} & DEBUG_LINKS;
  
          unless ( $server->{link_tags_lookup}{$tag} ) {
          
              # each tag is reported only once per page
              print STDERR
                  " ?? <$tag> skipped because not one of (",
                  join( ',', @{$server->{link_tags}} ),
                  ")\n" if $server->{debug} & DEBUG_LINKS && !$skipped_tags{$tag}++;
  
              if ( $server->{validate_links} && $tag eq 'img' && $attr{src} ) {
                  my $img = URI->new_abs( $attr{src}, $base );
                  validate_link( $server, $img, $base );
              }
                       
              next;
          }
          
          # Grab which attribute(s) which might contain links for this tag
          my $links = $HTML::Tagset::linkElements{$tag};
          $links = [$links] unless ref $links;
  
  
          my $found;
  
  
          # Now, check each attribut to see if a link exists
          
          for my $attribute ( @$links ) {
              if ( $attr{ $attribute } ) {  # ok tag
  
                  # Create a URI object
                  
                  my $u = URI->new_abs( $attr{$attribute},$base );
  
                  next unless check_link( $u, $server, $base, $tag, $attribute );
                  
                  push @links, $u;
                  print STDERR qq[ ++ <$tag $attribute="$u"> Added to list of links to follow\n] if $server->{debug} & DEBUG_LINKS;
                  $found++;
              }
          }
  
  
          if ( !$found && $server->{debug} & DEBUG_LINKS ) {
              my $s = "<$tag";
              $s .= ' ' . qq[$_="$attr{$_}"] for sort keys %attr;
              $s .= '>';
                  
              print STDERR " ?? tag $s did not include any links to follow\n";
          }
          
      }
  
      print STDERR "! Found ", scalar @links, " links in ", $response->base, "\n\n" if $server->{debug} & DEBUG_INFO;
  
  
      return \@links;
  }
  
  
  
  
  #=============================================================================
  # This function check's if a link should be added to the list to spider
  #
  #   Pass:
  #       $u - URI object
  #       $server - the server hash
  #       $base - the base or parent of the link
  #
  #   Returns true if a valid link
  #
  #   Calls the user function "test_url".  Link rewriting before spider
  #   can be done here.
  #
  #------------------------------------------------------------------------------
  sub check_link {
      my ( $u, $server, $base, $tag, $attribute ) = @_;
      
      $tag ||= '';
      $attribute ||= '';
  
  
      # Kill the fragment
      $u->fragment( undef );
  
  
      # This should not happen, but make sure we have a host to check against
  
      unless ( $u->host ) {
          print STDERR qq[ ?? <$tag $attribute="$u"> skipped because missing host name\n] if $server->{debug} & DEBUG_LINKS;
          return;
      }
  
  
      # Here we make sure we are looking at a link pointing to the correct (or equivalent) host
  
      unless ( $server->{same_host_lookup}{$u->authority} ) {
  
          print STDERR qq[ ?? <$tag $attribute="$u"> skipped because different authority (server:port)\n] if $server->{debug} & DEBUG_LINKS;
          $server->{counts}{'Off-site links'}++;
          validate_link( $server, $u, $base ) if $server->{validate_links};
          return;
      }
      
      $u->authority( $server->{authority} );  # Force all the same host name
  
      # Allow rejection of this URL by user function
  
      return unless check_user_function( 'test_url', $u, $server );
  
  
      # Don't add the link if already seen  - these are so common that we don't report
  
      if ( $visited{ $u->canonical }++ ) {
          #$server->{counts}{Skipped}++;
          $server->{counts}{Duplicates}++;
  
  
          # Just so it's reported for all pages 
          if ( $server->{validate_links} && $validated{$u->canonical} ) {
              push @{$bad_links{ $base->canonical }}, $u->canonical;
          }
  
          return;
      }
  
      return 1;
  }
  
  
  #=============================================================================
  # This function is used to validate links that are off-site.
  #
  #   It's just a very basic link check routine that lets you validate the
  #   off-site links at the same time as indexing.  Just because we can.
  #
  #------------------------------------------------------------------------------
  sub validate_link {
      my ($server, $uri, $base, $response ) = @_;
  
     # Already checked? 
  
      if ( exists $validated{ $uri->canonical } )
      {
          # Add it to the list of bad links on that page if it's a bad link.
          push @{$bad_links{ $base->canonical }}, $uri->canonical
              if $validated{ $uri->canonical };
  
          return;            
      }
  
      $validated{ $uri->canonical } = 0;  # mark as checked and ok.
  
      unless ( $response ) {
          my $ua = LWP::UserAgent->new;
          my $request = HTTP::Request->new('HEAD', $uri->canonical );
  
          eval {
              $SIG{ALRM} = sub { die "timed out\n" };
              alarm 5;
              $response = $ua->simple_request( $request );
              alarm 0;
          };
  
          if ( $@ ) {
              $server->{counts}{'Bad Links'}++;
              my $msg = $@;
              $msg =~ tr/\n//s;
              $validated{ $uri->canonical } = $msg;
              push @{$bad_links{ $base->canonical }}, $uri->canonical;
              return;
          }
      }
  
      return if $response->is_success;
  
      my $error = $response->status_line || $response->status || 'unknown status';
  
      $error .= ' ' . URI->new_abs( $response->header('location'), $response->base )->canonical
          if $response->is_redirect && $response->header('location');
  
      $validated{ $uri->canonical } = $error;
      push @{$bad_links{ $base->canonical }}, $uri->canonical;
  }
      
  
  sub output_content {
      my ( $server, $content, $uri, $response ) = @_;
  
      $server->{indexed}++;
  
      unless ( length $$content ) {
          print STDERR "Warning: document '", $response->request->uri, "' has no content\n";
          $$content = ' ';
      }
  
  
      $server->{counts}{'Total Bytes'} += length $$content;
      $server->{counts}{'Total Docs'}++;
  
  
      my $headers = join "\n",
          'Path-Name: ' .  $uri,
          'Content-Length: ' . length $$content,
          '';
  
      $headers .= 'Last-Mtime: ' . $response->last_modified . "\n"
          if $response->last_modified;
  
  
      $headers .= "No-Contents: 1\n" if $server->{no_contents};
      print "$headers\n$$content";
  
      die "$0: Max indexed files Reached\n"
          if $server->{max_indexed} && $server->{counts}{'Total Docs'} >= $server->{max_indexed};
  }
          
  
  
  sub commify {
      local $_  = shift;
      1 while s/^([-+]?\d+)(\d{3})/$1,$2/;
      return $_;
  }
  
  sub default_urls {
      die "$0: Must list URLs when using 'default'\n" unless @ARGV;
  
      my $validate = 0;
      if ( $ARGV[0] eq 'validate' ) {
          shift @ARGV;
          $validate = 1;
      }
  
      my @content_types  = qw{ text/html text/plain };
  
      return map {
          {
              #debug => DEBUG_HEADERS,
              #debug => DEBUG_URL|DEBUG_SKIPPED|DEBUG_INFO,
              base_url        => \@ARGV,
              email           => 'swish@domain.invalid',
              delay_min       => .0001,
              link_tags       => [qw/ a frame /],
              txest_url        => sub { $_[0]->path !~ /\.(?:gif|jpeg|.png)$/i },
  
              test_response   => sub {
                  my $content_type = $_[2]->content_type;
                  my $ok = grep { $_ eq $content_type } @content_types;
                  return 1 if $ok;
                  print STDERR "$_[0] $content_type != (@content_types)\n";
                  return;
              },
              validate_links => $validate,
  
          }
      } @ARGV;
  }
  
              
  
  __END__
  
  =head1 NAME
  
  spider.pl - Example Perl program to spider web servers
  
  =head1 SYNOPSIS
  
    swish.config:
      IndexDir ./spider.pl
      SwishProgParameters spider.config
      # other swish-e settings
  
    spider.config:
      @servers = (
          {
              base_url    => 'http://myserver.com/',
              email       => 'me@myself.com',
              # other spider settings described below
          },
      );
  
    begin indexing:
      swish-e -S prog -c swish.config
  
  =head1 DESCRIPTION
  
  This is a swish-e "prog" document source program for spidering
  web servers.  It can be used instead of the C<http> method for
  spidering with swish.
  
  The spider typically uses a configuration
  file that lists the URL(s) to spider, and configuration parameters that control
  the behavior of the spider.  In addition, you may define I<callback> perl functions
  in the configuration file that can dynamically change the behavior of the spider
  based on URL, HTTP response headers, or the content of the fetched document.  These
  callback functions can also be used to filter or convert documents (e.g. PDF, gzip, MS Word)
  into a format that swish-e can parse.  Some examples are provided.
  
  You define "servers" to spider, set a few parameters,
  create callback routines, and start indexing as the synopsis above shows.
  The spider requires its own configuration file (unless you want the default values).  This
  is NOT the same configuration file that swish-e uses.
  
  The example configuration file C<SwishSpiderConfig.pl> is
  included in the C<prog-bin> directory along with this script.  Please just use it as an
  example, as it contains more settings than you probably want to use.  Start with a tiny config file
  and add settings as required by your situation.
  
  The available configuration parameters are discussed below.
  
  If all that sounds confusing, then you can run the spider with default settings.  In fact, you can
  run the spider without using swish just to make sure it works.  Just run
  
      ./spider.pl default http://someserver.com/sometestdoc.html
  
  And you should see F<sometestdoc.html> dumped to your screen.  Get ready to kill the script
  if the file you request contains links as the output from the fetched pages will be displayed.
  
      ./spider.pl default http://someserver.com/sometestdoc.html > output.file
  
  might be more friendly.    
  
  If the first parameter passed to the spider is the word "default" (as in the preceeding example)
  then the spider uses the default parameters,
  and the following parameter(s) are expected to be URL(s) to spider.
  Otherwise, the first parameter is considered to be the name of the configuration file (as described
  below).  When using C<-S prog>, the swish-e configuration setting
  C<SwishProgParameters> is used to pass parameters to the program specified
  with C<IndexDir> or the C<-i> switch.
  
  If you do not specify any parameters the program will look for the file
  
      SwishSpiderConfig.pl
  
  in the current directory.    
  
  The spider does require Perl's LWP library and a few other reasonably common modules.
  Most well maintained systems should have these modules installed.  If not, start here:
  
      http://search.cpan.org/search?dist=libwww-perl
      http://search.cpan.org/search?dist=HTML-Parser
  
  See more below in C<REQUIREMENTS>.  It's a good idea to check that you are running
  a current version of these modules.
  
  =head2 Robots Exclusion Rules and being nice
  
  This script will not spider files blocked by F<robots.txt>.  In addition,
  The script will check for <meta name="robots"..> tags, which allows finer
  control over what files are indexed and/or spidered.
  See http://www.robotstxt.org/wc/exclusion.html for details.
  
  This spider provides an extension to the <meta> tag exclusion, by adding a
  C<NOCONTENTS> attribute.  This attribute turns on the C<no_contents> setting, which
  asks swish-e to only index the document's title (or file name if not title is found).
  
  For example:
  
        <META NAME="ROBOTS" CONTENT="NOCONTENTS, NOFOLLOW">
  
  says to just index the document's title, but don't index its contents, and don't follow
  any links within the document.  Granted, it's unlikely that this feature will ever be used...
  
  If you are indexing your own site, and know what you are doing, you can disable robot exclusion by
  the C<ignore_robots_file> configuration parameter, described below.  This disables both F<robots.txt>
  and the meta tag parsing.
  
  This script only spiders one file at a time, so load on the web server is not that great.
  And with libwww-perl-5.53_91 HTTP/1.1 keep alive requests can reduce the load on
  the server even more (and potentially reduce spidering time considerably!)
  
  Still, discuss spidering with a site's administrator before beginning.
  Use the C<delay_min> to adjust how fast the spider fetches documents.
  Consider running a second web server with a limited number of children if you really
  want to fine tune the resources used by spidering.
  
  =head2 Duplicate Documents
  
  The spider program keeps track of URLs visited, so a document is only indexed
  one time.  
  
  The Digest::MD5 module can be used to create a "fingerprint" of every page
  indexed and this fingerprint is used in a hash to find duplicate pages.
  For example, MD5 will prevent indexing these as two different documents:
  
      http://localhost/path/to/some/index.html
      http://localhost/path/to/some/
  
  But note that this may have side effects you don't want.  If you want this
  file indexed under this URL:
  
      http://localhost/important.html
  
  But the spider happens to find the exact content in this file first:
  
      http://localhost/developement/test/todo/maybeimportant.html
  
  Then only that URL will be indexed.    
  
  MD5 may slow down indexing a tiny bit, so test with and without if speed is an
  issue (which it probably isn't since you are spidering in the first place).
  This feature will also use more memory.
  
  Note: the "prog" document source in swish bypasses many swish-e configuration settings.
  For example, you cannot use the C<IndexOnly> directive with the "prog" document
  source.  This is by design to limit the overhead when using an external program
  for providing documents to swish; after all, with "prog", if you don't want to index a file, then
  don't give it to swish to index in the first place.
  
  So, for spidering, if you do not wish to index images, for example, you will
  need to either filter by the URL or by the content-type returned from the web
  server.  See L<CALLBACK FUNCTIONS|CALLBACK FUNCTIONS> below for more information.
  
  =head1 REQUIREMENTS
  
  Perl 5 (hopefully at least 5.00503) or later.
  
  You must have the LWP Bundle on your computer.  Load the LWP::Bundle via the CPAN.pm shell,
  or download libwww-perl-x.xx from CPAN (or via ActiveState's ppm utility).
  Also required is the the HTML-Parser-x.xx bundle of modules also from CPAN
  (and from ActiveState for Windows).
  
      http://search.cpan.org/search?dist=libwww-perl
      http://search.cpan.org/search?dist=HTML-Parser
  
  You will also need Digest::MD5 if you wish to use the MD5 feature.
  HTML::Tagset is also required.
  Other modules may be required (for example, the pod2xml.pm module
  has its own requirementes -- see perldoc pod2xml for info).
  
  The spider.pl script, like everyone else, expects perl to live in /usr/local/bin.
  If this is not the case then either add a symlink at /usr/local/bin/perl
  to point to where perl is installed
  or modify the shebang (#!) line at the top of the spider.pl program.
  
  Note that the libwww-perl package does not support SSL (Secure Sockets Layer) (https)
  by default.  See F<README.SSL> included in the libwww-perl package for information on
  installing SSL support.
  
  =head1 CONFIGURATION FILE
  
  Configuration is not very fancy.  The spider.pl program simply does a
  C<do "path";> to read in the parameters and create the callback subroutines.
  The C<path> is the first parameter passed to the spider script, which is set
  by the Swish-e configuration setting C<SwishProgParameters>.
  
  For example, if in your swish-e configuration file you have
  
      SwishProgParameters /path/to/config.pl
      IndexDir /home/moseley/swish-e/prog-bin/spider.pl
  
  And then run swish as
  
      swish-e -c swish.config -S prog
  
  swish will run C</home/moseley/swish-e/prog-bin/spider.pl> and the spider.pl
  program will receive as its first parameter C</path/to/config.pl>, and
  spider.pl will read C</path/to/config.pl> to get the spider configuration
  settings.  If C<SwishProgParameters> is not set, the program will try to
  use C<SwishSpiderConfig.pl> by default.
  
  There is a special case of:
  
      SwishProgParameters default http://www.mysite/index.html ...
  
  Where default parameters are used.  This will only index documents of type
  C<text/html> or C<text/plain>, and will skip any file with an extension that matches
  the pattern:
  
      /\.(?:gif|jpeg|.png)$/i
  
  This can be useful for indexing just your web documnts, but you will probably want finer
  control over your spidering by using a configuration file.
  
  The configuration file must set a global variable C<@servers> (in package main).
  Each element in C<@servers> is a reference to a hash.  The elements of the has
  are described next.  More than one server hash may be defined -- each server
  will be spidered in order listed in C<@servers>, although currently a I<global> hash is
  used to prevent spidering the same URL twice.
  
  Examples:
  
      my %serverA = (
          base_url    => 'http://swish-e.org/',
          same_hosts  => [ qw/www.swish-e.org/ ],
          email       => 'my@email.address',
      );
      my %serverB = (
          ...
          ...
      );
      @servers = ( \%serverA, \%serverB, );
  
  =head1 CONFIGURATION OPTIONS
  
  This describes the required and optional keys in the server configuration hash, in random order...
  
  =over 4
  
  =item base_url
  
  This required setting is the starting URL for spidering.
  
  Typically, you will just list one URL for the base_url.  You may specify more than one
  URL as a reference to a list
  
      base_url => [qw! http://swish-e.org/ http://othersite.org/other/index.html !],
  
  =item same_hosts
  
  This optional key sets equivalent B<authority> name(s) for the site you are spidering.
  For example, if your site is C<www.mysite.edu> but also can be reached by
  C<mysite.edu> (with or without C<www>) and also C<web.mysite.edu> then:
  
  
  Example:
  
      $serverA{base_url} = 'http://www.mysite.edu/index.html';
      $serverA{same_hosts} = ['mysite.edu', 'web.mysite.edu'];
  
  Now, if a link is found while spidering of:
  
      http://web.mysite.edu/path/to/file.html
  
  it will be considered on the same site, and will actually spidered and indexed
  as:
  
      http://www.mysite.edu/path/to/file.html
  
  Note: This should probably be called B<same_authority> because it compares the URI C<authority>
  against the list of host names in C<same_hosts>.  So, if you specify a port name in you will
  probably want to specify the port name in the the list of hosts in C<same_hosts>:
  
      my %serverA = (
          base_url    => 'http://sunsite.berkeley.edu:4444/',
          same_hosts  => [ qw/www.sunsite.berkeley.edu:4444/ ],
          email       => 'my@email.address',
      );
  
  
  =item email
  
  This required key sets the email address for the spider.  Set this to
  your email address.
  
  =item agent
  
  This optional key sets the name of the spider.
  
  =item link_tags
  
  This optional tag is a reference to an array of tags.  Only links found in these tags will be extracted.
  The default is to only extract links from C<a> tags.
  
  For example, to extract tags from C<a> tags and from C<frame> tags:
  
      my %serverA = (
          base_url    => 'http://sunsite.berkeley.edu:4444/',
          same_hosts  => [ qw/www.sunsite.berkeley.edu:4444/ ],
          email       => 'my@email.address',
          link_tags   => [qw/ a frame /],
      );
  
  
  =item delay_min
  
  This optional key sets the delay in minutes to wait between requests.  See the
  LWP::RobotUA man page for more information.  The default is 0.1 (6 seconds),
  but in general you will probably want it much smaller.  But, check with
  the webmaster before using too small a number.
  
  =item max_time
  
  This optional key will set the max minutes to spider.   Spidering
  for this host will stop after C<max_time> minutes, and move on to the
  next server, if any.  The default is to not limit by time.
  
  =item max_files
  
  This optional key sets the max number of files to spider before aborting.
  The default is to not limit by number of files.  This is the number of requests
  made to the remote server, not the total number of files to index (see C<max_indexed>).
  This count is displayted at the end of indexing as C<Unique URLs>.
  
  This feature can (and perhaps should) be use when spidering a web site where dynamic
  content may generate unique URLs to prevent run-away spidering.
  
  =item max_indexed
  
  This optional key sets the max number of files that will be indexed.
  The default is to not limit.  This is the number of files sent to
  swish for indexing (and is reported by C<Total Docs> when spidering ends).
  
  =item max_size
  
  This optional key sets the max size of a file read from the web server.
  This B<defaults> to 5,000,000 bytes.  If the size is exceeded the resource is
  skipped and a message is written to STDERR if the DEBUG_SKIPPED debug flag is set.
  
  =item keep_alive
  
  This optional parameter will enable keep alive requests.  This can dramatically speed
  up searching and reduce the load on server being spidered.  The default is to not use
  keep alives, although enabling it will probably be the right thing to do.
  
  To get the most out of keep alives, you may want to set up your web server to
  allow a lot of requests per single connection (i.e MaxKeepAliveRequests on Apache).
  Apache's default is 100, which should be good.  (But, in general, don't enable KeepAlives
  on a mod_perl server.)
  
  Note: try to filter as many documents as possible B<before> making the request to the server.  In
  other words, use C<test_url> to look for files ending in C<.html> instead of using C<test_response> to look
  for a content type of C<text/html> if possible.
  Do note that aborting a request from C<test_response> will break the
  current keep alive connection.
  
  Note: you must have at least libwww-perl-5.53_90 installed to use this feature.
  
  =item skip
  
  This optional key can be used to skip the current server.  It's only purpose
  is to make it easy to disable a server in a configuration file.
  
  =item debug
  
  Set this to a number to display different amounts of info while spidering.  Writes info
  to STDERR.  Zero/undefined is no debug output.
  
  The following constants are defined for debugging.  They may be or'ed together to
  get the individual debugging of your choice.
  
  Here are basically the levels:
  
      DEBUG_ERRORS   general program errors (not used at this time)
      DEBUG_URL      print out every URL processes
      DEBUG_HEADERS  prints the response headers
      DEBUG_FAILED   failed to return a 200
      DEBUG_SKIPPED  didn't index for some reason
      DEBUG_INFO     more verbose
      DEBUG_LINKS    prints links as they are extracted
  
  For example, to display the urls processed, failed, and skipped use:
  
      debug => DEBUG_URL | DEBUG_FAILED | DEBUG_SKIPPED,
  
  To display the returned headers
  
      debug => DEBUG_HEADERS,
  
  You can easily run the spider without using swish for debugging purposes:
  
      ./spider.pl test.config > spider.out
  
  And you will see debugging info as it runs, and the fetched documents will be saved
  in the C<spider.out> file.
  
  =item max_depth
  
  The C<max_depth> parameter can be used to limit how deeply to recurse a web site.
  The depth is just a count of levels of web pages decended, and not related to
  the number of path elements in a URL.
  
  A max_depth of zero says to only spider the page listed as the C<base_url>.  A max_depth of one will
  spider the C<base_url> page, plus all links on that page, and no more.  The default is to spider all
  pages.
  
  
  =item ignore_robots_file
  
  If this is set to true then the robots.txt file will not be checked when spidering
  this server.  Don't use this option unless you know what you are doing.
  
  =item use_cookies
  
  If this is set then a "cookie jar" will be maintained while spidering.  Some
  (poorly written ;) sites require cookies to be enabled on clients.
  
  This requires the HTTP::Cookies module.
  
  =item use_md5
  
  If this setting is true, then a MD5 digest "fingerprint" will be made from the content of every
  spidered document.  This digest number will be used as a hash key to prevent
  indexing the same content more than once.  This is helpful if different URLs
  generate the same content.
  
  Obvious example is these two documents will only be indexed one time:
  
      http://localhost/path/to/index.html
      http://localhost/path/to/
  
  This option requires the Digest::MD5 module.  Spidering with this option might
  be a tiny bit slower.
  
  =item validate_links
  
  Just a hack.  If you set this true the spider will do HEAD requests all links (e.g. off-site links), just
  to make sure that all your links work.
  
  =back
  
  =head1 CALLBACK FUNCTIONS
  
  Three callback functions can be defined in your parameter hash.
  These optional settings are I<callback> subroutines that are called while
  processing URLs.
  
  A little perl discussion is in order:
  
  In perl, a scalar variable can contain a reference to a subroutine.  The config example above shows
  that the configuration parameters are stored in a perl I<hash>.
  
      my %serverA = (
          base_url    => 'http://sunsite.berkeley.edu:4444/',
          same_hosts  => [ qw/www.sunsite.berkeley.edu:4444/ ],
          email       => 'my@email.address',
          link_tags   => [qw/ a frame /],
      );
  
  There's two ways to add a reference to a subroutine to this hash:
  
  sub foo {
      return 1;
  }
  
      my %serverA = (
          base_url    => 'http://sunsite.berkeley.edu:4444/',
          same_hosts  => [ qw/www.sunsite.berkeley.edu:4444/ ],
          email       => 'my@email.address',
          link_tags   => [qw/ a frame /],
          test_url    => \&foo,  # a reference to a named subroutine
      );
  
  Or the subroutine can be coded right in place:    
  
      my %serverA = (
          base_url    => 'http://sunsite.berkeley.edu:4444/',
          same_hosts  => [ qw/www.sunsite.berkeley.edu:4444/ ],
          email       => 'my@email.address',
          link_tags   => [qw/ a frame /],
          test_url    => sub { reutrn 1; },
      );
  
  The above example is not very useful as it just creates a user callback function that
  always returns a true value (the number 1).  But, it's just an example.
  
  The function calls are wrapped in an eval, so calling die (or doing something that dies) will just cause
  that URL to be skipped.  If you really want to stop processing you need to set $server->{abort} in your
  subroutine (or send a kill -HUP to the spider).
  
  The first two parameters passed are a URI object (to have access to the current URL), and
  a reference to the current server hash.  The C<server> hash is just a global hash for holding data, and
  useful for setting flags as describe belwo.
  
  Other parameters may be also passes, as described below.
  In perl parameters are passed in an array called "@_".  The first element (first parameter) of
  that array is $_[0], and the second is $_[1], and so on.  Depending on how complicated your
  function is you may wish to shift your parameters off of the @_ list to make working with them
  easier.  See the examples below.
  
  
  To make use of these routines you need to understand when they are called, and what changes
  you can make in your routines.  Each routine deals with a given step, and returning false from
  your routine will stop processing for the current URL.
  
  =over 4
  
  =item test_url
  
  C<test_url> allows you to skip processing of urls based on the url before the request
  to the server is made.  This function is called for the C<base_url> links (links you define in
  the spider configuration file) and for every link extracted from a fetched web page.
  
  This function is a good place to skip links that you are not interested in following.  For example,
  if you know there's no point in requesting images then you can exclude them like:
  
      test_url => sub {
          my $uri = shift;
          return 0 if $uri->path =~ /\.(gif|jpeg|png)$/;
          return 1;
      },
  
  Or to write it another way:
  
      test_url => sub { $_[0]->path !~ /\.(gif|jpeg|png)$/ },
  
  Another feature would be if you were using a web server where path names are
  NOT case sensitive (e.g. Windows).  You can normalize all links in this situation
  using something like
  
      test_url => sub {
          my $uri = shift;
          return 0 if $uri->path =~ /\.(gif|jpeg|png)$/;
  
          $uri->path( lc $uri->path ); # make all path names lowercase
          return 1;
      },
  
  The important thing about C<test_url> (compared to the other callback functions) is that
  it is called while I<extracting> links, not while actually fetching that page from the web
  server.  Returning false from C<test_url> simple says to not add the URL to the list of links to
  spider.
  
  You may set a flag in the server hash (second parameter) to tell the spider to abort processing.
  
      test_url => sub {
          my $server = $_[1];
          $server->{abort}++ if $_[0]->path =~ /foo\.html/;
          return 1;
      },
  
  You cannot use the server flags:
  
      no_contents
      no_index
      no_spider
  
  
  This is discussed below.
  
  =item test_response
  
  This function allows you to filter based on the response from the
  remote server (such as by content-type).  This function is called while the
  web pages is being fetched from the remote server, typically after just enought
  data has been returned to read the response from the web server.
  
  The spider requests a document in "chunks" of 4096 bytes.  4096 is only a suggestion
  of how many bytes to return in each chunk.  The C<test_response> routine is
  called when the first chunk is received only.  This allows ignoring (aborting)
  reading of a very large file, for example, without having to read the entire file.
  Although not much use, a reference to this chunk is passed as the forth parameter.
  
  Web servers use a Content-Type: header to define the type of data returned from the server.
  On a web server you could have a .jpeg file be a web page -- file extensions may not always
  indicate the type of the file.  The third parameter ($_[2]) returned is a reference to a
  HTTP::Response object:
  
  For example, to only index true HTML (text/html) pages:
  
      test_response => sub {
          my $content_type = $_[2]->content_type;
          return $content_type =~ m!text/html!;
      },
  
  You can also set flags in the server hash (the second parameter) to control indexing:
  
      no_contents -- index only the title (or file name), and not the contents
      no_index    -- do not index this file, but continue to spider if HTML
      no_spider   -- index, but do not spider this file for links to follow
      abort       -- stop spidering any more files
  
  For example, to avoid index the contents of "private.html", yet still follow any links
  in that file:
  
      test_url => sub {
          my $server = $_[1];
          $server->{no_index}++ if $_[0]->path =~ /private\.html$/;
          return 1;
      },
  
  Note: Do not modify the URI object in this call back function.
      
  
  =item filter_content
  
  This callback function is called right before sending the content to swish.
  Like the other callback function, returning false will cause the URL to be skipped.
  Setting the C<abort> server flag and returning false will abort spidering.
  
  You can also set the C<no_contents> flag.
  
  This callback function is passed four parameters.
  The URI object, server hash, the HTTP::Response object,
  and a reference to the content.
  
  You can modify the content as needed.  For example you might not like upper case:
  
      filter_content => sub {
          my $content_ref = $_[3];
  
          $$content_ref = lc $$content_ref;
          return 1;
      },
  
  I more reasonable example would be converting PDF or MS Word documents for parsing by swish.
  Examples of this are provided in the F<prog-bin> directory of the swish-e distribution.
  
  You may also modify the URI object to change the path name passed to swish for indexing.
  
      filter_content => sub {
          my $uri = $_[0];
          $uri->host('www.other.host') ;
          return 1;
      },
  
  Swish-e's ReplaceRules feature can also be used for modifying the path name indexed.
  
  Here's a bit more advanced example of indexing text/html and PDF files only:
  
      use pdf2xml;  # included example pdf converter module
      $server{filter_content} = sub {
         my ( $uri, $server, $response, $content_ref ) = @_;
  
         return 1 if $response->content_type eq 'text/html';
         return 0 unless $response->content_type eq 'application/pdf';
  
         # for logging counts
         $server->{counts}{'PDF transformed'}++;
  
         $$content_ref = ${pdf2xml( $content_ref )};
         return 1;
      }
  
  
  
  =back
  
  Note that you can create your own counters to display in the summary list when spidering
  is finished by adding a value to the hash pointed to by C<$server->{counts}>.
  
      test_url => sub {
          my $server = $_[1];
          $server->{no_index}++ if $_[0]->path =~ /private\.html$/;
          $server->{counts}{'Private Files'}++;
          return 1;
      },
  
  
  Each callback function B<must> return true to continue processing the URL.  Returning false will
  cause processing of I<the current> URL to be skipped.
  
  =head2 More on setting flags
  
  Swish (not this spider) has a configuration directive C<NoContents> that will instruct swish to
  index only the title (or file name), and not the contents.  This is often used when
  indexing binary files such as image files, but can also be used with html
  files to index only the document titles.
  
  As shown above, you can turn this feature on for specific documents by setting a flag in
  the server hash passed into the C<test_response> or C<filter_contents> subroutines.
  For example, in your configuration file you might have the C<test_response> callback set
  as:
  
     
      test_response => sub {
          my ( $uri, $server, $response ) = @_;
          # tell swish not to index the contents if this is of type image
          $server->{no_contents} = $response->content_type =~ m[^image/];
          return 1;  # ok to index and spider this document
      }
  
  The entire contents of the resource is still read from the web server, and passed
  on to swish, but swish will also be passed a C<No-Contents> header which tells
  swish to enable the NoContents feature for this document only.
  
  Note: Swish will index the path name only when C<NoContents> is set, unless the document's
  type (as set by the swish configuration settings C<IndexContents> or C<DefaultContents>) is
  HTML I<and> a title is found in the html document.
  
  Note: In most cases you probably would not want to send a large binary file to swish, just
  to be ignored.  Therefore, it would be smart to use a C<filter_contents> callback routine to
  replace the contents with single character (you cannot use the empty string at this time).
  
  A similar flag may be set to prevent indexing a document at all, but still allow spidering.
  In general, if you want completely skip spidering a file you return false from one of the 
  callback routines (C<test_url>, C<test_response>, or C<filter_content>).  Returning false from any of those
  three callbacks will stop processing of that file, and the file will B<not> be spidered.
  
  But there may be some cases where you still want to spider (extract links) yet, not index the file.  An example
  might be where you wish to index only PDF files, but you still need to spider all HTML files to find
  the links to the PDF files.
  
      $server{test_response} = sub {
          my ( $uri, $server, $response ) = @_;
          $server->{no_index} = $response->content_type ne 'application/pdf';
          return 1;  # ok to spider, but don't index
      }
  
  So, the difference between C<no_contents> and C<no_index> is that C<no_contents> will still index the file
  name, just not the contents.  C<no_index> will still spider the file (if it's C<text/html>) but the
  file will not be processed by swish at all.
  
  B<Note:> If C<no_index> is set in a C<test_response> callback function then
  the document I<will not be filtered>.  That is, your C<filter_content>
  callback function will not be called.
  
  The C<no_spider> flag can be set to avoid spiderering an HTML file.  The file will still be indexed unless
  C<no_index> is also set.  But if you do not want to index and spider, then simply return false from one of the three
  callback funtions.
  
  
  =head1 SIGNALS
  
  Sending a SIGHUP to the running spider will cause it to stop spidering.  This is a good way to abort spidering, but
  let swish index the documents retrieved so far.
  
  =head1 SEE ALSO
  
  L<URI> L<LWP::RobotUA> L<WWW::RobotRules> L<Digest::MD5>
  
  =head1 COPYRIGHT
  
  Copyright 2001 Bill Moseley
  
  This program is free software; you can redistribute it and/or modify
  it under the same terms as Perl itself.
  
  =head1 SUPPORT
  
  Send all questions to the The SWISH-E discussion list.
  
  See http://sunsite.berkeley.edu/SWISH-E.
  
  =cut   
  
  
  
  
  1.1                  modperl-docs/src/search/swish.cgi
  
  Index: swish.cgi
  ===================================================================
  #!/usr/local/bin/perl -w
  package SwishSearch;
  use strict;
  
  use lib qw( modules );  ### This must be adjusted!
  
  
  ####################################################################################
  #
  #    If this text is displayed on your browser then your web server
  #    is not configured to run .cgi programs.  Contact your web server administrator.
  #
  #    To display documentation for this program type "perldoc swish.cgi"
  #
  #    swish.cgi $Revision: 1.1 $ Copyright (C) 2001 Bill Moseley swishscript@hank.org
  #    Example CGI program for searching with SWISH-E
  #
  #    This example program will only run under an OS that supports fork().
  #    Ok, piped opens.
  #
  #
  #    This program is free software; you can redistribute it and/or
  #    modify it under the terms of the GNU General Public License
  #    as published by the Free Software Foundation; either version
  #    2 of the License, or (at your option) any later version.
  #    
  #    This program is distributed in the hope that it will be useful,
  #    but WITHOUT ANY WARRANTY; without even the implied warranty of
  #    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  #    GNU General Public License for more details.
  #
  #    The above lines must remain at the top of this program
  #
  #    $Id: swish.cgi,v 1.1 2002/01/30 06:35:00 stas Exp $
  #
  ####################################################################################
  
  # This is written this way so the script can be used as a CGI script or a mod_perl
  # module without any code changes.
  
  
  # use CGI ();  # might not be needed if using Apache::Request
  
  #=================================================================================
  #   CGI entry point
  #
  #=================================================================================
  
  
  
  # Run the script -- entry point if running as a CGI script
  
      unless ( $ENV{MOD_PERL} ) {
          my $config = default_config();
  
          # Merge with disk config file.
          $config = merge_read_config( $config );
          process_request( $config );
      }
  
  
  #=================================================================================
  #   mod_perl entry point
  #
  #   As an example, you might use a PerlSetVar to point to paths to different
  #   config files, and then cache the different configurations by path.
  #
  #=================================================================================
  
  my %cached_configs;
  
  sub handler {
      my $r = shift;
  
      if ( my $config_path = $r->dir_config( 'Swish_Conf_File' ) ) {
  
          # Already cached?
          if ( $cached_configs{ $config_path } ) {
              process_request( $cached_configs{ $config_path } );
              return Apache::Constants::OK();
          }
  
          # Else, load config
          my $config = default_config();
          $config->{config_file} = $config_path;
  
          # Merge with disk config file.
          $cached_configs{ $config_path } = merge_read_config( $config );
  
          process_request( $cached_configs{ $config_path } );
          return Apache::Constants::OK();
      }
  
  
      # Otherwise, use hard-coded config
      process_request( default_config() );
  
      return Apache::Constants::OK();
  
  }
  
  
  
  #==================================================================================
  #   This sets the default configuration
  #   Any configuration read from disk is merged with these settings.
  #
  #   Only a few settings are actually required.  Some reasonable defaults are used
  #   for most.  If fact, you can probably create a complete config as:
  #
  #    return = {
  #        swish_binary    => '/usr/local/bin/swish-e',
  #        swish_index     => '/usr/local/share/swish/index.swish-e',
  #        title_property  => 'swishtitle',  # Not required, but recommended
  #    };
  #
  #   But, that doesn't really show all the options.
  #
  #   You can modify the options below, or you can use a config file.  The config file
  #   is .swishcgi.conf by default (read from the current directory) that must return
  #   a hash reference.  For example, to create a config file that changes the default
  #   title and index file name, plus uses Template::Toolkit to generate output
  #   create a config file as:
  #
  #       # Example config file -- returns a hash reference
  #       {
  #           title           => 'Search Our Site',
  #           swish_index     => 'index.web',
  #
  #           template => {
  #            package         => 'TemplateToolkit',
  #            file            => 'search.tt',
  #            options         => {
  #                INCLUDE_PATH    => '/home/user/swish-e/example',
  #            },
  #       };
  #
  #
  #-----------------------------------------------------------------------------------
  
  sub default_config {
  
  
      
      ##### Configuration Parameters #########
  
      #---- This lists all the options, with many commented out ---
      # By default, this config is used -- see the process_request() call below.
      
      # You should adjust for your site, and how your swish index was created.
  
      ##>>
      ##>>  Please don't post this entire section on the swish-e list if looking for help!
      ##>>
      ##>>  Send a small example, without all the comments.
  
      # Items beginning with an "x" or "#" are commented out
      
      return {
          title           => 'Search our site',  # Title of your choice.
          swish_binary    => './swish-e',        # Location of swish-e binary
  
  
          # By default, this script tries to read a config file.  You should probably
          # comment this out if not used save a disk stat
          config_file     => '.swishcgi.conf',    # Default config file
  
  
          # The location of your index file.  Typically, this would not be in
          # your web tree.
          # If you have more than one index to search then specify an array
          # reference.  e.g. swish_index =>[ qw/ index1 index2 index3 /],
          
          swish_index     => 'index.swish-e',  # Location of your index file
                                                  
                                                 # See "select_indexes" below for how to
                                                 # select more than one index.
  
          page_size       => 15,                 # Number of results per page  - default 15
  
  
          # Property name to use as the main link text to the indexed document.
          # Typically, this will be 'swishtitle' if have indexed html documents,
          # But you can specify any PropertyName defined in your document.
          # By default, swish will return the pathname for documents that do not
          # have a title.
  
          title_property => 'swishtitle',
  
  
  
          # prepend this path to the filename (swishdocpath) returned by swish.  This is used to
          # make the href link back to the original document.  Comment out to disable.
  
          #prepend_path    => 'http://localhost/mydocs',
  
  
          # Swish has a configuration directive "StoreDescription" that will save part or
          # all of a document's contents in the index file.  This can then be displayed
          # along with results.  If you are indexing a lot of files this can use a lot of disk
          # space, so test carefully before indexing your entire site.
          # Building swish with zlib can greatly reduce the space used by StoreDescription
          #
          # This settings tells this script to display this description.
          # Normally, this should be 'swishdescription', but you can specify another property name.
          # There is no default.
         
          description_prop    => 'swishdescription',
  
  
  
          # Property names listed here will be displayed in a table below each result
          # You may wish to modify this list if you are using document properties (PropertyNames)
          # in your swish-e index configuration
          # There is no default.
          
          display_props   => [qw/swishlastmodified swishdocsize swishdocpath/],
  
  
  
          # Results can be be sorted by any of the properties listed here
          # They will be displayed in a drop-down list
          # Again, you may modify this list if you are using document properties of your own creation
          # Swish uses the rank as the default sort
  
          sorts           => [qw/swishrank swishlastmodified swishtitle swishdocpath/],
  
  
          # Secondary_sort is used to sort within a sort
          # You may enter a property name followed by a direction (asc|desc)
  
          secondary_sort  => [qw/swishlastmodified desc/],
  
  
  
  
  
          # You can limit by MetaNames here.  Names listed here will be displayed in
          # a line of radio buttons.
          # The default is to not allow any metaname selection.
          # To use this feature you must define MetaNames while indexing.
  
          # The special "swishdefault" says to search any text that was not indexed
          # as a metaname (e.g. the body of a HTML document).
  
          # To see how this might work, add to your config file:
          #   MetaNames swishtitle swishdocpath
          # and try:
          metanames       => [qw/swishdefault swishtitle swishdocpath/],
          
          
  
          # Another example: if you indexed an email archive
          # that defined the metanames subject name email (as in the swish-e discussion archive)
          # you might use:
          #metanames       => [qw/body subject name email/],
  
          # Note that you can do a real "all" search if you use nested metanames in your source documents.
          # Nesting metanames is most common with XML documents.
          
  
          # These are used to map MetaNames and PropertyNames to user-friendly names
          # on the form.
  
          name_labels => {
              swishdefault        => 'Title & Body',
              swishtitle          => 'Title',
              swishrank           => 'Rank',
              swishlastmodified   => 'Last Modified Date',
              swishdocpath        => 'Document Path',
              swishdocsize        => 'Document Size',
              subject             => 'Message Subject',
              name                => "Poster's Name",
              email               => "Poster's Email",
              sent                => 'Message Date',
              ALL                 => 'Message text',
          },
  
  
          timeout         => 10,    # limit time used by swish when fetching results - DoS protection.
  
  
  
          # These settings will use some crude highlighting code to highlight search terms in the
          # property specified above as the description_prop (normally, 'swishdescription').
  
  
          max_chars       => 500,   # If "highlight" is not defined, then just truncate the description to this many *chars*.
                                    # If you want to go by *words*, enable highlighting,
                                    # and then comment-out show_words.  It will be a little slower.
  
  
          # This structure defines term highlighting, and what type of highlighting to use
          highlight       => {
  
              # Pick highlighting module -- you must make sure the module can be found
  
              # Ok speed, but doesn't handle phrases.
              #Deals with stemming, but not stopwords
              #package         => 'DefaultHighlight', 
  
              # Somewhat slow, but deals with phases, stopwords, and stemming.
              # Takes into consideration WordCharacters, IgnoreFirstChars and IgnoreLastChars.
              package         => 'PhraseHighlight',
  
              # Fast: phrases without regard to wordcharacter settings
              # doesn't do context display, so must match in first X words,
              # doesn't handle stemming or stopwords.
              #package         => 'SimpleHighlight',  
  
              show_words      => 10,    # Number of swish words words to show around highlighted word
              max_words       => 100,   # If no words are found to highlighted then show this many words
              occurrences     => 6,     # Limit number of occurrences of highlighted words
              #highlight_on   => '<b>', # HTML highlighting codes
              #highlight_off  => '</b>',
              highlight_on    => '<font style="background:#FFFF99">',
              highlight_off   => '</font>',
              meta_to_prop_map => {   # this maps search metatags to display properties
                  swishdefault    => [ qw/swishtitle swishdescription/ ],
                  swishtitle      => [ qw/swishtitle/ ],
                  swishdocpath    => [ qw/swishdocpath/ ],
              },
          },
  
  
  
          # If you specify more than one index file (as an array reference) you
          # can set this allow selection of which indexes to search.
          # The default is to search all indexes specified if this is not used.
          # When used, the first index is the default index.
  
          # You need to specify your indexes as an array reference: 
          #swish_index     => [ qw/ index.swish-e index.other index2.other index3.other index4.other / ], 
  
          Xselect_indexes  => {
              #method  => 'radio_group',  # pico radio_group, popup_menu, or checkbox_group
              method  => 'checkbox_group',
              #method => 'popup_menu',
              columns => 3,
              labels  => [ 'Main Index', 'Other Index', qw/ two three four/ ],  # Must match up one-to-one
              description => 'Select Site: ',
          },
  
  
          # Similar to select_indexes, this adds a metaname search
          # based on a metaname.  You can use any metaname, and this will
          # add an "AND" search to limit results to a subset of your records.
          # i.e. it adds something like  'site=(foo or bar or baz)' if foo, bar, and baz were selected.
  
          # Swish-e's ExtractPath would work well with this.  For example, the apache docs:
          # ExtractPath site regex !^/usr/local/apache/htdocs/manual/([^/]+)/.+$!$1!
          # ExtractPathDefault site other
          
  
          Xselect_by_meta  => {
              #method      => 'radio_group',  # pico radio_group, popup_menu, or checkbox_group
              method      => 'checkbox_group',
              #method      => 'popup_menu',
              columns     => 3,
              metaname    => 'site',     # Can't be a metaname used elsewhere!
              values      => [qw/misc mod vhosts other/],
              labels  => {
                  misc    => 'General Apache docs',
                  mod     => 'Apache Modules',
                  vhosts  => 'Virutal hosts',
              },
              description => 'Limit search to these areas: ',
          },
                                                
  
  
  
          # The 'template' setting defines what generates the output
          # The default is "TemplateDefault" which is reasonably ugly.
          # Note that some of the above options may not be available
          # for templating, as it's up to you do layout the form
          # and results in your template.
          
  
          xtemplate => {
              package     => 'TemplateDefault',
          },
  
          xtemplate => {
              package     => 'TemplateDumper',
          },
  
          xtemplate => {
              package         => 'TemplateToolkit',
              file            => 'search.tt',
              options         => {
                  INCLUDE_PATH    => '/home/user/swish-e/example',
                  #PRE_PROCESS     => 'config',
              },
          },
  
          xtemplate => {
              package         => 'TemplateHTMLTemplate',
              options         => {
                  filename            => 'swish.tmpl',
                  die_on_bad_params   => 0,
                  loop_context_vars   => 1,
                  cache               => 1,
              },
          },
  
  
          # This defines the package object for reading CGI parameters
          # Defaults to CGI.  Might be useful with mod_perl.
          # request_package     => 'CGI',
          # request_package     => 'Apache::Request',
  
  
  
  
  
          # Limit to date ranges
  
  
  
          # This adds in the date_range limiting options
          # You will need the DateRanges.pm module from the author to use that feature
  
          # Noramlly, you will want to limit by the last modified date, so specify
          # "swishlastmodified" as the property_name.  If indexing a mail archive, and, for
          # example, you store the date (a unix timestamp) as "date" then specify
          # "date" as the property_name.
  
          date_ranges     => {
              property_name   => 'swishlastmodified',      # property name to limit by
  
              # what you specify here depends on the DateRanges.pm module.
              time_periods    => [  
                  'All',
                  'Today',
                  'Yesterday',
                  #'Yesterday onward',
                  'This Week',
                  'Last Week',
                  'Last 90 Days',
                  'This Month',
                  'Last Month',
                  #'Past',
                  #'Future',
                  #'Next 30 Days',
              ],
  
              line_break      => 0,
              default         => 'All',
              date_range      => 1,
          },
  
      };
  
  }
  
  #============================================================================
  #   Read config settings from disk, and merge
  #   Note, all errors are ignored since by default this script looks for a
  #   config file.
  #
  #============================================================================
  sub merge_read_config {
      my $config = shift;
  
      return $config unless $config->{config_file};
  
      my $return = do $config->{config_file};
  
      return $config unless ref $return eq 'HASH';
  
      # Merge settings
      return { %$config, %$return };
  }
  
  #============================================================================
  #
  #   This is the main entry point, where a config hash is passed in.
  #
  #============================================================================
  
  sub process_request {
      my $conf = shift;  # configuration parameters
  
      # Use CGI.pm by default
      my $request_package = $conf->{request_package} || 'CGI';
      $request_package =~ s[::][/]g;
      require "$request_package.pm";
  
  
      # create search object
      my $search = SwishQuery->new(
          config    => $conf,
          request   => ($conf->{request_package} ? $conf->{request_package}->new : CGI->new),
      );
  
  
      # run the query
      my $results = $search->run_query;  # currently, results is the just the $search object
  
      my $template = $conf->{template} || { package => 'TemplateDefault' };
  
      my $package = $template->{package};
  
      my $file = "$package.pm";
      $file =~ s[::][/]g;
  
      require $file;
      $package->show_template( $template, $results );
  }
  
  
  
  
  
  #==================================================================================================
  package SwishQuery;
  #==================================================================================================
  
  use Carp;
  
  #--------------------------------------------------------------------------------
  # new() doesn't do much, just create the object
  #--------------------------------------------------------------------------------
  sub new {
      my $class = shift;
      my %options = @_;
  
      my $conf = $options{config};
  
      croak "Failed to set the swish index files in config setting 'swish_index'" unless $conf->{swish_index};
      croak "Failed to specify 'swish_binary' in configuration" unless $conf->{swish_binary};
  
      # initialize the request search hash
      my $sh = {
         prog         => $conf->{swish_binary},
         config       => $conf,
         q            => $options{request},
         hits         => 0,
         MOD_PERL     => $ENV{MOD_PERL},
      };
  
      return bless $sh, $class;
  }
  
  
  sub hits { shift->{hits} }
  
  sub config {
      my ($self, $setting, $value ) = @_;
  
      croak "Failed to pass 'config' a setting" unless $setting;
  
      my $cur = $self->{config}{$setting} if exists $self->{config}{$setting};
  
      $self->{config}{$setting} = $value if $value;
  
      return $cur;
  }
  
  sub header {
      my $self = shift;
      return unless ref $self->{_headers} eq 'HASH';
      
      return $self->{_headers}{$_[0]} || '';
  }
  
  
  # return a ref to an array
  sub results {
      my $self = shift;
      return $self->{_results} || undef;
  }
  
  sub navigation {
      my $self = shift;
      return unless ref $self->{navigation} eq 'HASH';
      
      return exists $self->{navigation}{$_[0]} ? $self->{navigation}{$_[0]} : '';
  }
  
  sub CGI { $_[0]->{q} };
      
  
  
  
  sub swish_command {
  
      my $self = shift;
  
      unless ( @_ ) {
          return $self->{swish_command} ? @{$self->{swish_command}} : undef;
      }
  
      push @{$self->{swish_command}}, @_;
  }
  
  
  sub errstr {
      my ($self, $value ) = @_;
  
  
      $self->{_errstr} = $value if $value;
  
      return $self->{_errstr} || '';
  }
  
  
  
  
      
  
  #============================================
  # This returns "$self" just in case we want to seperate out into two objects later
  
  
  sub run_query {
  
      my $self = shift;
  
      my $q = $self->{q};
      my $conf = $self->{config};
  
  
  
      # Sets the query string, and any -L limits.
      return $self unless $self->build_query;
  
  
     
      # Set the starting position (which is offset by one)
  
      my $start = $q->param('start') || 0;
      $start = 0 unless $start =~ /^\d+$/ && $start >= 0;
  
      $self->swish_command( '-b', $start+1 );
  
      
  
      # Set the max hits
  
      my $page_size = $self->config('page_size') || 15;
      $self->swish_command( '-m', $page_size );
  
  
      return $self unless $self->set_index_file;
  
  
  
      # Set the sort option, if any
      return $self unless $self->set_sort_order;
  
  
  
      # Trap the call - not portable.
  
      my $timeout = $self->config('timeout');
  
      if ( $timeout ) {
          eval {
              local $SIG{ALRM} = sub { die "Timed out\n" };
              alarm ( $self->config('timeout') || 5 );
              $self->run_swish;
              alarm 0;
          };
  
          if ( $@ ) {
              $self->errstr( $@ );
              return $self;
          }
      } else {
          $self->run_swish;
      }
  
  
  
      my $hits = $self->hits;
      return $self unless $hits;
  
  
  
      # Build href for repeated search via GET (forward, backward links)
  
  
      my @query_string =
           map { "$_=" . $q->escape( $q->param($_) ) }
              grep { $q->param($_) }  qw/query metaname sort reverse/;
  
  
      for my $p ( qw/si sbm/ ) {
          my @settings = $q->param($p);
          next unless @settings;
          push @query_string,  "$p=" . $q->escape( $_ ) for @settings;
      }
  
      
  
  
      if ( $conf->{date_ranges} ) {
          my $dr = DateRanges::GetDateRangeArgs( $q );
          push @query_string, $dr, if $dr;
      }
  
  
      $self->{query_href} = $q->script_name . '?' . join '&amp;', @query_string;
  
  
  
      # Return the template fields
  
      $self->{my_url} = $q->script_name;
  
      $self->{hits} = $hits;
          
      $self->{navigation}  = {
              showing     => $hits,
              from        => $start + 1,
              to          => $start + $hits,
              hits        => $self->header('number of hits') ||  0,
              run_time    => $self->header('run time') ||  'unknown',
              search_time => $self->header('search time') ||  'unknown',
      };
  
  
      $self->set_page ( $page_size );
  
      return $self;
      
  }        
  
  
  #============================================================
  # Build a query string from swish
  # Just builds the -w string
  #------------------------------------------------------------
  
  sub build_query {
      my $self = shift;
  
      my $q = $self->{q};
      
  
      # set up the query string to pass to swish.
      my $query = $q->param('query') || '';
  
      for ( $query ) {  # trim the query string
          s/\s+$//;
          s/^\s+//;
      }
  
      $self->{query_simple} = $query;    # without metaname
      $q->param('query', $query );  # clean up the query, if needed.
      
  
      # Read in the date limits, if any.  This can create a new query
      return unless $self->get_date_limits( \$query );
  
  
      unless ( $query ) {
          $self->errstr('Please enter a query string') if $q->param('submit');
          return;
      }
      if ( length( $query ) > 100 ) {
          $self->errstr('Please enter a shorter query');
          return;
      }
  
  
  
      # Adjust the query string for metaname search
      # *Everything* is a metaname search
      # Might also like to allow searching more than one metaname at the same time
  
      my $metaname = $q->param('metaname') || 'swishdefault';
  
  
      # make sure it's a valid metaname
  
      my $conf = $self->{config};
      my @metas = ('swishdefault');
      push @metas, @{ $self->config('metanames')} if $self->config('metanames');
      unless ( grep { $metaname eq $_ } @metas  ) {
          $self->errstr('Bad MetaName provided');
          return;
      }
  
      # prepend metaname to query
      $query = $metaname . "=($query)";
  
      # save the metaname so we know what field to highlight
      $self->{metaname} = $metaname;
  
  
      ## Look for a "limit" metaname -- perhaps used with ExtractPath
      # Here we don't worry about user supplied data
  
      my $limits = $self->config('select_by_meta');
      my @limits = $q->param('sbm');  # Select By Metaname
  
  
      # Note that this could be messed up by ending the query in a NOT or OR
      # Should look into doing:
      # $query = "( $query ) AND " . $limits->{metaname} . '=(' . join( ' OR ', @limits ) . ')';
      if ( @limits && ref $limits eq 'HASH' && $limits->{metaname} ) {
          $query .= ' and ' . $limits->{metaname} . '=(' . join( ' or ', @limits ) . ')';
      }
  
  
      $self->swish_command('-w', $query );
  
      return 1;
  }
  
  #========================================================================
  #  Get the index files from the form, or from simple the config settings
  #------------------------------------------------------------------------
  
  sub set_index_file {
      my $self = shift;
  
      my $q = $self->CGI;
      
      # Set the index file
      
      if ( $self->config('select_indexes') && ref $self->config('swish_index') eq 'ARRAY'  ) {
  
          my @choices = $q->param('si');
          if ( !@choices ) {
              $self->errstr('Please select a source to search');
              return;
          }
  
          my @indexes = @{$self->config('swish_index')};
  
  
          my @selected_indexes = grep {/^\d+$/ && $_ >= 0 && $_ < @indexes } @choices;
  
          if ( !@selected_indexes ) {
              $self->errstr('Invalid source selected');
              return $self;
          }
          $self->swish_command( '-f', @indexes[ @selected_indexes ] );
  
  
      } else {
          my $indexes = $self->config('swish_index');        
          $self->swish_command( '-f', ref $indexes ? @$indexes : $indexes );
      }
  
      return 1;
  }
  
  #================================================================================
  #   Parse out the date limits from the form or from GET request
  #
  #---------------------------------------------------------------------------------
  
  sub get_date_limits {
  
      my ( $self, $query_ref ) = @_;
  
      my $conf = $self->{config};
  
      # Are date ranges enabled?
      return 1 unless $conf->{date_ranges};
  
  
      eval { require DateRanges };
      if ( $@ ) {
          $self->errstr( $@ );
          delete $conf->{date_ranges};
          return;
      }
      
      my $q = $self->{q};
  
      my %limits;
  
      unless ( DateRanges::DateRangeParse( $q, \%limits ) ) {
          $self->errstr( $limits{DateRanges_error} || 'Bad date range selection' );
          return;
      }
  
      # Store the values for later
      
      $self->{DateRanges_time_low} = $limits{DateRanges_time_low};
      $self->{DateRanges_time_high} = $limits{DateRanges_time_high};
  
  
      # Allow searchs just be date if not "All dates" search
      # $$$ should place some limits here, and provide a switch to disable
      if ( !$$query_ref && $limits{DateRanges_time_high} ) {
          $$query_ref = 'not skaisikdeekk';
          $self->{_search_all}++;  # flag
      }
  
  
      my $limit_prop = $conf->{date_ranges}{property_name} || 'swishlastmodified';
  
  
      if ( $limits{DateRanges_time_low} && $limits{DateRanges_time_high} ) {
          $self->swish_command( '-L', $limit_prop, $limits{DateRanges_time_low}, $limits{DateRanges_time_high} );
      }
  
      return 1;
  }
  
  
  
  #================================================================
  #  Set the sort order
  #  Just builds the -s string
  #----------------------------------------------------------------
  
  sub set_sort_order {
      my $self = shift;
  
      my $q = $self->{q};
  
      my $sorts_array = $self->config('sorts');
      return 1 unless $sorts_array;
  
  
      my $conf = $self->{config};
  
  
      # Now set sort option - if a valid option submitted (or you could let swish-e return the error).
      my %sorts = map { $_, 1 } @$sorts_array;
  
      if ( $q->param('sort') && $sorts{ $q->param('sort') } ) {
  
          my $direction = $q->param('sort') eq 'swishrank'
              ? $q->param('reverse') ? 'asc' : 'desc'
              : $q->param('reverse') ? 'desc' : 'asc';
                  
          $self->swish_command( '-s', $q->param('sort'), $direction );
  
          if ( $conf->{secondary_sort} && $q->param('sort') ne $conf->{secondary_sort}[0] ) {
                  $self->swish_command(ref $conf->{secondary_sort} ? @{ $conf->{secondary_sort} } : $conf->{secondary_sort} );
          }
  
      } else {
          $self->errstr( 'Invalid Sort Option Selected' );
          return;
      }
  
      return 1;
  }
  
  
  
  #========================================================
  # Sets prev and next page links.
  # Feel free to clean this code up!
  #
  #   Pass:
  #       $resutls - reference to a hash (for access to the headers returned by swish)
  #       $q       - CGI object
  #
  #   Returns:
  #       Sets entries in the $results hash
  #
      
  sub set_page {
  
      my ( $self, $Page_Size ) = @_;
  
      my $q = $self->{q};
  
      my $navigation = $self->{navigation};
      
          
      my $start = $navigation->{from} - 1;   # Current starting record
      
          
      my $prev = $start - $Page_Size;
      $prev = 0 if $prev < 0;
  
      if ( $prev < $start ) {
          $navigation->{prev} = $prev;
          $navigation->{prev_count} = $start - $prev;
      }
  
      
      my $last = $navigation->{hits} - 1;
  
      
      my $next = $start + $Page_Size;
      $next = $last if $next > $last;
      my $cur_end   = $start + $self->{hits} - 1;
      if ( $next > $cur_end ) {
          $navigation->{next} = $next;
          $navigation->{next_count} = $next + $Page_Size > $last
                                  ? $last - $next + 1
                                  : $Page_Size;
      }
  
  
      # Calculate pages  ( is this -1 correct here? )
      
      my $pages = int (($navigation->{hits} -1) / $Page_Size);
      if ( $pages ) {
  
          my @pages = 0..$pages;
  
          my $max_pages = 10;
  
          if ( @pages > $max_pages ) {
              my $current_page = int ( $start / $Page_Size - $max_pages/2) ;
              $current_page = 0 if $current_page < 0;
              if ( $current_page + $max_pages - 1 > $pages ) {
                  $current_page = $pages - $max_pages;
              }
              
              @pages = $current_page..$current_page + $max_pages - 1;
              unshift @pages, 0 if $current_page;
              push @pages, $pages unless $current_page + $max_pages - 1 == $pages;
          }
  
      
          $navigation->{pages} =
              join ' ', map {
                  my $page_start = $_ * $Page_Size;
                  my $page = $_ + 1;
                  $page_start == $start
                  ? $page
                  : qq[<a href="$self->{query_href}&amp;start=$page_start">$page</a>];
                          } @pages;
      }
  
  }
  
  #==================================================
  # Format and return the date range options in HTML
  #
  #--------------------------------------------------
  sub get_date_ranges {
  
      my $self = shift;
  
      my $q = $self->{q};
      my $conf = $self->{config};
  
      return '' unless $conf->{date_ranges};
  
      # pass parametes, and a hash to store the returned values.
  
      my %fields;
      
      DateRanges::DateRangeForm( $q, $conf->{date_ranges}, \%fields );
  
  
      # Set the layout:
      
      my $string = '<br>Limit to: '
                   . ( $fields{buttons} ? "$fields{buttons}<br>" : '' )
                   . ( $fields{date_range_button} || '' )
                   . ( $fields{date_range_low}
                       ? " $fields{date_range_low} through $fields{date_range_high}"
                       : '' );
  
      return $string;
  }
  
  
      
  #============================================
  # Run swish-e and gathers headers and results
  # Currently requires fork() to run.
  #
  #   Pass:
  #       $sh - an array with search parameters
  #
  #   Returns:
  #       a reference to a hash that contains the headers and results
  #       or possibly a scalar with an error message.
  #
  
  use Symbol;
  
  sub run_swish {
  
  
      my $self = shift;
  
      my $results = $self->{results};
      my $conf    = $self->{config};
      my $q       = $self->{q};
  
  
  
      
      my @properties;
      my %seen;
  
      # Gather up the properties specified
      
      for ( qw/ title_property description_prop display_props / ) {
          push @properties, ref $conf->{$_} ? @{$conf->{$_}} : $conf->{$_}
              if $conf->{$_} && !$seen{$_}++;
      }
  
      # Add in the default props 
      for ( qw/swishrank swishdocpath/ ) {
          push @properties, $_ unless $seen{$_};
      }
  
      
      # add in the default prop - a number must be first (this might be a duplicate in -x, oh well)
      @properties = ( 'swishreccount', @properties );
  
      $self->swish_command( -x => join( '\t', map { "<$_>" } @properties ) . '\n' );
      $self->swish_command( -H => 9 );
  
  
      # Run swish 
      my $fh = gensym;
      my $pid = open( $fh, '-|' );
  
      die "Failed to fork: $!\n" unless defined $pid;
  
      if ( !$pid ) {  # in child
          exec $self->{prog},  $self->swish_command or die "Failed to exec '$self->{prog}' Error:$!";
      }
  
      $self->{COMMAND} = join ' ', $self->{prog},  $self->swish_command;
  
  
      # read in from child
  
  
      my @results;
      
      my $trim_prop = $self->config('description_prop');
  
      my $highlight = $self->config('highlight');
      my $highlight_object;
  
      # Loop through values returned from swish.
  
      my %stops_removed;
      
      while (<$fh>) {
  
          chomp;
  
          # This will not work correctly with multiple indexes when different values are used.
          if ( /^# ([^:]+):\s+(.+)$/ ) {
              my $h = lc $1;
              my $value = $2;
              $self->{_headers}{$h} = $value;
  
              push @{$self->{_headers}{'removed stopwords'}}, $value if $h eq 'removed stopword' && !$stops_removed{$value}++;
  
              next;
          }
  
  
          # return errors as text
          $self->errstr($1), return if /^err:\s*(.+)/;
  
  
          # Found a result
          if ( /^\d/ ) {
  
              my %h;
              @h{@properties} = split /\t/;
              push @results, \%h;
  
              # There's a chance that the docpath could be modified by highlighting
              # when used in a "display_props".
              $h{saved_swishdocpath} = $h{swishdocpath};
  
              my $docpath = $h{swishdocpath};
              $docpath =~ s/\s/%20/g;  # Replace spaces
              $h{swishdocpath_href} = ( $self->config('prepend_path') || '' ) . $docpath;
  
              
  
  
  
              # Now do any formatting
              if ( $highlight ) {
                  if ( !$highlight_object ) {
                      my $package = $highlight->{package} || 'DefaultHighlight';
  
                      eval { require "$package.pm" };
                      if ( $@ ) {
                          $self->errstr( $@ );
                          $highlight = '';
                          next;
                      } else {
                          $highlight_object = $package->new( $self, $self->{metaname} );
                      }
                  }
  
                  # Highlight any fields, as needed
                  $highlight_object->highlight( \%h  );
  
                  next;
              }
  
  
  
  
              # Trim down the description if no highlight, or if highlighting some other property
              # Not very nice.  The highlighting code would limit by words
  
              if ( $trim_prop && $h{$trim_prop} ) {
                  my $max = $conf->{max_chars} || 500;
  
                  if ( length $h{$trim_prop} > $max ) {
                      $h{$trim_prop} = substr( $h{$trim_prop}, 0, $max) . ' <b>...</b>';
                  }
              }
      
          }
  
          # Might check for "\n." for end of results.
  
          
      }
  
      $self->{hits} = @results;
      $self->{_results} = \@results if @results;
          
  }
  
  
  #=====================================================================================
  # This method parses out the query from the "Parsed words" returned by swish
  # for use in highlighting routines
  # This returns a hash ref:
  #  $query->{text}  # evertying is currently at level "text"
  #                {$metaname}  # the meta name
  #                           [ array of phrases ]
  #                    each phrase is made up of an array of words
  
  
  
  
  
  use constant DEBUG_QUERY_PARSED => 0;
  
  sub extract_query_match {
      my $self = shift;
  
      my $query = $self->header('parsed words');  # grab query parsed by swish
  
      
  
      my %query_match;  # kewords broken down by layer and field.
      $self->{query_match} = \%query_match;
  
  
      # Loop through the query
  
      while ( $query =~ /([a-z]+)\s+=\s+(.+?)(?=\s+[a-z]+\s+=|$)/g ) {
  
          my ( $field, $words ) = ( $1, $2 );
  
  
          my $inquotes;
          my $buffer;
          my %single_words;
  
          my $layer = 'text';  # This might be used in the future to highlight tags when matching a href.
  
          # Expand group searches -- not currently used
          my @fields = ( $field );
  
  
          for my $word ( split /\s+/, $words ) {
  
  
              # XXX This list of swish operators could change "and or not" and is dependent on stopwords.
              # remove control words and parens
              next if !$inquotes && $word =~ /^(and|or|not|\(|\))$/; 
  
              $buffer = [] unless $inquotes;  # is there a better way to allocate memory like this?
  
              if ( $word eq '"' ) {
                  unless ( $inquotes ) {
                      $inquotes++;
                      next;
                  } else {
                      $inquotes = 0;
                  }
  
              } else {
          
                  push @$buffer, $word;
              }
  
          
              next if $inquotes;
  
  
              # Only record single words once (this will probably break something)
              # Reason: to reduce the number of matches must check
              next if @$buffer == 1 && $single_words{ $buffer->[0] }++;
  
          
              push @{$query_match{$layer}{$_}}, $buffer foreach @fields;
  
  
          }
      }
  
  
  
      # Now, sort in desending order of phrase lenght
  
  
      foreach my $layer ( keys %query_match ) {
          print STDERR "         LAYER: $layer\n" if DEBUG_QUERY_PARSED;
  
  
          foreach my $tag ( keys %{$query_match{$layer}} ) {
  
              @{$query_match{$layer}{$tag}} = sort { @$b <=> @$a } @{$query_match{$layer}{$tag}};
  
  
              if ( DEBUG_QUERY_PARSED ) {
                  print STDERR "            TAG: '$tag'\n";
                  print STDERR "                 : '@$_'\n" foreach @{$query_match{$layer}{$tag}};
              }
          }
      }
  
  
      return \%query_match;
  }
  
  
  1;
  
  
  __END__
  
  =head1 NAME
  
  swish.cgi -- Example Perl script for searching with the SWISH-E search engine.
  
  =head1 DESCRIPTION
  
  C<swish.cgi> is an example CGI script for searching with the SWISH-E search engine version 2.1-dev and above.
  It returns results a page at a time, with matching words from the source document highlighted, showing a
  few words of content on either side of the highlighted word.
  
  The standard configuration should work with most swish index files.  Customization of the parameters will be
  needed if you are indexing special meta data and want to search and/or display the meta data.  The
  configuration can be modified by editing this script directly, or by using a configuration file (.swishcgi.conf
  by default).
  
  The script is modular in design.  Both the highlighting code and output generation is handled by modules, which
  are included in the F<example/modules> directory.  This allows for easy customization of the output without
  changing the main CGI script.  A module exists to generate standard HTML output.  There's also modules and
  template examples to use with the popular templating systems HTML::Template and Template-Toolkit.  This allows
  you to tightly integrate this script with the look of an existing template-driven web site.
  
  This scipt can also run basically unmodified as a mod_perl handler, providing much better performance than
  running as a CGI script.
  
  Due to the forking nature of this program and its use of signals,
  this script probably will not run under Windows without some modifications.
  There's plan to change this soon.
  
  
  =head1 INSTALLATION
  
  Installing a CGI application is dependent on your specific web server's configuration.
  For this discussion we will assume you are using Apache in a typical configuration.  For example,
  a common location for the DocumentRoot is C</usr/local/apache/htdocs>.  If you are installing this
  on your shell account, your DocumentRoot might be C<~yourname/public_html>.
  
  For the sake of this example we will assume the following:
  
      /usr/local/apache/htdocs        - Document root
      /usr/local/apache/cgi-bin       - CGI directory
  
  =head2 Move the files to their locations
  
  =over 4
  
  =item Copy the swish.cgi file to your CGI directory
  
  Most web servers have a directory where CGI programs are kept.
  Copy the C<swish.cgi> perl script into that directory if this is the case on your
  server.  You will need to provide read
  and execute permisssions to the file.  Exactly what permissions are needed again depends on
  your specific configuration.  For example, under Unix:
  
      chmod 0755 swish.cgi
  
  This gives the file owner (that's you) write access, and everyone read and execute access.    
  
  Note that you are not required to use a cgi-bin directory with Apache.  You may place the
  CGI script in any directory accessible via the web server and
  enable it as a CGI script with something like the following
  (place either in httpd.conf or in .htaccess):
  
      <Files swish.cgi>
          Allow from all
          SetHandler cgi-script
          Options +ExecCGI
      </Files>        
  
  Using this method you don't even need to use the C<.cgi> extension.  For example, rename
  the script to "search" and then use that in the C<Files> directive.  Take to your web
  administrator for further information.
  
  =item Copy the modules directory
  
  Copying the modules directory is optional, but the script needs to find additional modules so you will
  need to edit the script to point to the modules directory.  Unlike CPAN modules that need to
  be uncompressed, built, and installed, all you need to do is make sure the modules are some place where
  the web server can read them.  You may decide to leave them where you uncompressed the swish-e distribution,
  or you may wish to move them to your perl library.
  
  =head1 CONFIGURATION
  
  =head2 Configure the swish.cgi program
  
  Use a text editor and open the C<swish.cgi> program.
  
  =over 4
  
  =item 1 Check the C<shebang> line
  
  The first line of the program must point to the location of your perl program.  Typical
  examples are:
  
      #!/usr/local/bin/perl -w
      #!/usr/bin/perl -w
      #!/opt/perl/bin/perl -w
  
  =item 2 Set the perl library path
  
  The script must find the modules that the script is distributed with.  These modules handle
  the highlighting of the search terms, and the output generation.  Again, where you place the
  modules is up to you, and the only requirement is that the web server can access those files.
  
  You tell perl the location of the modules with the "use lib" directive.  The default for this script is:
  
      use lib qw( modules );
  
  This says to look for the modules in the F<modules> directory of the current directory.
  
  For example, say you want to leave the modules where you unpacked the swish-e distribution.  If
  you unpacked in your home directory of F</home/yourname/swish-e> then you must add this to the
  script:
  
      use lib qw( /home/yourname/swish-e/example/modules );
  
      
  
  =item 3 Set the configuration parameters
  
  To make things somewhat simple, the configuration parameters are included at the top of the program.
  The parameters are all part of a perl C<hash> structure, and the comments at the top of the program should
  get you going.
  
  You will probably need to specify at least the location of the swish-e binary, your index file or files,
  and a title.
  
  You have two options for changing the configuration settings from their default:
  you may edit the script directly, or you may use a configuration file.  In either case, the configuration
  settings are a basic perl hash reference.
  
  Using a configuration file is described below.
  
  The configuration settings might look like:
  
      return {
          title           => 'Search the Swish-e list',   # Title of your choice.
          swish_binary    => './swish-e',                 # Location of swish-e binary
          swish_index     => '../index.swish-e',          # Location of your index file
      };
  
  Or if searching more than one index:
      
      return {
          title           => 'Search the Swish-e list',
          swish_binary    => './swish-e',
          swish_index     => ['../index.swish-e', '../index2'],
      };
  
  Both of these examples return a reference to a perl hash ( C<return {...}> ).  Again, this same format is
  used either at the top of this program, or in a configuration file.
  
  The examples above place the swish index file(s)
  in the directory above the C<swish.cgi> CGI script.  If using the example paths above
  of C</usr/local/apache/cgi-bin> for the CGI bin directory, that means that the index file
  is in C</usr/local/apache>.  That places the index out of web space (e.g. cannot be accessed
  via the web server), yet relative to where the C<swish.cgi> script is located.
  
  (If running under mod_perl you will most likely specify absolute paths for your index files.)
  
  There's more than one way to do it, of course.
  One option is to place the index in the same directory as the <swish.cgi> script, but
  then be sure to use your web server's configuration to prohibit access to the index directly.
  
  Another common option is to maintain a separate directory of the all your swish index files.  This decision is
  up to you.
  
  As mentioned above, you can either edit this script directly and modify the configuration settings, or
  use an external configuration file.  The settings in the configuration file are merged with (override)
  the settings defined in the script.
  
  By default, the script will attempt to read from the file F<.swishcgi.conf>.
  For example, you might only wish to change the title used
  in the script.  Simply create a file called F<.swishcgi.conf> in the same directory as the CGI script:
  
      > cat .swishcgi.conf
      # Example swish.cgi configuration script.
      return {
         title => 'Search Our Mailing List Archive',
      };
  
  Look at the default configuration settings at the top of this program for information on the available settings.    
  
  =item 4 Create your index
  
  You must index your web site before you can begin to use the C<swish.cgi> script.
  Create a configuration file called C<swish.conf> in the directory where you will store
  the index file.
  
  This next example uses the file system to index your web documents.
  In general, you will probably wish to I<spider> your web site if your web pages do not
  map exactly to your file system, and to only index files available from links on you web
  site.
  
  See B<Spidering> below for more information.
  
  Example C<swish.conf> file:
  
      # Define what to index
      IndexDir /usr/local/apache/htdocs
      IndexOnly .html .htm
  
      # Tell swish how to parse .html and .html documents
      IndexContents HTML .html .htm
      # And just in case we have files without an extension
      DefaultContents HTML
  
      # Replace the path name with a URL
      ReplaceRules replace /usr/local/apache/htdocs/ http://www.myserver.name/
  
      # Allow limiting search to titles and URLs.
      MetaNames swishdocpath swishtitle
  
      # Optionally use stemming for "fuzzy" searches
      #UseStemming yes
  
  Now to index you simply run:
  
      swish-e -c swish.conf
  
  The default index file C<index.swish-e> will be placed in the current directory.
  
  Note that the above swish-e configuration defines two MetaNames "swishdocpath" and "swishtitle".
  This allows searching just the document path or the title instead of the document's content.
  
  Here's an expanded C<swish.cgi> configuration to make use of the above settings used while indexing:
  
      return {
          title           => 'Search the Apache documentation',
          swish_binary    => './swish-e',
          swish_index     => 'index.swish-e',
          metanames       => [qw/swishdefault swishdocpath swishtitle/],
          display_props   => [qw/swishlastmodified swishdocsize swishdocpath/],
          title_property  => 'swishtitle',  # Not required, but recommended
  
          name_labels => {
              swishdefault        => 'Body & Title',
              swishtitle          => 'Title',
              swishrank           => 'Rank',
              swishlastmodified   => 'Last Modified Date',
              swishdocpath        => 'Document Path',
              swishdocsize        => 'Document Size',
          },
  
      };
  
  The above configuration defines metanames to use on the form.
  Searches can be limited to these metanames.
  
  "display_props" tells the script to display the property "swishlastmodified" (the last modified
  date of the file), the document size, and path with the search results.
  
  The parameter "name_labels" is a hash (reference)
  that is used to give friendly names to the metanames.
  
  Swish-e can store part of all of the contents of the documents as they are indexed, and this
  "document description" can be returned with search results.
  
      # Store the text of the documents within the swish index file
      StoreDescription HTML <body> 100000
  
  Adding the above to your C<swish.conf> file tells swish-e to store up to 100,000 characters from the body of each document within the
  swish-e index.  To display this information in search results, highlighting search terms,
  use the follow configuration in C<swish.cgi>:
  
      return {
          title           => 'Search the Apache documentation',
          swish_binary    => './swish-e',
          swish_index     => 'index.swish-e',
          metanames       => [qw/swishdefault swishdocpath swishtitle/],
          display_props   => [qw/swishlastmodified swishdocsize swishdocpath/],
          title_property  => 'swishtitle',  # Not required, but recommended
          description_prop=> 'swishdescription',
  
          name_labels => {
              swishdefault        => 'Body & Title',
              swishtitle          => 'Title',
              swishrank           => 'Rank',
              swishlastmodified   => 'Last Modified Date',
              swishdocpath        => 'Document Path',
              swishdocsize        => 'Document Size',
          },
          highlight       => {
              package         => 'PhraseHighlight',
              meta_to_prop_map => {   # this maps search metatags to display properties
                  swishdefault    => [ qw/swishtitle swishdescription/ ],
                  swishtitle      => [ qw/swishtitle/ ],
                  swishdocpath    => [ qw/swishdocpath/ ],
              },
         }
  
      };
  
  
  Other C<swish.cgi> configuration settings are available, and are listed at the top of the F<swish.cgi>
  script.
  
  
  =back
  
  You should now be ready to run your search engine.  Point your browser to:
  
      http://www.myserver.name/cgi-bin/swish.cgi
  
  adjusting the server and URL to match your system, of course.    
  
  =head1 MOD_PERL
  
  This script can be run under mod_perl (see http://perl.apache.org).
  This will improve the response time of the script compared to running under CGI.
  
  Configuration is simple.  In your httpd.conf or your startup.pl file you need to
  load the script.  For example, in httpd.conf you can use a perl section:
  
      <perl>
          use lib '/usr/local/apache/cgi-bin';
          use lib '/home/yourname/swish-e/example/modules';
          require "swish.cgi";
      </perl>
  
  Again, note that the paths used will depend on where you installed the script and the modules.
  When running under mod_perl the swish.cgi script becomes a perl module, and therefore the script
  does not need to be installed in the cgi-bin directory.  (But, you can actually use the same script as
  both a CGI script and a mod_perl module at the same time, read from the same location.)
  
  The above loads the script into mod_perl.  Then to configure the script to run add this to your httpd.conf
  configuration file:
  
      <location /search>
          allow from all
          SetHandler perl-script
          PerlHandler SwishSearch
      </location>
  
  Unlike CGI, mod_perl does not change the current directory to the location of the perl module, so
  your settings for the swish binary and the path to your index files must be absolute
  paths (or relative to the server root).
  
  Take a look at the C<handler()> routine in this script for ideas how to use PerlSetVar commands
  in httpd.conf to control the script.
  
  Please post to the swish-e discussion list if you have any questions about running this
  script under mod_perl.
  
  
  =head1 DEBUGGING
  
  The key to debugging CGI scripts is to run them from the command line, not with a browser.
  
  First, make sure the program compiles correctly:
  
      > perl -c swish.cgi
      swish.cgi syntax OK
  
  Next, simply try running the program:
  
      > ./swish.cgi | head
      Content-Type: text/html; charset=ISO-8859-1
  
      <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
      <html>
          <head>
             <title>
                Search our site
             </title>
          </head>
          <body>
  
  Now, you know that the program compiles and will run from the command line.
  Next, try accessing the script from a web browser.
  
  If you see the contents of the CGI script instead of its output then your web server is
  not configured to run the script.  You will need to look at settings like ScriptAlias, SetHandler,
  and Options.
  
  If an error is reported (such as Internal Server Error or Forbidden)
  you need to locate your web server's error_log file
  and carefully read what the problem is.  Contact your web administrator for help.
      
      
  =head1 Spidering
  
  There are two ways to spider with swish-e.  One uses the "http" input method that uses code that's
  part of swish.  The other way is to use the new "prog" method along with a perl helper program called
  C<spider.pl>.
  
  Here's an example of a configuration file for spidering with the "http" input method.
  You can see that the configuration is not much different than the file system input method.
  (But, don't use the http input method -- use the -S prog method shown below.)
  
      # Define what to index
      IndexDir http://www.myserver.name/index.html
      IndexOnly .html .htm
  
      IndexContents HTML .html .htm
      DefaultContents HTML
      StoreDescription HTML <body> 200000
      MetaNames swishdocpath swishtitle
  
      # Define http method specific settings -- see swish-e documentation
      SpiderDirectory ../swish-e/src/
      Delay 0
  
  You index with the command:
  
      swish-e -S http -c spider.conf
  
  Note that this does take longer.  For example, spidering the Apache documentation on
  a local web server with this method took over a minute, where indexing with the
  file system took less than two seconds.  Using the "prog" method can speed this up.
  
  Here's an example configuration file for using the "prog" input method:
  
      # Define the location of the spider helper program
      IndexDir ../swish-e/prog-bin/spider.pl
  
      # Tell the spider what to index.
      SwishProgParameters default http://www.myserver.name/index.html
  
      IndexContents HTML .html .htm
      DefaultContents HTML
      StoreDescription HTML <body> 200000
      MetaNames swishdocpath swishtitle
  
  Then to index you use the command:
  
      swish-e -c prog.conf -S prog -v 0
  
  Spidering with this method took nine seconds.    
  
  
  =head1 Stemmed Indexes
  
  Many people enable a feature of swish called word stemming to provide "fuzzy" search
  options to their users.
  The stemming code does not actually find the "stem" of word, rather removes and/or replaces
  common endings on words.
  Stemming is far from perfect, and many words do not stem as you might expect.  But, it can
  be a helpful tool for searching your site.  You may wish to create both a stemmed and non-stemmed index, and
  provide a checkbox for selecting the index file.
  
  To enable a stemmed index you simply add to your configuration file:
  
      UseStemming yes
  
  If you want to use a stemmed index with this program and continue to highlight search terms you will need
  to install a perl module that will stem words.  This section explains how to do this.
  
  The perl module is included with the swish-e distribution.  It can be found in the examples directory (where
  you found this file) and called something like:
  
      SWISH-Stemmer-0.05.tar.gz
  
  The module should also be available on CPAN (http://search.cpan.org/).    
  
  Here's an example session for installing the module.  (There will be quite a bit of output
  when running make.)
  
  
      % gzip -dc SWISH-Stemmer-0.05.tar.gz |tar xof -
      % cd SWISH-Stemmer-0.05
      % perl Makefile.PL
      or
      % perl Makefile.PL PREFIX=$HOME/perl_lib
      % make
      % make test
  
      (perhaps su root at this point if you did not use a PREFIX)
      % make install
      % cd ..
  
  Use the B<PREFIX> if you do not have root access or you want to install the modules
  in a local library.  If you do use a PREFIX setting, add a C<use lib> statement to the top of this
  swish.cgi program.
  
  For example:
  
      use lib qw(
          /home/bmoseley/perl_lib/lib/site_perl/5.6.0
          /home/bmoseley/perl_lib/lib/site_perl/5.6.0/i386-linux/
      );
  
  Once the stemmer module is installed, and you are using a stemmed index, the C<swish.cgi> script will automatically
  detect this and use the stemmer module.
  
  =head1 DISCLAIMER
  
  Please use this CGI script at your own risk.
  
  This script has been tested and used without problem, but you should still be aware that
  any code running on your server represents a risk.  If you have any concerns please carefully
  review the code.
  
  See http://www.w3.org/Security/Faq/www-security-faq.html
  
  =head1 SUPPORT
  
  The SWISH-E discussion list is the place to ask for any help regarding SWISH-E or this example
  script. See http://swish-e.org.
  
  Before posting please review:
  
      http://swish-e.org/2.2/docs/INSTALL.html#When_posting_please_provide_the_
  
  Please do not contact the author directly.
  
  =head1 LICENSE
  
  swish.cgi $Revision: 1.1 $ Copyright (C) 2001 Bill Moseley search@hank.org
  Example CGI program for searching with SWISH-E
  
  
  This program is free software; you can redistribute it and/or
  modify it under the terms of the GNU General Public License
  as published by the Free Software Foundation; either version
  2 of the License, or (at your option) any later version.
  
  This program is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  
  
  =head1 AUTHOR
  
  Bill Moseley -- search@hank.org
  
  =cut
  
  
  
  
  
  1.1                  modperl-docs/src/search/swish.conf
  
  Index: swish.conf
  ===================================================================
  SwishProgParameters default http://perl.apache.org/~stas/modperl-site/
  #SwishProgParameters default http://localhost/modperl-site/
  IndexDir ./spider.pl
  DefaultContents HTML2
  StoreDescription HTML2 <body> 100000
  MetaNames swishtitle swishdocpath
  
  
  
  1.1                  modperl-docs/src/search/modules/DateRanges.pm
  
  Index: DateRanges.pm
  ===================================================================
  # $Id: DateRanges.pm,v 1.1 2002/01/30 06:35:00 stas Exp $
  
  package DateRanges;
  use strict;
  
  =head1 NAME
  
  DateRanges
  
  =head1 SYNOPSIS
  
      use DateRanges;
      use CGI;
      my $cgi = CGI->new();
       ...
  
      my %hash = (
          date_ranges     => {
  
              # Define what buttons to include
              time_periods    => [
                  'All',
                  'Today',
                  'Yesterday',
                  #'Yesterday onward',
                  'This Week',
                  'Last Week',
                  'Last 90 Days',
                  'This Month',
                  'Last Month',
                  #'Past',
                  #'Future',
                  #'Next 30 Days',
              ],
  
              # Default button
              default         => 'All',
  
              # Should buttons be in a row or a column?
              line_break      => 0,
  
              # Should a date input form be shown, too?
              date_range      => 1,
          },
      );
  
      
  
  
  =head1 DESCRIPTION
  
  This module provides I<basic> support for entering and using date ranges.  It
  was written to use with swish-e (http://swish-e.org).
  
  See swish.cgi in the swish-e distribution for an example.
  
  Sorry about the interface -- if anyone really wants to use this please let me know and I'll
  rewrite as OO interface!
  
  =head1 FUNCTIONS
  
  =cut
  
  require Exporter;
  
  use vars qw/$VERSION @ISA @EXPORT/;
  
  
  @ISA = qw(Exporter);
  $VERSION = '0.01';
  
  @EXPORT = qw (
      DateRangeForm
      DateRangeParse
      GetDateRangeArgs
  );
  
      # what to pick from
      my @TIME_PERIODS = (
          'Today',
          'Yesterday',
          'Yesterday onward',
          'This Week',
          'Last Week',
          'Last 90 Days',
          'This Month',
          'Last Month',
          'Past',
          'Future',
          'Next 30 Days',
          'All'
      );
  
      my %TIME_PERIODS = map { $_, 1} @TIME_PERIODS;
  
  
  use Date::Calc qw /
      Day_of_Week_to_Text
      Day_of_Week
      Date_to_Text
      Monday_of_Week
      Week_of_Year
      Today
      Add_Delta_Days
      Days_in_Month
      check_date
  /;
  
  use Time::Local;
  
  
  =item DateRangeForm( $cgi, $params, $fields );
  
  This function simple creates a simple form for selecting date ranges based
  on the fields passed in C<$params>.  Will call C<die()> on errors.
  
  C<$params> must be a hash reference with a key named C<time_periods> as shown in
  B<SYNOPSIS> above.  This is used to select which time periods to display.
  
  C<$fields> is a reference to a hash where C<DateRanges> returns data.
  
  These store the HTML for display on your form.
  
      buttons - the buttons to select the different time ranges
      date_range_button - the button to select a date range
      date_range_low - the low range select form fields
      date_range_hi  - the hight range select form fields
  
  =cut
  
  sub DateRangeForm {
      my ( $CGI, $params, $fields ) = @_;
  
      die "Must supply arrary ref for 'options'"
          unless $params->{time_periods} && ref $params->{time_periods} eq 'ARRAY';
  
      my @time_periods = grep { $TIME_PERIODS{$_} } @{ $params->{time_periods} };
  
  
      $fields->{buttons} = '';
      $fields->{date_range_button} = '';
      $fields->{date_range_low}  =  '';
      $fields->{date_range_high} =  '';
  
      $fields->{buttons} = 
            $CGI->radio_group( 
              -name       => 'DateRanges_date_option',
              -values     => \@time_periods,
              -default    => ($params->{default} || $time_periods[0]),
              -linebreak  => (exists $params->{line_break} ? $params->{line_break} : 1),
              #-columns=>2,
            ) if @time_periods;
  
  
      return unless $params->{date_range};
  
      $fields->{date_range_button} = 
          $CGI->radio_group(
              -name       => 'DateRanges_date_option',
              -values     => ['Select Date Range'],
              -default    => ($params->{default} || $time_periods[0]),
              -linebreak  => (exists $params->{line_break} ? $params->{line_break} : 1),
            );
  
  
  
      $fields->{date_range_low}  =  show_date_input($CGI, 'start');
      $fields->{date_range_high} =  show_date_input($CGI, 'end');
  
  =pod
      print '<br>Limit to the hour of: ',
            popup_menu( -name       => 'Limit_hour',
                        -default    => ' ',
                        -values     => [' ',0..23], ),
            '<br>';
  =cut          
            
  }
  
  =item my $args = GetDateRangeArgs( $cgi );
  
  Returns a string to use in a HREF with all the parameters set.
  
  =cut
  
  sub GetDateRangeArgs {
      my $CGI = shift;
  
      my %args;
  
      
      $args{DateRanges_date_option} = $CGI->param('DateRanges_date_option')
          if defined $CGI->param('DateRanges_date_option');
  
  
      for ( qw/ mon day year / ) {
          my $start = "DateRanges_start_$_";
          my $end   =  "DateRanges_end_$_";
          $args{$start} = $CGI->param($start) if defined $CGI->param($start);
          $args{$end} = $CGI->param($end) if defined $CGI->param($end);
      }
  
      return '' unless %args;
  
      return join '&amp;', map { "$_=" . $CGI->escape($args{$_}) } keys %args;
      
  }
  
  =item DateRangeParse( $cgi, $form )
  
  Parses the date range form and returns a low and high range unix timestamp.
  Returns false on error with the folowing key set in C<$form>:
  
      DateRanges_error - error string explaining the problem
  
  C<$form> is a hash reference where the following keys may be set:
  
      All - no date ranges were selected
      DateRanges_time_low - low range unix timestamp
      DateRanges_time_high - high range unix timestamp
  
  =cut    
      
  
  
  
  #------------------------ Get the report dates ---------------------
  sub DateRangeParse {
      my ( $q, $form ) = @_;
  
      $form->{DateRanges_error} = '';
      
  
  
      # If requesting ALL (or not found in form) return true for all
      if ( !$q->param('DateRanges_date_option') || $q->param('DateRanges_date_option') eq 'All' ) {
          $form->{All}++;
          return 1;
      }
  
      my $time = time();
  
      my ( @start, @end );
  
      for ($q->param('DateRanges_date_option') ) {
      
          /^Today/      && do { @start = @end = Today(); last; };
  
          /^Yesterday onward/  && do { @start = Add_Delta_Days( Today(), -1 ); last };
          
          /^Yesterday/  && do { @start = @end = Add_Delta_Days( Today(), -1 ); last };
  
  
          /^This Week/ && do {
              @start = Monday_of_Week( Week_of_Year( Today() ) );
              @end   = Add_Delta_Days( @start, 6 );
              last;
          };
  
          /^Last Week/ && do {
              @start = Monday_of_Week( Week_of_Year( Add_Delta_Days( Today(), -7 ) ) );
              @end   = Add_Delta_Days( @start, 6 );
              last;
          };
  
          /^This Month/ && do {
              @start = @end = Today();
              $start[2] = 1;
              $end[2] = Days_in_Month($end[0],$end[1]);
              last;
          };
          
          /^Last Month/ && do {
              @start = Today();
              $start[2] = 1;
              $start[1] = 1 if --$start[1] == 0;
              @end = @start;
              $end[2] = Days_in_Month($end[0],$end[1]);
              last;
          };
  
          /^Last 90 Days/ && do {
              @end = Today();
              @start = Add_Delta_Days( Today(), -90 );
              last
          };
  
          /^Past/ && return 1;  # use defaults;
  
          /^Future/ && do {
              $form->{DateRanges_time_low} = time;
              delete $form->{DateRanges_time_high};
              return 1;
          };
  
          /^Next 30 Days/   && do {
              @start = Today();
              @end = Add_Delta_Days( Today(), +30 );
              last
          };
          
  
  
          /^Select/ && do {
              my ( $day, $mon, $year );
  
              $day    = $q->param('DateRanges_start_day') || 0;
              $mon    = $q->param('DateRanges_start_mon') || 0;
              $year   = $q->param('DateRanges_start_year') || 0;
              @start = ( $year, $mon, $day );
  
              $day    = $q->param('DateRanges_end_day') || 0;
              $mon    = $q->param('DateRanges_end_mon') || 0;
              $year   = $q->param('DateRanges_end_year') || 0;
              @end = ( $year, $mon, $day );
              last;
          };
  
          $form->{DateRanges_error} = 'Invalid Date Option ' . $q->param('DateRanges_date_option') . ' Selected';
          return;
      }
  
  
      
      $form->{DateRanges_error} = 'Invalid Start Date' && return if @start && !check_date( @start );
      $form->{DateRanges_error} = 'Invalid Ending Date' && return if @end && !check_date( @end );
  
  
      my $start_time = @start ? timelocal( 0, 0, 0, $start[2], $start[1]-1, $start[0]-1900 ) : 0;
      my $end_time   = @end   ? timelocal( 59, 59, 23, $end[2], $end[1]-1, $end[0]-1900 ) : 0;
  
  
      $form->{DateRanges_error} = "Starting time should be before now, don't you think?" && return
          if $start_time && $start_time > time();
          
      $form->{DateRanges_error} = 'Start date must be same day or before end date' && return
          if $start_time && $end_time && $start_time > $end_time;
  
  
      $form->{DateRanges_time_low} = $start_time;
      $form->{DateRanges_time_high} = $end_time;
      return 1;
  }
  
  
  #----------------------------------------------------------------------------
  sub show_date_input {
      my ( $CGI, $name ) = @_;
  
      $name = "DateRanges_$name";
  
      my @months = qw/Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec/;
      my $x = 1;
      my %months = map { $x++, $_ } @months;
  
      my ($mon, $day, $year) = (localtime)[4,3,5];
  
      $year = $year + 1900;
      $mon++;
  
      my $cur_year = $year;
  
      $cur_year += 5;
  
      ($year,$mon,$day) = Date::Calc::Add_Delta_Days($year,$mon,$day, -28 ) if $name eq 'start';
  
  
      my $str = join "\n", 
          $CGI->popup_menu(
              -name   => "${name}_mon",
              -values     => [1..12],
              -default    => $mon,
              -labels     => \%months
          ),
          '&nbsp',
          $CGI->popup_menu(
              -name       => "${name}_day",
              -default    => $day,
              -values     => [1..31],
          ),
  
          '&nbsp',
          $CGI->popup_menu(
              -name       => "${name}_year",
              -default    => $year,
              -values     => [$year-5..$cur_year],
          );
  
  
      return $str;
  }
  
  #----------------------- ymd_to_unix --------------------------
  # Silly Date::Calc
  sub ymd_to_unix {
      my ($y, $m, $d ) = @_;
      $m--;
      $y -= 1900;
      return timelocal( 0, 0, 0, $d, $m, $y );
  }
  
  1;
  
  =back
  
  =head1 COPYRIGHT
  
  Copyright 2001 Bill Moseley
  
  This library is free software; you can redistribute it and/or
  modify it under the same terms as Perl itself.
  
  =cut
  
  
  
  
  1.1                  modperl-docs/src/search/modules/DefaultHighlight.pm
  
  Index: DefaultHighlight.pm
  ===================================================================
  #=======================================================================
  #  Default Highlighting Code
  #
  #    $Id: DefaultHighlight.pm,v 1.1 2002/01/30 06:35:00 stas Exp $
  #=======================================================================
  package DefaultHighlight;
  use strict;
  
  sub new {
      my ( $class, $results, $metaname ) = @_;
  
  
      my $self = bless {
          results => $results,  # just in case we need a method
          settings=> $results->config('highlight'),
          metaname=> $metaname,
      }, $class;
  
      # parse out the query into words
      my $query = $results->extract_query_match;
  
  
      # Do words exist for this layer (all text at this time) and metaname?
      # This is a reference to an array of phrases and words
  
      $self->{description_prop} = $results->config('description_prop') || '';
  
      if ( $results->header('stemming applied') =~ /^(?:1|yes)$/i ) {
          eval { require SWISH::Stemmer };
          if ( $@ ) {
              $results->errstr('Stemmed index needs Stemmer.pm to highlight: ' . $@);
          } else {
              $self->{stemmer_function} = \&SWISH::Stemmer::SwishStem;
          }
      }
  
  
      if ( $query && exists $query->{text}{$metaname} ) {
          $self->{query} = $query->{text}{$metaname};
  
          $self->set_match_regexp;
      }
  
      return $self;
  }
  
  sub highlight {
      my ( $self, $properties ) = @_;
  
  
      return unless $self->{query};
  
      my $phrase_array = $self->{query};
  
      my $settings = $self->{settings};
      my $metaname = $self->{metaname};
  
      # Do we care about this meta?
      return unless exists $settings->{meta_to_prop_map}{$metaname};
  
      # Get the related properties
      my @props = @{ $settings->{meta_to_prop_map}{$metaname} };
  
      my %checked;
  
      for ( @props ) {
          if ( $properties->{$_} ) {
              $checked{$_}++;
              $self->highlight_text( \$properties->{$_}, $phrase_array );
          }
      }
  
  
      # Truncate the description, if not processed.
      my $description = $self->{description_prop};
      if ( $description && !$checked{ $description } && $properties->{$description} ) {
          my $max_words = $settings->{max_words} || 100;
          my @words = split /\s+/, $properties->{$description};
          if ( @words > $max_words ) {
              $properties->{$description} = join ' ', @words[0..$max_words], '<b>...</b>';
          }
      }
  
  }
  
  
  
  #==========================================================================
  #
  
  sub highlight_text {
  
      my ( $self, $text_ref, $phrase_array ) = @_;
      
      my $wc_regexp = $self->{wc_regexp};
      my $extract_regexp = $self->{extract_regexp};
      my $match_regexp = $self->{match_regexp};
  
  
      my $last = 0;
  
      my $settings = $self->{settings};
  
      my $Show_Words = $settings->{show_words} || 10;
      my $Occurrences = $settings->{occurrences} || 5;
      my $Max_Words = $settings->{max_words} || 100;
      my $On = $settings->{highlight_on} || '<b>';
      my $Off = $settings->{highlight_off} || '</b>';
  
  
      my $stemmer_function = $self->{stemmer_function};
  
  
      # Should really call unescapeHTML(), but then would need to escape <b> from escaping.
      my @words = split /$wc_regexp/, $$text_ref;
  
  
      return 'No Content saved: Check StoreDescription setting' unless @words;
  
      my @flags;
      $flags[$#words] = 0;  # Extend array.
  
      my $occurrences = $Occurrences ;
  
  
      my $pos = 0;
  
      while ( $Show_Words && $pos <= $#words ) {
  
          # Check if the word is a swish word (ignoring begin and end chars)
          if ( $words[$pos] =~ /$extract_regexp/ ) {
  
  
              my ( $begin, $word, $end ) = ( $1, $2, $3 );
  
              my $test = $stemmer_function
                         ? $stemmer_function->($word)
                         : lc $word;
  
              $test ||= lc $word;                       
  
              # Not check if word matches
              if ( $test =~ /$match_regexp/ ) {
  
                  $words[$pos] = "$begin$On$word$Off$end";
  
  
                  my $start = $pos - ($Show_Words-1)* 2;
                  my $end   = $pos + ($Show_Words-1)* 2;
                  if ( $start < 0 ) {
                      $end = $end - $start;
                      $start = 0;
                  }
                  
                  $end = $#words if $end > $#words;
  
                  $flags[$_]++ for $start .. $end;
  
  
                  # All done, and mark where to stop looking
                  if ( $occurrences-- <= 0 ) {
                      $last = $end;
                      last;
                  }
              }
          }
  
         $pos += 2;  # Skip to next wordchar word
      }
  
  
      my $dotdotdot = ' <b>...</b> ';
  
  
      my @output;
  
      my $printing;
      my $first = 1;
      my $some_printed;
  
      if ( $Show_Words && @words > 50 ) {  # don't limit context if a small number of words
          for my $i ( 0 ..$#words ) {
  
  
              if ( $last && $i >= $last && $i < $#words ) {
                  push @output, $dotdotdot;
                  last;
              }
  
              if ( $flags[$i] ) {
  
                  push @output, $dotdotdot if !$printing++ && !$first;
                  push @output, $words[$i];
                  $some_printed++;
  
              } else {
                  $printing = 0;
              }
  
          $first = 0;
  
          
          }
      }
  
      if ( !$some_printed ) {
          for my $i ( 0 .. $Max_Words ) {
              if ( $i > $#words ) {
                  $printing++;
                  last;
              }
              push @output, $words[$i];
          }
      }
          
          
  
      push @output, $dotdotdot if !$printing;
  
      $$text_ref = join '', @output;
  
  
  }
  
  #============================================
  # Returns compiled regular expressions for matching
  #
  
  sub set_match_regexp {
      my $self = shift;
  
      my $results = $self->{results};
  
  
      my $wc = $results->header('wordcharacters');
      my $ignoref = $results->header('ignorefirstchar');
      my $ignorel = $results->header('ignorelastchar');
  
  
  
      my $query = join ' ', map { join ' ', @$_} @{$self->{query}};  # join everything together!
  
  
      $wc = quotemeta $wc;
  
  
      my $match_string =
          join '|',
             map { substr( $_, -1, 1 ) eq '*'
                      ? quotemeta( substr( $_, 0, -1) ) . "[$wc]*?"
                      : quotemeta
                 }
                  grep { ! /^(and|or|not|["()=])$/oi }  # left over code
                      split /\s+/, $query;
  
  
  
      return unless $match_string;
  
      for ( $ignoref, $ignorel ) {
          if ( $_ ) {
              $_ = quotemeta;
              $_ = "([$_]*)";
          } else {
              $_ = '()';
          }
      }
  
  
      # Yuck!
      $wc .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';  # Warning: dependent on tolower used while indexing
  
  
      # Now, wait a minute.  Look at this more, as I'd hope that making a
      # qr// go out of scope would release the compiled pattern.
  
      if ( $ENV{MOD_PERL} ) {
          $self->{wc_regexp}      = qr/([^$wc]+)/;                     # regexp for splitting into swish-words
          $self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/i;  # regexp for extracting out the words to compare
          $self->{match_regexp}   = qr/^(?:$match_string)$/;           # regexp for comparing extracted words to query
  
       } else {
          $self->{wc_regexp}      = qr/([^$wc]+)/o;                    # regexp for splitting into swish-words
          $self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/oi;  # regexp for extracting out the words to compare
          $self->{match_regexp}   = qr/^(?:$match_string)$/o;          # regexp for comparing extracted words to query
       }
  }    
  
  
  1;
  
  
  
  
  1.1                  modperl-docs/src/search/modules/PhraseHighlight.pm
  
  Index: PhraseHighlight.pm
  ===================================================================
  #=======================================================================
  #  Phrase Highlighting Code
  #
  #    $Id: PhraseHighlight.pm,v 1.1 2002/01/30 06:35:00 stas Exp $
  #=======================================================================
  package PhraseHighlight;
  use strict;
  
  use constant DEBUG_HIGHLIGHT => 0;
  
  sub new {
      my ( $class, $results, $metaname ) = @_;
  
  
      my $self = bless {
          results => $results,  # just in case we need a method
          settings=> $results->config('highlight'),
          metaname=> $metaname,
      }, $class;
  
      # parse out the query into words
      my $query = $results->extract_query_match;
  
  
      # Do words exist for this layer (all text at this time) and metaname?
      # This is a reference to an array of phrases and words
  
      $self->{description_prop} = $results->config('description_prop') || '';
  
  
  
      if ( $results->header('stemming applied') =~ /^(?:1|yes)$/i ) {
          eval { require SWISH::Stemmer };
          if ( $@ ) {
              $results->errstr('Stemmed index needs Stemmer.pm to highlight: ' . $@);
          } else {
              $self->{stemmer_function} = \&SWISH::Stemmer::SwishStem;
          }
      }
  
  
  
      my %stopwords =  map { $_, 1 } split /\s+/, $results->header('stopwords');
      $self->{stopwords} = \%stopwords;
  
  
      if ( $query && exists $query->{text}{$metaname} ) {
          $self->{query} = $query->{text}{$metaname};
  
          $self->set_match_regexp;
      }
  
      return $self;
  }
  
  sub highlight {
      my ( $self, $properties ) = @_;
  
  
      return unless $self->{query};
  
      my $phrase_array = $self->{query};
  
      my $settings = $self->{settings};
      my $metaname = $self->{metaname};
  
      # Do we care about this meta?
      return unless exists $settings->{meta_to_prop_map}{$metaname};
  
      # Get the related properties
      my @props = @{ $settings->{meta_to_prop_map}{$metaname} };
  
      my %checked;
  
      for ( @props ) {
          if ( $properties->{$_} ) {
              $checked{$_}++;
              $self->highlight_text( \$properties->{$_}, $phrase_array );
          }
      }
  
  
      # Truncate the description, if not processed.
      my $description = $self->{description_prop};
      if ( $description && !$checked{ $description } && $properties->{$description} ) {
          my $max_words = $settings->{max_words} || 100;
          my @words = split /\s+/, $properties->{$description};
          if ( @words > $max_words ) {
              $properties->{$description} = join ' ', @words[0..$max_words], '<b>...</b>';
          }
      }
  
  }
  
  
  
  #==========================================================================
  #
  
  sub highlight_text {
  
      my ( $self, $text_ref, $phrase_array ) = @_;
      
      my $wc_regexp = $self->{wc_regexp};
      my $extract_regexp = $self->{extract_regexp};
  
  
      my $last = 0;
  
      my $settings = $self->{settings};
  
      my $Show_Words = $settings->{show_words} || 10;
      my $Occurrences = $settings->{occurrences} || 5;
      my $Max_Words = $settings->{max_words} || 100;
  
  
  
      my $On = $settings->{highlight_on} || '<b>';
      my $Off = $settings->{highlight_off} || '</b>';
  
      my $on_flag  = 'sw' . time . 'on';
      my $off_flag = 'sw' . time . 'off';
  
  
      my $stemmer_function = $self->{stemmer_function};
  
      # Should really call unescapeHTML(), but then would need to escape <b> from escaping.
  
      # Split into words.  For speed, should work on a stream method.
      my @words = split /$wc_regexp/, $$text_ref;
  
  
      return 'No Content saved: Check StoreDescription setting' unless @words;
  
      my @flags;  # This marks where to start and stop display.
      $flags[$#words] = 0;  # Extend array.
  
      my $occurrences = $Occurrences ;
  
  
      my $word_pos = $words[0] eq '' ? 2 : 0;  # Start depends on if first word was wordcharacters or not
  
      my @phrases = @{ $self->{query} };
  
      # Remember, that the swish words are every other in @words.
  
      WORD:
      while ( $Show_Words && $word_pos * 2 < @words ) {
  
          PHRASE:
          foreach my $phrase ( @phrases ) {
  
              print STDERR "  Search phrase '@$phrase'\n" if DEBUG_HIGHLIGHT;
              next PHRASE if ($word_pos + @$phrase -1) * 2 > @words;  # phrase is longer than what's left
              
  
              my $end_pos = 0;  # end offset of the current phrase
  
              # now compare all the words in the phrase
  
              my ( $begin, $word, $end );
              
              for my $match_word ( @$phrase ) {
  
                  my $cur_word = $words[ ($word_pos + $end_pos) * 2 ];
                  unless ( $cur_word =~ /$extract_regexp/ ) {
  
                      my $idx = ($word_pos + $end_pos) * 2;
                      my ( $s, $e ) = ( $idx - 10, $idx + 10 );
                      $s = 0 if $s < 0;
                      $e = @words-1 if $e >= @words;
                     
                  
                      warn  "Failed to parse IgnoreFirst/Last from word '"
                      . (defined $cur_word ? $cur_word : '*undef')
                      . "' (index: $idx) word_pos:$word_pos end_pos:$end_pos total:"
                      . scalar @words
                      . "\n-search pharse words-\n"
                      . join( "\n", map { "$_ '$phrase->[$_]'" } 0..@$phrase -1 )
                      . "\n-Words-\n"
                      . join( "\n", map { "$_ '$words[$_]'" . ($_ == $idx ? ' <<< this word' : '') } $s..$e )
                      . "\n";
  
                      next PHRASE;
                  }
  
  
  
  
                  # Strip ignorefirst and ignorelast
                  ( $begin, $word, $end ) = ( $1, $2, $3 );  # this is a waste, as it can operate on the same word over and over
  
                  my $check_word = lc $word;
  
                  if ( $end_pos && exists $self->{stopwords}{$check_word} ) {
                      $end_pos++;
                      print STDERR " Found stopword '$check_word' in the middle of phrase - * MATCH *\n" if DEBUG_HIGHLIGHT;
                      redo if  ( $word_pos + $end_pos ) * 2 < @words;  # go on to check this match word with the next word.
  
                      # No more words to match with, so go on to next pharse.
                      next PHRASE;
                  }
  
                  if ( $stemmer_function ) {
                      my $w = $stemmer_function->($check_word);
                      $check_word = $w if $w;
                  }
  
  
  
                  print STDERR "     comparing source # (word:$word_pos offset:$end_pos) '$check_word' == '$match_word'\n" if DEBUG_HIGHLIGHT;
      
                  if ( substr( $match_word, -1 ) eq '*' ) {
                      next PHRASE if index( $check_word, substr($match_word, 0, length( $match_word ) - 1) ) != 0;
  
                  } else {
                      next PHRASE if $check_word ne $match_word;
                  }
  
  
                  print STDERR "      *** Word Matched '$check_word' *** \n" if DEBUG_HIGHLIGHT;
                  $end_pos++;  
              }
  
              print STDERR "      *** PHRASE MATCHED (word:$word_pos offset:$end_pos) *** \n" if DEBUG_HIGHLIGHT;
  
  
              # We are currently at the end word, so it's easy to set that highlight
  
              $end_pos--;
  
              if ( !$end_pos ) { # only one word
                  $words[$word_pos * 2] = "$begin$on_flag$word$off_flag$end";
              } else {
                  $words[($word_pos + $end_pos) * 2 ] = "$begin$word$off_flag$end";
  
                  #Now, reload first word of match
                  $words[$word_pos * 2] =~ /$extract_regexp/ or die "2 Why didn't '$words[$word_pos]' =~ /$extract_regexp/?";
                  # Strip ignorefirst and ignorelast
                  ( $begin, $word, $end ) = ( $1, $2, $3 );  # probably should cache this!
                  $words[$word_pos * 2] = "$begin$on_flag$word$end";
              }
  
  
              # Now, flag the words around to be shown
              my $start = ($word_pos - $Show_Words + 1) * 2;
              my $stop   = ($word_pos + $end_pos + $Show_Words - 2) * 2;
              if ( $start < 0 ) {
                  $stop = $stop - $start;
                  $start = 0;
              }
              
              $stop = $#words if $stop > $#words;
  
              $flags[$_]++ for $start .. $stop;
  
  
              # All done, and mark where to stop looking
              if ( $occurrences-- <= 0 ) {
                  $last = $end;
                  last WORD;
              }
  
  
              # Now reset $word_pos to word following
              $word_pos += $end_pos; # continue will still be executed
              next WORD;
          }
      } continue {
          $word_pos ++;
      }
  
  
  
  
      my $dotdotdot = ' ... ';
  
  
      my @output;
  
      my $printing;
      my $first = 1;
      my $some_printed;
  
      if ( $Show_Words && @words > 50 ) {  # don't limit context if a small number of words
          for my $i ( 0 ..$#words ) {
  
  
              if ( $last && $i >= $last && $i < $#words ) {
                  push @output, $dotdotdot;
                  last;
              }
  
              if ( $flags[$i] ) {
  
                  push @output, $dotdotdot if !$printing++ && !$first;
                  push @output, $words[$i];
                  $some_printed++;
  
              } else {
                  $printing = 0;
              }
  
          $first = 0;
  
          
          }
      }
  
      if ( !$some_printed ) {
          for my $i ( 0 .. $Max_Words ) {
              if ( $i > $#words ) {
                  $printing++;
                  last;
              }
              push @output, $words[$i];
          }
      }
          
          
  
      push @output, $dotdotdot if !$printing;
  
      $$text_ref = join '', @output;
      my %entities = (
          '&' => '&amp;',
          '>' => '&gt;',
          '<' => '&lt;',
          '"' => '&quot;',
      );
      my %highlight = (
          $on_flag => $On,
          $off_flag => $Off,
      );
          
  
      $$text_ref =~ s/([&"<>])/$entities{$1}/ge;
  
      $$text_ref =~ s/($on_flag|$off_flag)/$highlight{$1}/ge;
  
  
  
      
      # $$text_ref = join '', @words;  # interesting that this seems reasonably faster
  
  
  }
  
  #============================================
  # Returns compiled regular expressions for matching
  #
  #
  
  sub set_match_regexp {
      my $self = shift;
  
      my $results = $self->{results};
  
  
      my $wc = $results->header('wordcharacters');
      my $ignoref = $results->header('ignorefirstchar');
      my $ignorel = $results->header('ignorelastchar');
  
  
      $wc = quotemeta $wc;
  
      #Convert query into regular expressions
  
  
      for ( $ignoref, $ignorel ) {
          if ( $_ ) {
              $_ = quotemeta;
              $_ = "([$_]*)";
          } else {
              $_ = '()';
          }
      }
  
  
      $wc .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';  # Warning: dependent on tolower used while indexing
  
  
      # Now, wait a minute.  Look at this more, as I'd hope that making a
      # qr// go out of scope would release the compiled pattern.
  
      if ( $ENV{MOD_PERL} ) {
          $self->{wc_regexp}      = qr/([^$wc]+)/;                     # regexp for splitting into swish-words
          $self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/i;  # regexp for extracting out the words to compare
  
       } else {
          $self->{wc_regexp}      = qr/([^$wc]+)/o;                    # regexp for splitting into swish-words
          $self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/oi;  # regexp for extracting out the words to compare
       }
  }    
  
  1;
  
  
  
  
  
  
  1.1                  modperl-docs/src/search/modules/PhraseTest.pm
  
  Index: PhraseTest.pm
  ===================================================================
  #!/usr/local/bin/perl -w
  use strict;
  
  =pod
  
  To give some background.
  
  when you search with swish you do something like:
  
    ./swish-e -w swishdefault=foo
  
  that says: search the field "swishdefault" for the word foo.  swishdefault
  is the default "metaname".  When indexing, say html, swish indexes the body
  text AND the title as swishdefault.
  
  So, a document will match if "foo" is found in the title or in the body when searching
  the swishdefault metaname.  This is why in the sample code below supplies both the title
  and description to the highlighting code.
  
  The other part to swish is "properties".  A document property is something like
  the path name, title, or last modified date.  Some bit of data that can be returned
  with search results.  So, when you do a search for "foo" swish will
  return a list of documents, and for each document it will list properties.
  
  Since properties and metanames may or may not be related, in the "config" below you see a hash that maps
  one or more properties to metanames.
  
  Note, here's a search for "apache" limiting to four results.
  
  > ./swish-e -w apache -m 4
  # SWISH format: 2.1-dev-24
  # Search words: apache
  # Number of hits: 120
  # Search time: 0.001 seconds
  # Run time: 0.006 seconds
  1000 /usr/local/apache/htdocs/manual/misc/FAQ.html "Apache Server Frequently Asked Questions" 107221
  973 /usr/local/apache/htdocs/manual/windows.html "Using Apache with Microsoft Windows" 21664
  953 /usr/local/apache/htdocs/manual/mod/core.html "Apache Core Features" 121406
  933 /usr/local/apache/htdocs/manual/netware.html "Using Apache with Novell NetWare 5" 11345
  
  That's returning the properties rank, path, title (called swishtitle), document size by default.
  
  Swish can also store as a property the words extracted while indexing and return that
  text in the search results.  This property is called "swishdescription".  It's a lot
  faster for swish to return this in results than to go and fetch the source document by path
  and then extract out the content.
  
  
  =cut 
  
  
  # here's the emulated results from swish placed in the "properties" hash
  
  show() for ( 1..15 );
  
  sub show {
  
      my %properties = (
          swishtitle          => 'Apache module mod_foobar',
          swishdescription    => Content::content(),
      );
  
      # emulate a result object
  
      my $result = result->new;
  
  
  
      my $hl = PhraseHighlight->new($result, 'swishdefault' );
      $hl->highlight( \%properties );
      use Text::Wrap qw(wrap);
      print "\nTitle: $properties{ swishtitle }\n\nDescription:\n",
            wrap(' ',' ',$properties{ swishdescription }),"\n";
  
  }          
  
  
  
  
  
  
  #===============================================================
      
  package result;
  use strict;
  
  use Carp;
  
  
  
  sub new {
      bless {
          config => {
              description_prop => 'swishdescription',
  
              highlight       => {
                  package         => 'PhraseHighlight',
                  show_words      => 10,    # Number of swish words words to show around highlighted word
                  max_words       => 100,   # If no words are found to highlighted then show this many words
                  occurrences     => 6,     # Limit number of occurrences of highlighted words
                  highlight_on    => '<<on>>',
                  highlight_off   => '<</off>>',
                  meta_to_prop_map => {   # this maps search metatags to display properties
                      swishdefault    => [ qw/swishtitle swishdescription/ ],
                      swishtitle      => [ qw/swishtitle/ ],
                      swishdocpath    => [ qw/swishdocpath/ ],
                  },
              },
          },
      }, shift;
  
  }
  
  
  sub header {
     my ( $self, $value ) = @_;
  
     my %values = (
          wordcharacters      => 'abcdefghijklmnopqrstuvwxyz-,.',
          ignorefirstchar     => '.-,',
          ignorelastchar      => '.-,',
          'stemming applied'  => 0,
          stopwords           => 'and the is for',
     );
  
     return $values{$value} || '';
  }
  
  sub config {
      my ($self, $setting, $value ) = @_;
  
      croak "Failed to pass 'config' a setting" unless $setting;
  
      my $cur = $self->{config}{$setting} if exists $self->{config}{$setting};
  
      $self->{config}{$setting} = $value if $value;
  
      return $cur;
  }
  
  # This emulates the parsing of the query passed to swish-e
  
  sub extract_query_match {
      return {
          text => {                                ## can be text or url "layer"
              swishdefault    => [                 ## metaname searched
                  [qw/directive not compatible/],  ## phrase made up of three words (not stopword missing)
                  [qw/ foobar /],                  ## phrase of one word
                  [qw/ des* /],                  ## wildcard search
              ],
          },
       };
  }
      
  
  
  #=======================================================================
  #  Phrase Highlighting Code
  #
  #  copyright 2001 - Bill Moseley moseley@hank.org
  #
  #    $Id: PhraseTest.pm,v 1.1 2002/01/30 06:35:00 stas Exp $
  #=======================================================================
  package PhraseHighlight;
  use strict;
  
  use constant DEBUG_HIGHLIGHT => 0;
  
  sub new {
      my ( $class, $results, $metaname ) = @_;
  
      my $self = bless {
          results => $results,  # just in case we need a method
          settings=> $results->config('highlight'),
          metaname=> $metaname,
      }, $class;
  
  
      # parse out the query into words
      my $query = $results->extract_query_match;
  
  
      # Do words exist for this layer (all text at this time) and metaname?
      # This is a reference to an array of phrases and words
  
      $self->{description_prop} = $results->config('description_prop') || '';
  
  
  
      if ( $results->header('stemming applied') =~ /^(?:1|yes)$/i ) {
          eval { require SWISH::Stemmer };
          if ( $@ ) {
              $results->errstr('Stemmed index needs Stemmer.pm to highlight: ' . $@);
          } else {
              $self->{stemmer_function} = \&SWISH::Stemmer::SwishStem;
          }
      }
  
  
  
      my %stopwords =  map { $_, 1 } split /\s+/, $results->header('stopwords');
      $self->{stopwords} = \%stopwords;
  
  
      if ( $query && exists $query->{text}{$metaname} ) {
          $self->{query} = $query->{text}{$metaname};
          $self->set_match_regexp;
      }
  
      return $self;
  }
  
  sub highlight {
      my ( $self, $properties ) = @_;
  
      return unless $self->{query};
  
      my $phrase_array = $self->{query};
  
      my $settings = $self->{settings};
      my $metaname = $self->{metaname};
  
      # Do we care about this meta?
      return unless exists $settings->{meta_to_prop_map}{$metaname};
  
      # Get the related properties
      my @props = @{ $settings->{meta_to_prop_map}{$metaname} };
  
      my %checked;
  
      for ( @props ) {
          if ( $properties->{$_} ) {
              $checked{$_}++;
              $self->highlight_text( \$properties->{$_}, $phrase_array );
          }
      }
  
  
      # Truncate the description, if not processed.
      my $description = $self->{description_prop};
      if ( $description && !$checked{ $description } && $properties->{$description} ) {
          my $max_words = $settings->{max_words} || 100;
          my @words = split /\s+/, $properties->{$description};
          if ( @words > $max_words ) {
              $properties->{$description} = join ' ', @words[0..$max_words], '<b>...</b>';
          }
      }
  
  }
  
  
  
  #==========================================================================
  #
  
  sub highlight_text {
  
      my ( $self, $text_ref, $phrase_array ) = @_;
  
      my $wc_regexp = $self->{wc_regexp};
      my $extract_regexp = $self->{extract_regexp};
  
  
      my $last = 0;
  
      my $settings = $self->{settings};
  
      my $Show_Words = $settings->{show_words} || 10;
      my $Occurrences = $settings->{occurrences} || 5;
  
  
  
  
      my $on_flag  = 'sw' . time . 'on';
      my $off_flag = 'sw' . time . 'off';
  
  
      my $stemmer_function = $self->{stemmer_function};
  
      # Should really call unescapeHTML(), but then would need to escape <b> from escaping.
  
      # Split into words.  For speed, should work on a stream method.
      my @words;
      $self->split_by_wordchars( \@words, $text_ref );
  
  
      return 'No Content saved: Check StoreDescription setting' unless @words;
  
      my @flags;  # This marks where to start and stop display.
      $flags[$#words] = 0;  # Extend array.
  
      my $occurrences = $Occurrences ;
  
  
      my $word_pos = $words[0] eq '' ? 2 : 0;  # Start depends on if first word was wordcharacters or not
  
      my @phrases = @{ $self->{query} };
  
      # Remember, that the swish words are every other in @words.
  
      WORD:
      while ( $Show_Words && $word_pos * 2 < @words ) {
  
          PHRASE:
          foreach my $phrase ( @phrases ) {
  
              print STDERR "  Search phrase '@$phrase'\n" if DEBUG_HIGHLIGHT;
              next PHRASE if ($word_pos + @$phrase -1) * 2 > @words;  # phrase is longer than what's left
              
  
              my $end_pos = 0;  # end offset of the current phrase
  
              # now compare all the words in the phrase
  
              my ( $begin, $word, $end );
              
              for my $match_word ( @$phrase ) {
  
                  my $cur_word = $words[ ($word_pos + $end_pos) * 2 ];
                  unless ( $cur_word =~ /$extract_regexp/ ) {
  
                      my $idx = ($word_pos + $end_pos) * 2;
                      my ( $s, $e ) = ( $idx - 10, $idx + 10 );
                      $s = 0 if $s < 0;
                      $e = @words-1 if $e >= @words;
                     
                  
                      warn  "Failed to parse IgnoreFirst/Last from word '"
                      . (defined $cur_word ? $cur_word : '*undef')
                      . "' (index: $idx) word_pos:$word_pos end_pos:$end_pos total:"
                      . scalar @words
                      . "\n-search pharse words-\n"
                      . join( "\n", map { "$_ '$phrase->[$_]'" } 0..@$phrase -1 )
                      . "\n-Words-\n"
                      . join( "\n", map { "$_ '$words[$_]'" . ($_ == $idx ? ' <<< this word' : '') } $s..$e )
                      . "\n";
  
                      next PHRASE;
                  }
  
  
  
  
                  # Strip ignorefirst and ignorelast
                  ( $begin, $word, $end ) = ( $1, $2, $3 );  # this is a waste, as it can operate on the same word over and over
  
                  my $check_word = lc $word;
  
                  if ( $end_pos && exists $self->{stopwords}{$check_word} ) {
                      $end_pos++;
                      print STDERR " Found stopword '$check_word' in the middle of phrase - * MATCH *\n" if DEBUG_HIGHLIGHT;
                      redo if  ( $word_pos + $end_pos ) * 2 < @words;  # go on to check this match word with the next word.
  
                      # No more words to match with, so go on to next pharse.
                      next PHRASE;
                  }
  
                  if ( $stemmer_function ) {
                      my $w = $stemmer_function->($check_word);
                      $check_word = $w if $w;
                  }
  
  
  
                  print STDERR "     comparing source # (word:$word_pos offset:$end_pos) '$check_word' == '$match_word'\n" if DEBUG_HIGHLIGHT;
      
                  if ( substr( $match_word, -1 ) eq '*' ) {
                      next PHRASE if index( $check_word, substr($match_word, 0, length( $match_word ) - 1) ) != 0;
  
                  } else {
                      next PHRASE if $check_word ne $match_word;
                  }
  
  
                  print STDERR "      *** Word Matched '$check_word' *** \n" if DEBUG_HIGHLIGHT;
                  $end_pos++;  
              }
  
              print STDERR "      *** PHRASE MATCHED (word:$word_pos offset:$end_pos) *** \n" if DEBUG_HIGHLIGHT;
  
  
              # We are currently at the end word, so it's easy to set that highlight
  
              $end_pos--;
  
              if ( !$end_pos ) { # only one word
                  $words[$word_pos * 2] = "$begin$on_flag$word$off_flag$end";
              } else {
                  $words[($word_pos + $end_pos) * 2 ] = "$begin$word$off_flag$end";
  
                  #Now, reload first word of match
                  $words[$word_pos * 2] =~ /$extract_regexp/ or die "2 Why didn't '$words[$word_pos]' =~ /$extract_regexp/?";
                  # Strip ignorefirst and ignorelast
                  ( $begin, $word, $end ) = ( $1, $2, $3 );  # probably should cache this!
                  $words[$word_pos * 2] = "$begin$on_flag$word$end";
              }
  
  
              # Now, flag the words around to be shown
              my $start = ($word_pos - $Show_Words + 1) * 2;
              my $stop   = ($word_pos + $end_pos + $Show_Words - 2) * 2;
              if ( $start < 0 ) {
                  $stop = $stop - $start;
                  $start = 0;
              }
              
              $stop = $#words if $stop > $#words;
  
              $flags[$_]++ for $start .. $stop;
  
  
              # All done, and mark where to stop looking
              if ( $occurrences-- <= 0 ) {
                  $last = $end;
                  last WORD;
              }
  
  
              # Now reset $word_pos to word following
              $word_pos += $end_pos; # continue will still be executed
              next WORD;
          }
      } continue {
          $word_pos ++;
      }
  
  
  
      my @output;
      $self->build_highlighted_text( \@output, \@words, \@flags, $last );
  
  
      $self->join_words( \@output, $text_ref );
  
  
      $self->escape_entities( $text_ref );
      $self->substitue_highlight( $text_ref, $on_flag, $off_flag );
  
      
      # $$text_ref = join '', @words;  # interesting that this seems reasonably faster
  
  
  }
  
  #====================================================================
  #  Split the source text into swish and non-swish words
  
  sub split_by_wordchars {
      my ( $self, $words, $text_ref ) = @_;
      my $wc_regexp = $self->{wc_regexp};
      
      @$words = split /$wc_regexp/, $$text_ref;
  }
  
  
  #=======================================================================
  #  Put all the words together for display
  #
  sub build_highlighted_text {
      my ( $self, $output, $words, $flags, $last ) = @_;
      
      my $dotdotdot = ' ... ';
  
      my $printing;
      my $first = 1;
      my $some_printed;
  
      my $settings = $self->{settings};
      my $Show_Words = $settings->{show_words} || 10;
      
  
      if ( $Show_Words && @$words > 50 ) {  # don't limit context if a small number of words
          
          for my $i ( 0 ..$#$words ) {
  
  
              if ( $last && $i >= $last && $i < $#$words ) {
                  push @$output, $dotdotdot;
                  last;
              }
  
              if ( $flags->[$i] ) {
  
                  push @$output, $dotdotdot if !$printing++ && !$first;
                  push @$output, $words->[$i];
                  $some_printed++;
  
              } else {
                  $printing = 0;
              }
  
          $first = 0;
  
          
          }
      }
  
  
  
      if ( !$some_printed ) {
          my $Max_Words = $settings->{max_words} || 100;
  
          for my $i ( 0 .. $Max_Words ) {
              if ( $i > $#$words ) {
                  $printing++;
                  last;
              }
              push @$output, $words->[$i];
          }
      }
          
      push @$output, $dotdotdot if !$printing;
  }    
  
  #==================================================================
  #
  sub join_words {
      my ( $self, $output, $text_ref ) = @_;
      
      $$text_ref = join '', @$output;
  }
  
  sub escape_entities {
      my ( $self, $text_ref ) = @_;
  
      my %entities = (
          '&' => '&amp;',
          '>' => '&gt;',
          '<' => '&lt;',
          '"' => '&quot;',
      );
      $$text_ref =~ s/([&"<>])/$entities{$1}/ge;
  }
  
  #========================================================
  # replace the highlight codes
  
  sub substitue_highlight {
      my ( $self, $text_ref, $on_flag, $off_flag ) = @_;
  
      my $settings = $self->{settings};
      my $On = $settings->{highlight_on} || '<b>';
      my $Off = $settings->{highlight_off} || '</b>';
  
      my %highlight = (
          $on_flag => $On,
          $off_flag => $Off,
      );
          
      $$text_ref =~ s/($on_flag|$off_flag)/$highlight{$1}/ge;
  }
  
  #============================================
  # Returns compiled regular expressions for matching
  #
  #
  
  sub set_match_regexp {
      my $self = shift;
  
      my $results = $self->{results};
  
  
      my $wc = $results->header('wordcharacters');
      my $ignoref = $results->header('ignorefirstchar');
      my $ignorel = $results->header('ignorelastchar');
  
  
      $wc = quotemeta $wc;
  
      #Convert query into regular expressions
  
  
      for ( $ignoref, $ignorel ) {
          if ( $_ ) {
              $_ = quotemeta;
              $_ = "([$_]*)";
          } else {
              $_ = '()';
          }
      }
  
  
      $wc .= 'ABCDEFGHIJKLMNOPQRSTUVWXYZ';  # Warning: dependent on tolower used while indexing
  
  
      # Now, wait a minute.  Look at this more, as I'd hope that making a
      # qr// go out of scope would release the compiled pattern.
  
      if ( $ENV{MOD_PERL} ) {
          $self->{wc_regexp}      = qr/([^$wc]+)/;                     # regexp for splitting into swish-words
          $self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/i;  # regexp for extracting out the words to compare
  
       } else {
          $self->{wc_regexp}      = qr/([^$wc]+)/o;                    # regexp for splitting into swish-words
          $self->{extract_regexp} = qr/^$ignoref([$wc]+?)$ignorel$/oi;  # regexp for extracting out the words to compare
       }
  }
  
  
  
  package Content;
  
  sub content {
  
      my $content = <<EOF;
  Apache HTTP Server Version 1.3 Module mod_foobar Add this file as a link in mod/index.html
  This module is contained in the mod_foobar.c file, and is/is not compiled in by default.
  It provides for the foobar feature. Any document with the mime type foo/bar will be processed by this module.
  Add the magic mime type to the list in magic_types.html Summary General module documentation here.
  Directives ADirective Add these directives to the list in directives.html
  ADirective directive Syntax: ADirective some args Default:
  ADirective default value Context: context-list context-list is where this directive can appear;
  allowed: server config, virtual host, directory, .htaccess Override: override required if the
  directive is allowed in .htaccess files; the AllowOverride option that allows the directive.
  Status: status Core if in core apache, Base if in one of the standard modules,
  Extension if in an extension module (not compiled in by default) or
  Experimental Module: mod_foobar Compatibility: compatibility notes Describe any compatibility issues,
  such as "Only available in Apache 1.2 or later," or "The Apache syntax for this directive is not
  compatible with the NCSA directive of the same name." The ADirective directive does something.
  Apache HTTP Server Version 1.3
  EOF
  
      $content =~ s/\n/ /g;
      return $content;
  }
  
  
  
  
  
  
  1.1                  modperl-docs/src/search/modules/SimpleHighlight.pm
  
  Index: SimpleHighlight.pm
  ===================================================================
  #=======================================================================
  #  Simple Highlighting Code
  #    $Id: SimpleHighlight.pm,v 1.1 2002/01/30 06:35:00 stas Exp $
  #=======================================================================
  package SimpleHighlight;
  use strict;
  
  sub new {
      my ( $class, $results, $metaname ) = @_;
  
  
      my $self = bless {
          results => $results,  # just in case we need a method
          settings=> $results->config('highlight'),
          metaname=> $metaname,
      }, $class;
  
      # parse out the query into words
      my $query = $results->extract_query_match;
  
  
      # Do words exist for this layer (all text at this time) and metaname?
      # This is a reference to an array of phrases and words
  
  
  
      if ( $query && exists $query->{text}{$metaname} ) {
          $self->{query} = $query->{text}{$metaname};
  
          $self->set_match_regexp;
      }
  
  
      return $self;
  }
  
  sub highlight {
      my ( $self, $properties ) = @_;
  
  
      return unless $self->{query};
  
      my $phrase_array = $self->{query};
  
      my $settings = $self->{settings};
      my $metaname = $self->{metaname};
  
      # Do we care about this meta?
      return unless exists $settings->{meta_to_prop_map}{$metaname};
  
      # Get the related properties
      my @props = @{ $settings->{meta_to_prop_map}{$metaname} };
  
      for ( @props ) {
          if ( $properties->{$_} ) {
              $self->highlight_text( \$properties->{$_}, $phrase_array );
          }
      }
  
  }
  
  
  
  #==========================================================================
  #
  
  sub highlight_text {
  
      my ( $self, $text_ref, $phrase_array ) = @_;
      
  
      my $settings = $self->{settings};
  
      my $Show_Words = $settings->{show_words} || 10;
      my $Occurrences = $settings->{occurrences} || 5;
      my $Max_Words = $settings->{max_words} || 100;
      my $On = $settings->{highlight_on} || '<b>';
      my $Off = $settings->{highlight_off} || '</b>';
  
      my @words = split /\s+/, $$text_ref;
      if ( @words > $Max_Words ) {
          $$text_ref = join ' ', @words[0..$Max_Words], '<b>...</b>';
      }
  
  
      for ( @{ $self->{matches} } ) {
          $$text_ref =~ s/($_)/$On$1$Off/g;
      }
  
      
  
  }
  
  #============================================
  # Returns compiled regular expressions for matching
  #
  # This builds a list of expressions to match against the text.
  
  sub set_match_regexp {
      my $self = shift;
  
      my $results = $self->{results};
  
  
      my $wc = $results->header('wordcharacters');
      $wc = quotemeta $wc;
  
      my @matches;
  
      # loop through all the phrase 
      for ( @{$self->{query}} ) {
  
          # Fix up wildcards
          my $exp = join '[^$wc]+',
              map { '\b' . $_ . '\b' }
              map { substr( $_, -1, 1 ) eq '*'
                  ? quotemeta( substr( $_, 0, -1) ) . "[$wc]*?"
                  : quotemeta
              } @$_;
  
          push @matches, qr/$exp/i;
      }
  
  
      $self->{matches} = \@matches;
  
  }    
  1;
  
  
  
  
  1.1                  modperl-docs/src/search/modules/TemplateDefault.pm
  
  Index: TemplateDefault.pm
  ===================================================================
  #=====================================================================
  # These routines format the HTML output.
  #    $Id: TemplateDefault.pm,v 1.1 2002/01/30 06:35:00 stas Exp $
  #=====================================================================
  package TemplateDefault;
  use strict;
  
  use CGI;
  
  sub show_template {
      my ( $class, $template_params, $results ) = @_;
  
  
      my $q = $results->CGI;
  
      my $output =  $q->header . page_header( $results );
  
      # Show form at top always
      $output .= show_form( $results );
  
  
      if ( $results->results ) {
          $output .=  results_header( $results );
          $output .=  show_result( $results, $_ ) for @{ $results->results };
      }
  
      # Form after results (or at top if no results)
      #$output .= show_form( $results );
  
      $output .= footer( $results );
  
      print $output;
  
  }
  
  #=====================================================================
  # This generates the header
  
  sub page_header {
      my $results = shift;
      my $title = $results->config('title') || 'Search our site with Swish-e';
      my $message = $results->errstr;
  
      $message = $message
          ? qq[<br><font color=red>$message</font>]
          : '' ;
  
  
      my $html_title = $results->results
          ? ( $results->navigation('hits')
              . ' Results for ['
              . CGI::escapeHTML( $results->{query_simple} )
              . ']'
             )
  
          : ( $results->errstr || $title );
  
      return <<EOF;
  <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
  <html>
      <head>
         <title>
            $html_title
         </title>
      </head>
      <body>
          <h2>
          <a href="http://swish-e.org">
          <img border="0" alt="Swish-e home page" src="http://swish-e.org/Images/swish-e.gif"></a> $title $message
          
          </h2>
  EOF
  }
  
  #=====================================================================
  # This generates the form
  #
  #   Pass:
  #       $results hash
  
  sub show_form {
  
      my $results = shift;
      my $q = $results->{q};
  
  
      my $query = $q->param('query') || '';
  
      $query = CGI::escapeHTML( $query );  # May contain quotes
  
  
      # Here's some form components
      
      my $meta_select_list    = get_meta_name_limits( $results );
      my $sorts               = get_sort_select_list( $results );
      my $select_index        = get_index_select_list( $results );
      my $limit_select        = get_limit_select( $results );
      
      my $date_ranges_select  = $results->get_date_ranges;
  
  
      my $form = $q->script_name;
      
      return <<EOF;
      <form method="post" action="$form" enctype="application/x-www-form-urlencoded" class="form">
          <input / maxlength="200" value="$query" size="32" type="text" name="query">
          <input / value="Search!" type="submit" name="submit"><br>
  
          $meta_select_list
          $sorts
          $select_index
          $limit_select
          $date_ranges_select
      </form>
  EOF
  }
  
  
  #=====================================================================
  # This routine creates the results header display
  # and navigation bar
  #
  #
  #
  
  sub results_header {
  
      my $results = shift;
      my $config = $results->{config};
      my $q = $results->{q};
  
  
  
      my $swr = $results->header('removed stopwords');
      my $stopwords = '';
  
  
      if ( $swr && ref $swr eq 'ARRAY' ) {
          $stopwords = @$swr > 1
          ? join( ', ', map { "<b>$_</b>" } @$swr ) . ' are very common words and were not included in your search'
          : join( ', ', map { "<b>$_</b>" } @$swr ) . ' is a very common word and was not included in your search';
      }
  
      my $limits = '';
  
      #  Ok, this is ugly.
  
  
      if ( $results->{DateRanges_time_low} && $results->{DateRanges_time_high} ) {
          my $low = scalar localtime $results->{DateRanges_time_low};
          my $high = scalar localtime $results->{DateRanges_time_high};
          $limits = <<EOF;
          <tr>
              <td colspan=2>
                  <font size="-2" face="Geneva, Arial, Helvetica, San-Serif">
                  &nbsp;Results limited to dates $low to $high
                  </font>
              </td>
          </tr>
  EOF
      }
  
      my $query_href = $results->{query_href};
      my $query_simple = CGI::escapeHTML( $results->{query_simple} );
      my $pages       = $results->navigation('pages');
  
      my $prev        = $results->navigation('prev');
      my $prev_count  = $results->navigation('prev_count');
      my $next        = $results->navigation('next');
      my $next_count  = $results->navigation('next_count');
      
      my $hits        = $results->navigation('hits');
      my $from        = $results->navigation('from');
      my $to          = $results->navigation('to');
      
      my $run_time    = $results->navigation('run_time');
      my $search_time = $results->navigation('search_time');
  
      
      
  
  
      my $links = '';
  
      $links .= '<font size="-1" face="Geneva, Arial, Helvetica, San-Serif">&nbsp;Page:</font>' . $pages
          if $pages;
  
      $links .= qq[ <a href="$query_href&amp;start=$prev">Previous $prev_count</a>]
          if $prev_count;
  
      $links .= qq[ <a href="$query_href&amp;start=$next">Next $next_count</a>]
          if $next_count;
  
  
      # Save for the bottom of the screen.
      $results->{LINKS} = $links;
  
      $links = qq[<tr><td colspan=2 bgcolor="#EEEEEE">$links</td></tr>] if $links;
  
      $query_simple = $query_simple
          ? "&nbsp;Results for <b>$query_simple</b>"
          : '';
          
      
  
      return <<EOF;
  
      <table cellpadding=0 cellspacing=0 border=0 width="100%">
          <tr>
              <td height=20 bgcolor="#FF9999">
                  <font size="-1" face="Geneva, Arial, Helvetica, San-Serif">
                  $query_simple
                  &nbsp; $from to $to of $hits results.
                  </font>
              </td>
              <td align=right bgcolor="#FF9999">
                  <font size="-2" face="Geneva, Arial, Helvetica, San-Serif">
                  Run time: $run_time |
                  Search time: $search_time &nbsp; &nbsp;
                  </font>
              </td>
          </tr>
  
          $links
          $limits
          $stopwords
  
      </table>
  
  EOF
  
  }
  
  #=====================================================================
  # This routine formats a single result
  #
  #
  sub show_result {
      my ($results, $this_result ) = @_;
  
      my $conf = $results->{conf};
  
      my $DocTitle = $results->config('title_property') || 'swishtitle';
  
  
      my $title = $this_result->{$DocTitle} || $this_result->{swishdocpath} || '?';
  
      my $name_labels = $results->config('name_labels');
  
  
                  
  
      # The the properties to display
      
      my $props = '';
  
      my $display_props = $results->config('display_props');
      if ( $display_props ) {
  
          
          $props = join "\n",
              '<br><table cellpadding=0 cellspacing=0>',
              map ( {
                  '<tr><td><small>'
                  . ( $name_labels->{$_} || $_ )
                  . ':</small></td><td><small> '
                  . '<b>'
                  . $this_result->{$_}
                  . '</b>'
                  . '</small></td></tr>'
                   }  @$display_props
              ),
              '</table>';
      }
  
  
      my $description_prop = $results->config('description_prop');
      
      my $description = '';
      if ( $description_prop ) {
          $description = $this_result->{ $description_prop } || '';
      }
  
  
      return <<EOF;
      <dl>
          <dt>$this_result->{swishreccount} <a href="$this_result->{swishdocpath_href}">$title</a> <small>-- rank: <b>$this_result->{swishrank}</b></small></dt>
          <dd>$description
  
          $props
          </dd>
      </dl>
  
  EOF
  
  }
  
  #=====================================================================
  # This is displayed on the bottom of every page
  #
  #
  
  
  sub footer {
  
      my $mod_perl = $ENV{MOD_PERL}
                 ? '<br><small>Response brought to you by <a href="http://perl.apache.org"><em>mod_perl</em></a></small>'
                 : '';
  
      return <<EOF;
  
      <hr>
      <small>Powered by <em>Swish-e</em> <a href="http://swish-e.org">swish-e.org</a></small>
      $mod_perl
  
      <p>
          <a href="http://validator.w3.org/check/referer"><img border="0"
              src="http://www.w3.org/Icons/valid-html401"
              alt="Valid HTML 4.01!" height="31" width="88"></a>
      </p>
      </body>
  </html>
  EOF
  }
  
  #==================================================================
  #  Form setup for sorts and metas
  #
  #  This could be methods of $results object
  #  (and then available for Template-Toolkit)
  #  But that's too much HTML in the object, perhaps.
  #
  #
  #==================================================================
  
  sub get_meta_name_limits {
      my ( $results ) = @_;
  
      my $metanames = $results->config('metanames');
      return '' unless $metanames;
  
      
      my $name_labels = $results->config('name_labels');
      my $q = $results->CGI;
  
  
      return join "\n",
          'Limit search to:',
          $q->radio_group(
              -name   =>'metaname',
              -values => $metanames,
              -default=>$metanames->[0],
              -labels =>$name_labels
          ),
          '<br>';
  }
  
  sub get_sort_select_list {
      my ( $results ) = @_;
  
      my $sort_metas = $results->config('sorts');
      return '' unless $sort_metas;
  
      
      my $name_labels = $results->config('name_labels');
      my $q = $results->CGI;
  
  
  
      return join "\n",
          'Sort by:',
          $q->popup_menu(
              -name   =>'sort',
              -values => $sort_metas,
              -default=>$sort_metas->[0],
              -labels =>$name_labels
          ),
          $q->checkbox(
              -name   => 'reverse',
              -label  => 'Reverse Sort'
          );
  }
  
  
  
  sub get_index_select_list {
      my ( $results ) = @_;
      my $q = $results->CGI;
  
  
      my $indexes = $results->config('swish_index');
      return '' unless ref $indexes eq 'ARRAY';
  
      my $select_config = $results->config('select_indexes');
      return '' unless $select_config && ref $select_config eq 'HASH';
  
  
      # Should return a warning, as this might be a likely mistake
      # This jumps through hoops so that real index file name is not exposed
      
      return '' unless exists $select_config->{labels}
                && ref $select_config->{labels} eq 'ARRAY'
                && @$indexes == @{$select_config->{labels}};
  
  
      my @labels = @{$select_config->{labels}};
      my %map;
  
      for ( 0..$#labels ) {
          $map{$_} = $labels[$_];
      }
  
      my $method = $select_config->{method} || 'checkbox_group';
      my @cols = $select_config->{columns} ? ('-columns', $select_config->{columns}) : ();
  
      return join "\n",
          '<br>',
          ( $select_config->{description} || 'Select: '),
          $q->$method(
          -name   => 'si',
          -values => [0..$#labels],
          -default=> 0,
          -labels => \%map,
          @cols );
  }
  
  
  sub get_limit_select {
      my ( $results ) = @_;
      my $q = $results->CGI;
  
  
      my $limit = $results->config('select_by_meta');
      return '' unless ref $limit eq 'HASH';
  
      my $method = $limit->{method} || 'checkbox_group';
  
      my @options = (
          -name   => 'sbm',
          -values => $limit->{values},
          -labels => $limit->{labels} || {},
      );
  
      push @options, ( -columns=> $limit->{columns} ) if $limit->{columns};
      
  
      return join "\n",
          '<br>',
          ( $limit->{description} || 'Select: '),
          $q->$method( @options );
  }
  1;
  
  
  
  
  1.1                  modperl-docs/src/search/modules/TemplateDumper.pm
  
  Index: TemplateDumper.pm
  ===================================================================
  #===============================================================================
  # This just dumps out the results object.
  # Need to use a query string to do anything interesting
  #
  #    $Id: TemplateDumper.pm,v 1.1 2002/01/30 06:35:00 stas Exp $
  #
  #===============================================================================
  package TemplateDumper;
  use strict;
  
  use Data::Dumper;
  
  sub show_template {
      my ( $class, $template_params, $results ) = @_;
  
  $Data::Dumper::Quotekeys = 0;
      print "Content-Type: text/plain\n\n",
            Dumper $template_params,
            Dumper $results;
  }
  
  1;
  
  
  
  
  
  1.1                  modperl-docs/src/search/modules/TemplateHTMLTemplate.pm
  
  Index: TemplateHTMLTemplate.pm
  ===================================================================
  #=======================================================================
  #  Module for using Template-Toolkit for generating output
  #    $Id: TemplateHTMLTemplate.pm,v 1.1 2002/01/30 06:35:00 stas Exp $
  #
  #  This module probably does not automatically support all the features
  #  of the swish.cgi script (such as selecting index files).  See
  #  the TemplateToolkit.pm module for examples.
  #
  #=======================================================================
  package TemplateHTMLTemplate;
  use strict;
  
  use HTML::Template;
  use HTML::FillInForm;
  use CGI ();
  
  use vars '$Template';
  
  use TemplateDefault;  # ugly hack
  
  
  sub show_template {
      my ( $class, $template_params, $results ) = @_;
  
      my $cgi = $results->CGI;
  
  
      my $template = HTML::Template->new( %{$template_params->{options}} );
  
      my $params = {
          TITLE           => ($results->config('title') || 'Search Page'),
          QUERY_SIMPLE    => CGI::escapeHTML( $results->{query_simple} ),
          MESSAGE         => CGI::escapeHTML( $results->errstr ),
          QUERY_HREF      => $results->{query_href},
          MY_URL          => $cgi->script_name,
  
          HITS            => $results->navigation('hits'),
          FROM            => $results->navigation('from'),
          TO              => $results->navigation('to'),
          SHOWING         => $results->navigation('showing'),
  
          PAGES           => $results->navigation('pages'),
          NEXT            => $results->navigation('next'),
          NEXT_COUNT      => $results->navigation('next_count'),
          PREV            => $results->navigation('prev'),
          PREV_COUNT      => $results->navigation('prev_count'),
          
  
          RUN_TIME        => $results->header('run time') ||  'unknown',
          SEARCH_TIME     => $results->header('search time') ||  'unknown',
          MOD_PERL        => $ENV{MOD_PERL},
      };
  
      $params->{FILES} =  $results->results if $results->results;
  
  
  
      my $MapNames  = $results->config('name_labels') || {};
      my $Sorts     = $results->config('sorts');
      my $MetaNames = $results->config('metanames');
  
  
      # set a default
      if ( $MetaNames && !$cgi->param('metanames') ) {
          $cgi->param('metaname', $MetaNames->[0] );
      }
  
      $params->{SORTS} = [ map { { NAME => $_, LABEL => ($MapNames->{$_} || $_) } } @$Sorts ] if $Sorts;
      $params->{METANAMES} = [ map { { NAME => $_, LABEL => ($MapNames->{$_} || $_) } } @$MetaNames ] if $MetaNames;
  
  
      $template->param( $params );
      my $page = $template->output;
  
  
      my $fif = new HTML::FillInForm;
  
      print $cgi->header,
            '<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">',
            $fif->fill(
              scalarref => \$page,
              fobject   => $cgi,
            );
  }
     
  1;
  
  
  
  
  1.1                  modperl-docs/src/search/modules/TemplateToolkit.pm
  
  Index: TemplateToolkit.pm
  ===================================================================
  #=======================================================================
  #  Module for using Template-Toolkit for generating output
  #    $Id: TemplateToolkit.pm,v 1.1 2002/01/30 06:35:00 stas Exp $
  #
  #=======================================================================
  package TemplateToolkit;
  use strict;
  
  use Template;
  use vars '$Template';
  
  
  sub show_template {
      my ( $class, $template_params, $results ) = @_;
  
      my $cgi = $results->CGI;
  
  
      #/* Cached if running under mod_perl */
      $Template ||= Template->new( $template_params->{options} );
  
  
      die $Template->error() unless $Template;
  
  
      print $cgi->header;
  
      my $subclass = TemplateToolkit::Helpers->new( $results );
  
      my $vars = {
          search  => $subclass,
          CGI     => $results->CGI,
      };
  
      $Template->process( $template_params->{file}, $vars )
  	|| die "Template process failed for page '$template_params->{file}' ", $Template->error(), "\n";
  }
  
  
  #==================================================================
  #  Form setup for sorts and metas
  #
  #  This could be methods of $results object
  #  (and then available for Template-Toolkit)
  #  But that's too much HTML in the object, perhaps.
  #
  #
  #==================================================================
  package TemplateToolkit::Helpers;
  use strict;
  
  sub new {
      my ( $class, $results ) = @_;
  
  
      @TemplateToolkit::Helpers::ISA = ref $results;  # that doesn't look right.
  
      return bless $results, $class;
  }
  
  sub get_meta_name_limits {
      my ( $results ) = @_;
  
      my $metanames = $results->config('metanames');
      return '' unless $metanames;
  
      
      my $name_labels = $results->config('name_labels');
      my $q = $results->CGI;
  
  
      return join "\n",
          'Limit search to:',
          $q->radio_group(
              -name   =>'metaname',
              -values => $metanames,
              -default=>$metanames->[0],
              -labels =>$name_labels
          ),
          '<br>';
  }
  
  sub get_sort_select_list {
      my ( $results ) = @_;
  
      my $sort_metas = $results->config('sorts');
      return '' unless $sort_metas;
  
      
      my $name_labels = $results->config('name_labels');
      my $q = $results->CGI;
  
  
  
      return join "\n",
          'Sort by:',
          $q->popup_menu(
              -name   =>'sort',
              -values => $sort_metas,
              -default=>$sort_metas->[0],
              -labels =>$name_labels
          ),
          $q->checkbox(
              -name   => 'reverse',
              -label  => 'Reverse Sort'
          );
  }
  
  
  
  
  sub get_index_select_list {
      my ( $results ) = @_;
      my $q = $results->CGI;
  
  
      my $indexes = $results->config('swish_index');
      return '' unless ref $indexes eq 'ARRAY';
  
      my $select_config = $results->config('select_indexes');
      return '' unless $select_config && ref $select_config eq 'HASH';
  
  
      # Should return a warning, as this might be a likely mistake
      # This jumps through hoops so that real index file name is not exposed
      
      return '' unless exists $select_config->{labels}
                && ref $select_config->{labels} eq 'ARRAY'
                && @$indexes == @{$select_config->{labels}};
  
  
      my @labels = @{$select_config->{labels}};
      my %map;
  
      for ( 0..$#labels ) {
          $map{$_} = $labels[$_];
      }
  
      my $method = $select_config->{method} || 'checkbox_group';
      my @cols = $select_config->{columns} ? ('-columns', $select_config->{columns}) : ();
  
      return join "\n",
          '<br>',
          ( $select_config->{description} || 'Select: '),
          $q->$method(
          -name   => 'si',
          -values => [0..$#labels],
          -default=> 0,
          -labels => \%map,
          @cols );
  }
  
  
  sub get_limit_select {
      my ( $results ) = @_;
      my $q = $results->CGI;
  
  
      my $limit = $results->config('select_by_meta');
      return '' unless ref $limit eq 'HASH';
  
      my $method = $limit->{method} || 'checkbox_group';
  
      my @options = (
          -name   => 'sbm',
          -values => $limit->{values},
          -labels => $limit->{labels} || {},
      );
  
      push @options, ( -columns=> $limit->{columns} ) if $limit->{columns};
      
  
      return join "\n",
          '<br>',
          ( $limit->{description} || 'Select: '),
          $q->$method( @options );
  }
  
  sub stopwords_removed {
      my $results = shift;
      
      my $swr = $results->header('removed stopwords');
      my $stopwords = '';
  
  
      if ( $swr && ref $swr eq 'ARRAY' ) {
          $stopwords = @$swr > 1
          ? join( ', ', map { "<b>$_</b>" } @$swr ) . ' are very common words and were not included in your search'
          : join( ', ', map { "<b>$_</b>" } @$swr ) . ' is a very common word and was not included in your search';
      }
  
      return $stopwords;
  }
  
  
     
  1;
  
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: docs-cvs-unsubscribe@perl.apache.org
For additional commands, e-mail: docs-cvs-help@perl.apache.org