You are viewing a plain text version of this content. The canonical link for it is here.
Posted to cvs@httpd.apache.org by Dean Gaudet <dg...@hyperreal.org> on 1997/09/11 20:50:38 UTC

cvs commit: apachen/src/modules/standard mod_speling.c

dgaudet     97/09/11 11:50:35

  Modified:    src      Configuration.tmpl
  Added:       htdocs/manual/mod mod_speling.html
               src/modules/standard mod_speling.c
  Log:
  Add in mod_speling.
  
  Submitted by:	Martin Kraemer, Alexei Kosut
  Reviewed by:	various
  
  Revision  Changes    Path
  1.1                  apachen/htdocs/manual/mod/mod_speling.html
  
  Index: mod_speling.html
  ===================================================================
  <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 3.2 Final//EN">
  <HTML>
   <HEAD>
    <TITLE>Apache module mod_speling</TITLE>
   </HEAD>
  <!-- Background white, links blue (unvisited), navy (visited), red (active) -->
   <BODY
    BGCOLOR="#FFFFFF"
    TEXT="#000000"
    LINK="#0000FF"
    VLINK="#000080"
    ALINK="#FF0000"
   >
  <!--#include virtual="header.html" -->
    <H1 ALIGN="CENTER">Module mod_speling</H1>
    <P>
    This module is contained in the <code>mod_speling.c</code> file,
    and is <strong>not</strong> compiled in by default.
    It attemps to correct mispellings of
    URLs that users might have entered, by ignoring capitalization
    and by allowing up to one misspelling.<br>
    This catches the majority of misspelled requests. An automatic
    "spelling corrected" redirection is returned if only one matching
    document was found, and a list of matches is returned if more than
    one document with a sufficiently similar name is found.
    </P>
  
    <h2>Summary</h2>
    <p>
    Requests to documents sometimes cannot be served by the core apache
    server because the request was misspelled or miscapitalized. This
    module addresses this problem by trying to find a matching document,
    even after all other modules gave up. It does its work by comparing
    each document name in the requested directory against the requested
    document name <STRONG>without regard to case</STRONG>, and allowing
    <STRONG>up to one misspelling</STRONG> (character insertion / omission
    / transposition or wrong character). A list is built with all document
    names which were matched using this strategy.
    </p>
    <p>
    If, after scanning the directory,
    <ul>
     <li>no matching document was found, Apache will proceed as usual
         and return a "document not found" error.
     <li>only one document is found that "almost" matches the request,
         then it is returned in the form of a redirection response.
     <li>more than one document with a close match was found, then
         the list of the matches is returned to the client, and the client
         can select the correct candidate.
    </ul>
    </p>
  
    <h2>Directives</h2>
  
    <menu>
    <li><A HREF="#checkspelling">CheckSpelling</A>
    </menu>
  
    <HR> <!-- the HR is part of the directive description -->
    <A name="checkspelling"><h2>CheckSpelling</h2></A>
    <!--%plaintext &lt;?INDEX {\tt CheckSpelling} directive&gt; -->
    <strong>Syntax:</strong> CheckSpelling <em>on/off</em><br>
    <strong>Default:</strong> <code>CheckSpelling Off</code><br>
    <Strong>Context:</strong> server config, virtual host<br>
    <strong>Status:</strong> Base<br>
    <strong>Module:</strong> mod_speling<br>
    <strong>Compatibility:</strong> CheckSpelling was available as a separately
    available module for Apache 1.1, but was limited to miscapitalizations.
    As of Apache 1.3, it is part of the apache distribution<!-- or:
    available as a separate module-->.<p>
  
    This directive enables or disables the spelling module. When enabled,
    keep in mind that
    <UL>
    <LI>the directory scan which is necessary for the spelling
        correction will have an impact on the server's performance
        when many spelling corrections have to be performed at the same time.
    <LI>the document trees should not contain sensitive files which could
        be matched inadvertedly, by a spelling "correction".
    <LI>the module is unable to correct misspelled user names
        (as in <code>http://my.host/~apahce/</code>), just file names or
        directory names.
    </UL>
  
  <!--#include virtual="footer.html" -->
   </BODY>
  </HTML>
  
  
  
  
  1.76      +9 -0      apachen/src/Configuration.tmpl
  
  Index: Configuration.tmpl
  ===================================================================
  RCS file: /export/home/cvs/apachen/src/Configuration.tmpl,v
  retrieving revision 1.75
  retrieving revision 1.76
  diff -u -r1.75 -r1.76
  --- Configuration.tmpl	1997/08/31 22:36:22	1.75
  +++ Configuration.tmpl	1997/09/11 18:50:31	1.76
  @@ -211,6 +211,15 @@
   ##
   ## URL translation modules.
   ##
  +
  +## The Speling module attemps to correct mispellings of URLs that
  +## users might have entered, namely by checking capitalizations
  +## or by allowing up to one misspelling (character insertion / omission /
  +## transposition/typo). This catches the majority of misspelled requests.
  +## If it finds a match, a "spelling corrected" redirection is returned.
  +
  +# AddModule modules/standard/mod_speling.o
  +
   ## The UserDir module for selecting resource directories by user name
   ## and a common prefix, e.g., /~<user> , /usr/web/<user> , etc.
   
  
  
  
  1.1                  apachen/src/modules/standard/mod_speling.c
  
  Index: mod_speling.c
  ===================================================================
  #define WANT_BASENAME_MATCH
  /* ====================================================================
   * Copyright (c) 1996,1997 The Apache Group.  All rights reserved.
   *
   * Redistribution and use in source and binary forms, with or without
   * modification, are permitted provided that the following conditions
   * are met:
   *
   * 1. Redistributions of source code must retain the above copyright
   *    notice, this list of conditions and the following disclaimer. 
   *
   * 2. Redistributions in binary form must reproduce the above copyright
   *    notice, this list of conditions and the following disclaimer in
   *    the documentation and/or other materials provided with the
   *    distribution.
   *
   * 3. All advertising materials mentioning features or use of this
   *    software must display the following acknowledgment:
   *    "This product includes software developed by the Apache Group
   *    for use in the Apache HTTP server project (http://www.apache.org/)."
   *
   * 4. The names "Apache Server" and "Apache Group" must not be used to
   *    endorse or promote products derived from this software without
   *    prior written permission.
   *
   * 5. Redistributions of any form whatsoever must retain the following
   *    acknowledgment:
   *    "This product includes software developed by the Apache Group
   *    for use in the Apache HTTP server project (http://www.apache.org/)."
   *
   * THIS SOFTWARE IS PROVIDED BY THE APACHE GROUP ``AS IS'' AND ANY
   * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
   * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
   * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE APACHE GROUP OR
   * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
   * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
   * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
   * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
   * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
   * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
   * OF THE POSSIBILITY OF SUCH DAMAGE.
   * ====================================================================
   *
   * This software consists of voluntary contributions made by many
   * individuals on behalf of the Apache Group and was originally based
   * on public domain software written at the National Center for
   * Supercomputing Applications, University of Illinois, Urbana-Champaign.
   * For more information on the Apache Group and the Apache HTTP server
   * project, please see <http://www.apache.org/>.
   *
   */
  
  #include "httpd.h"
  #include "http_config.h"
  #include "http_log.h"
  
  /* mod_speling.c - by Alexei Kosut <ak...@organic.com> June, 1996
   *
   * This module is transparent, and simple. It attemps to correct
   * mispellings of URLs that users might have entered, namely by checking
   * capitalizations. If it finds a match, it sends a redirect.
   *
   * 08-Aug-1997 <Ma...@Mch.SNI.De>
   * o Upgraded module interface to apache_1.3a2-dev API (more NULL's in speling_module).
   * o Integrated tcsh's "spelling correction" routine which allows one
   *   misspelling (character insertion/omission/typo/transposition).
   *   Rewrote it to ignore case as well. This ought to catch the majority
   *   of misspelled requests.
   * o Commented out the second pass where files' suffixes are stripped.
   *   Given the better hit rate of the first pass, this rather ugly
   *   (request index.html, receive index.db ?!?!) solution can be
   *   omitted.
   * o wrote a "kind of" html page for mod_speling
   *
   * Activate it with "CheckSpelling On"
   */
  
  module speling_module;
  
  /* We use the "unconventional" mod_userdir approach here. And heck,
   * here it's just one int!
   */
  
  static void *create_speling_config(pool * dummy, server_rec * s)
  {
      return (void *) 0;
  }
  
  static const char *set_speling(cmd_parms * cmd, void *dummy, int arg)
  {
      void *server_conf = cmd->server->module_config;
  
      set_module_config(server_conf, &speling_module, (void *) arg);
      return NULL;
  }
  
  command_rec speling_cmds[] =
  {
      {"CheckSpelling", set_speling, NULL, RSRC_CONF, FLAG,
         "whether or not to fix miscapitalized/misspelled requests"},
      {NULL}
  };
  
  typedef enum {
      SP_IDENTICAL = 0,
      SP_MISCAPITALIZED = 1,
      SP_TRANSPOSITION = 2,
      SP_MISSINGCHAR = 3,
      SP_EXTRACHAR = 4,
      SP_SIMPLETYPO = 5,
      SP_VERYDIFFERENT = 6
  } sp_reason;
  
  static const char *sp_reason_str[] =
  {
      "identical",
      "miscapitalized",
      "transposed characters",
      "character missing",
      "extra character",
      "mistyped character",
      "common basename",
  };
  
  typedef struct {
      const char *name;
      sp_reason quality;
  } misspelled_file;
  
  /*
   * spdist() is taken from Kernighan & Pike,
   *  _The_UNIX_Programming_Environment_
   * and adapted somewhat to correspond better to psychological reality.
   * (Note the changes to the return values)
   *
   * According to Pollock and Zamora, CACM April 1984 (V. 27, No. 4),
   * page 363, the correct order for this is:
   * OMISSION = TRANSPOSITION > INSERTION > SUBSTITUTION
   * thus, it was exactly backwards in the old version. -- PWP
   *
   * This routine was taken out of tcsh's spelling correction code
   * (tcsh-6.07.04) and re-converted to apache data types ("char" type
   * instead of tcsh's NLS'ed "Char"). Plus it now ignores the case
   * during comparisons, so is a "approximate strcasecmp()".
   * NOTE that is still allows only _one_ real "typo",
   * it does NOT try to correct multiple errors.
   */
  
  static sp_reason spdist(const char *s, const char *t)
  {
      for (; tolower(*s) == tolower(*t); t++, s++)
  	if (*t == '\0')
  	    return SP_MISCAPITALIZED;	/* exact match (sans case) */
      if (*s) {
  	if (*t) {
  	    if (s[1] && t[1] && tolower(*s) == tolower(t[1]) &&
  	      tolower(*t) == tolower(s[1]) && strcasecmp(s + 2, t + 2) == 0)
  		return SP_TRANSPOSITION;	/* transposition */
  	    if (strcasecmp(s + 1, t + 1) == 0)
  		return SP_SIMPLETYPO;	/* 1 char mismatch */
  	}
  	if (strcasecmp(s + 1, t) == 0)
  	    return SP_EXTRACHAR;	/* extra character */
      }
      if (*t && strcasecmp(s, t + 1) == 0)
  	return SP_MISSINGCHAR;	/* missing character */
      return SP_VERYDIFFERENT;	/* distance too large to fix. */
  }
  
  static int sort_by_quality(const void *left, const void *rite)
  {
      return (int) (((misspelled_file *) left)->quality)
  	 - (int) (((misspelled_file *) rite)->quality);
  }
  
  static int check_speling(request_rec * r)
  {
      void *server_conf = r->server->module_config;
      char *good, *bad, *postgood, *url;
      int filoc, dotloc, urlen, pglen;
      DIR *dirp;
      struct DIR_TYPE *dir_entry;
      array_header *candidates = NULL;
  
      if (!(int) get_module_config(server_conf, &speling_module))
  	return DECLINED;
  
      /* We only want to worry about GETs */
      if (r->method_number != M_GET)
  	return DECLINED;
  
      /* We've already got a file of some kind or another */
      if (r->proxyreq || (r->finfo.st_mode != 0))
  	return DECLINED;
  
      /* This is a sub request - don't mess with it */
      if (r->main)
  	return DECLINED;
  
      /* The request should end up looking like this:
       * r->uri: /correct-url/mispelling/more
       * r->filename: /correct-file/mispelling r->path_info: /more
       *
       * So we do this in steps. First break r->filename into two peices
       */
  
      filoc = rind(r->filename, '/');
      if (filoc == -1)
  	return DECLINED;
  
      /* good = /correct-file */
      good = pstrndup(r->pool, r->filename, filoc);
      /* bad = mispelling */
      bad = pstrdup(r->pool, r->filename + filoc + 1);
      /* postgood = mispelling/more */
      postgood = pstrcat(r->pool, bad, r->path_info, NULL);
  
      urlen = strlen(r->uri);
      pglen = strlen(postgood);
  
      /* Check to see if the URL pieces add up */
      if (strcmp(postgood, r->uri + (urlen - pglen)))
  	return DECLINED;
  
      /* url = /correct-url */
      url = pstrndup(r->pool, r->uri, (urlen - pglen));
  
      /* Now open the directory and do ourselves a check... */
      dirp = opendir(good);
      if (dirp == NULL)		/* Oops, not a directory... */
  	return DECLINED;
  
      candidates = make_array(r->pool, 2, sizeof(misspelled_file));
  
      dotloc = ind(bad, '.');
      if (dotloc == -1)
  	dotloc = strlen(bad);
  
      while ((dir_entry = readdir(dirp))) {
  	sp_reason q;
  
  	/* If we end up with a "fixed" URL which is identical to the
  	 * requested one, we must have found a broken symlink or some such.
  	 * Do _not_ try to redirect this, it causes a loop!
  	 */
  	if (strcmp(bad, dir_entry->d_name) == 0)
  	{
  	    closedir(dirp);
  	    return OK;
  	}
  	/*
  	 * miscapitalization errors are checked first
  	 * (like, e.g., lower case file, upper case request)
  	 */
  	else if (strcasecmp(bad, dir_entry->d_name) == 0) {
  	    misspelled_file *sp_new = (misspelled_file *) push_array(candidates);
  	    sp_new->name = pstrdup(r->pool, dir_entry->d_name);
  	    sp_new->quality = SP_MISCAPITALIZED;
  	}
  	/*
  	 * simple typing errors are checked next
  	 * (like, e.g., missing/extra/transposed char)
  	 */
  	else if ((q = spdist(bad, dir_entry->d_name)) != SP_VERYDIFFERENT) {
  	    misspelled_file *sp_new = (misspelled_file *) push_array(candidates);
  	    sp_new->name = pstrdup(r->pool, dir_entry->d_name);
  	    sp_new->quality = q;
  	}
  	/* The spdist() should have found the majority of the misspelled requests.
  	 * it is of questionable use to continue looking for files with the same
  	 * base name, but potentially of totally wrong type (index.html <-> index.db)
  	 * I would propose to not set the WANT_BASENAME_MATCH define.
  	 *      08-Aug-1997 <Ma...@Mch.SNI.De>
  	 *
  	 * However, Alexei replied giving some reasons to add it anyway:
  	 * > Oh, by the way, I remembered why having the
  	 * > extension-stripping-and-matching stuff is a good idea:
  	 * >
  	 * > If you're using MultiViews, and have a file named foobar.html, which you
  	 * > refer to as "foobar", and someone tried to access "Foobar", mod_speling
  	 * > won't find it, because it won't find anything matching that
  	 * > spelling. With the extension-munging, it would locate "foobar.html". Not
  	 * > perfect, but I ran into that problem when I first wrote the module.
  	 */
  	else {
  #ifdef WANT_BASENAME_MATCH
  	    /* Okay... we didn't find anything. Now we take out the hard-core
  	     * power tools. There are several cases here. Someone might have
  	     * entered a wrong extension (.htm instead of .html or vice versa)
  	     * or the document could be negotiated. At any rate, now we just compare
  	     * stuff before the first dot. If it matches, we figure we got us a
  	     * match. This can result in wrong things if there are files of
  	     * different content types but the same prefix (e.g. foo.gif and foo.html)
  	     * This code will pick the first one it finds. Better than a Not Found,
  	     * though.
  	     */
  	    int entloc = ind(dir_entry->d_name, '.');
  	    if (entloc == -1)
  		entloc = strlen(dir_entry->d_name);
  
  	    if ((dotloc == entloc)
  		&& !strncasecmp(bad, dir_entry->d_name, dotloc)) {
  		misspelled_file *sp_new = (misspelled_file *) push_array(candidates);
  		sp_new->name = pstrdup(r->pool, dir_entry->d_name);
  		sp_new->quality = SP_VERYDIFFERENT;
  	    }
  #endif
  	}
      }
      closedir(dirp);
  
      if (candidates->nelts != 0) {
  	/* Wow... we found us a mispelling. Construct a fixed url */
  	char *nuri, *ref;
  	misspelled_file *variant = (misspelled_file *) candidates->elts;
  	int i;
  
  	ref = table_get(r->headers_in, "Referer");
  
  	qsort((void *) candidates->elts, candidates->nelts,
  	      sizeof(misspelled_file), sort_by_quality);
  
  	/*
  	 * Conditions for immediate redirection:
  	 *     a) the first candidate was not found by stripping the suffix
  	 * AND b) there exists only one candidate OR the best match is not ambigous
  	 *
  	 * Otherwise, a "[300] Multiple Choices" list with the variants is returned.
  	 */
  	if (variant[0].quality != SP_VERYDIFFERENT &&
  	    (candidates->nelts == 1 || variant[0].quality != variant[1].quality)) {
  
  	    nuri = pstrcat(r->pool, url, variant[0].name,
  			   r->path_info, NULL);
  
  	    table_set(r->headers_out, "Location",
  		      construct_url(r->pool, nuri, r->server));
  
  	    aplog_error(APLOG_MARK, APLOG_ERR, r->server,
  			ref ? "Fixed spelling: %s to %s from %s"
  			: "Fixed spelling: %s to %s",
  			r->uri, nuri, ref);
  
  	    return HTTP_MOVED_PERMANENTLY;
  	}
  	/*
  	 * Otherwise, a "[300] Multiple Choices" list with the variants is returned.
  	 */
  	else {
  	    char *t;
  	    pool *pool;
  	    table *notes;
  
  	    if (r->main == NULL) {
  		pool = r->pool;
  		notes = r->notes;
  	    }
  	    else {
  		pool = r->main->pool;
  		notes = r->main->notes;
  	    }
  
  	    /* Generate the reponse text. */
  	    t = pstrcat(pool, "The document name you requested (<code>",
  		     r->uri, "</code>) could not be found on this server.\n"
  			"However, we found documents with names similar to the one you requested.<p>"
  			"Available documents:\n<ul>\n", NULL);
  
  	    for (i = 0; i < candidates->nelts; ++i) {
  
  		/* The format isn't very neat... */
  		t = pstrcat(pool, t, "<li><a href=\"", variant[i].name, "\">",
  			    variant[i].name, "</a> (",
  			    sp_reason_str[(int) (variant[i].quality)], ")\n", NULL);
  
  		/* when we have printed the "close matches" and there
  		 * are more "distant matches" (matched by stripping the
  		 * suffix), then we insert an additional separator text
  		 * to suggest that the user LOOK CLOSELY whether these
  		 * are really the files she wanted.
  		 */
  		if (i > 0 && i < candidates->nelts - 1
  		    && variant[i].quality != SP_VERYDIFFERENT
  		    && variant[i + 1].quality == SP_VERYDIFFERENT) {
  		    t = pstrcat(pool, t, "</ul>\nFurthermore, the following related documents were found:\n<ul>\n", NULL);
  		}
  	    }
  	    t = pstrcat(pool, t, "</ul>\n", NULL);
  
  	    /* If we know there was a referring page, add a note: */
  	    if (ref != NULL)
  		t = pstrcat(pool, t, "Please consider informing the owner of the <a href=\"",
  		ref, "\">referring page</a> about the broken link.\n", NULL);
  
  	    /* Pass our table to http_protocol.c (see mod_negotiation): */
  	    table_set(notes, "variant-list", t);
  
  	    aplog_error(APLOG_MARK, APLOG_WARNING, r->server,
  			ref ? "Spelling fix: %s: %d candidates from %s"
  			: "Spelling fix: %s: %d candidates",
  			r->uri, candidates->nelts, ref);
  
  	    return HTTP_MULTIPLE_CHOICES;
  	}
      }
  
      return OK;
  }
  
  module MODULE_VAR_EXPORT speling_module = {
     STANDARD_MODULE_STUFF,
     NULL,                        /* initializer */
     NULL,                        /* create per-dir config */
     NULL,                        /* merge per-dir config */
     create_speling_config,       /* server config */
     NULL,                        /* merge server config */
     speling_cmds,                /* command table */
     NULL,                        /* handlers */
     NULL,                        /* filename translation */
     NULL,                        /* check_user_id */
     NULL,                        /* check auth */
     NULL,                        /* check access */
     NULL,                        /* type_checker */
     check_speling,               /* fixups */
     NULL,                        /* logger */
     NULL,                        /* header parser */
     NULL,                        /* child_init */
     NULL,                        /* child_exit */
     NULL                         /* post read-request */
  };