You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2010/10/08 02:27:46 UTC

svn commit: r1005681 [2/2] - /incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/

Modified: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1005681&r1=1005680&r2=1005681&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Fri Oct  8 00:27:46 2010
@@ -120,14 +120,6 @@ public class WebcrawlerConnector extends
     understoodProtocols.put("https","https");
   }
 
-  protected static final Map mapLookup = new HashMap();
-  static
-  {
-    mapLookup.put("amp","&");
-    mapLookup.put("lt","<");
-    mapLookup.put("gt",">");
-    mapLookup.put("quot","\"");
-  }
 
   // Usage flag values
   protected static final int ROBOTS_NONE = 0;
@@ -5343,283 +5335,6 @@ public class WebcrawlerConnector extends
 
   }
 
-  /** This class accumulates form data and allows overrides */
-  protected static class FormDataAccumulator implements FormData
-  {
-    // Note well: We don't handle multipart posts at this time!!
-
-    /** The form's action URI */
-    protected String actionURI;
-    /** The form's submit method */
-    protected int submitMethod;
-
-    /** The set of elements */
-    protected ArrayList elementList = new ArrayList();
-
-    public FormDataAccumulator(String actionURI, int submitMethod)
-    {
-      this.actionURI = actionURI;
-      this.submitMethod = submitMethod;
-    }
-
-    public void addElement(Map attributes)
-    {
-      // Interpret the input tag, and make a list of the potential elements we'll want to submit
-      String type = (String)attributes.get("type");
-      if (type != null)
-      {
-        String name = (String)attributes.get("name");
-        if (name != null)
-        {
-          String lowerType = type.toLowerCase();
-          if (lowerType.equals("submit"))
-          {
-            String value = (String)attributes.get("value");
-            if (value == null)
-              value = "Submit Form";
-            elementList.add(new FormItem(name,value,ELEMENTCATEGORY_FREEFORM,true));
-          }
-          else if (lowerType.equals("hidden") || lowerType.equals("text") || lowerType.equals("password"))
-          {
-            String value = (String)attributes.get("value");
-            if (value == null)
-              value = "";
-            elementList.add(new FormItem(name,value,ELEMENTCATEGORY_FREEFORM,true));
-          }
-          else if (lowerType.equals("select"))
-          {
-            String value = (String)attributes.get("value");
-            if (value == null)
-              value = "";
-            String selected = (String)attributes.get("selected");
-            boolean isSelected = false;
-            if (selected != null)
-              isSelected = true;
-            String multiple = (String)attributes.get("multiple");
-            boolean isMultiple = false;
-            if (multiple != null)
-              isMultiple = true;
-            elementList.add(new FormItem(name,value,isMultiple?ELEMENTCATEGORY_FIXEDINCLUSIVE:ELEMENTCATEGORY_FIXEDEXCLUSIVE,isSelected));
-          }
-          else if (lowerType.equals("radio"))
-          {
-            String value = (String)attributes.get("value");
-            if (value == null)
-              value = "";
-            String selected = (String)attributes.get("checked");
-            boolean isSelected = false;
-            if (selected != null)
-              isSelected = true;
-            elementList.add(new FormItem(name,value,ELEMENTCATEGORY_FIXEDEXCLUSIVE,isSelected));
-          }
-          else if (lowerType.equals("checkbox"))
-          {
-            String value = (String)attributes.get("value");
-            if (value == null)
-              value = "";
-            String selected = (String)attributes.get("checked");
-            boolean isSelected = false;
-            if (selected != null)
-              isSelected = true;
-            elementList.add(new FormItem(name,value,ELEMENTCATEGORY_FIXEDINCLUSIVE,isSelected));
-          }
-          else if (lowerType.equals("textarea"))
-          {
-            elementList.add(new FormItem(name,"",ELEMENTCATEGORY_FREEFORM,true));
-          }
-        }
-      }
-    }
-
-    public void applyOverrides(LoginParameters lp)
-    {
-      // This map contains the control names we have ALREADY wiped clean.
-      Map overrideMap = new HashMap();
-
-      // Override the specified elements with the specified values
-      int i = 0;
-      while (i < lp.getParameterCount())
-      {
-        Pattern namePattern = lp.getParameterNamePattern(i);
-        String value = lp.getParameterValue(i);
-        i++;
-
-        // For each parameter specified, go through the element list and do the right thing.  This will require us to keep some state around about
-        // what exactly we've done to the element list so far, so that each parameter rule in turn applies properly.
-        //
-        // Each rule regular expression will be deemed to apply to all matching controls.  If the rule matches the control name, then the precise behavior
-        // will depend on the type of the control.
-        //
-        // Controls can be categorized in the following way:
-        // - free-form value
-        // - specified exclusive value (e.g. radio button)
-        // - specified inclusive value (e.g. checkbox)
-        //
-        // For free-form values, the value given will simply override the value of the element.
-        // For exclusive controls, all values in the family will be disabled, and the value matching the one specified will be enabled.
-        // For inclusive controls, all values in the family will be cleared ONCE, and then subsequently the value matching the one specified will be enabled.
-        //
-        int j = 0;
-        while (j < elementList.size())
-        {
-          FormItem fi = (FormItem)elementList.get(j++);
-          Matcher m = namePattern.matcher(fi.getElementName());
-          if (m.find())
-          {
-            // Hey, it seems to apply!
-            switch (fi.getType())
-            {
-            case ELEMENTCATEGORY_FREEFORM:
-              // Override immediately
-              fi.setValue(value);
-              break;
-            case ELEMENTCATEGORY_FIXEDEXCLUSIVE:
-              // If it doesn't match the value, disable.
-              fi.setEnabled(fi.getElementValue().equals(value));
-              break;
-            case ELEMENTCATEGORY_FIXEDINCLUSIVE:
-              // Make sure we clear the entire control ONCE (and only once).
-              if (overrideMap.get(fi.getElementName()) == null)
-              {
-                // Zip through the entire list
-                int k = 0;
-                while (k < elementList.size())
-                {
-                  FormItem fi2 = (FormItem)elementList.get(k++);
-                  if (fi2.getElementName().equals(fi.getElementName()))
-                    fi.setEnabled(false);
-                }
-                overrideMap.put(fi.getElementName(),fi.getElementName());
-              }
-              if (fi.getElementValue().equals(value))
-                fi.setEnabled(true);
-            default:
-              break;
-            }
-          }
-        }
-      }
-    }
-
-    /** Get the full action URI for this form. */
-    public String getActionURI()
-    {
-      return actionURI;
-    }
-
-    /** Get the submit method for this form. */
-    public int getSubmitMethod()
-    {
-      return submitMethod;
-    }
-
-    /** Iterate over the active form data elements.  The returned iterator returns FormDataElement objects. */
-    public Iterator getElementIterator()
-    {
-      return new FormItemIterator(elementList);
-    }
-
-  }
-
-  /** Iterator over FormItems */
-  protected static class FormItemIterator implements Iterator
-  {
-    protected ArrayList elementList;
-    protected int currentIndex = 0;
-
-    public FormItemIterator(ArrayList elementList)
-    {
-      this.elementList = elementList;
-    }
-
-    public boolean hasNext()
-    {
-      while (true)
-      {
-        if (currentIndex == elementList.size())
-          return false;
-        if (((FormItem)elementList.get(currentIndex)).getEnabled() == false)
-          currentIndex++;
-        else
-          break;
-      }
-      return true;
-    }
-
-    public Object next()
-    {
-      while (true)
-      {
-        if (currentIndex == elementList.size())
-          throw new NoSuchElementException("No such element");
-        if (((FormItem)elementList.get(currentIndex)).getEnabled() == false)
-          currentIndex++;
-        else
-          break;
-      }
-      return elementList.get(currentIndex++);
-    }
-
-    public void remove()
-    {
-      throw new UnsupportedOperationException("Unsupported operation");
-    }
-  }
-
-  // Element categorization
-  protected final static int ELEMENTCATEGORY_FREEFORM = 0;
-  protected final static int ELEMENTCATEGORY_FIXEDEXCLUSIVE = 1;
-  protected final static int ELEMENTCATEGORY_FIXEDINCLUSIVE = 2;
-
-  /** This class provides an individual data item */
-  protected static class FormItem implements FormDataElement
-  {
-    protected String name;
-    protected String value;
-    protected boolean isEnabled;
-    protected int type;
-
-    public FormItem(String name, String value, int type, boolean isEnabled)
-    {
-      this.name = name;
-      this.value = value;
-      this.isEnabled = isEnabled;
-      this.type = type;
-    }
-
-    public void setEnabled(boolean enabled)
-    {
-      isEnabled = enabled;
-    }
-
-    public boolean getEnabled()
-    {
-      return isEnabled;
-    }
-
-    public void setValue(String value)
-    {
-      this.value = value;
-    }
-
-    public int getType()
-    {
-      return type;
-    }
-
-    /** Get the element name */
-    public String getElementName()
-    {
-      return name;
-    }
-
-    /** Get the element value */
-    public String getElementValue()
-    {
-      return value;
-    }
-
-  }
 
   /** This class is the handler for HTML parsing during state transitions */
   protected class FindHTMLHrefHandler extends FindHandler implements IHTMLHandler
@@ -7429,713 +7144,6 @@ public class WebcrawlerConnector extends
 
   }
 
-  /** This interface describes the functionality needed by a link extractor to note a discovered link.
-  */
-  protected static interface IDiscoveredLinkHandler
-  {
-    /** Inform the world of a discovered link.
-    *@param rawURL is the raw discovered url.  This may be relative, malformed, or otherwise unsuitable for use until final form is acheived.
-    */
-    public void noteDiscoveredLink(String rawURL)
-      throws ManifoldCFException;
-  }
-
-  /** This interface describes the functionality needed by an redirection processor in order to handle a redirection.
-  */
-  protected static interface IRedirectionHandler extends IDiscoveredLinkHandler
-  {
-  }
-
-  /** This interface describes the functionality needed by an XML processor in order to handle an XML document.
-  */
-  protected static interface IXMLHandler extends IDiscoveredLinkHandler
-  {
-    /** Inform the world of a discovered ttl value.
-    *@param rawTtlValue is the raw discovered ttl value.
-    */
-    public void noteDiscoveredTtlValue(String rawTtlValue)
-      throws ManifoldCFException;
-
-  }
-
-  /** This interface describes the functionality needed by an HTML processor in order to handle an HTML document.
-  */
-  protected static interface IHTMLHandler extends IDiscoveredLinkHandler
-  {
-    /** Note the start of a form */
-    public void noteFormStart(Map formAttributes)
-      throws ManifoldCFException;
-
-    /** Note an input tag */
-    public void noteFormInput(Map inputAttributes)
-      throws ManifoldCFException;
-
-    /** Note the end of a form */
-    public void noteFormEnd()
-      throws ManifoldCFException;
-
-    /** Note discovered href */
-    public void noteAHREF(String rawURL)
-      throws ManifoldCFException;
-
-    /** Note discovered href */
-    public void noteLINKHREF(String rawURL)
-      throws ManifoldCFException;
-
-    /** Note discovered IMG SRC */
-    public void noteIMGSRC(String rawURL)
-      throws ManifoldCFException;
-
-    /** Note discovered FRAME SRC */
-    public void noteFRAMESRC(String rawURL)
-      throws ManifoldCFException;
-  }
-
-  // HTML parsing classes and constants
-
-  /** Is a character HTML whitespace? */
-  protected static boolean isHTMLWhitespace(char x)
-  {
-    return x <= ' ';
-  }
-
-  /** Decode an html attribute */
-  protected static String htmlAttributeDecode(String input)
-  {
-    StringBuffer output = new StringBuffer();
-    int i = 0;
-    while (i < input.length())
-    {
-      char x = input.charAt(i++);
-      if (x == '&')
-      {
-        int index = input.indexOf(";",i);
-        if (index != -1)
-        {
-          String chunk = input.substring(i,index);
-          String replacement = mapChunk(chunk);
-          if (replacement != null)
-          {
-            output.append(replacement);
-            i = index + 1;
-            continue;
-          }
-        }
-      }
-      output.append(x);
-    }
-    return output.toString();
-  }
-
-  /** Map an entity reference back to a character */
-  protected static String mapChunk(String input)
-  {
-    if (input.startsWith("#"))
-    {
-      // Treat as a decimal value
-      try
-      {
-        int value = Integer.parseInt(input.substring(1));
-        StringBuffer sb = new StringBuffer();
-        sb.append((char)value);
-        return sb.toString();
-      }
-      catch (NumberFormatException e)
-      {
-        return null;
-      }
-    }
-    else
-      return (String)mapLookup.get(input);
-  }
-
-  // Basic parse states (lexical analysis)
-
-  protected static final int BASICPARSESTATE_NORMAL = 0;
-  protected static final int BASICPARSESTATE_SAWLEFTBRACKET = 1;
-  protected static final int BASICPARSESTATE_SAWEXCLAMATION = 2;
-  protected static final int BASICPARSESTATE_SAWDASH = 3;
-  protected static final int BASICPARSESTATE_IN_COMMENT = 4;
-  protected static final int BASICPARSESTATE_SAWCOMMENTDASH = 5;
-  protected static final int BASICPARSESTATE_SAWSECONDCOMMENTDASH = 6;
-  protected static final int BASICPARSESTATE_IN_TAG_NAME = 7;
-  protected static final int BASICPARSESTATE_IN_ATTR_NAME = 8;
-  protected static final int BASICPARSESTATE_IN_ATTR_VALUE = 9;
-  protected static final int BASICPARSESTATE_IN_TAG_SAW_SLASH = 10;
-  protected static final int BASICPARSESTATE_IN_END_TAG_NAME = 11;
-  protected static final int BASICPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE = 12;
-  protected static final int BASICPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE = 13;
-  protected static final int BASICPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE = 14;
-  protected static final int BASICPARSESTATE_IN_UNQUOTED_ATTR_VALUE = 15;
-
-
-  /** This class represents the basic, outermost parse state. */
-  protected static class BasicParseState
-  {
-    protected int currentState = BASICPARSESTATE_NORMAL;
-
-    protected StringBuffer currentTagNameBuffer = null;
-    protected StringBuffer currentAttrNameBuffer = null;
-    protected StringBuffer currentValueBuffer = null;
-
-    protected String currentTagName = null;
-    protected String currentAttrName = null;
-    protected Map currentAttrMap = null;
-
-    public BasicParseState()
-    {
-    }
-
-    /** Deal with a character.  No exceptions are allowed, since those would represent syntax errors, and we don't want those to cause difficulty. */
-    public void dealWithCharacter(char thisChar)
-      throws ManifoldCFException
-    {
-      // At this level we want basic lexical analysis - that is, we deal with identifying tags and comments, that's it.
-      char thisCharLower = Character.toLowerCase(thisChar);
-      switch (currentState)
-      {
-      case BASICPARSESTATE_NORMAL:
-        if (thisChar == '<')
-          currentState = BASICPARSESTATE_SAWLEFTBRACKET;
-        break;
-      case BASICPARSESTATE_SAWLEFTBRACKET:
-        if (thisChar == '!')
-          currentState = BASICPARSESTATE_SAWEXCLAMATION;
-        else if (thisChar == '/')
-        {
-          currentState = BASICPARSESTATE_IN_END_TAG_NAME;
-          currentTagNameBuffer = new StringBuffer();
-        }
-        else
-        {
-          currentState = BASICPARSESTATE_IN_TAG_NAME;
-          currentTagNameBuffer = new StringBuffer();
-          if (!isHTMLWhitespace(thisChar))
-            currentTagNameBuffer.append(thisCharLower);
-        }
-        break;
-      case BASICPARSESTATE_SAWEXCLAMATION:
-        if (thisChar == '-')
-          currentState = BASICPARSESTATE_SAWDASH;
-        else
-          currentState = BASICPARSESTATE_NORMAL;
-        break;
-      case BASICPARSESTATE_SAWDASH:
-        if (thisChar == '-')
-          currentState = BASICPARSESTATE_IN_COMMENT;
-        else
-          currentState = BASICPARSESTATE_NORMAL;
-        break;
-      case BASICPARSESTATE_IN_COMMENT:
-        // We're in a comment.  All we should look for is the end of the comment.
-        if (thisChar == '-')
-          currentState = BASICPARSESTATE_SAWCOMMENTDASH;
-        break;
-      case BASICPARSESTATE_SAWCOMMENTDASH:
-        if (thisChar == '-')
-          currentState = BASICPARSESTATE_SAWSECONDCOMMENTDASH;
-        else
-          currentState = BASICPARSESTATE_IN_COMMENT;
-        break;
-      case BASICPARSESTATE_SAWSECONDCOMMENTDASH:
-        if (thisChar == '>')
-          currentState = BASICPARSESTATE_NORMAL;
-        else if (thisChar != '-')
-          currentState = BASICPARSESTATE_IN_COMMENT;
-        break;
-      case BASICPARSESTATE_IN_TAG_NAME:
-        if (isHTMLWhitespace(thisChar))
-        {
-          if (currentTagNameBuffer.length() > 0)
-          {
-            // Done with the tag name!
-            currentTagName = currentTagNameBuffer.toString();
-            currentTagNameBuffer = null;
-            currentAttrMap = new HashMap();
-            currentState = BASICPARSESTATE_IN_ATTR_NAME;
-            currentAttrNameBuffer = new StringBuffer();
-          }
-        }
-        else if (thisChar == '/')
-        {
-          if (currentTagNameBuffer.length() > 0)
-          {
-            currentTagName = currentTagNameBuffer.toString();
-            currentTagNameBuffer = null;
-            currentAttrMap = new HashMap();
-            currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
-            noteTag(currentTagName,currentAttrMap);
-          }
-          else
-          {
-            currentState = BASICPARSESTATE_NORMAL;
-            currentTagNameBuffer = null;
-          }
-        }
-        else if (thisChar == '>')
-        {
-          if (currentTagNameBuffer.length() > 0)
-          {
-            currentTagName = currentTagNameBuffer.toString();
-            currentTagNameBuffer = null;
-            currentAttrMap = new HashMap();
-          }
-          if (currentTagName != null)
-          {
-            noteTag(currentTagName,currentAttrMap);
-          }
-          currentState = BASICPARSESTATE_NORMAL;
-          currentTagName = null;
-          currentAttrMap = null;
-        }
-        else
-          currentTagNameBuffer.append(thisCharLower);
-        break;
-      case BASICPARSESTATE_IN_ATTR_NAME:
-        if (isHTMLWhitespace(thisChar))
-        {
-          if (currentAttrNameBuffer.length() > 0)
-          {
-            // Done with attr name!
-            currentAttrName = currentAttrNameBuffer.toString();
-            currentAttrNameBuffer = null;
-            currentState = BASICPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE;
-          }
-        }
-        else if (thisChar == '=')
-        {
-          if (currentAttrNameBuffer.length() > 0)
-          {
-            currentAttrName = currentAttrNameBuffer.toString();
-            currentAttrNameBuffer = null;
-            currentState = BASICPARSESTATE_IN_ATTR_VALUE;
-            currentValueBuffer = new StringBuffer();
-          }
-        }
-        else if (thisChar == '/')
-        {
-          if (currentAttrNameBuffer.length() > 0)
-          {
-            currentAttrName = currentAttrNameBuffer.toString();
-            currentAttrNameBuffer = null;
-          }
-          if (currentAttrName != null)
-          {
-            currentAttrMap.put(currentAttrName,"");
-            currentAttrName = null;
-          }
-          noteTag(currentTagName,currentAttrMap);
-          currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
-        }
-        else if (thisChar == '>')
-        {
-          if (currentAttrNameBuffer.length() > 0)
-          {
-            currentAttrName = currentAttrNameBuffer.toString();
-            currentAttrNameBuffer = null;
-          }
-          if (currentAttrName != null)
-          {
-            currentAttrMap.put(currentAttrName,"");
-            currentAttrName = null;
-          }
-          currentState = BASICPARSESTATE_NORMAL;
-          noteTag(currentTagName,currentAttrMap);
-          currentTagName = null;
-          currentAttrMap = null;
-        }
-        else
-          currentAttrNameBuffer.append(thisCharLower);
-        break;
-      case BASICPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE:
-        if (thisChar == '=')
-        {
-          currentState = BASICPARSESTATE_IN_ATTR_VALUE;
-          currentValueBuffer = new StringBuffer();
-        }
-        else if (thisChar == '>')
-        {
-          currentState = BASICPARSESTATE_NORMAL;
-          noteTag(currentTagName,currentAttrMap);
-          currentTagName = null;
-          currentAttrMap = null;
-        }
-        else if (thisChar == '/')
-        {
-          currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
-          currentAttrMap.put(currentAttrName,"");
-          currentAttrName = null;
-          noteTag(currentTagName,currentAttrMap);
-        }
-        else if (!isHTMLWhitespace(thisChar))
-        {
-          currentAttrMap.put(currentAttrName,"");
-          currentState = BASICPARSESTATE_IN_ATTR_NAME;
-          currentAttrNameBuffer = new StringBuffer();
-          currentAttrNameBuffer.append(thisCharLower);
-          currentAttrName = null;
-        }
-        break;
-      case BASICPARSESTATE_IN_ATTR_VALUE:
-        if (thisChar == '\'')
-          currentState = BASICPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE;
-        else if (thisChar == '"')
-          currentState = BASICPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE;
-        else if (!isHTMLWhitespace(thisChar))
-        {
-          currentState = BASICPARSESTATE_IN_UNQUOTED_ATTR_VALUE;
-          currentValueBuffer.append(thisChar);
-        }
-        break;
-      case BASICPARSESTATE_IN_TAG_SAW_SLASH:
-        if (thisChar == '>')
-        {
-          noteEndTag(currentTagName);
-          currentState = BASICPARSESTATE_NORMAL;
-          currentTagName = null;
-          currentAttrMap = null;
-        }
-        break;
-      case BASICPARSESTATE_IN_END_TAG_NAME:
-        if (isHTMLWhitespace(thisChar))
-        {
-          if (currentTagNameBuffer != null && currentTagNameBuffer.length() > 0)
-          {
-            // Done with the tag name!
-            currentTagName = currentTagNameBuffer.toString();
-            currentTagNameBuffer = null;
-          }
-        }
-        else if (thisChar == '>')
-        {
-          if (currentTagNameBuffer != null && currentTagNameBuffer.length() > 0)
-          {
-            currentTagName = currentTagNameBuffer.toString();
-            currentTagNameBuffer = null;
-          }
-          if (currentTagName != null)
-          {
-            noteEndTag(currentTagName);
-          }
-          currentTagName = null;
-          currentState = BASICPARSESTATE_NORMAL;
-        }
-        else if (currentTagNameBuffer != null)
-          currentTagNameBuffer.append(thisCharLower);
-        break;
-      case BASICPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE:
-        if (thisChar == '\'' || thisChar == '\n' || thisChar == '\r')
-        {
-          currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
-          currentAttrName = null;
-          currentValueBuffer = null;
-          currentState = BASICPARSESTATE_IN_ATTR_NAME;
-          currentAttrNameBuffer = new StringBuffer();
-        }
-        else
-          currentValueBuffer.append(thisChar);
-        break;
-      case BASICPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE:
-        if (thisChar == '"' || thisChar == '\n' || thisChar == '\r')
-        {
-          currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
-          currentAttrName = null;
-          currentValueBuffer = null;
-          currentState = BASICPARSESTATE_IN_ATTR_NAME;
-          currentAttrNameBuffer = new StringBuffer();
-        }
-        else
-          currentValueBuffer.append(thisChar);
-        break;
-      case BASICPARSESTATE_IN_UNQUOTED_ATTR_VALUE:
-        if (isHTMLWhitespace(thisChar))
-        {
-          currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
-          currentAttrName = null;
-          currentValueBuffer = null;
-          currentState = BASICPARSESTATE_IN_ATTR_NAME;
-          currentAttrNameBuffer = new StringBuffer();
-        }
-        else if (thisChar == '/')
-        {
-          currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
-          noteTag(currentTagName,currentAttrMap);
-          currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
-        }
-        else if (thisChar == '>')
-        {
-          currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
-          currentAttrName = null;
-          currentValueBuffer = null;
-          currentState = BASICPARSESTATE_NORMAL;
-          noteTag(currentTagName,currentAttrMap);
-          currentTagName = null;
-          currentAttrMap = null;
-        }
-        else
-          currentValueBuffer.append(thisChar);
-        break;
-      default:
-        throw new ManifoldCFException("Invalid state: "+Integer.toString(currentState));
-      }
-    }
-
-    protected void noteTag(String tagName, Map attributes)
-      throws ManifoldCFException
-    {
-      Logging.connectors.debug(" Saw tag '"+tagName+"'");
-    }
-
-    protected void noteEndTag(String tagName)
-      throws ManifoldCFException
-    {
-      Logging.connectors.debug(" Saw end tag '"+tagName+"'");
-    }
-
-    public void finishUp()
-      throws ManifoldCFException
-    {
-      // Does nothing
-    }
-
-  }
-
-  // Script tag parsing states
-
-  protected static final int SCRIPTPARSESTATE_NORMAL = 0;
-  protected static final int SCRIPTPARSESTATE_INSCRIPT = 1;
-
-  /** This class interprets the tag stream generated by the BasicParseState class, and causes script sections to be skipped */
-  protected static class ScriptParseState extends BasicParseState
-  {
-    protected int scriptParseState = SCRIPTPARSESTATE_NORMAL;
-
-    public ScriptParseState()
-    {
-      super();
-    }
-
-    // Override methods having to do with notification of tag discovery
-
-    protected void noteTag(String tagName, Map attributes)
-      throws ManifoldCFException
-    {
-      super.noteTag(tagName,attributes);
-      switch (scriptParseState)
-      {
-      case SCRIPTPARSESTATE_NORMAL:
-        if (tagName.equals("script"))
-          scriptParseState = SCRIPTPARSESTATE_INSCRIPT;
-        else
-          noteNonscriptTag(tagName,attributes);
-        break;
-      case SCRIPTPARSESTATE_INSCRIPT:
-        // Skip all tags until we see the end script one.
-        break;
-      default:
-        throw new ManifoldCFException("Unknown script parse state: "+Integer.toString(scriptParseState));
-      }
-    }
-
-    protected void noteEndTag(String tagName)
-      throws ManifoldCFException
-    {
-      super.noteEndTag(tagName);
-      switch (scriptParseState)
-      {
-      case SCRIPTPARSESTATE_NORMAL:
-        noteNonscriptEndTag(tagName);
-        break;
-      case SCRIPTPARSESTATE_INSCRIPT:
-        // Skip all tags until we see the end script one.
-        if (tagName.equals("script"))
-          scriptParseState = SCRIPTPARSESTATE_NORMAL;
-        break;
-      default:
-        break;
-      }
-    }
-
-    protected void noteNonscriptTag(String tagName, Map attributes)
-      throws ManifoldCFException
-    {
-    }
-
-    protected void noteNonscriptEndTag(String tagName)
-      throws ManifoldCFException
-    {
-    }
-
-  }
-
-  /** This class recognizes and interprets all links */
-  protected static class LinkParseState extends ScriptParseState
-  {
-
-    protected IHTMLHandler handler;
-
-    public LinkParseState(IHTMLHandler handler)
-    {
-      super();
-      this.handler = handler;
-    }
-
-    protected void noteNonscriptTag(String tagName, Map attributes)
-      throws ManifoldCFException
-    {
-      super.noteNonscriptTag(tagName,attributes);
-      String lowerTagName = tagName.toLowerCase();
-      if (tagName.equals("a"))
-      {
-        String hrefValue = (String)attributes.get("href");
-        if (hrefValue != null && hrefValue.length() > 0)
-          handler.noteAHREF(hrefValue);
-      }
-      else if (tagName.equals("link"))
-      {
-        String hrefValue = (String)attributes.get("href");
-        if (hrefValue != null && hrefValue.length() > 0)
-          handler.noteLINKHREF(hrefValue);
-      }
-      else if (tagName.equals("img"))
-      {
-        String srcValue = (String)attributes.get("src");
-        if (srcValue != null && srcValue.length() > 0)
-          handler.noteIMGSRC(srcValue);
-      }
-      else if (tagName.equals("frame"))
-      {
-        String srcValue = (String)attributes.get("src");
-        if (srcValue != null && srcValue.length() > 0)
-          handler.noteFRAMESRC(srcValue);
-      }
-    }
-
-  }
-
-  // States for form handling.
-
-  protected final static int FORMPARSESTATE_NORMAL = 0;
-  protected final static int FORMPARSESTATE_IN_FORM = 1;
-  protected final static int FORMPARSESTATE_IN_SELECT = 2;
-  protected final static int FORMPARSESTATE_IN_TEXTAREA = 3;
-
-  /** This class interprets the tag stream generated by the BasicParseState class, and keeps track of the form tags. */
-  protected static class FormParseState extends LinkParseState
-  {
-    protected int formParseState = FORMPARSESTATE_NORMAL;
-    protected String selectName = null;
-    protected String selectMultiple = null;
-
-    public FormParseState(IHTMLHandler handler)
-    {
-      super(handler);
-    }
-
-    // Override methods having to do with notification of tag discovery
-
-    protected void noteNonscriptTag(String tagName, Map attributes)
-      throws ManifoldCFException
-    {
-      super.noteNonscriptTag(tagName,attributes);
-      switch (formParseState)
-      {
-      case FORMPARSESTATE_NORMAL:
-        if (tagName.equals("form"))
-        {
-          formParseState = FORMPARSESTATE_IN_FORM;
-          handler.noteFormStart(attributes);
-        }
-        break;
-      case FORMPARSESTATE_IN_FORM:
-        if (tagName.equals("input"))
-        {
-          String type = (String)attributes.get("type");
-          // We're only interested in form elements that can actually transmit data
-          if (type != null && !type.toLowerCase().equals("button") && !type.toLowerCase().equals("reset") && !type.toLowerCase().equals("image"))
-            handler.noteFormInput(attributes);
-        }
-        else if (tagName.equals("select"))
-        {
-          selectName = (String)attributes.get("name");
-          selectMultiple = (String)attributes.get("multiple");
-          formParseState = FORMPARSESTATE_IN_SELECT;
-        }
-        else if (tagName.equals("textarea"))
-        {
-          formParseState = FORMPARSESTATE_IN_TEXTAREA;
-          Map textareaMap = new HashMap();
-          textareaMap.put("type","textarea");
-          // Default value is too tough to meaningfully compute because of the embedded tags etc.  Known limitation.
-          textareaMap.put("value","");
-          handler.noteFormInput(textareaMap);
-        }
-        else if (tagName.equals("button"))
-        {
-          String type = (String)attributes.get("type");
-          if (type == null || type.toLowerCase().equals("submit"))
-          {
-            // Same as input type="submit"
-            handler.noteFormInput(attributes);
-          }
-        }
-        else if (tagName.equals("isindex"))
-        {
-          Map indexMap = new HashMap();
-          indexMap.put("type","text");
-        }
-        break;
-      case FORMPARSESTATE_IN_SELECT:
-        if (tagName.equals("option"))
-        {
-          String optionValue = (String)attributes.get("value");
-          String optionSelected = (String)attributes.get("selected");
-          Map optionMap = new HashMap();
-          optionMap.put("type","select");
-          optionMap.put("name",selectName);
-          optionMap.put("multiple",selectMultiple);
-          optionMap.put("value",optionValue);
-          optionMap.put("selected",optionSelected);
-          handler.noteFormInput(optionMap);
-        }
-        break;
-      case FORMPARSESTATE_IN_TEXTAREA:
-        break;
-      default:
-        throw new ManifoldCFException("Unknown form parse state: "+Integer.toString(formParseState));
-      }
-    }
-
-    protected void noteNonscriptEndTag(String tagName)
-      throws ManifoldCFException
-    {
-      super.noteNonscriptEndTag(tagName);
-      switch (formParseState)
-      {
-      case FORMPARSESTATE_NORMAL:
-        break;
-      case FORMPARSESTATE_IN_FORM:
-        if (tagName.equals("form"))
-        {
-          handler.noteFormEnd();
-          formParseState = FORMPARSESTATE_NORMAL;
-        }
-        break;
-      case FORMPARSESTATE_IN_SELECT:
-        formParseState = FORMPARSESTATE_IN_FORM;
-        selectName = null;
-        selectMultiple = null;
-        break;
-      case FORMPARSESTATE_IN_TEXTAREA:
-        formParseState = FORMPARSESTATE_IN_FORM;
-        break;
-      default:
-        throw new ManifoldCFException("Unknown form parse state: "+Integer.toString(formParseState));
-      }
-    }
-
-  }
-
 }