You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2010/10/08 02:27:46 UTC

svn commit: r1005681 [1/2] - /incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/

Author: kwright
Date: Fri Oct  8 00:27:46 2010
New Revision: 1005681

URL: http://svn.apache.org/viewvc?rev=1005681&view=rev
Log:
Separate out a good chunk of the inner classes, to reduce the size of the main connector class to some degree.

Added:
    incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java   (with props)
    incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java   (with props)
    incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormItem.java   (with props)
    incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java   (with props)
    incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java   (with props)
    incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java   (with props)
    incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IRedirectionHandler.java   (with props)
    incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IXMLHandler.java   (with props)
    incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java   (with props)
    incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java   (with props)
Modified:
    incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java Fri Oct  8 00:27:46 2010
@@ -0,0 +1,437 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.crawler.system.Logging;
+import java.util.*;
+
+/** This class represents the basic, outermost parse state. */
+public class BasicParseState
+{
+  protected static final int BASICPARSESTATE_NORMAL = 0;
+  protected static final int BASICPARSESTATE_SAWLEFTBRACKET = 1;
+  protected static final int BASICPARSESTATE_SAWEXCLAMATION = 2;
+  protected static final int BASICPARSESTATE_SAWDASH = 3;
+  protected static final int BASICPARSESTATE_IN_COMMENT = 4;
+  protected static final int BASICPARSESTATE_SAWCOMMENTDASH = 5;
+  protected static final int BASICPARSESTATE_SAWSECONDCOMMENTDASH = 6;
+  protected static final int BASICPARSESTATE_IN_TAG_NAME = 7;
+  protected static final int BASICPARSESTATE_IN_ATTR_NAME = 8;
+  protected static final int BASICPARSESTATE_IN_ATTR_VALUE = 9;
+  protected static final int BASICPARSESTATE_IN_TAG_SAW_SLASH = 10;
+  protected static final int BASICPARSESTATE_IN_END_TAG_NAME = 11;
+  protected static final int BASICPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE = 12;
+  protected static final int BASICPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE = 13;
+  protected static final int BASICPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE = 14;
+  protected static final int BASICPARSESTATE_IN_UNQUOTED_ATTR_VALUE = 15;
+
+
+  protected int currentState = BASICPARSESTATE_NORMAL;
+
+  protected StringBuffer currentTagNameBuffer = null;
+  protected StringBuffer currentAttrNameBuffer = null;
+  protected StringBuffer currentValueBuffer = null;
+
+  protected String currentTagName = null;
+  protected String currentAttrName = null;
+  protected Map currentAttrMap = null;
+
+  protected static final Map mapLookup = new HashMap();
+  static
+  {
+    mapLookup.put("amp","&");
+    mapLookup.put("lt","<");
+    mapLookup.put("gt",">");
+    mapLookup.put("quot","\"");
+  }
+
+  public BasicParseState()
+  {
+  }
+
+  /** Deal with a character.  No exceptions are allowed, since those would represent syntax errors, and we don't want those to cause difficulty. */
+  public void dealWithCharacter(char thisChar)
+    throws ManifoldCFException
+  {
+    // At this level we want basic lexical analysis - that is, we deal with identifying tags and comments, that's it.
+    char thisCharLower = Character.toLowerCase(thisChar);
+    switch (currentState)
+    {
+    case BASICPARSESTATE_NORMAL:
+      if (thisChar == '<')
+        currentState = BASICPARSESTATE_SAWLEFTBRACKET;
+      break;
+    case BASICPARSESTATE_SAWLEFTBRACKET:
+      if (thisChar == '!')
+        currentState = BASICPARSESTATE_SAWEXCLAMATION;
+      else if (thisChar == '/')
+      {
+        currentState = BASICPARSESTATE_IN_END_TAG_NAME;
+        currentTagNameBuffer = new StringBuffer();
+      }
+      else
+      {
+        currentState = BASICPARSESTATE_IN_TAG_NAME;
+        currentTagNameBuffer = new StringBuffer();
+        if (!isHTMLWhitespace(thisChar))
+          currentTagNameBuffer.append(thisCharLower);
+      }
+      break;
+    case BASICPARSESTATE_SAWEXCLAMATION:
+      if (thisChar == '-')
+        currentState = BASICPARSESTATE_SAWDASH;
+      else
+        currentState = BASICPARSESTATE_NORMAL;
+      break;
+    case BASICPARSESTATE_SAWDASH:
+      if (thisChar == '-')
+        currentState = BASICPARSESTATE_IN_COMMENT;
+      else
+        currentState = BASICPARSESTATE_NORMAL;
+      break;
+    case BASICPARSESTATE_IN_COMMENT:
+      // We're in a comment.  All we should look for is the end of the comment.
+      if (thisChar == '-')
+        currentState = BASICPARSESTATE_SAWCOMMENTDASH;
+      break;
+    case BASICPARSESTATE_SAWCOMMENTDASH:
+      if (thisChar == '-')
+        currentState = BASICPARSESTATE_SAWSECONDCOMMENTDASH;
+      else
+        currentState = BASICPARSESTATE_IN_COMMENT;
+      break;
+    case BASICPARSESTATE_SAWSECONDCOMMENTDASH:
+      if (thisChar == '>')
+        currentState = BASICPARSESTATE_NORMAL;
+      else if (thisChar != '-')
+        currentState = BASICPARSESTATE_IN_COMMENT;
+      break;
+    case BASICPARSESTATE_IN_TAG_NAME:
+      if (isHTMLWhitespace(thisChar))
+      {
+        if (currentTagNameBuffer.length() > 0)
+        {
+          // Done with the tag name!
+          currentTagName = currentTagNameBuffer.toString();
+          currentTagNameBuffer = null;
+          currentAttrMap = new HashMap();
+          currentState = BASICPARSESTATE_IN_ATTR_NAME;
+          currentAttrNameBuffer = new StringBuffer();
+        }
+      }
+      else if (thisChar == '/')
+      {
+        if (currentTagNameBuffer.length() > 0)
+        {
+          currentTagName = currentTagNameBuffer.toString();
+          currentTagNameBuffer = null;
+          currentAttrMap = new HashMap();
+          currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
+          noteTag(currentTagName,currentAttrMap);
+        }
+        else
+        {
+          currentState = BASICPARSESTATE_NORMAL;
+          currentTagNameBuffer = null;
+        }
+      }
+      else if (thisChar == '>')
+      {
+        if (currentTagNameBuffer.length() > 0)
+        {
+          currentTagName = currentTagNameBuffer.toString();
+          currentTagNameBuffer = null;
+          currentAttrMap = new HashMap();
+        }
+        if (currentTagName != null)
+        {
+          noteTag(currentTagName,currentAttrMap);
+        }
+        currentState = BASICPARSESTATE_NORMAL;
+        currentTagName = null;
+        currentAttrMap = null;
+      }
+      else
+        currentTagNameBuffer.append(thisCharLower);
+      break;
+    case BASICPARSESTATE_IN_ATTR_NAME:
+      if (isHTMLWhitespace(thisChar))
+      {
+        if (currentAttrNameBuffer.length() > 0)
+        {
+          // Done with attr name!
+          currentAttrName = currentAttrNameBuffer.toString();
+          currentAttrNameBuffer = null;
+          currentState = BASICPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE;
+        }
+      }
+      else if (thisChar == '=')
+      {
+        if (currentAttrNameBuffer.length() > 0)
+        {
+          currentAttrName = currentAttrNameBuffer.toString();
+          currentAttrNameBuffer = null;
+          currentState = BASICPARSESTATE_IN_ATTR_VALUE;
+          currentValueBuffer = new StringBuffer();
+        }
+      }
+      else if (thisChar == '/')
+      {
+        if (currentAttrNameBuffer.length() > 0)
+        {
+          currentAttrName = currentAttrNameBuffer.toString();
+          currentAttrNameBuffer = null;
+        }
+        if (currentAttrName != null)
+        {
+          currentAttrMap.put(currentAttrName,"");
+          currentAttrName = null;
+        }
+        noteTag(currentTagName,currentAttrMap);
+        currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
+      }
+      else if (thisChar == '>')
+      {
+        if (currentAttrNameBuffer.length() > 0)
+        {
+          currentAttrName = currentAttrNameBuffer.toString();
+          currentAttrNameBuffer = null;
+        }
+        if (currentAttrName != null)
+        {
+          currentAttrMap.put(currentAttrName,"");
+          currentAttrName = null;
+        }
+        currentState = BASICPARSESTATE_NORMAL;
+        noteTag(currentTagName,currentAttrMap);
+        currentTagName = null;
+        currentAttrMap = null;
+      }
+      else
+        currentAttrNameBuffer.append(thisCharLower);
+      break;
+    case BASICPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE:
+      if (thisChar == '=')
+      {
+        currentState = BASICPARSESTATE_IN_ATTR_VALUE;
+        currentValueBuffer = new StringBuffer();
+      }
+      else if (thisChar == '>')
+      {
+        currentState = BASICPARSESTATE_NORMAL;
+        noteTag(currentTagName,currentAttrMap);
+        currentTagName = null;
+        currentAttrMap = null;
+      }
+      else if (thisChar == '/')
+      {
+        currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
+        currentAttrMap.put(currentAttrName,"");
+        currentAttrName = null;
+        noteTag(currentTagName,currentAttrMap);
+      }
+      else if (!isHTMLWhitespace(thisChar))
+      {
+        currentAttrMap.put(currentAttrName,"");
+        currentState = BASICPARSESTATE_IN_ATTR_NAME;
+        currentAttrNameBuffer = new StringBuffer();
+        currentAttrNameBuffer.append(thisCharLower);
+        currentAttrName = null;
+      }
+      break;
+    case BASICPARSESTATE_IN_ATTR_VALUE:
+      if (thisChar == '\'')
+        currentState = BASICPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE;
+      else if (thisChar == '"')
+        currentState = BASICPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE;
+      else if (!isHTMLWhitespace(thisChar))
+      {
+        currentState = BASICPARSESTATE_IN_UNQUOTED_ATTR_VALUE;
+        currentValueBuffer.append(thisChar);
+      }
+      break;
+    case BASICPARSESTATE_IN_TAG_SAW_SLASH:
+      if (thisChar == '>')
+      {
+        noteEndTag(currentTagName);
+        currentState = BASICPARSESTATE_NORMAL;
+        currentTagName = null;
+        currentAttrMap = null;
+      }
+      break;
+    case BASICPARSESTATE_IN_END_TAG_NAME:
+      if (isHTMLWhitespace(thisChar))
+      {
+        if (currentTagNameBuffer != null && currentTagNameBuffer.length() > 0)
+        {
+          // Done with the tag name!
+          currentTagName = currentTagNameBuffer.toString();
+          currentTagNameBuffer = null;
+        }
+      }
+      else if (thisChar == '>')
+      {
+        if (currentTagNameBuffer != null && currentTagNameBuffer.length() > 0)
+        {
+          currentTagName = currentTagNameBuffer.toString();
+          currentTagNameBuffer = null;
+        }
+        if (currentTagName != null)
+        {
+          noteEndTag(currentTagName);
+        }
+        currentTagName = null;
+        currentState = BASICPARSESTATE_NORMAL;
+      }
+      else if (currentTagNameBuffer != null)
+        currentTagNameBuffer.append(thisCharLower);
+      break;
+    case BASICPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE:
+      if (thisChar == '\'' || thisChar == '\n' || thisChar == '\r')
+      {
+        currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+        currentAttrName = null;
+        currentValueBuffer = null;
+        currentState = BASICPARSESTATE_IN_ATTR_NAME;
+        currentAttrNameBuffer = new StringBuffer();
+      }
+      else
+        currentValueBuffer.append(thisChar);
+      break;
+    case BASICPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE:
+      if (thisChar == '"' || thisChar == '\n' || thisChar == '\r')
+      {
+        currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+        currentAttrName = null;
+        currentValueBuffer = null;
+        currentState = BASICPARSESTATE_IN_ATTR_NAME;
+        currentAttrNameBuffer = new StringBuffer();
+      }
+      else
+        currentValueBuffer.append(thisChar);
+      break;
+    case BASICPARSESTATE_IN_UNQUOTED_ATTR_VALUE:
+      if (isHTMLWhitespace(thisChar))
+      {
+        currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+        currentAttrName = null;
+        currentValueBuffer = null;
+        currentState = BASICPARSESTATE_IN_ATTR_NAME;
+        currentAttrNameBuffer = new StringBuffer();
+      }
+      else if (thisChar == '/')
+      {
+        currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+        noteTag(currentTagName,currentAttrMap);
+        currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
+      }
+      else if (thisChar == '>')
+      {
+        currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+        currentAttrName = null;
+        currentValueBuffer = null;
+        currentState = BASICPARSESTATE_NORMAL;
+        noteTag(currentTagName,currentAttrMap);
+        currentTagName = null;
+        currentAttrMap = null;
+      }
+      else
+        currentValueBuffer.append(thisChar);
+      break;
+    default:
+      throw new ManifoldCFException("Invalid state: "+Integer.toString(currentState));
+    }
+  }
+
+  protected void noteTag(String tagName, Map attributes)
+    throws ManifoldCFException
+  {
+    Logging.connectors.debug(" Saw tag '"+tagName+"'");
+  }
+
+  protected void noteEndTag(String tagName)
+    throws ManifoldCFException
+  {
+    Logging.connectors.debug(" Saw end tag '"+tagName+"'");
+  }
+
+  public void finishUp()
+    throws ManifoldCFException
+  {
+    // Does nothing
+  }
+
+  /** Decode an html attribute */
+  protected static String htmlAttributeDecode(String input)
+  {
+    StringBuffer output = new StringBuffer();
+    int i = 0;
+    while (i < input.length())
+    {
+      char x = input.charAt(i++);
+      if (x == '&')
+      {
+        int index = input.indexOf(";",i);
+        if (index != -1)
+        {
+          String chunk = input.substring(i,index);
+          String replacement = mapChunk(chunk);
+          if (replacement != null)
+          {
+            output.append(replacement);
+            i = index + 1;
+            continue;
+          }
+        }
+      }
+      output.append(x);
+    }
+    return output.toString();
+  }
+
+  /** Map an entity reference back to a character */
+  protected static String mapChunk(String input)
+  {
+    if (input.startsWith("#"))
+    {
+      // Treat as a decimal value
+      try
+      {
+        int value = Integer.parseInt(input.substring(1));
+        StringBuffer sb = new StringBuffer();
+        sb.append((char)value);
+        return sb.toString();
+      }
+      catch (NumberFormatException e)
+      {
+        return null;
+      }
+    }
+    else
+      return (String)mapLookup.get(input);
+  }
+
+  /** Is a character HTML whitespace? */
+  protected static boolean isHTMLWhitespace(char x)
+  {
+    return x <= ' ';
+  }
+
+}

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java Fri Oct  8 00:27:46 2010
@@ -0,0 +1,251 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+import java.util.regex.*;
+
+/** This class accumulates form data and allows overrides */
+public class FormDataAccumulator implements FormData
+{
+  // Note well: We don't handle multipart posts at this time!!
+
+  // Element categorization
+  protected final static int ELEMENTCATEGORY_FREEFORM = 0;
+  protected final static int ELEMENTCATEGORY_FIXEDEXCLUSIVE = 1;
+  protected final static int ELEMENTCATEGORY_FIXEDINCLUSIVE = 2;
+
+  /** The form's action URI */
+  protected String actionURI;
+  /** The form's submit method */
+  protected int submitMethod;
+
+  /** The set of elements */
+  protected ArrayList elementList = new ArrayList();
+
+  public FormDataAccumulator(String actionURI, int submitMethod)
+  {
+    this.actionURI = actionURI;
+    this.submitMethod = submitMethod;
+  }
+
+  public void addElement(Map attributes)
+  {
+    // Interpret the input tag, and make a list of the potential elements we'll want to submit
+    String type = (String)attributes.get("type");
+    if (type != null)
+    {
+      String name = (String)attributes.get("name");
+      if (name != null)
+      {
+        String lowerType = type.toLowerCase();
+        if (lowerType.equals("submit"))
+        {
+          String value = (String)attributes.get("value");
+          if (value == null)
+            value = "Submit Form";
+          elementList.add(new FormItem(name,value,ELEMENTCATEGORY_FREEFORM,true));
+        }
+        else if (lowerType.equals("hidden") || lowerType.equals("text") || lowerType.equals("password"))
+        {
+          String value = (String)attributes.get("value");
+          if (value == null)
+            value = "";
+          elementList.add(new FormItem(name,value,ELEMENTCATEGORY_FREEFORM,true));
+        }
+        else if (lowerType.equals("select"))
+        {
+          String value = (String)attributes.get("value");
+          if (value == null)
+            value = "";
+          String selected = (String)attributes.get("selected");
+          boolean isSelected = false;
+          if (selected != null)
+            isSelected = true;
+          String multiple = (String)attributes.get("multiple");
+          boolean isMultiple = false;
+          if (multiple != null)
+            isMultiple = true;
+          elementList.add(new FormItem(name,value,isMultiple?ELEMENTCATEGORY_FIXEDINCLUSIVE:ELEMENTCATEGORY_FIXEDEXCLUSIVE,isSelected));
+        }
+        else if (lowerType.equals("radio"))
+        {
+          String value = (String)attributes.get("value");
+          if (value == null)
+            value = "";
+          String selected = (String)attributes.get("checked");
+          boolean isSelected = false;
+          if (selected != null)
+            isSelected = true;
+          elementList.add(new FormItem(name,value,ELEMENTCATEGORY_FIXEDEXCLUSIVE,isSelected));
+        }
+        else if (lowerType.equals("checkbox"))
+        {
+          String value = (String)attributes.get("value");
+          if (value == null)
+            value = "";
+          String selected = (String)attributes.get("checked");
+          boolean isSelected = false;
+          if (selected != null)
+            isSelected = true;
+          elementList.add(new FormItem(name,value,ELEMENTCATEGORY_FIXEDINCLUSIVE,isSelected));
+        }
+        else if (lowerType.equals("textarea"))
+        {
+          elementList.add(new FormItem(name,"",ELEMENTCATEGORY_FREEFORM,true));
+        }
+      }
+    }
+  }
+
+  public void applyOverrides(LoginParameters lp)
+  {
+    // This map contains the control names we have ALREADY wiped clean.
+    Map overrideMap = new HashMap();
+
+    // Override the specified elements with the specified values
+    int i = 0;
+    while (i < lp.getParameterCount())
+    {
+      Pattern namePattern = lp.getParameterNamePattern(i);
+      String value = lp.getParameterValue(i);
+      i++;
+
+      // For each parameter specified, go through the element list and do the right thing.  This will require us to keep some state around about
+      // what exactly we've done to the element list so far, so that each parameter rule in turn applies properly.
+      //
+      // Each rule regular expression will be deemed to apply to all matching controls.  If the rule matches the control name, then the precise behavior
+      // will depend on the type of the control.
+      //
+      // Controls can be categorized in the following way:
+      // - free-form value
+      // - specified exclusive value (e.g. radio button)
+      // - specified inclusive value (e.g. checkbox)
+      //
+      // For free-form values, the value given will simply override the value of the element.
+      // For exclusive controls, all values in the family will be disabled, and the value matching the one specified will be enabled.
+      // For inclusive controls, all values in the family will be cleared ONCE, and then subsequently the value matching the one specified will be enabled.
+      //
+      int j = 0;
+      while (j < elementList.size())
+      {
+        FormItem fi = (FormItem)elementList.get(j++);
+        Matcher m = namePattern.matcher(fi.getElementName());
+        if (m.find())
+        {
+          // Hey, it seems to apply!
+          switch (fi.getType())
+          {
+          case ELEMENTCATEGORY_FREEFORM:
+            // Override immediately
+            fi.setValue(value);
+            break;
+          case ELEMENTCATEGORY_FIXEDEXCLUSIVE:
+            // If it doesn't match the value, disable.
+            fi.setEnabled(fi.getElementValue().equals(value));
+            break;
+          case ELEMENTCATEGORY_FIXEDINCLUSIVE:
+            // Make sure we clear the entire control ONCE (and only once).
+            if (overrideMap.get(fi.getElementName()) == null)
+            {
+              // Zip through the entire list
+              int k = 0;
+              while (k < elementList.size())
+              {
+                FormItem fi2 = (FormItem)elementList.get(k++);
+                if (fi2.getElementName().equals(fi.getElementName()))
+                  fi.setEnabled(false);
+              }
+              overrideMap.put(fi.getElementName(),fi.getElementName());
+            }
+            if (fi.getElementValue().equals(value))
+              fi.setEnabled(true);
+          default:
+            break;
+          }
+        }
+      }
+    }
+  }
+
+  /** Get the full action URI for this form. */
+  public String getActionURI()
+  {
+    return actionURI;
+  }
+
+  /** Get the submit method for this form. */
+  public int getSubmitMethod()
+  {
+    return submitMethod;
+  }
+
+  /** Iterate over the active form data elements.  The returned iterator returns FormDataElement objects. */
+  public Iterator getElementIterator()
+  {
+    return new FormItemIterator(elementList);
+  }
+
+  /** Iterator over FormItems */
+  protected static class FormItemIterator implements Iterator
+  {
+    protected ArrayList elementList;
+    protected int currentIndex = 0;
+
+    public FormItemIterator(ArrayList elementList)
+    {
+      this.elementList = elementList;
+    }
+
+    public boolean hasNext()
+    {
+      while (true)
+      {
+        if (currentIndex == elementList.size())
+          return false;
+        if (((FormItem)elementList.get(currentIndex)).getEnabled() == false)
+          currentIndex++;
+        else
+          break;
+      }
+      return true;
+    }
+
+    public Object next()
+    {
+      while (true)
+      {
+        if (currentIndex == elementList.size())
+          throw new NoSuchElementException("No such element");
+        if (((FormItem)elementList.get(currentIndex)).getEnabled() == false)
+          currentIndex++;
+        else
+          break;
+      }
+      return elementList.get(currentIndex++);
+    }
+
+    public void remove()
+    {
+      throw new UnsupportedOperationException("Unsupported operation");
+    }
+  }
+
+}

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormItem.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormItem.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormItem.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormItem.java Fri Oct  8 00:27:46 2010
@@ -0,0 +1,69 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+/** This class provides an individual data item */
+public class FormItem implements FormDataElement
+{
+  protected String name;
+  protected String value;
+  protected boolean isEnabled;
+  protected int type;
+
+  public FormItem(String name, String value, int type, boolean isEnabled)
+  {
+    this.name = name;
+    this.value = value;
+    this.isEnabled = isEnabled;
+    this.type = type;
+  }
+
+  public void setEnabled(boolean enabled)
+  {
+    isEnabled = enabled;
+  }
+
+  public boolean getEnabled()
+  {
+    return isEnabled;
+  }
+
+  public void setValue(String value)
+  {
+    this.value = value;
+  }
+
+  public int getType()
+  {
+    return type;
+  }
+
+  /** Get the element name */
+  public String getElementName()
+  {
+    return name;
+  }
+
+  /** Get the element value */
+  public String getElementValue()
+  {
+    return value;
+  }
+
+}

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormItem.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormItem.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java Fri Oct  8 00:27:46 2010
@@ -0,0 +1,145 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+
+/** This class interprets the tag stream generated by the BasicParseState class, and keeps track of the form tags. */
+public class FormParseState extends LinkParseState
+{
+  // States for form handling.
+  protected final static int FORMPARSESTATE_NORMAL = 0;
+  protected final static int FORMPARSESTATE_IN_FORM = 1;
+  protected final static int FORMPARSESTATE_IN_SELECT = 2;
+  protected final static int FORMPARSESTATE_IN_TEXTAREA = 3;
+
+  
+  protected int formParseState = FORMPARSESTATE_NORMAL;
+  protected String selectName = null;
+  protected String selectMultiple = null;
+
+  public FormParseState(IHTMLHandler handler)
+  {
+    super(handler);
+  }
+
+  // Override methods having to do with notification of tag discovery
+
+  protected void noteNonscriptTag(String tagName, Map attributes)
+    throws ManifoldCFException
+  {
+    super.noteNonscriptTag(tagName,attributes);
+    switch (formParseState)
+    {
+    case FORMPARSESTATE_NORMAL:
+      if (tagName.equals("form"))
+      {
+        formParseState = FORMPARSESTATE_IN_FORM;
+        handler.noteFormStart(attributes);
+      }
+      break;
+    case FORMPARSESTATE_IN_FORM:
+      if (tagName.equals("input"))
+      {
+        String type = (String)attributes.get("type");
+        // We're only interested in form elements that can actually transmit data
+        if (type != null && !type.toLowerCase().equals("button") && !type.toLowerCase().equals("reset") && !type.toLowerCase().equals("image"))
+          handler.noteFormInput(attributes);
+      }
+      else if (tagName.equals("select"))
+      {
+        selectName = (String)attributes.get("name");
+        selectMultiple = (String)attributes.get("multiple");
+        formParseState = FORMPARSESTATE_IN_SELECT;
+      }
+      else if (tagName.equals("textarea"))
+      {
+        formParseState = FORMPARSESTATE_IN_TEXTAREA;
+        Map textareaMap = new HashMap();
+        textareaMap.put("type","textarea");
+        // Default value is too tough to meaningfully compute because of the embedded tags etc.  Known limitation.
+        textareaMap.put("value","");
+        handler.noteFormInput(textareaMap);
+      }
+      else if (tagName.equals("button"))
+      {
+        String type = (String)attributes.get("type");
+        if (type == null || type.toLowerCase().equals("submit"))
+        {
+          // Same as input type="submit"
+          handler.noteFormInput(attributes);
+        }
+      }
+      else if (tagName.equals("isindex"))
+      {
+        Map indexMap = new HashMap();
+        indexMap.put("type","text");
+      }
+      break;
+    case FORMPARSESTATE_IN_SELECT:
+      if (tagName.equals("option"))
+      {
+        String optionValue = (String)attributes.get("value");
+        String optionSelected = (String)attributes.get("selected");
+        Map optionMap = new HashMap();
+        optionMap.put("type","select");
+        optionMap.put("name",selectName);
+        optionMap.put("multiple",selectMultiple);
+        optionMap.put("value",optionValue);
+        optionMap.put("selected",optionSelected);
+        handler.noteFormInput(optionMap);
+      }
+      break;
+    case FORMPARSESTATE_IN_TEXTAREA:
+      break;
+    default:
+      throw new ManifoldCFException("Unknown form parse state: "+Integer.toString(formParseState));
+    }
+  }
+
+  protected void noteNonscriptEndTag(String tagName)
+    throws ManifoldCFException
+  {
+    super.noteNonscriptEndTag(tagName);
+    switch (formParseState)
+    {
+    case FORMPARSESTATE_NORMAL:
+      break;
+    case FORMPARSESTATE_IN_FORM:
+      if (tagName.equals("form"))
+      {
+        handler.noteFormEnd();
+        formParseState = FORMPARSESTATE_NORMAL;
+      }
+      break;
+    case FORMPARSESTATE_IN_SELECT:
+      formParseState = FORMPARSESTATE_IN_FORM;
+      selectName = null;
+      selectMultiple = null;
+      break;
+    case FORMPARSESTATE_IN_TEXTAREA:
+      formParseState = FORMPARSESTATE_IN_FORM;
+      break;
+    default:
+      throw new ManifoldCFException("Unknown form parse state: "+Integer.toString(formParseState));
+    }
+  }
+
+}

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java Fri Oct  8 00:27:46 2010
@@ -0,0 +1,32 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+
+/** This interface describes the functionality needed by a link extractor to note a discovered link.
+*/
+public interface IDiscoveredLinkHandler
+{
+  /** Inform the world of a discovered link.
+  *@param rawURL is the raw discovered url.  This may be relative, malformed, or otherwise unsuitable for use until final form is acheived.
+  */
+  public void noteDiscoveredLink(String rawURL)
+    throws ManifoldCFException;
+}

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java Fri Oct  8 00:27:46 2010
@@ -0,0 +1,55 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+
+/** This interface describes the functionality needed by an HTML processor in order to handle an HTML document.
+*/
+public interface IHTMLHandler extends IDiscoveredLinkHandler
+{
+  /** Note the start of a form */
+  public void noteFormStart(Map formAttributes)
+    throws ManifoldCFException;
+
+  /** Note an input tag */
+  public void noteFormInput(Map inputAttributes)
+    throws ManifoldCFException;
+
+  /** Note the end of a form */
+  public void noteFormEnd()
+    throws ManifoldCFException;
+
+  /** Note discovered href */
+  public void noteAHREF(String rawURL)
+    throws ManifoldCFException;
+
+  /** Note discovered href */
+  public void noteLINKHREF(String rawURL)
+    throws ManifoldCFException;
+
+  /** Note discovered IMG SRC */
+  public void noteIMGSRC(String rawURL)
+    throws ManifoldCFException;
+
+  /** Note discovered FRAME SRC */
+  public void noteFRAMESRC(String rawURL)
+    throws ManifoldCFException;
+}

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IRedirectionHandler.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IRedirectionHandler.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IRedirectionHandler.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IRedirectionHandler.java Fri Oct  8 00:27:46 2010
@@ -0,0 +1,25 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+/** This interface describes the functionality needed by an redirection processor in order to handle a redirection.
+*/
+public interface IRedirectionHandler extends IDiscoveredLinkHandler
+{
+}

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IRedirectionHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IRedirectionHandler.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IXMLHandler.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IXMLHandler.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IXMLHandler.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IXMLHandler.java Fri Oct  8 00:27:46 2010
@@ -0,0 +1,33 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+ 
+import org.apache.manifoldcf.core.interfaces.*;
+
+/** This interface describes the functionality needed by an XML processor in order to handle an XML document.
+*/
+public interface IXMLHandler extends IDiscoveredLinkHandler
+{
+  /** Inform the world of a discovered ttl value.
+  *@param rawTtlValue is the raw discovered ttl value.
+  */
+  public void noteDiscoveredTtlValue(String rawTtlValue)
+    throws ManifoldCFException;
+
+}

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IXMLHandler.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IXMLHandler.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java Fri Oct  8 00:27:46 2010
@@ -0,0 +1,67 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+
+/** This class recognizes and interprets all links */
+public class LinkParseState extends ScriptParseState
+{
+
+  protected IHTMLHandler handler;
+
+  public LinkParseState(IHTMLHandler handler)
+  {
+    super();
+    this.handler = handler;
+  }
+
+  protected void noteNonscriptTag(String tagName, Map attributes)
+    throws ManifoldCFException
+  {
+    super.noteNonscriptTag(tagName,attributes);
+    String lowerTagName = tagName.toLowerCase();
+    if (tagName.equals("a"))
+    {
+      String hrefValue = (String)attributes.get("href");
+      if (hrefValue != null && hrefValue.length() > 0)
+        handler.noteAHREF(hrefValue);
+    }
+    else if (tagName.equals("link"))
+    {
+      String hrefValue = (String)attributes.get("href");
+      if (hrefValue != null && hrefValue.length() > 0)
+        handler.noteLINKHREF(hrefValue);
+    }
+    else if (tagName.equals("img"))
+    {
+      String srcValue = (String)attributes.get("src");
+      if (srcValue != null && srcValue.length() > 0)
+        handler.noteIMGSRC(srcValue);
+    }
+    else if (tagName.equals("frame"))
+    {
+      String srcValue = (String)attributes.get("src");
+      if (srcValue != null && srcValue.length() > 0)
+        handler.noteFRAMESRC(srcValue);
+    }
+  }
+
+}

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
------------------------------------------------------------------------------
    svn:keywords = Id

Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java Fri Oct  8 00:27:46 2010
@@ -0,0 +1,89 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+
+/** This class interprets the tag stream generated by the BasicParseState class, and causes script sections to be skipped */
+public class ScriptParseState extends BasicParseState
+{
+  // Script tag parsing states
+  protected static final int SCRIPTPARSESTATE_NORMAL = 0;
+  protected static final int SCRIPTPARSESTATE_INSCRIPT = 1;
+
+  protected int scriptParseState = SCRIPTPARSESTATE_NORMAL;
+
+  public ScriptParseState()
+  {
+    super();
+  }
+
+  // Override methods having to do with notification of tag discovery
+
+  protected void noteTag(String tagName, Map attributes)
+    throws ManifoldCFException
+  {
+    super.noteTag(tagName,attributes);
+    switch (scriptParseState)
+    {
+    case SCRIPTPARSESTATE_NORMAL:
+      if (tagName.equals("script"))
+        scriptParseState = SCRIPTPARSESTATE_INSCRIPT;
+      else
+        noteNonscriptTag(tagName,attributes);
+      break;
+    case SCRIPTPARSESTATE_INSCRIPT:
+      // Skip all tags until we see the end script one.
+      break;
+    default:
+      throw new ManifoldCFException("Unknown script parse state: "+Integer.toString(scriptParseState));
+    }
+  }
+
+  protected void noteEndTag(String tagName)
+    throws ManifoldCFException
+  {
+    super.noteEndTag(tagName);
+    switch (scriptParseState)
+    {
+    case SCRIPTPARSESTATE_NORMAL:
+      noteNonscriptEndTag(tagName);
+      break;
+    case SCRIPTPARSESTATE_INSCRIPT:
+      // Skip all tags until we see the end script one.
+      if (tagName.equals("script"))
+        scriptParseState = SCRIPTPARSESTATE_NORMAL;
+      break;
+    default:
+      break;
+    }
+  }
+
+  protected void noteNonscriptTag(String tagName, Map attributes)
+    throws ManifoldCFException
+  {
+  }
+
+  protected void noteNonscriptEndTag(String tagName)
+    throws ManifoldCFException
+  {
+  }
+
+}

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
------------------------------------------------------------------------------
    svn:keywords = Id