You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2010/10/08 02:27:46 UTC
svn commit: r1005681 [2/2] -
/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/
Modified: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1005681&r1=1005680&r2=1005681&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Fri Oct 8 00:27:46 2010
@@ -120,14 +120,6 @@ public class WebcrawlerConnector extends
understoodProtocols.put("https","https");
}
- protected static final Map mapLookup = new HashMap();
- static
- {
- mapLookup.put("amp","&");
- mapLookup.put("lt","<");
- mapLookup.put("gt",">");
- mapLookup.put("quot","\"");
- }
// Usage flag values
protected static final int ROBOTS_NONE = 0;
@@ -5343,283 +5335,6 @@ public class WebcrawlerConnector extends
}
- /** This class accumulates form data and allows overrides */
- protected static class FormDataAccumulator implements FormData
- {
- // Note well: We don't handle multipart posts at this time!!
-
- /** The form's action URI */
- protected String actionURI;
- /** The form's submit method */
- protected int submitMethod;
-
- /** The set of elements */
- protected ArrayList elementList = new ArrayList();
-
- public FormDataAccumulator(String actionURI, int submitMethod)
- {
- this.actionURI = actionURI;
- this.submitMethod = submitMethod;
- }
-
- public void addElement(Map attributes)
- {
- // Interpret the input tag, and make a list of the potential elements we'll want to submit
- String type = (String)attributes.get("type");
- if (type != null)
- {
- String name = (String)attributes.get("name");
- if (name != null)
- {
- String lowerType = type.toLowerCase();
- if (lowerType.equals("submit"))
- {
- String value = (String)attributes.get("value");
- if (value == null)
- value = "Submit Form";
- elementList.add(new FormItem(name,value,ELEMENTCATEGORY_FREEFORM,true));
- }
- else if (lowerType.equals("hidden") || lowerType.equals("text") || lowerType.equals("password"))
- {
- String value = (String)attributes.get("value");
- if (value == null)
- value = "";
- elementList.add(new FormItem(name,value,ELEMENTCATEGORY_FREEFORM,true));
- }
- else if (lowerType.equals("select"))
- {
- String value = (String)attributes.get("value");
- if (value == null)
- value = "";
- String selected = (String)attributes.get("selected");
- boolean isSelected = false;
- if (selected != null)
- isSelected = true;
- String multiple = (String)attributes.get("multiple");
- boolean isMultiple = false;
- if (multiple != null)
- isMultiple = true;
- elementList.add(new FormItem(name,value,isMultiple?ELEMENTCATEGORY_FIXEDINCLUSIVE:ELEMENTCATEGORY_FIXEDEXCLUSIVE,isSelected));
- }
- else if (lowerType.equals("radio"))
- {
- String value = (String)attributes.get("value");
- if (value == null)
- value = "";
- String selected = (String)attributes.get("checked");
- boolean isSelected = false;
- if (selected != null)
- isSelected = true;
- elementList.add(new FormItem(name,value,ELEMENTCATEGORY_FIXEDEXCLUSIVE,isSelected));
- }
- else if (lowerType.equals("checkbox"))
- {
- String value = (String)attributes.get("value");
- if (value == null)
- value = "";
- String selected = (String)attributes.get("checked");
- boolean isSelected = false;
- if (selected != null)
- isSelected = true;
- elementList.add(new FormItem(name,value,ELEMENTCATEGORY_FIXEDINCLUSIVE,isSelected));
- }
- else if (lowerType.equals("textarea"))
- {
- elementList.add(new FormItem(name,"",ELEMENTCATEGORY_FREEFORM,true));
- }
- }
- }
- }
-
- public void applyOverrides(LoginParameters lp)
- {
- // This map contains the control names we have ALREADY wiped clean.
- Map overrideMap = new HashMap();
-
- // Override the specified elements with the specified values
- int i = 0;
- while (i < lp.getParameterCount())
- {
- Pattern namePattern = lp.getParameterNamePattern(i);
- String value = lp.getParameterValue(i);
- i++;
-
- // For each parameter specified, go through the element list and do the right thing. This will require us to keep some state around about
- // what exactly we've done to the element list so far, so that each parameter rule in turn applies properly.
- //
- // Each rule regular expression will be deemed to apply to all matching controls. If the rule matches the control name, then the precise behavior
- // will depend on the type of the control.
- //
- // Controls can be categorized in the following way:
- // - free-form value
- // - specified exclusive value (e.g. radio button)
- // - specified inclusive value (e.g. checkbox)
- //
- // For free-form values, the value given will simply override the value of the element.
- // For exclusive controls, all values in the family will be disabled, and the value matching the one specified will be enabled.
- // For inclusive controls, all values in the family will be cleared ONCE, and then subsequently the value matching the one specified will be enabled.
- //
- int j = 0;
- while (j < elementList.size())
- {
- FormItem fi = (FormItem)elementList.get(j++);
- Matcher m = namePattern.matcher(fi.getElementName());
- if (m.find())
- {
- // Hey, it seems to apply!
- switch (fi.getType())
- {
- case ELEMENTCATEGORY_FREEFORM:
- // Override immediately
- fi.setValue(value);
- break;
- case ELEMENTCATEGORY_FIXEDEXCLUSIVE:
- // If it doesn't match the value, disable.
- fi.setEnabled(fi.getElementValue().equals(value));
- break;
- case ELEMENTCATEGORY_FIXEDINCLUSIVE:
- // Make sure we clear the entire control ONCE (and only once).
- if (overrideMap.get(fi.getElementName()) == null)
- {
- // Zip through the entire list
- int k = 0;
- while (k < elementList.size())
- {
- FormItem fi2 = (FormItem)elementList.get(k++);
- if (fi2.getElementName().equals(fi.getElementName()))
- fi.setEnabled(false);
- }
- overrideMap.put(fi.getElementName(),fi.getElementName());
- }
- if (fi.getElementValue().equals(value))
- fi.setEnabled(true);
- default:
- break;
- }
- }
- }
- }
- }
-
- /** Get the full action URI for this form. */
- public String getActionURI()
- {
- return actionURI;
- }
-
- /** Get the submit method for this form. */
- public int getSubmitMethod()
- {
- return submitMethod;
- }
-
- /** Iterate over the active form data elements. The returned iterator returns FormDataElement objects. */
- public Iterator getElementIterator()
- {
- return new FormItemIterator(elementList);
- }
-
- }
-
- /** Iterator over FormItems */
- protected static class FormItemIterator implements Iterator
- {
- protected ArrayList elementList;
- protected int currentIndex = 0;
-
- public FormItemIterator(ArrayList elementList)
- {
- this.elementList = elementList;
- }
-
- public boolean hasNext()
- {
- while (true)
- {
- if (currentIndex == elementList.size())
- return false;
- if (((FormItem)elementList.get(currentIndex)).getEnabled() == false)
- currentIndex++;
- else
- break;
- }
- return true;
- }
-
- public Object next()
- {
- while (true)
- {
- if (currentIndex == elementList.size())
- throw new NoSuchElementException("No such element");
- if (((FormItem)elementList.get(currentIndex)).getEnabled() == false)
- currentIndex++;
- else
- break;
- }
- return elementList.get(currentIndex++);
- }
-
- public void remove()
- {
- throw new UnsupportedOperationException("Unsupported operation");
- }
- }
-
- // Element categorization
- protected final static int ELEMENTCATEGORY_FREEFORM = 0;
- protected final static int ELEMENTCATEGORY_FIXEDEXCLUSIVE = 1;
- protected final static int ELEMENTCATEGORY_FIXEDINCLUSIVE = 2;
-
- /** This class provides an individual data item */
- protected static class FormItem implements FormDataElement
- {
- protected String name;
- protected String value;
- protected boolean isEnabled;
- protected int type;
-
- public FormItem(String name, String value, int type, boolean isEnabled)
- {
- this.name = name;
- this.value = value;
- this.isEnabled = isEnabled;
- this.type = type;
- }
-
- public void setEnabled(boolean enabled)
- {
- isEnabled = enabled;
- }
-
- public boolean getEnabled()
- {
- return isEnabled;
- }
-
- public void setValue(String value)
- {
- this.value = value;
- }
-
- public int getType()
- {
- return type;
- }
-
- /** Get the element name */
- public String getElementName()
- {
- return name;
- }
-
- /** Get the element value */
- public String getElementValue()
- {
- return value;
- }
-
- }
/** This class is the handler for HTML parsing during state transitions */
protected class FindHTMLHrefHandler extends FindHandler implements IHTMLHandler
@@ -7429,713 +7144,6 @@ public class WebcrawlerConnector extends
}
- /** This interface describes the functionality needed by a link extractor to note a discovered link.
- */
- protected static interface IDiscoveredLinkHandler
- {
- /** Inform the world of a discovered link.
- *@param rawURL is the raw discovered url. This may be relative, malformed, or otherwise unsuitable for use until final form is acheived.
- */
- public void noteDiscoveredLink(String rawURL)
- throws ManifoldCFException;
- }
-
- /** This interface describes the functionality needed by an redirection processor in order to handle a redirection.
- */
- protected static interface IRedirectionHandler extends IDiscoveredLinkHandler
- {
- }
-
- /** This interface describes the functionality needed by an XML processor in order to handle an XML document.
- */
- protected static interface IXMLHandler extends IDiscoveredLinkHandler
- {
- /** Inform the world of a discovered ttl value.
- *@param rawTtlValue is the raw discovered ttl value.
- */
- public void noteDiscoveredTtlValue(String rawTtlValue)
- throws ManifoldCFException;
-
- }
-
- /** This interface describes the functionality needed by an HTML processor in order to handle an HTML document.
- */
- protected static interface IHTMLHandler extends IDiscoveredLinkHandler
- {
- /** Note the start of a form */
- public void noteFormStart(Map formAttributes)
- throws ManifoldCFException;
-
- /** Note an input tag */
- public void noteFormInput(Map inputAttributes)
- throws ManifoldCFException;
-
- /** Note the end of a form */
- public void noteFormEnd()
- throws ManifoldCFException;
-
- /** Note discovered href */
- public void noteAHREF(String rawURL)
- throws ManifoldCFException;
-
- /** Note discovered href */
- public void noteLINKHREF(String rawURL)
- throws ManifoldCFException;
-
- /** Note discovered IMG SRC */
- public void noteIMGSRC(String rawURL)
- throws ManifoldCFException;
-
- /** Note discovered FRAME SRC */
- public void noteFRAMESRC(String rawURL)
- throws ManifoldCFException;
- }
-
- // HTML parsing classes and constants
-
- /** Is a character HTML whitespace? */
- protected static boolean isHTMLWhitespace(char x)
- {
- return x <= ' ';
- }
-
- /** Decode an html attribute */
- protected static String htmlAttributeDecode(String input)
- {
- StringBuffer output = new StringBuffer();
- int i = 0;
- while (i < input.length())
- {
- char x = input.charAt(i++);
- if (x == '&')
- {
- int index = input.indexOf(";",i);
- if (index != -1)
- {
- String chunk = input.substring(i,index);
- String replacement = mapChunk(chunk);
- if (replacement != null)
- {
- output.append(replacement);
- i = index + 1;
- continue;
- }
- }
- }
- output.append(x);
- }
- return output.toString();
- }
-
- /** Map an entity reference back to a character */
- protected static String mapChunk(String input)
- {
- if (input.startsWith("#"))
- {
- // Treat as a decimal value
- try
- {
- int value = Integer.parseInt(input.substring(1));
- StringBuffer sb = new StringBuffer();
- sb.append((char)value);
- return sb.toString();
- }
- catch (NumberFormatException e)
- {
- return null;
- }
- }
- else
- return (String)mapLookup.get(input);
- }
-
- // Basic parse states (lexical analysis)
-
- protected static final int BASICPARSESTATE_NORMAL = 0;
- protected static final int BASICPARSESTATE_SAWLEFTBRACKET = 1;
- protected static final int BASICPARSESTATE_SAWEXCLAMATION = 2;
- protected static final int BASICPARSESTATE_SAWDASH = 3;
- protected static final int BASICPARSESTATE_IN_COMMENT = 4;
- protected static final int BASICPARSESTATE_SAWCOMMENTDASH = 5;
- protected static final int BASICPARSESTATE_SAWSECONDCOMMENTDASH = 6;
- protected static final int BASICPARSESTATE_IN_TAG_NAME = 7;
- protected static final int BASICPARSESTATE_IN_ATTR_NAME = 8;
- protected static final int BASICPARSESTATE_IN_ATTR_VALUE = 9;
- protected static final int BASICPARSESTATE_IN_TAG_SAW_SLASH = 10;
- protected static final int BASICPARSESTATE_IN_END_TAG_NAME = 11;
- protected static final int BASICPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE = 12;
- protected static final int BASICPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE = 13;
- protected static final int BASICPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE = 14;
- protected static final int BASICPARSESTATE_IN_UNQUOTED_ATTR_VALUE = 15;
-
-
- /** This class represents the basic, outermost parse state. */
- protected static class BasicParseState
- {
- protected int currentState = BASICPARSESTATE_NORMAL;
-
- protected StringBuffer currentTagNameBuffer = null;
- protected StringBuffer currentAttrNameBuffer = null;
- protected StringBuffer currentValueBuffer = null;
-
- protected String currentTagName = null;
- protected String currentAttrName = null;
- protected Map currentAttrMap = null;
-
- public BasicParseState()
- {
- }
-
- /** Deal with a character. No exceptions are allowed, since those would represent syntax errors, and we don't want those to cause difficulty. */
- public void dealWithCharacter(char thisChar)
- throws ManifoldCFException
- {
- // At this level we want basic lexical analysis - that is, we deal with identifying tags and comments, that's it.
- char thisCharLower = Character.toLowerCase(thisChar);
- switch (currentState)
- {
- case BASICPARSESTATE_NORMAL:
- if (thisChar == '<')
- currentState = BASICPARSESTATE_SAWLEFTBRACKET;
- break;
- case BASICPARSESTATE_SAWLEFTBRACKET:
- if (thisChar == '!')
- currentState = BASICPARSESTATE_SAWEXCLAMATION;
- else if (thisChar == '/')
- {
- currentState = BASICPARSESTATE_IN_END_TAG_NAME;
- currentTagNameBuffer = new StringBuffer();
- }
- else
- {
- currentState = BASICPARSESTATE_IN_TAG_NAME;
- currentTagNameBuffer = new StringBuffer();
- if (!isHTMLWhitespace(thisChar))
- currentTagNameBuffer.append(thisCharLower);
- }
- break;
- case BASICPARSESTATE_SAWEXCLAMATION:
- if (thisChar == '-')
- currentState = BASICPARSESTATE_SAWDASH;
- else
- currentState = BASICPARSESTATE_NORMAL;
- break;
- case BASICPARSESTATE_SAWDASH:
- if (thisChar == '-')
- currentState = BASICPARSESTATE_IN_COMMENT;
- else
- currentState = BASICPARSESTATE_NORMAL;
- break;
- case BASICPARSESTATE_IN_COMMENT:
- // We're in a comment. All we should look for is the end of the comment.
- if (thisChar == '-')
- currentState = BASICPARSESTATE_SAWCOMMENTDASH;
- break;
- case BASICPARSESTATE_SAWCOMMENTDASH:
- if (thisChar == '-')
- currentState = BASICPARSESTATE_SAWSECONDCOMMENTDASH;
- else
- currentState = BASICPARSESTATE_IN_COMMENT;
- break;
- case BASICPARSESTATE_SAWSECONDCOMMENTDASH:
- if (thisChar == '>')
- currentState = BASICPARSESTATE_NORMAL;
- else if (thisChar != '-')
- currentState = BASICPARSESTATE_IN_COMMENT;
- break;
- case BASICPARSESTATE_IN_TAG_NAME:
- if (isHTMLWhitespace(thisChar))
- {
- if (currentTagNameBuffer.length() > 0)
- {
- // Done with the tag name!
- currentTagName = currentTagNameBuffer.toString();
- currentTagNameBuffer = null;
- currentAttrMap = new HashMap();
- currentState = BASICPARSESTATE_IN_ATTR_NAME;
- currentAttrNameBuffer = new StringBuffer();
- }
- }
- else if (thisChar == '/')
- {
- if (currentTagNameBuffer.length() > 0)
- {
- currentTagName = currentTagNameBuffer.toString();
- currentTagNameBuffer = null;
- currentAttrMap = new HashMap();
- currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
- noteTag(currentTagName,currentAttrMap);
- }
- else
- {
- currentState = BASICPARSESTATE_NORMAL;
- currentTagNameBuffer = null;
- }
- }
- else if (thisChar == '>')
- {
- if (currentTagNameBuffer.length() > 0)
- {
- currentTagName = currentTagNameBuffer.toString();
- currentTagNameBuffer = null;
- currentAttrMap = new HashMap();
- }
- if (currentTagName != null)
- {
- noteTag(currentTagName,currentAttrMap);
- }
- currentState = BASICPARSESTATE_NORMAL;
- currentTagName = null;
- currentAttrMap = null;
- }
- else
- currentTagNameBuffer.append(thisCharLower);
- break;
- case BASICPARSESTATE_IN_ATTR_NAME:
- if (isHTMLWhitespace(thisChar))
- {
- if (currentAttrNameBuffer.length() > 0)
- {
- // Done with attr name!
- currentAttrName = currentAttrNameBuffer.toString();
- currentAttrNameBuffer = null;
- currentState = BASICPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE;
- }
- }
- else if (thisChar == '=')
- {
- if (currentAttrNameBuffer.length() > 0)
- {
- currentAttrName = currentAttrNameBuffer.toString();
- currentAttrNameBuffer = null;
- currentState = BASICPARSESTATE_IN_ATTR_VALUE;
- currentValueBuffer = new StringBuffer();
- }
- }
- else if (thisChar == '/')
- {
- if (currentAttrNameBuffer.length() > 0)
- {
- currentAttrName = currentAttrNameBuffer.toString();
- currentAttrNameBuffer = null;
- }
- if (currentAttrName != null)
- {
- currentAttrMap.put(currentAttrName,"");
- currentAttrName = null;
- }
- noteTag(currentTagName,currentAttrMap);
- currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
- }
- else if (thisChar == '>')
- {
- if (currentAttrNameBuffer.length() > 0)
- {
- currentAttrName = currentAttrNameBuffer.toString();
- currentAttrNameBuffer = null;
- }
- if (currentAttrName != null)
- {
- currentAttrMap.put(currentAttrName,"");
- currentAttrName = null;
- }
- currentState = BASICPARSESTATE_NORMAL;
- noteTag(currentTagName,currentAttrMap);
- currentTagName = null;
- currentAttrMap = null;
- }
- else
- currentAttrNameBuffer.append(thisCharLower);
- break;
- case BASICPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE:
- if (thisChar == '=')
- {
- currentState = BASICPARSESTATE_IN_ATTR_VALUE;
- currentValueBuffer = new StringBuffer();
- }
- else if (thisChar == '>')
- {
- currentState = BASICPARSESTATE_NORMAL;
- noteTag(currentTagName,currentAttrMap);
- currentTagName = null;
- currentAttrMap = null;
- }
- else if (thisChar == '/')
- {
- currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
- currentAttrMap.put(currentAttrName,"");
- currentAttrName = null;
- noteTag(currentTagName,currentAttrMap);
- }
- else if (!isHTMLWhitespace(thisChar))
- {
- currentAttrMap.put(currentAttrName,"");
- currentState = BASICPARSESTATE_IN_ATTR_NAME;
- currentAttrNameBuffer = new StringBuffer();
- currentAttrNameBuffer.append(thisCharLower);
- currentAttrName = null;
- }
- break;
- case BASICPARSESTATE_IN_ATTR_VALUE:
- if (thisChar == '\'')
- currentState = BASICPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE;
- else if (thisChar == '"')
- currentState = BASICPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE;
- else if (!isHTMLWhitespace(thisChar))
- {
- currentState = BASICPARSESTATE_IN_UNQUOTED_ATTR_VALUE;
- currentValueBuffer.append(thisChar);
- }
- break;
- case BASICPARSESTATE_IN_TAG_SAW_SLASH:
- if (thisChar == '>')
- {
- noteEndTag(currentTagName);
- currentState = BASICPARSESTATE_NORMAL;
- currentTagName = null;
- currentAttrMap = null;
- }
- break;
- case BASICPARSESTATE_IN_END_TAG_NAME:
- if (isHTMLWhitespace(thisChar))
- {
- if (currentTagNameBuffer != null && currentTagNameBuffer.length() > 0)
- {
- // Done with the tag name!
- currentTagName = currentTagNameBuffer.toString();
- currentTagNameBuffer = null;
- }
- }
- else if (thisChar == '>')
- {
- if (currentTagNameBuffer != null && currentTagNameBuffer.length() > 0)
- {
- currentTagName = currentTagNameBuffer.toString();
- currentTagNameBuffer = null;
- }
- if (currentTagName != null)
- {
- noteEndTag(currentTagName);
- }
- currentTagName = null;
- currentState = BASICPARSESTATE_NORMAL;
- }
- else if (currentTagNameBuffer != null)
- currentTagNameBuffer.append(thisCharLower);
- break;
- case BASICPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE:
- if (thisChar == '\'' || thisChar == '\n' || thisChar == '\r')
- {
- currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
- currentAttrName = null;
- currentValueBuffer = null;
- currentState = BASICPARSESTATE_IN_ATTR_NAME;
- currentAttrNameBuffer = new StringBuffer();
- }
- else
- currentValueBuffer.append(thisChar);
- break;
- case BASICPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE:
- if (thisChar == '"' || thisChar == '\n' || thisChar == '\r')
- {
- currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
- currentAttrName = null;
- currentValueBuffer = null;
- currentState = BASICPARSESTATE_IN_ATTR_NAME;
- currentAttrNameBuffer = new StringBuffer();
- }
- else
- currentValueBuffer.append(thisChar);
- break;
- case BASICPARSESTATE_IN_UNQUOTED_ATTR_VALUE:
- if (isHTMLWhitespace(thisChar))
- {
- currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
- currentAttrName = null;
- currentValueBuffer = null;
- currentState = BASICPARSESTATE_IN_ATTR_NAME;
- currentAttrNameBuffer = new StringBuffer();
- }
- else if (thisChar == '/')
- {
- currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
- noteTag(currentTagName,currentAttrMap);
- currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
- }
- else if (thisChar == '>')
- {
- currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
- currentAttrName = null;
- currentValueBuffer = null;
- currentState = BASICPARSESTATE_NORMAL;
- noteTag(currentTagName,currentAttrMap);
- currentTagName = null;
- currentAttrMap = null;
- }
- else
- currentValueBuffer.append(thisChar);
- break;
- default:
- throw new ManifoldCFException("Invalid state: "+Integer.toString(currentState));
- }
- }
-
- protected void noteTag(String tagName, Map attributes)
- throws ManifoldCFException
- {
- Logging.connectors.debug(" Saw tag '"+tagName+"'");
- }
-
- protected void noteEndTag(String tagName)
- throws ManifoldCFException
- {
- Logging.connectors.debug(" Saw end tag '"+tagName+"'");
- }
-
- public void finishUp()
- throws ManifoldCFException
- {
- // Does nothing
- }
-
- }
-
- // Script tag parsing states
-
- protected static final int SCRIPTPARSESTATE_NORMAL = 0;
- protected static final int SCRIPTPARSESTATE_INSCRIPT = 1;
-
- /** This class interprets the tag stream generated by the BasicParseState class, and causes script sections to be skipped */
- protected static class ScriptParseState extends BasicParseState
- {
- protected int scriptParseState = SCRIPTPARSESTATE_NORMAL;
-
- public ScriptParseState()
- {
- super();
- }
-
- // Override methods having to do with notification of tag discovery
-
- protected void noteTag(String tagName, Map attributes)
- throws ManifoldCFException
- {
- super.noteTag(tagName,attributes);
- switch (scriptParseState)
- {
- case SCRIPTPARSESTATE_NORMAL:
- if (tagName.equals("script"))
- scriptParseState = SCRIPTPARSESTATE_INSCRIPT;
- else
- noteNonscriptTag(tagName,attributes);
- break;
- case SCRIPTPARSESTATE_INSCRIPT:
- // Skip all tags until we see the end script one.
- break;
- default:
- throw new ManifoldCFException("Unknown script parse state: "+Integer.toString(scriptParseState));
- }
- }
-
- protected void noteEndTag(String tagName)
- throws ManifoldCFException
- {
- super.noteEndTag(tagName);
- switch (scriptParseState)
- {
- case SCRIPTPARSESTATE_NORMAL:
- noteNonscriptEndTag(tagName);
- break;
- case SCRIPTPARSESTATE_INSCRIPT:
- // Skip all tags until we see the end script one.
- if (tagName.equals("script"))
- scriptParseState = SCRIPTPARSESTATE_NORMAL;
- break;
- default:
- break;
- }
- }
-
- protected void noteNonscriptTag(String tagName, Map attributes)
- throws ManifoldCFException
- {
- }
-
- protected void noteNonscriptEndTag(String tagName)
- throws ManifoldCFException
- {
- }
-
- }
-
- /** This class recognizes and interprets all links */
- protected static class LinkParseState extends ScriptParseState
- {
-
- protected IHTMLHandler handler;
-
- public LinkParseState(IHTMLHandler handler)
- {
- super();
- this.handler = handler;
- }
-
- protected void noteNonscriptTag(String tagName, Map attributes)
- throws ManifoldCFException
- {
- super.noteNonscriptTag(tagName,attributes);
- String lowerTagName = tagName.toLowerCase();
- if (tagName.equals("a"))
- {
- String hrefValue = (String)attributes.get("href");
- if (hrefValue != null && hrefValue.length() > 0)
- handler.noteAHREF(hrefValue);
- }
- else if (tagName.equals("link"))
- {
- String hrefValue = (String)attributes.get("href");
- if (hrefValue != null && hrefValue.length() > 0)
- handler.noteLINKHREF(hrefValue);
- }
- else if (tagName.equals("img"))
- {
- String srcValue = (String)attributes.get("src");
- if (srcValue != null && srcValue.length() > 0)
- handler.noteIMGSRC(srcValue);
- }
- else if (tagName.equals("frame"))
- {
- String srcValue = (String)attributes.get("src");
- if (srcValue != null && srcValue.length() > 0)
- handler.noteFRAMESRC(srcValue);
- }
- }
-
- }
-
- // States for form handling.
-
- protected final static int FORMPARSESTATE_NORMAL = 0;
- protected final static int FORMPARSESTATE_IN_FORM = 1;
- protected final static int FORMPARSESTATE_IN_SELECT = 2;
- protected final static int FORMPARSESTATE_IN_TEXTAREA = 3;
-
- /** This class interprets the tag stream generated by the BasicParseState class, and keeps track of the form tags. */
- protected static class FormParseState extends LinkParseState
- {
- protected int formParseState = FORMPARSESTATE_NORMAL;
- protected String selectName = null;
- protected String selectMultiple = null;
-
- public FormParseState(IHTMLHandler handler)
- {
- super(handler);
- }
-
- // Override methods having to do with notification of tag discovery
-
- protected void noteNonscriptTag(String tagName, Map attributes)
- throws ManifoldCFException
- {
- super.noteNonscriptTag(tagName,attributes);
- switch (formParseState)
- {
- case FORMPARSESTATE_NORMAL:
- if (tagName.equals("form"))
- {
- formParseState = FORMPARSESTATE_IN_FORM;
- handler.noteFormStart(attributes);
- }
- break;
- case FORMPARSESTATE_IN_FORM:
- if (tagName.equals("input"))
- {
- String type = (String)attributes.get("type");
- // We're only interested in form elements that can actually transmit data
- if (type != null && !type.toLowerCase().equals("button") && !type.toLowerCase().equals("reset") && !type.toLowerCase().equals("image"))
- handler.noteFormInput(attributes);
- }
- else if (tagName.equals("select"))
- {
- selectName = (String)attributes.get("name");
- selectMultiple = (String)attributes.get("multiple");
- formParseState = FORMPARSESTATE_IN_SELECT;
- }
- else if (tagName.equals("textarea"))
- {
- formParseState = FORMPARSESTATE_IN_TEXTAREA;
- Map textareaMap = new HashMap();
- textareaMap.put("type","textarea");
- // Default value is too tough to meaningfully compute because of the embedded tags etc. Known limitation.
- textareaMap.put("value","");
- handler.noteFormInput(textareaMap);
- }
- else if (tagName.equals("button"))
- {
- String type = (String)attributes.get("type");
- if (type == null || type.toLowerCase().equals("submit"))
- {
- // Same as input type="submit"
- handler.noteFormInput(attributes);
- }
- }
- else if (tagName.equals("isindex"))
- {
- Map indexMap = new HashMap();
- indexMap.put("type","text");
- }
- break;
- case FORMPARSESTATE_IN_SELECT:
- if (tagName.equals("option"))
- {
- String optionValue = (String)attributes.get("value");
- String optionSelected = (String)attributes.get("selected");
- Map optionMap = new HashMap();
- optionMap.put("type","select");
- optionMap.put("name",selectName);
- optionMap.put("multiple",selectMultiple);
- optionMap.put("value",optionValue);
- optionMap.put("selected",optionSelected);
- handler.noteFormInput(optionMap);
- }
- break;
- case FORMPARSESTATE_IN_TEXTAREA:
- break;
- default:
- throw new ManifoldCFException("Unknown form parse state: "+Integer.toString(formParseState));
- }
- }
-
- protected void noteNonscriptEndTag(String tagName)
- throws ManifoldCFException
- {
- super.noteNonscriptEndTag(tagName);
- switch (formParseState)
- {
- case FORMPARSESTATE_NORMAL:
- break;
- case FORMPARSESTATE_IN_FORM:
- if (tagName.equals("form"))
- {
- handler.noteFormEnd();
- formParseState = FORMPARSESTATE_NORMAL;
- }
- break;
- case FORMPARSESTATE_IN_SELECT:
- formParseState = FORMPARSESTATE_IN_FORM;
- selectName = null;
- selectMultiple = null;
- break;
- case FORMPARSESTATE_IN_TEXTAREA:
- formParseState = FORMPARSESTATE_IN_FORM;
- break;
- default:
- throw new ManifoldCFException("Unknown form parse state: "+Integer.toString(formParseState));
- }
- }
-
- }
-
}