You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2010/10/08 02:27:46 UTC
svn commit: r1005681 [1/2] -
/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/
Author: kwright
Date: Fri Oct 8 00:27:46 2010
New Revision: 1005681
URL: http://svn.apache.org/viewvc?rev=1005681&view=rev
Log:
Separate out a good chunk of the inner classes, to reduce the size of the main connector class to some degree.
Added:
incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java (with props)
incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java (with props)
incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormItem.java (with props)
incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java (with props)
incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java (with props)
incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java (with props)
incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IRedirectionHandler.java (with props)
incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IXMLHandler.java (with props)
incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java (with props)
incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java (with props)
Modified:
incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java Fri Oct 8 00:27:46 2010
@@ -0,0 +1,437 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.crawler.system.Logging;
+import java.util.*;
+
+/** This class represents the basic, outermost parse state. */
+public class BasicParseState
+{
+ protected static final int BASICPARSESTATE_NORMAL = 0;
+ protected static final int BASICPARSESTATE_SAWLEFTBRACKET = 1;
+ protected static final int BASICPARSESTATE_SAWEXCLAMATION = 2;
+ protected static final int BASICPARSESTATE_SAWDASH = 3;
+ protected static final int BASICPARSESTATE_IN_COMMENT = 4;
+ protected static final int BASICPARSESTATE_SAWCOMMENTDASH = 5;
+ protected static final int BASICPARSESTATE_SAWSECONDCOMMENTDASH = 6;
+ protected static final int BASICPARSESTATE_IN_TAG_NAME = 7;
+ protected static final int BASICPARSESTATE_IN_ATTR_NAME = 8;
+ protected static final int BASICPARSESTATE_IN_ATTR_VALUE = 9;
+ protected static final int BASICPARSESTATE_IN_TAG_SAW_SLASH = 10;
+ protected static final int BASICPARSESTATE_IN_END_TAG_NAME = 11;
+ protected static final int BASICPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE = 12;
+ protected static final int BASICPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE = 13;
+ protected static final int BASICPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE = 14;
+ protected static final int BASICPARSESTATE_IN_UNQUOTED_ATTR_VALUE = 15;
+
+
+ protected int currentState = BASICPARSESTATE_NORMAL;
+
+ protected StringBuffer currentTagNameBuffer = null;
+ protected StringBuffer currentAttrNameBuffer = null;
+ protected StringBuffer currentValueBuffer = null;
+
+ protected String currentTagName = null;
+ protected String currentAttrName = null;
+ protected Map currentAttrMap = null;
+
+ protected static final Map mapLookup = new HashMap();
+ static
+ {
+ mapLookup.put("amp","&");
+ mapLookup.put("lt","<");
+ mapLookup.put("gt",">");
+ mapLookup.put("quot","\"");
+ }
+
+ public BasicParseState()
+ {
+ }
+
+ /** Deal with a character. No exceptions are allowed, since those would represent syntax errors, and we don't want those to cause difficulty. */
+ public void dealWithCharacter(char thisChar)
+ throws ManifoldCFException
+ {
+ // At this level we want basic lexical analysis - that is, we deal with identifying tags and comments, that's it.
+ char thisCharLower = Character.toLowerCase(thisChar);
+ switch (currentState)
+ {
+ case BASICPARSESTATE_NORMAL:
+ if (thisChar == '<')
+ currentState = BASICPARSESTATE_SAWLEFTBRACKET;
+ break;
+ case BASICPARSESTATE_SAWLEFTBRACKET:
+ if (thisChar == '!')
+ currentState = BASICPARSESTATE_SAWEXCLAMATION;
+ else if (thisChar == '/')
+ {
+ currentState = BASICPARSESTATE_IN_END_TAG_NAME;
+ currentTagNameBuffer = new StringBuffer();
+ }
+ else
+ {
+ currentState = BASICPARSESTATE_IN_TAG_NAME;
+ currentTagNameBuffer = new StringBuffer();
+ if (!isHTMLWhitespace(thisChar))
+ currentTagNameBuffer.append(thisCharLower);
+ }
+ break;
+ case BASICPARSESTATE_SAWEXCLAMATION:
+ if (thisChar == '-')
+ currentState = BASICPARSESTATE_SAWDASH;
+ else
+ currentState = BASICPARSESTATE_NORMAL;
+ break;
+ case BASICPARSESTATE_SAWDASH:
+ if (thisChar == '-')
+ currentState = BASICPARSESTATE_IN_COMMENT;
+ else
+ currentState = BASICPARSESTATE_NORMAL;
+ break;
+ case BASICPARSESTATE_IN_COMMENT:
+ // We're in a comment. All we should look for is the end of the comment.
+ if (thisChar == '-')
+ currentState = BASICPARSESTATE_SAWCOMMENTDASH;
+ break;
+ case BASICPARSESTATE_SAWCOMMENTDASH:
+ if (thisChar == '-')
+ currentState = BASICPARSESTATE_SAWSECONDCOMMENTDASH;
+ else
+ currentState = BASICPARSESTATE_IN_COMMENT;
+ break;
+ case BASICPARSESTATE_SAWSECONDCOMMENTDASH:
+ if (thisChar == '>')
+ currentState = BASICPARSESTATE_NORMAL;
+ else if (thisChar != '-')
+ currentState = BASICPARSESTATE_IN_COMMENT;
+ break;
+ case BASICPARSESTATE_IN_TAG_NAME:
+ if (isHTMLWhitespace(thisChar))
+ {
+ if (currentTagNameBuffer.length() > 0)
+ {
+ // Done with the tag name!
+ currentTagName = currentTagNameBuffer.toString();
+ currentTagNameBuffer = null;
+ currentAttrMap = new HashMap();
+ currentState = BASICPARSESTATE_IN_ATTR_NAME;
+ currentAttrNameBuffer = new StringBuffer();
+ }
+ }
+ else if (thisChar == '/')
+ {
+ if (currentTagNameBuffer.length() > 0)
+ {
+ currentTagName = currentTagNameBuffer.toString();
+ currentTagNameBuffer = null;
+ currentAttrMap = new HashMap();
+ currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
+ noteTag(currentTagName,currentAttrMap);
+ }
+ else
+ {
+ currentState = BASICPARSESTATE_NORMAL;
+ currentTagNameBuffer = null;
+ }
+ }
+ else if (thisChar == '>')
+ {
+ if (currentTagNameBuffer.length() > 0)
+ {
+ currentTagName = currentTagNameBuffer.toString();
+ currentTagNameBuffer = null;
+ currentAttrMap = new HashMap();
+ }
+ if (currentTagName != null)
+ {
+ noteTag(currentTagName,currentAttrMap);
+ }
+ currentState = BASICPARSESTATE_NORMAL;
+ currentTagName = null;
+ currentAttrMap = null;
+ }
+ else
+ currentTagNameBuffer.append(thisCharLower);
+ break;
+ case BASICPARSESTATE_IN_ATTR_NAME:
+ if (isHTMLWhitespace(thisChar))
+ {
+ if (currentAttrNameBuffer.length() > 0)
+ {
+ // Done with attr name!
+ currentAttrName = currentAttrNameBuffer.toString();
+ currentAttrNameBuffer = null;
+ currentState = BASICPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE;
+ }
+ }
+ else if (thisChar == '=')
+ {
+ if (currentAttrNameBuffer.length() > 0)
+ {
+ currentAttrName = currentAttrNameBuffer.toString();
+ currentAttrNameBuffer = null;
+ currentState = BASICPARSESTATE_IN_ATTR_VALUE;
+ currentValueBuffer = new StringBuffer();
+ }
+ }
+ else if (thisChar == '/')
+ {
+ if (currentAttrNameBuffer.length() > 0)
+ {
+ currentAttrName = currentAttrNameBuffer.toString();
+ currentAttrNameBuffer = null;
+ }
+ if (currentAttrName != null)
+ {
+ currentAttrMap.put(currentAttrName,"");
+ currentAttrName = null;
+ }
+ noteTag(currentTagName,currentAttrMap);
+ currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
+ }
+ else if (thisChar == '>')
+ {
+ if (currentAttrNameBuffer.length() > 0)
+ {
+ currentAttrName = currentAttrNameBuffer.toString();
+ currentAttrNameBuffer = null;
+ }
+ if (currentAttrName != null)
+ {
+ currentAttrMap.put(currentAttrName,"");
+ currentAttrName = null;
+ }
+ currentState = BASICPARSESTATE_NORMAL;
+ noteTag(currentTagName,currentAttrMap);
+ currentTagName = null;
+ currentAttrMap = null;
+ }
+ else
+ currentAttrNameBuffer.append(thisCharLower);
+ break;
+ case BASICPARSESTATE_IN_ATTR_LOOKING_FOR_VALUE:
+ if (thisChar == '=')
+ {
+ currentState = BASICPARSESTATE_IN_ATTR_VALUE;
+ currentValueBuffer = new StringBuffer();
+ }
+ else if (thisChar == '>')
+ {
+ currentState = BASICPARSESTATE_NORMAL;
+ noteTag(currentTagName,currentAttrMap);
+ currentTagName = null;
+ currentAttrMap = null;
+ }
+ else if (thisChar == '/')
+ {
+ currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
+ currentAttrMap.put(currentAttrName,"");
+ currentAttrName = null;
+ noteTag(currentTagName,currentAttrMap);
+ }
+ else if (!isHTMLWhitespace(thisChar))
+ {
+ currentAttrMap.put(currentAttrName,"");
+ currentState = BASICPARSESTATE_IN_ATTR_NAME;
+ currentAttrNameBuffer = new StringBuffer();
+ currentAttrNameBuffer.append(thisCharLower);
+ currentAttrName = null;
+ }
+ break;
+ case BASICPARSESTATE_IN_ATTR_VALUE:
+ if (thisChar == '\'')
+ currentState = BASICPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE;
+ else if (thisChar == '"')
+ currentState = BASICPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE;
+ else if (!isHTMLWhitespace(thisChar))
+ {
+ currentState = BASICPARSESTATE_IN_UNQUOTED_ATTR_VALUE;
+ currentValueBuffer.append(thisChar);
+ }
+ break;
+ case BASICPARSESTATE_IN_TAG_SAW_SLASH:
+ if (thisChar == '>')
+ {
+ noteEndTag(currentTagName);
+ currentState = BASICPARSESTATE_NORMAL;
+ currentTagName = null;
+ currentAttrMap = null;
+ }
+ break;
+ case BASICPARSESTATE_IN_END_TAG_NAME:
+ if (isHTMLWhitespace(thisChar))
+ {
+ if (currentTagNameBuffer != null && currentTagNameBuffer.length() > 0)
+ {
+ // Done with the tag name!
+ currentTagName = currentTagNameBuffer.toString();
+ currentTagNameBuffer = null;
+ }
+ }
+ else if (thisChar == '>')
+ {
+ if (currentTagNameBuffer != null && currentTagNameBuffer.length() > 0)
+ {
+ currentTagName = currentTagNameBuffer.toString();
+ currentTagNameBuffer = null;
+ }
+ if (currentTagName != null)
+ {
+ noteEndTag(currentTagName);
+ }
+ currentTagName = null;
+ currentState = BASICPARSESTATE_NORMAL;
+ }
+ else if (currentTagNameBuffer != null)
+ currentTagNameBuffer.append(thisCharLower);
+ break;
+ case BASICPARSESTATE_IN_SINGLE_QUOTES_ATTR_VALUE:
+ if (thisChar == '\'' || thisChar == '\n' || thisChar == '\r')
+ {
+ currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+ currentAttrName = null;
+ currentValueBuffer = null;
+ currentState = BASICPARSESTATE_IN_ATTR_NAME;
+ currentAttrNameBuffer = new StringBuffer();
+ }
+ else
+ currentValueBuffer.append(thisChar);
+ break;
+ case BASICPARSESTATE_IN_DOUBLE_QUOTES_ATTR_VALUE:
+ if (thisChar == '"' || thisChar == '\n' || thisChar == '\r')
+ {
+ currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+ currentAttrName = null;
+ currentValueBuffer = null;
+ currentState = BASICPARSESTATE_IN_ATTR_NAME;
+ currentAttrNameBuffer = new StringBuffer();
+ }
+ else
+ currentValueBuffer.append(thisChar);
+ break;
+ case BASICPARSESTATE_IN_UNQUOTED_ATTR_VALUE:
+ if (isHTMLWhitespace(thisChar))
+ {
+ currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+ currentAttrName = null;
+ currentValueBuffer = null;
+ currentState = BASICPARSESTATE_IN_ATTR_NAME;
+ currentAttrNameBuffer = new StringBuffer();
+ }
+ else if (thisChar == '/')
+ {
+ currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+ noteTag(currentTagName,currentAttrMap);
+ currentState = BASICPARSESTATE_IN_TAG_SAW_SLASH;
+ }
+ else if (thisChar == '>')
+ {
+ currentAttrMap.put(currentAttrName,htmlAttributeDecode(currentValueBuffer.toString()));
+ currentAttrName = null;
+ currentValueBuffer = null;
+ currentState = BASICPARSESTATE_NORMAL;
+ noteTag(currentTagName,currentAttrMap);
+ currentTagName = null;
+ currentAttrMap = null;
+ }
+ else
+ currentValueBuffer.append(thisChar);
+ break;
+ default:
+ throw new ManifoldCFException("Invalid state: "+Integer.toString(currentState));
+ }
+ }
+
+ protected void noteTag(String tagName, Map attributes)
+ throws ManifoldCFException
+ {
+ Logging.connectors.debug(" Saw tag '"+tagName+"'");
+ }
+
+ protected void noteEndTag(String tagName)
+ throws ManifoldCFException
+ {
+ Logging.connectors.debug(" Saw end tag '"+tagName+"'");
+ }
+
+ public void finishUp()
+ throws ManifoldCFException
+ {
+ // Does nothing
+ }
+
+ /** Decode an html attribute */
+ protected static String htmlAttributeDecode(String input)
+ {
+ StringBuffer output = new StringBuffer();
+ int i = 0;
+ while (i < input.length())
+ {
+ char x = input.charAt(i++);
+ if (x == '&')
+ {
+ int index = input.indexOf(";",i);
+ if (index != -1)
+ {
+ String chunk = input.substring(i,index);
+ String replacement = mapChunk(chunk);
+ if (replacement != null)
+ {
+ output.append(replacement);
+ i = index + 1;
+ continue;
+ }
+ }
+ }
+ output.append(x);
+ }
+ return output.toString();
+ }
+
+ /** Map an entity reference back to a character */
+ protected static String mapChunk(String input)
+ {
+ if (input.startsWith("#"))
+ {
+ // Treat as a decimal value
+ try
+ {
+ int value = Integer.parseInt(input.substring(1));
+ StringBuffer sb = new StringBuffer();
+ sb.append((char)value);
+ return sb.toString();
+ }
+ catch (NumberFormatException e)
+ {
+ return null;
+ }
+ }
+ else
+ return (String)mapLookup.get(input);
+ }
+
+ /** Is a character HTML whitespace? */
+ protected static boolean isHTMLWhitespace(char x)
+ {
+ return x <= ' ';
+ }
+
+}
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java
------------------------------------------------------------------------------
svn:keywords = Id
Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java Fri Oct 8 00:27:46 2010
@@ -0,0 +1,251 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+import java.util.regex.*;
+
+/** This class accumulates form data and allows overrides */
+public class FormDataAccumulator implements FormData
+{
+ // Note well: We don't handle multipart posts at this time!!
+
+ // Element categorization
+ protected final static int ELEMENTCATEGORY_FREEFORM = 0;
+ protected final static int ELEMENTCATEGORY_FIXEDEXCLUSIVE = 1;
+ protected final static int ELEMENTCATEGORY_FIXEDINCLUSIVE = 2;
+
+ /** The form's action URI */
+ protected String actionURI;
+ /** The form's submit method */
+ protected int submitMethod;
+
+ /** The set of elements */
+ protected ArrayList elementList = new ArrayList();
+
+ public FormDataAccumulator(String actionURI, int submitMethod)
+ {
+ this.actionURI = actionURI;
+ this.submitMethod = submitMethod;
+ }
+
+ public void addElement(Map attributes)
+ {
+ // Interpret the input tag, and make a list of the potential elements we'll want to submit
+ String type = (String)attributes.get("type");
+ if (type != null)
+ {
+ String name = (String)attributes.get("name");
+ if (name != null)
+ {
+ String lowerType = type.toLowerCase();
+ if (lowerType.equals("submit"))
+ {
+ String value = (String)attributes.get("value");
+ if (value == null)
+ value = "Submit Form";
+ elementList.add(new FormItem(name,value,ELEMENTCATEGORY_FREEFORM,true));
+ }
+ else if (lowerType.equals("hidden") || lowerType.equals("text") || lowerType.equals("password"))
+ {
+ String value = (String)attributes.get("value");
+ if (value == null)
+ value = "";
+ elementList.add(new FormItem(name,value,ELEMENTCATEGORY_FREEFORM,true));
+ }
+ else if (lowerType.equals("select"))
+ {
+ String value = (String)attributes.get("value");
+ if (value == null)
+ value = "";
+ String selected = (String)attributes.get("selected");
+ boolean isSelected = false;
+ if (selected != null)
+ isSelected = true;
+ String multiple = (String)attributes.get("multiple");
+ boolean isMultiple = false;
+ if (multiple != null)
+ isMultiple = true;
+ elementList.add(new FormItem(name,value,isMultiple?ELEMENTCATEGORY_FIXEDINCLUSIVE:ELEMENTCATEGORY_FIXEDEXCLUSIVE,isSelected));
+ }
+ else if (lowerType.equals("radio"))
+ {
+ String value = (String)attributes.get("value");
+ if (value == null)
+ value = "";
+ String selected = (String)attributes.get("checked");
+ boolean isSelected = false;
+ if (selected != null)
+ isSelected = true;
+ elementList.add(new FormItem(name,value,ELEMENTCATEGORY_FIXEDEXCLUSIVE,isSelected));
+ }
+ else if (lowerType.equals("checkbox"))
+ {
+ String value = (String)attributes.get("value");
+ if (value == null)
+ value = "";
+ String selected = (String)attributes.get("checked");
+ boolean isSelected = false;
+ if (selected != null)
+ isSelected = true;
+ elementList.add(new FormItem(name,value,ELEMENTCATEGORY_FIXEDINCLUSIVE,isSelected));
+ }
+ else if (lowerType.equals("textarea"))
+ {
+ elementList.add(new FormItem(name,"",ELEMENTCATEGORY_FREEFORM,true));
+ }
+ }
+ }
+ }
+
+ public void applyOverrides(LoginParameters lp)
+ {
+ // This map contains the control names we have ALREADY wiped clean.
+ Map overrideMap = new HashMap();
+
+ // Override the specified elements with the specified values
+ int i = 0;
+ while (i < lp.getParameterCount())
+ {
+ Pattern namePattern = lp.getParameterNamePattern(i);
+ String value = lp.getParameterValue(i);
+ i++;
+
+ // For each parameter specified, go through the element list and do the right thing. This will require us to keep some state around about
+ // what exactly we've done to the element list so far, so that each parameter rule in turn applies properly.
+ //
+ // Each rule regular expression will be deemed to apply to all matching controls. If the rule matches the control name, then the precise behavior
+ // will depend on the type of the control.
+ //
+ // Controls can be categorized in the following way:
+ // - free-form value
+ // - specified exclusive value (e.g. radio button)
+ // - specified inclusive value (e.g. checkbox)
+ //
+ // For free-form values, the value given will simply override the value of the element.
+ // For exclusive controls, all values in the family will be disabled, and the value matching the one specified will be enabled.
+ // For inclusive controls, all values in the family will be cleared ONCE, and then subsequently the value matching the one specified will be enabled.
+ //
+ int j = 0;
+ while (j < elementList.size())
+ {
+ FormItem fi = (FormItem)elementList.get(j++);
+ Matcher m = namePattern.matcher(fi.getElementName());
+ if (m.find())
+ {
+ // Hey, it seems to apply!
+ switch (fi.getType())
+ {
+ case ELEMENTCATEGORY_FREEFORM:
+ // Override immediately
+ fi.setValue(value);
+ break;
+ case ELEMENTCATEGORY_FIXEDEXCLUSIVE:
+ // If it doesn't match the value, disable.
+ fi.setEnabled(fi.getElementValue().equals(value));
+ break;
+ case ELEMENTCATEGORY_FIXEDINCLUSIVE:
+ // Make sure we clear the entire control ONCE (and only once).
+ if (overrideMap.get(fi.getElementName()) == null)
+ {
+ // Zip through the entire list
+ int k = 0;
+ while (k < elementList.size())
+ {
+ FormItem fi2 = (FormItem)elementList.get(k++);
+ if (fi2.getElementName().equals(fi.getElementName()))
+ fi.setEnabled(false);
+ }
+ overrideMap.put(fi.getElementName(),fi.getElementName());
+ }
+ if (fi.getElementValue().equals(value))
+ fi.setEnabled(true);
+ default:
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ /** Get the full action URI for this form. */
+ public String getActionURI()
+ {
+ return actionURI;
+ }
+
+ /** Get the submit method for this form. */
+ public int getSubmitMethod()
+ {
+ return submitMethod;
+ }
+
+ /** Iterate over the active form data elements. The returned iterator returns FormDataElement objects. */
+ public Iterator getElementIterator()
+ {
+ return new FormItemIterator(elementList);
+ }
+
+ /** Iterator over FormItems */
+ protected static class FormItemIterator implements Iterator
+ {
+ protected ArrayList elementList;
+ protected int currentIndex = 0;
+
+ public FormItemIterator(ArrayList elementList)
+ {
+ this.elementList = elementList;
+ }
+
+ public boolean hasNext()
+ {
+ while (true)
+ {
+ if (currentIndex == elementList.size())
+ return false;
+ if (((FormItem)elementList.get(currentIndex)).getEnabled() == false)
+ currentIndex++;
+ else
+ break;
+ }
+ return true;
+ }
+
+ public Object next()
+ {
+ while (true)
+ {
+ if (currentIndex == elementList.size())
+ throw new NoSuchElementException("No such element");
+ if (((FormItem)elementList.get(currentIndex)).getEnabled() == false)
+ currentIndex++;
+ else
+ break;
+ }
+ return elementList.get(currentIndex++);
+ }
+
+ public void remove()
+ {
+ throw new UnsupportedOperationException("Unsupported operation");
+ }
+ }
+
+}
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormDataAccumulator.java
------------------------------------------------------------------------------
svn:keywords = Id
Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormItem.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormItem.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormItem.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormItem.java Fri Oct 8 00:27:46 2010
@@ -0,0 +1,69 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+/** This class provides an individual data item */
+public class FormItem implements FormDataElement
+{
+ protected String name;
+ protected String value;
+ protected boolean isEnabled;
+ protected int type;
+
+ public FormItem(String name, String value, int type, boolean isEnabled)
+ {
+ this.name = name;
+ this.value = value;
+ this.isEnabled = isEnabled;
+ this.type = type;
+ }
+
+ public void setEnabled(boolean enabled)
+ {
+ isEnabled = enabled;
+ }
+
+ public boolean getEnabled()
+ {
+ return isEnabled;
+ }
+
+ public void setValue(String value)
+ {
+ this.value = value;
+ }
+
+ public int getType()
+ {
+ return type;
+ }
+
+ /** Get the element name */
+ public String getElementName()
+ {
+ return name;
+ }
+
+ /** Get the element value */
+ public String getElementValue()
+ {
+ return value;
+ }
+
+}
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormItem.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormItem.java
------------------------------------------------------------------------------
svn:keywords = Id
Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java Fri Oct 8 00:27:46 2010
@@ -0,0 +1,145 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+
+/** This class interprets the tag stream generated by the BasicParseState class, and keeps track of the form tags. */
+public class FormParseState extends LinkParseState
+{
+ // States for form handling.
+ protected final static int FORMPARSESTATE_NORMAL = 0;
+ protected final static int FORMPARSESTATE_IN_FORM = 1;
+ protected final static int FORMPARSESTATE_IN_SELECT = 2;
+ protected final static int FORMPARSESTATE_IN_TEXTAREA = 3;
+
+
+ protected int formParseState = FORMPARSESTATE_NORMAL;
+ protected String selectName = null;
+ protected String selectMultiple = null;
+
+ public FormParseState(IHTMLHandler handler)
+ {
+ super(handler);
+ }
+
+ // Override methods having to do with notification of tag discovery
+
+ protected void noteNonscriptTag(String tagName, Map attributes)
+ throws ManifoldCFException
+ {
+ super.noteNonscriptTag(tagName,attributes);
+ switch (formParseState)
+ {
+ case FORMPARSESTATE_NORMAL:
+ if (tagName.equals("form"))
+ {
+ formParseState = FORMPARSESTATE_IN_FORM;
+ handler.noteFormStart(attributes);
+ }
+ break;
+ case FORMPARSESTATE_IN_FORM:
+ if (tagName.equals("input"))
+ {
+ String type = (String)attributes.get("type");
+ // We're only interested in form elements that can actually transmit data
+ if (type != null && !type.toLowerCase().equals("button") && !type.toLowerCase().equals("reset") && !type.toLowerCase().equals("image"))
+ handler.noteFormInput(attributes);
+ }
+ else if (tagName.equals("select"))
+ {
+ selectName = (String)attributes.get("name");
+ selectMultiple = (String)attributes.get("multiple");
+ formParseState = FORMPARSESTATE_IN_SELECT;
+ }
+ else if (tagName.equals("textarea"))
+ {
+ formParseState = FORMPARSESTATE_IN_TEXTAREA;
+ Map textareaMap = new HashMap();
+ textareaMap.put("type","textarea");
+ // Default value is too tough to meaningfully compute because of the embedded tags etc. Known limitation.
+ textareaMap.put("value","");
+ handler.noteFormInput(textareaMap);
+ }
+ else if (tagName.equals("button"))
+ {
+ String type = (String)attributes.get("type");
+ if (type == null || type.toLowerCase().equals("submit"))
+ {
+ // Same as input type="submit"
+ handler.noteFormInput(attributes);
+ }
+ }
+ else if (tagName.equals("isindex"))
+ {
+ Map indexMap = new HashMap();
+ indexMap.put("type","text");
+ }
+ break;
+ case FORMPARSESTATE_IN_SELECT:
+ if (tagName.equals("option"))
+ {
+ String optionValue = (String)attributes.get("value");
+ String optionSelected = (String)attributes.get("selected");
+ Map optionMap = new HashMap();
+ optionMap.put("type","select");
+ optionMap.put("name",selectName);
+ optionMap.put("multiple",selectMultiple);
+ optionMap.put("value",optionValue);
+ optionMap.put("selected",optionSelected);
+ handler.noteFormInput(optionMap);
+ }
+ break;
+ case FORMPARSESTATE_IN_TEXTAREA:
+ break;
+ default:
+ throw new ManifoldCFException("Unknown form parse state: "+Integer.toString(formParseState));
+ }
+ }
+
+ protected void noteNonscriptEndTag(String tagName)
+ throws ManifoldCFException
+ {
+ super.noteNonscriptEndTag(tagName);
+ switch (formParseState)
+ {
+ case FORMPARSESTATE_NORMAL:
+ break;
+ case FORMPARSESTATE_IN_FORM:
+ if (tagName.equals("form"))
+ {
+ handler.noteFormEnd();
+ formParseState = FORMPARSESTATE_NORMAL;
+ }
+ break;
+ case FORMPARSESTATE_IN_SELECT:
+ formParseState = FORMPARSESTATE_IN_FORM;
+ selectName = null;
+ selectMultiple = null;
+ break;
+ case FORMPARSESTATE_IN_TEXTAREA:
+ formParseState = FORMPARSESTATE_IN_FORM;
+ break;
+ default:
+ throw new ManifoldCFException("Unknown form parse state: "+Integer.toString(formParseState));
+ }
+ }
+
+}
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
------------------------------------------------------------------------------
svn:keywords = Id
Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java Fri Oct 8 00:27:46 2010
@@ -0,0 +1,32 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+
+/** This interface describes the functionality needed by a link extractor to note a discovered link.
+*/
+public interface IDiscoveredLinkHandler
+{
+ /** Inform the world of a discovered link.
+ *@param rawURL is the raw discovered url. This may be relative, malformed, or otherwise unsuitable for use until final form is acheived.
+ */
+ public void noteDiscoveredLink(String rawURL)
+ throws ManifoldCFException;
+}
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IDiscoveredLinkHandler.java
------------------------------------------------------------------------------
svn:keywords = Id
Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java Fri Oct 8 00:27:46 2010
@@ -0,0 +1,55 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+
+/** This interface describes the functionality needed by an HTML processor in order to handle an HTML document.
+*/
+public interface IHTMLHandler extends IDiscoveredLinkHandler
+{
+ /** Note the start of a form */
+ public void noteFormStart(Map formAttributes)
+ throws ManifoldCFException;
+
+ /** Note an input tag */
+ public void noteFormInput(Map inputAttributes)
+ throws ManifoldCFException;
+
+ /** Note the end of a form */
+ public void noteFormEnd()
+ throws ManifoldCFException;
+
+ /** Note discovered href */
+ public void noteAHREF(String rawURL)
+ throws ManifoldCFException;
+
+ /** Note discovered href */
+ public void noteLINKHREF(String rawURL)
+ throws ManifoldCFException;
+
+ /** Note discovered IMG SRC */
+ public void noteIMGSRC(String rawURL)
+ throws ManifoldCFException;
+
+ /** Note discovered FRAME SRC */
+ public void noteFRAMESRC(String rawURL)
+ throws ManifoldCFException;
+}
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IHTMLHandler.java
------------------------------------------------------------------------------
svn:keywords = Id
Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IRedirectionHandler.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IRedirectionHandler.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IRedirectionHandler.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IRedirectionHandler.java Fri Oct 8 00:27:46 2010
@@ -0,0 +1,25 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+/** This interface describes the functionality needed by an redirection processor in order to handle a redirection.
+*/
+public interface IRedirectionHandler extends IDiscoveredLinkHandler
+{
+}
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IRedirectionHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IRedirectionHandler.java
------------------------------------------------------------------------------
svn:keywords = Id
Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IXMLHandler.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IXMLHandler.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IXMLHandler.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IXMLHandler.java Fri Oct 8 00:27:46 2010
@@ -0,0 +1,33 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+
+/** This interface describes the functionality needed by an XML processor in order to handle an XML document.
+*/
+public interface IXMLHandler extends IDiscoveredLinkHandler
+{
+ /** Inform the world of a discovered ttl value.
+ *@param rawTtlValue is the raw discovered ttl value.
+ */
+ public void noteDiscoveredTtlValue(String rawTtlValue)
+ throws ManifoldCFException;
+
+}
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IXMLHandler.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/IXMLHandler.java
------------------------------------------------------------------------------
svn:keywords = Id
Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java Fri Oct 8 00:27:46 2010
@@ -0,0 +1,67 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+
+/** This class recognizes and interprets all links */
+public class LinkParseState extends ScriptParseState
+{
+
+ protected IHTMLHandler handler;
+
+ public LinkParseState(IHTMLHandler handler)
+ {
+ super();
+ this.handler = handler;
+ }
+
+ protected void noteNonscriptTag(String tagName, Map attributes)
+ throws ManifoldCFException
+ {
+ super.noteNonscriptTag(tagName,attributes);
+ String lowerTagName = tagName.toLowerCase();
+ if (tagName.equals("a"))
+ {
+ String hrefValue = (String)attributes.get("href");
+ if (hrefValue != null && hrefValue.length() > 0)
+ handler.noteAHREF(hrefValue);
+ }
+ else if (tagName.equals("link"))
+ {
+ String hrefValue = (String)attributes.get("href");
+ if (hrefValue != null && hrefValue.length() > 0)
+ handler.noteLINKHREF(hrefValue);
+ }
+ else if (tagName.equals("img"))
+ {
+ String srcValue = (String)attributes.get("src");
+ if (srcValue != null && srcValue.length() > 0)
+ handler.noteIMGSRC(srcValue);
+ }
+ else if (tagName.equals("frame"))
+ {
+ String srcValue = (String)attributes.get("src");
+ if (srcValue != null && srcValue.length() > 0)
+ handler.noteFRAMESRC(srcValue);
+ }
+ }
+
+}
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
------------------------------------------------------------------------------
svn:keywords = Id
Added: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java?rev=1005681&view=auto
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java (added)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java Fri Oct 8 00:27:46 2010
@@ -0,0 +1,89 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.crawler.connectors.webcrawler;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.util.*;
+
+/** This class interprets the tag stream generated by the BasicParseState class, and causes script sections to be skipped */
+public class ScriptParseState extends BasicParseState
+{
+ // Script tag parsing states
+ protected static final int SCRIPTPARSESTATE_NORMAL = 0;
+ protected static final int SCRIPTPARSESTATE_INSCRIPT = 1;
+
+ protected int scriptParseState = SCRIPTPARSESTATE_NORMAL;
+
+ public ScriptParseState()
+ {
+ super();
+ }
+
+ // Override methods having to do with notification of tag discovery
+
+ protected void noteTag(String tagName, Map attributes)
+ throws ManifoldCFException
+ {
+ super.noteTag(tagName,attributes);
+ switch (scriptParseState)
+ {
+ case SCRIPTPARSESTATE_NORMAL:
+ if (tagName.equals("script"))
+ scriptParseState = SCRIPTPARSESTATE_INSCRIPT;
+ else
+ noteNonscriptTag(tagName,attributes);
+ break;
+ case SCRIPTPARSESTATE_INSCRIPT:
+ // Skip all tags until we see the end script one.
+ break;
+ default:
+ throw new ManifoldCFException("Unknown script parse state: "+Integer.toString(scriptParseState));
+ }
+ }
+
+ protected void noteEndTag(String tagName)
+ throws ManifoldCFException
+ {
+ super.noteEndTag(tagName);
+ switch (scriptParseState)
+ {
+ case SCRIPTPARSESTATE_NORMAL:
+ noteNonscriptEndTag(tagName);
+ break;
+ case SCRIPTPARSESTATE_INSCRIPT:
+ // Skip all tags until we see the end script one.
+ if (tagName.equals("script"))
+ scriptParseState = SCRIPTPARSESTATE_NORMAL;
+ break;
+ default:
+ break;
+ }
+ }
+
+ protected void noteNonscriptTag(String tagName, Map attributes)
+ throws ManifoldCFException
+ {
+ }
+
+ protected void noteNonscriptEndTag(String tagName)
+ throws ManifoldCFException
+ {
+ }
+
+}
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: incubator/lcf/trunk/modules/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
------------------------------------------------------------------------------
svn:keywords = Id