You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2012/08/27 08:14:21 UTC
svn commit: r1377575 - in /manifoldcf/trunk: ./
connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/
Author: kwright
Date: Mon Aug 27 06:14:20 2012
New Revision: 1377575
URL: http://svn.apache.org/viewvc?rev=1377575&view=rev
Log:
Fix OPTION operation when VALUE is missing; part of CONNECTORS-513.
Modified:
manifoldcf/trunk/CHANGES.txt
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1377575&r1=1377574&r2=1377575&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Mon Aug 27 06:14:20 2012
@@ -3,6 +3,11 @@ $Id$
======================= 0.7-dev =====================
+CONNECTORS-513: Fix HTML parsing in WebConnector so that we
+recognize a default input type to be "text", and deal with missing
+"value" attributes in "option" tags.
+(Karl Wright)
+
CONNECTORS-514: Revamp ManifoldCF.initializeEnvironment and
ManifoldCF.cleanUpEnvironment to use reference counting so that more
than one user of these methods can coexist in the same VM.
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java?rev=1377575&r1=1377574&r2=1377575&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java Mon Aug 27 06:14:20 2012
@@ -77,6 +77,8 @@ public class BasicParseState
case BASICPARSESTATE_NORMAL:
if (thisChar == '<')
currentState = BASICPARSESTATE_SAWLEFTBRACKET;
+ else
+ noteNormalCharacter(thisChar);
break;
case BASICPARSESTATE_SAWLEFTBRACKET:
if (thisChar == '!')
@@ -372,12 +374,23 @@ public class BasicParseState
Logging.connectors.debug(" Saw end tag '"+tagName+"'");
}
+ protected void noteNormalCharacter(char thisChar)
+ throws ManifoldCFException
+ {
+ }
+
public void finishUp()
throws ManifoldCFException
{
// Does nothing
}
+ /** Decode html body text */
+ protected static String htmlBodyDecode(String input)
+ {
+ return htmlAttributeDecode(input);
+ }
+
/** Decode an html attribute */
protected static String htmlAttributeDecode(String input)
{
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java?rev=1377575&r1=1377574&r2=1377575&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java Mon Aug 27 06:14:20 2012
@@ -29,12 +29,15 @@ public class FormParseState extends Link
protected final static int FORMPARSESTATE_IN_FORM = 1;
protected final static int FORMPARSESTATE_IN_SELECT = 2;
protected final static int FORMPARSESTATE_IN_TEXTAREA = 3;
-
+ protected final static int FORMPARSESTATE_IN_OPTION = 4;
protected int formParseState = FORMPARSESTATE_NORMAL;
protected String selectName = null;
protected String selectMultiple = null;
-
+ protected String optionValue = null;
+ protected String optionSelected = null;
+ protected StringBuilder optionValueText = null;
+
public FormParseState(IHTMLHandler handler)
{
super(handler);
@@ -42,6 +45,7 @@ public class FormParseState extends Link
// Override methods having to do with notification of tag discovery
+ @Override
protected void noteNonscriptTag(String tagName, Map attributes)
throws ManifoldCFException
{
@@ -96,17 +100,26 @@ public class FormParseState extends Link
case FORMPARSESTATE_IN_SELECT:
if (tagName.equals("option"))
{
- String optionValue = (String)attributes.get("value");
- String optionSelected = (String)attributes.get("selected");
- Map optionMap = new HashMap();
- optionMap.put("type","select");
- optionMap.put("name",selectName);
- optionMap.put("multiple",selectMultiple);
- optionMap.put("value",optionValue);
- optionMap.put("selected",optionSelected);
- handler.noteFormInput(optionMap);
+ optionValue = (String)attributes.get("value");
+ optionSelected = (String)attributes.get("selected");
+ formParseState = FORMPARSESTATE_IN_OPTION;
+ // In case there's no end tag, if we have everything we need, do it now.
+ if (optionValue != null)
+ {
+ Map optionMap = new HashMap();
+ optionMap.put("type","select");
+ optionMap.put("name",selectName);
+ optionMap.put("multiple",selectMultiple);
+ optionMap.put("value",optionValue);
+ optionMap.put("selected",optionSelected);
+ handler.noteFormInput(optionMap);
+ }
+ else
+ optionValueText = new StringBuilder();
}
break;
+ case FORMPARSESTATE_IN_OPTION:
+ break;
case FORMPARSESTATE_IN_TEXTAREA:
break;
default:
@@ -114,6 +127,7 @@ public class FormParseState extends Link
}
}
+ @Override
protected void noteNonscriptEndTag(String tagName)
throws ManifoldCFException
{
@@ -134,12 +148,45 @@ public class FormParseState extends Link
selectName = null;
selectMultiple = null;
break;
+ case FORMPARSESTATE_IN_OPTION:
+ if (tagName.equals("option"))
+ {
+ // If we haven't already emitted the option, emit it now.
+ if (optionValueText != null)
+ {
+ Map optionMap = new HashMap();
+ optionMap.put("type","select");
+ optionMap.put("name",selectName);
+ optionMap.put("multiple",selectMultiple);
+ optionMap.put("value",htmlBodyDecode(optionValueText.toString()));
+ optionMap.put("selected",optionSelected);
+ handler.noteFormInput(optionMap);
+ }
+ formParseState = FORMPARSESTATE_IN_SELECT;
+ optionSelected = null;
+ optionValue = null;
+ optionValueText = null;
+ }
+ break;
case FORMPARSESTATE_IN_TEXTAREA:
- formParseState = FORMPARSESTATE_IN_FORM;
+ if (tagName.equals("textarea"))
+ formParseState = FORMPARSESTATE_IN_FORM;
break;
default:
throw new ManifoldCFException("Unknown form parse state: "+Integer.toString(formParseState));
}
}
+ @Override
+ protected void noteNormalCharacter(char thisChar)
+ throws ManifoldCFException
+ {
+ super.noteNormalCharacter(thisChar);
+ if (formParseState == FORMPARSESTATE_IN_OPTION)
+ {
+ if (optionValueText != null)
+ optionValueText.append(thisChar);
+ }
+ }
+
}