You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2012/08/27 08:14:21 UTC

svn commit: r1377575 - in /manifoldcf/trunk: ./ connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/

Author: kwright
Date: Mon Aug 27 06:14:20 2012
New Revision: 1377575

URL: http://svn.apache.org/viewvc?rev=1377575&view=rev
Log:
Fix OPTION operation when VALUE is missing; part of CONNECTORS-513.

Modified:
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1377575&r1=1377574&r2=1377575&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Mon Aug 27 06:14:20 2012
@@ -3,6 +3,11 @@ $Id$
 
 ======================= 0.7-dev =====================
 
+CONNECTORS-513: Fix HTML parsing in WebConnector so that we
+recognize a default input type to be "text", and deal with missing
+"value" attributes in "option" tags.
+(Karl Wright)
+
 CONNECTORS-514: Revamp ManifoldCF.initializeEnvironment and
 ManifoldCF.cleanUpEnvironment to use reference counting so that more
 than one user of these methods can coexist in the same VM.

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java?rev=1377575&r1=1377574&r2=1377575&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java Mon Aug 27 06:14:20 2012
@@ -77,6 +77,8 @@ public class BasicParseState
     case BASICPARSESTATE_NORMAL:
       if (thisChar == '<')
         currentState = BASICPARSESTATE_SAWLEFTBRACKET;
+      else
+        noteNormalCharacter(thisChar);
       break;
     case BASICPARSESTATE_SAWLEFTBRACKET:
       if (thisChar == '!')
@@ -372,12 +374,23 @@ public class BasicParseState
     Logging.connectors.debug(" Saw end tag '"+tagName+"'");
   }
 
+  protected void noteNormalCharacter(char thisChar)
+    throws ManifoldCFException
+  {
+  }
+  
   public void finishUp()
     throws ManifoldCFException
   {
     // Does nothing
   }
 
+  /** Decode html body text */
+  protected static String htmlBodyDecode(String input)
+  {
+    return htmlAttributeDecode(input);
+  }
+  
   /** Decode an html attribute */
   protected static String htmlAttributeDecode(String input)
   {

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java?rev=1377575&r1=1377574&r2=1377575&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java Mon Aug 27 06:14:20 2012
@@ -29,12 +29,15 @@ public class FormParseState extends Link
   protected final static int FORMPARSESTATE_IN_FORM = 1;
   protected final static int FORMPARSESTATE_IN_SELECT = 2;
   protected final static int FORMPARSESTATE_IN_TEXTAREA = 3;
-
+  protected final static int FORMPARSESTATE_IN_OPTION = 4;
   
   protected int formParseState = FORMPARSESTATE_NORMAL;
   protected String selectName = null;
   protected String selectMultiple = null;
-
+  protected String optionValue = null;
+  protected String optionSelected = null;
+  protected StringBuilder optionValueText = null;
+  
   public FormParseState(IHTMLHandler handler)
   {
     super(handler);
@@ -42,6 +45,7 @@ public class FormParseState extends Link
 
   // Override methods having to do with notification of tag discovery
 
+  @Override
   protected void noteNonscriptTag(String tagName, Map attributes)
     throws ManifoldCFException
   {
@@ -96,17 +100,26 @@ public class FormParseState extends Link
     case FORMPARSESTATE_IN_SELECT:
       if (tagName.equals("option"))
       {
-        String optionValue = (String)attributes.get("value");
-        String optionSelected = (String)attributes.get("selected");
-        Map optionMap = new HashMap();
-        optionMap.put("type","select");
-        optionMap.put("name",selectName);
-        optionMap.put("multiple",selectMultiple);
-        optionMap.put("value",optionValue);
-        optionMap.put("selected",optionSelected);
-        handler.noteFormInput(optionMap);
+        optionValue = (String)attributes.get("value");
+        optionSelected = (String)attributes.get("selected");
+        formParseState = FORMPARSESTATE_IN_OPTION;
+        // In case there's no end tag, if we have everything we need, do it now.
+        if (optionValue != null)
+        {
+          Map optionMap = new HashMap();
+          optionMap.put("type","select");
+          optionMap.put("name",selectName);
+          optionMap.put("multiple",selectMultiple);
+          optionMap.put("value",optionValue);
+          optionMap.put("selected",optionSelected);
+          handler.noteFormInput(optionMap);
+        }
+        else
+          optionValueText = new StringBuilder();
       }
       break;
+    case FORMPARSESTATE_IN_OPTION:
+      break;
     case FORMPARSESTATE_IN_TEXTAREA:
       break;
     default:
@@ -114,6 +127,7 @@ public class FormParseState extends Link
     }
   }
 
+  @Override
   protected void noteNonscriptEndTag(String tagName)
     throws ManifoldCFException
   {
@@ -134,12 +148,45 @@ public class FormParseState extends Link
       selectName = null;
       selectMultiple = null;
       break;
+    case FORMPARSESTATE_IN_OPTION:
+      if (tagName.equals("option"))
+      {
+        // If we haven't already emitted the option, emit it now.
+        if (optionValueText != null)
+        {
+          Map optionMap = new HashMap();
+          optionMap.put("type","select");
+          optionMap.put("name",selectName);
+          optionMap.put("multiple",selectMultiple);
+          optionMap.put("value",htmlBodyDecode(optionValueText.toString()));
+          optionMap.put("selected",optionSelected);
+          handler.noteFormInput(optionMap);
+        }
+        formParseState = FORMPARSESTATE_IN_SELECT;
+        optionSelected = null;
+        optionValue = null;
+        optionValueText = null;
+      }
+      break;
     case FORMPARSESTATE_IN_TEXTAREA:
-      formParseState = FORMPARSESTATE_IN_FORM;
+      if (tagName.equals("textarea"))
+        formParseState = FORMPARSESTATE_IN_FORM;
       break;
     default:
       throw new ManifoldCFException("Unknown form parse state: "+Integer.toString(formParseState));
     }
   }
 
+  @Override
+  protected void noteNormalCharacter(char thisChar)
+    throws ManifoldCFException
+  {
+    super.noteNormalCharacter(thisChar);
+    if (formParseState == FORMPARSESTATE_IN_OPTION)
+    {
+      if (optionValueText != null)
+        optionValueText.append(thisChar);
+    }
+  }
+
 }