You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/02/10 23:55:31 UTC

svn commit: r1444628 [1/2] - in /manifoldcf/trunk: ./ connectors/rss/ connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/ connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler...

Author: kwright
Date: Sun Feb 10 22:55:30 2013
New Revision: 1444628

URL: http://svn.apache.org/r1444628
Log:
Fix for CONNECTORS-633.  Remove dependency on patched xerces parser, and instead use our own parser for fuzzy parsing of XML.

Added:
    manifoldcf/trunk/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/
      - copied from r1444626, manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/
Removed:
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/BasicParseState.java
Modified:
    manifoldcf/trunk/   (props changed)
    manifoldcf/trunk/CHANGES.txt
    manifoldcf/trunk/build.xml
    manifoldcf/trunk/connectors/rss/   (props changed)
    manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
    manifoldcf/trunk/connectors/wiki/   (props changed)
    manifoldcf/trunk/connectors/wiki/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/wiki/WikiConnector.java
    manifoldcf/trunk/mvn-bootstrap.bat
    manifoldcf/trunk/mvn-bootstrap.sh
    manifoldcf/trunk/pom.xml

Propchange: manifoldcf/trunk/
------------------------------------------------------------------------------
  Merged /manifoldcf/branches/CONNECTORS-633:r1442813-1444626

Modified: manifoldcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/CHANGES.txt?rev=1444628&r1=1444627&r2=1444628&view=diff
==============================================================================
--- manifoldcf/trunk/CHANGES.txt (original)
+++ manifoldcf/trunk/CHANGES.txt Sun Feb 10 22:55:30 2013
@@ -3,6 +3,11 @@ $Id$
 
 ======================= 1.2-dev =====================
 
+CONNECTORS-633: Remove dependency on custom version of xerces;
+extend the simple tag parser to be able to handle XML, and move it into
+core/fuzzyml for general use.
+(Karl Wright)
+
 CONNECTORS-639: Maven execute of jetty-runner fails.
 (Maciej Li¿ewski)
 

Modified: manifoldcf/trunk/build.xml
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/build.xml?rev=1444628&r1=1444627&r2=1444628&view=diff
==============================================================================
--- manifoldcf/trunk/build.xml (original)
+++ manifoldcf/trunk/build.xml Sun Feb 10 22:55:30 2013
@@ -3153,29 +3153,12 @@ Use Apache Forrest version forrest-0.9-d
     
     <target name="download-xerces">
         <mkdir dir="lib"/>
-        <!-- Download and build patched version of xerces 2.9.1 -->
-        <mkdir dir="build/download"/>
-        <delete dir="build/download/xerces2-j"/>
-        <antcall target="checkout-source-via-svn">
-            <param name="root-dir" value="build/download"/>
-            <param name="svn-url" value="http://svn.apache.org/repos/asf/xerces/java/tags/Xerces-J_2_9_1"/>
-            <param name="dir-name" value="xerces2-j"/>
-        </antcall>
-        <!-- Apply mcf-specific features and fixes patch -->
-        <antcall target="patch-source">
-            <param name="root-dir" value="build/download"/>
-            <param name="diff-file" value="../../upstream-diffs/xerces2-j-2.9.1.mcf.patch"/>
-            <param name="dir-name" value="xerces2-j"/>
-        </antcall>
-        <!-- Build it -->
-        <exec dir="build/download/xerces2-j" executable="cmd" osfamily="windows" failifexecutionfails="true" failonerror="true">
-            <arg line="/c build.bat jar"/>
-        </exec>
-        <exec dir="build/download/xerces2-j" executable="/bin/sh" osfamily="unix" failifexecutionfails="true" failonerror="true">
-            <arg value="build.sh" />
-            <arg value="jar" />
-        </exec>
-        <copy todir="lib" file="build/download/xerces2-j/build/xercesImpl.jar"/>
+        <property name="xerces-version" value="2.10.0"/>
+        <property name="xerces-package" value="xerces"/>
+        <antcall target="download-via-maven"><param name="project-path" value="${xerces-package}"/><param name="artifact-version" value="${xerces-version}"/><param name="target" value="lib"/>
+            <param name="artifact-name" value="xercesImpl"/>
+            <param name="artifact-type" value="jar"/>
+        </antcall>
     </target>
     
     <target name="download-xalan">

Propchange: manifoldcf/trunk/connectors/rss/
------------------------------------------------------------------------------
  Merged /manifoldcf/branches/CONNECTORS-633/connectors/rss:r1442813-1444626

Modified: manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java?rev=1444628&r1=1444627&r2=1444628&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java (original)
+++ manifoldcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java Sun Feb 10 22:55:30 2013
@@ -24,13 +24,7 @@ import org.apache.manifoldcf.crawler.int
 import org.apache.manifoldcf.crawler.system.Logging;
 import org.apache.manifoldcf.crawler.system.ManifoldCF;
 
-import org.xml.sax.Attributes;
-
-import org.apache.manifoldcf.core.common.XMLDoc;
-import org.apache.manifoldcf.agents.common.XMLStream;
-import org.apache.manifoldcf.agents.common.XMLContext;
-import org.apache.manifoldcf.agents.common.XMLStringContext;
-import org.apache.manifoldcf.agents.common.XMLFileContext;
+import org.apache.manifoldcf.core.fuzzyml.*;
 
 import org.apache.http.conn.ConnectTimeoutException;
 import org.apache.http.client.RedirectException;
@@ -3473,31 +3467,15 @@ public class RSSConnector extends org.ap
       }
       try
       {
-        // Parse the document.  This will cause various things to occur, within the instantiated XMLContext class.
-        XMLStream x = new XMLStream();
+        Parser p = new Parser();
+        // Parse the document.  This will cause various things to occur, within the instantiated XMLParsingContext class.
+        XMLFuzzyHierarchicalParseState x = new XMLFuzzyHierarchicalParseState();
         OuterContextClass c = new OuterContextClass(x,documentIdentifier,activities,filter);
         x.setContext(c);
         try
         {
-          try
-          {
-            x.parse(is);
-          }
-          catch (ManifoldCFException e)
-          {
-            // Ignore XML parsing errors.
-            if (e.getMessage().indexOf("pars") >= 0)
-            {
-              if (Logging.connectors.isDebugEnabled())
-                Logging.connectors.debug("RSS: XML document '"+documentIdentifier+"' was unparseable ("+e.getMessage()+"), skipping");
-
-              c.setDefaultRescanTimeIfNeeded();
-
-              return;
-            }
-            throw e;
-
-          }
+          // Believe it or not, there are no parsing errors we can get back now.
+          p.parseWithCharsetDetection(null,is,x);
           c.checkIfValidFeed();
           c.setDefaultRescanTimeIfNeeded();
         }
@@ -3531,7 +3509,7 @@ public class RSSConnector extends org.ap
   }
 
   /** This class handles the outermost XML context for the feed document. */
-  protected class OuterContextClass extends XMLContext
+  protected class OuterContextClass extends XMLParsingContext
   {
     /** Keep track of the number of valid feed signals we saw */
     protected int outerTagCount = 0;
@@ -3544,7 +3522,7 @@ public class RSSConnector extends org.ap
     /** Flag indicating the the rescan time was set for this feed */
     protected boolean rescanTimeSet = false;
 
-    public OuterContextClass(XMLStream theStream, String documentIdentifier, IProcessActivity activities, Filter filter)
+    public OuterContextClass(XMLFuzzyHierarchicalParseState theStream, String documentIdentifier, IProcessActivity activities, Filter filter)
     {
       super(theStream);
       this.documentIdentifier = documentIdentifier;
@@ -3585,8 +3563,9 @@ public class RSSConnector extends org.ap
     }
 
     /** Handle the tag beginning to set the correct second-level parsing context */
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       if (localName.equals("rss"))
       {
@@ -3594,36 +3573,37 @@ public class RSSConnector extends org.ap
         outerTagCount++;
         if (Logging.connectors.isDebugEnabled())
           Logging.connectors.debug("RSS: Parsed bottom-level XML for RSS document '"+documentIdentifier+"'");
-        return new RSSContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
+        return new RSSContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
       }
       else if (localName.equals("RDF"))
       {
         // RDF/Atom feed detected
         outerTagCount++;
-        return new RDFContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
+        return new RDFContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
       }
       else if (localName.equals("feed"))
       {
         // Basic feed detected
         outerTagCount++;
-        return new FeedContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
+        return new FeedContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
       }
       else if (localName.equals("urlset") || localName.equals("sitemapindex"))
       {
         // Sitemap detected
         outerTagCount++;
-        return new UrlsetContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
+        return new UrlsetContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
       }
       
       // The default action is to establish a new default context.
-      return super.beginTag(namespaceURI,localName,qName,atts);
+      return super.beginTag(namespace,localName,qName,atts);
     }
 
     /** Handle the tag ending */
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext context = theStream.getContext();
+      XMLParsingContext context = theStream.getContext();
       String tagName = context.getLocalname();
       if (tagName.equals("rss"))
       {
@@ -3647,7 +3627,7 @@ public class RSSConnector extends org.ap
 
   }
 
-  protected class RSSContextClass extends XMLContext
+  protected class RSSContextClass extends XMLParsingContext
   {
     /** The document identifier */
     protected String documentIdentifier;
@@ -3658,33 +3638,35 @@ public class RSSConnector extends org.ap
     /** Rescan time set flag */
     protected boolean rescanTimeSet = false;
 
-    public RSSContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentIdentifier, IProcessActivity activities, Filter filter)
+    public RSSContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.documentIdentifier = documentIdentifier;
       this.activities = activities;
       this.filter = filter;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // Handle each channel
       if (localName.equals("channel"))
       {
         // Channel detected
-        return new RSSChannelContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
+        return new RSSChannelContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
       }
 
       // Skip everything else.
-      return super.beginTag(namespaceURI,localName,qName,atts);
+      return super.beginTag(namespace,localName,qName,atts);
     }
 
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
       // If it's our channel tag, process global channel information
-      XMLContext context = theStream.getContext();
+      XMLParsingContext context = theStream.getContext();
       String tagName = context.getLocalname();
       if (tagName.equals("channel"))
       {
@@ -3703,7 +3685,7 @@ public class RSSConnector extends org.ap
 
   }
 
-  protected class RSSChannelContextClass extends XMLContext
+  protected class RSSChannelContextClass extends XMLParsingContext
   {
     /** The document identifier */
     protected String documentIdentifier;
@@ -3715,40 +3697,42 @@ public class RSSConnector extends org.ap
     /** TTL value is set on a per-channel basis */
     protected String ttlValue = null;
 
-    public RSSChannelContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentIdentifier, IProcessActivity activities, Filter filter)
+    public RSSChannelContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.documentIdentifier = documentIdentifier;
       this.activities = activities;
       this.filter = filter;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "ttl" and "item", nothing else.
       if (localName.equals("ttl"))
       {
         // TTL value seen.  Prepare to record it, as a string.
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("item"))
       {
         // Item seen.  We don't need any of the attributes etc., but we need to start a new context.
-        return new RSSItemContextClass(theStream,namespaceURI,localName,qName,atts,filter.getDechromedContentMode());
+        return new RSSItemContextClass(theStream,namespace,localName,qName,atts,filter.getDechromedContentMode());
       }
       // Skip everything else.
-      return super.beginTag(namespaceURI,localName,qName,atts);
+      return super.beginTag(namespace,localName,qName,atts);
     }
 
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("ttl"))
         // If the current context must be the TTL one, record its data value.
-        ttlValue = ((XMLStringContext)theContext).getValue();
+        ttlValue = ((XMLStringParsingContext)theContext).getValue();
       else if (theTag.equals("item"))
       {
         // It's an item.
@@ -3813,7 +3797,7 @@ public class RSSConnector extends org.ap
     }
   }
 
-  protected class RSSItemContextClass extends XMLContext
+  protected class RSSItemContextClass extends XMLParsingContext
   {
     protected int dechromedContentMode;
     protected String guidField = null;
@@ -3824,40 +3808,41 @@ public class RSSConnector extends org.ap
     protected ArrayList categoryField = new ArrayList();
     protected File contentsFile = null;
 
-    public RSSItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, int dechromedContentMode)
+    public RSSItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, int dechromedContentMode)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.dechromedContentMode = dechromedContentMode;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "ttl" and "item", nothing else.
       if (localName.equals("link"))
       {
         // "link" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("guid"))
       {
         // "guid" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("pubDate"))
       {
         // "pubDate" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("title"))
       {
         // "title" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("category"))
       {
         // "category" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else
       {
@@ -3869,7 +3854,7 @@ public class RSSConnector extends org.ap
         case DECHROMED_NONE:
           if (localName.equals("description"))
           {
-            return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+            return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
           }
           break;
         case DECHROMED_DESCRIPTION:
@@ -3878,7 +3863,7 @@ public class RSSConnector extends org.ap
             try
             {
               File tempFile = File.createTempFile("_rssdata_","tmp");
-              return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+              return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
             }
             catch (java.net.SocketTimeoutException e)
             {
@@ -3900,7 +3885,7 @@ public class RSSConnector extends org.ap
             try
             {
               File tempFile = File.createTempFile("_rssdata_","tmp");
-              return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+              return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
             }
             catch (java.net.SocketTimeoutException e)
             {
@@ -3917,42 +3902,43 @@ public class RSSConnector extends org.ap
           }
           else if (localName.equals("description"))
           {
-            return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+            return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
           }
           break;
         default:
           break;
         }
         // Skip everything else.
-        return super.beginTag(namespaceURI,localName,qName,atts);
+        return super.beginTag(namespace,localName,qName,atts);
       }
     }
 
     /** Convert the individual sub-fields of the item context into their final forms */
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("link"))
       {
-        linkField = ((XMLStringContext)theContext).getValue();
+        linkField = ((XMLStringParsingContext)theContext).getValue();
       }
       else if (theTag.equals("guid"))
       {
-        guidField = ((XMLStringContext)theContext).getValue();
+        guidField = ((XMLStringParsingContext)theContext).getValue();
       }
       else if (theTag.equals("pubDate"))
       {
-        pubDateField = ((XMLStringContext)theContext).getValue();
+        pubDateField = ((XMLStringParsingContext)theContext).getValue();
       }
       else if (theTag.equals("title"))
       {
-        titleField = ((XMLStringContext)theContext).getValue();
+        titleField = ((XMLStringParsingContext)theContext).getValue();
       }
       else if (theTag.equals("category"))
       {
-        categoryField.add(((XMLStringContext)theContext).getValue());
+        categoryField.add(((XMLStringParsingContext)theContext).getValue());
       }
       else
       {
@@ -3964,7 +3950,7 @@ public class RSSConnector extends org.ap
         case DECHROMED_NONE:
           if (theTag.equals("description"))
           {
-            descriptionField = ((XMLStringContext)theContext).getValue();
+            descriptionField = ((XMLStringParsingContext)theContext).getValue();
           }
           break;
         case DECHROMED_DESCRIPTION:
@@ -3972,7 +3958,7 @@ public class RSSConnector extends org.ap
           {
             // Content file has been written; retrieve it (being sure not to leak any files already hanging around!)
             tagCleanup();
-            contentsFile = ((XMLFileContext)theContext).getCompletedFile();
+            contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
             return;
           }
           break;
@@ -3981,12 +3967,12 @@ public class RSSConnector extends org.ap
           {
             tagCleanup();
             // Retrieve content file
-            contentsFile = ((XMLFileContext)theContext).getCompletedFile();
+            contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
             return;
           }
           else if (theTag.equals("description"))
           {
-            descriptionField = ((XMLStringContext)theContext).getValue();
+            descriptionField = ((XMLStringParsingContext)theContext).getValue();
           }
           break;
         default:
@@ -4139,7 +4125,7 @@ public class RSSConnector extends org.ap
     }
   }
 
-  protected class RDFContextClass extends XMLContext
+  protected class RDFContextClass extends XMLParsingContext
   {
     /** The document identifier */
     protected String documentIdentifier;
@@ -4151,40 +4137,42 @@ public class RSSConnector extends org.ap
     /** ttl value */
     protected String ttlValue = null;
 
-    public RDFContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentIdentifier, IProcessActivity activities, Filter filter)
+    public RDFContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.documentIdentifier = documentIdentifier;
       this.activities = activities;
       this.filter = filter;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "ttl" and "item", nothing else.
       if (localName.equals("ttl"))
       {
         // TTL value seen.  Prepare to record it, as a string.
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("item"))
       {
         // Item seen.  We don't need any of the attributes etc., but we need to start a new context.
-        return new RDFItemContextClass(theStream,namespaceURI,localName,qName,atts,filter.getDechromedContentMode());
+        return new RDFItemContextClass(theStream,namespace,localName,qName,atts,filter.getDechromedContentMode());
       }
       // Skip everything else.
-      return super.beginTag(namespaceURI,localName,qName,atts);
+      return super.beginTag(namespace,localName,qName,atts);
     }
 
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("ttl"))
         // If the current context must be the TTL one, record its data value.
-        ttlValue = ((XMLStringContext)theContext).getValue();
+        ttlValue = ((XMLStringParsingContext)theContext).getValue();
       else if (theTag.equals("item"))
       {
         // It's an item.
@@ -4249,7 +4237,7 @@ public class RSSConnector extends org.ap
     }
   }
 
-  protected class RDFItemContextClass extends XMLContext
+  protected class RDFItemContextClass extends XMLParsingContext
   {
     protected int dechromedContentMode;
     protected String linkField = null;
@@ -4258,30 +4246,31 @@ public class RSSConnector extends org.ap
     protected String descriptionField = null;
     protected File contentsFile = null;
 
-    public RDFItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, int dechromedContentMode)
+    public RDFItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, int dechromedContentMode)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.dechromedContentMode = dechromedContentMode;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "ttl" and "item", nothing else.
       if (localName.equals("link"))
       {
         // "link" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("date"))
       {
         // "dc:date" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("title"))
       {
         // "title" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else
       {
@@ -4290,7 +4279,7 @@ public class RSSConnector extends org.ap
         case DECHROMED_NONE:
           if (localName.equals("description"))
           {
-            return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+            return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
           }
           break;
         case DECHROMED_DESCRIPTION:
@@ -4299,7 +4288,7 @@ public class RSSConnector extends org.ap
             try
             {
               File tempFile = File.createTempFile("_rssdata_","tmp");
-              return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+              return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
             }
             catch (java.net.SocketTimeoutException e)
             {
@@ -4321,7 +4310,7 @@ public class RSSConnector extends org.ap
             try
             {
               File tempFile = File.createTempFile("_rssdata_","tmp");
-              return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+              return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
             }
             catch (java.net.SocketTimeoutException e)
             {
@@ -4338,34 +4327,35 @@ public class RSSConnector extends org.ap
           }
           else if (localName.equals("description"))
           {
-            return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+            return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
           }
           break;
         default:
           break;
         }
         // Skip everything else.
-        return super.beginTag(namespaceURI,localName,qName,atts);
+        return super.beginTag(namespace,localName,qName,atts);
       }
     }
 
     /** Convert the individual sub-fields of the item context into their final forms */
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("link"))
       {
-        linkField = ((XMLStringContext)theContext).getValue();
+        linkField = ((XMLStringParsingContext)theContext).getValue();
       }
       else if (theTag.equals("date"))
       {
-        pubDateField = ((XMLStringContext)theContext).getValue();
+        pubDateField = ((XMLStringParsingContext)theContext).getValue();
       }
       else if (theTag.equals("title"))
       {
-        titleField = ((XMLStringContext)theContext).getValue();
+        titleField = ((XMLStringParsingContext)theContext).getValue();
       }
       else
       {
@@ -4374,7 +4364,7 @@ public class RSSConnector extends org.ap
         case DECHROMED_NONE:
           if (theTag.equals("description"))
           {
-            descriptionField = ((XMLStringContext)theContext).getValue();
+            descriptionField = ((XMLStringParsingContext)theContext).getValue();
           }
           break;
         case DECHROMED_DESCRIPTION:
@@ -4382,7 +4372,7 @@ public class RSSConnector extends org.ap
           {
             // Content file has been written; retrieve it (being sure not to leak any files already hanging around!)
             tagCleanup();
-            contentsFile = ((XMLFileContext)theContext).getCompletedFile();
+            contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
             return;
           }
           break;
@@ -4391,12 +4381,12 @@ public class RSSConnector extends org.ap
           {
             // Retrieve content file
             tagCleanup();
-            contentsFile = ((XMLFileContext)theContext).getCompletedFile();
+            contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
             return;
           }
           else if (theTag.equals("description"))
           {
-            descriptionField = ((XMLStringContext)theContext).getValue();
+            descriptionField = ((XMLStringParsingContext)theContext).getValue();
           }
           break;
         default:
@@ -4525,7 +4515,7 @@ public class RSSConnector extends org.ap
     }
   }
 
-  protected class FeedContextClass extends XMLContext
+  protected class FeedContextClass extends XMLParsingContext
   {
     /** The document identifier */
     protected String documentIdentifier;
@@ -4537,40 +4527,42 @@ public class RSSConnector extends org.ap
     /** ttl value */
     protected String ttlValue = null;
 
-    public FeedContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentIdentifier, IProcessActivity activities, Filter filter)
+    public FeedContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.documentIdentifier = documentIdentifier;
       this.activities = activities;
       this.filter = filter;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "ttl" and "item", nothing else.
       if (localName.equals("ttl"))
       {
         // TTL value seen.  Prepare to record it, as a string.
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("entry"))
       {
         // Item seen.  We don't need any of the attributes etc., but we need to start a new context.
-        return new FeedItemContextClass(theStream,namespaceURI,localName,qName,atts,filter.getDechromedContentMode());
+        return new FeedItemContextClass(theStream,namespace,localName,qName,atts,filter.getDechromedContentMode());
       }
       // Skip everything else.
-      return super.beginTag(namespaceURI,localName,qName,atts);
+      return super.beginTag(namespace,localName,qName,atts);
     }
 
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("ttl"))
         // If the current context must be the TTL one, record its data value.
-        ttlValue = ((XMLStringContext)theContext).getValue();
+        ttlValue = ((XMLStringParsingContext)theContext).getValue();
       else if (theTag.equals("entry"))
       {
         // It's an item.
@@ -4635,7 +4627,7 @@ public class RSSConnector extends org.ap
     }
   }
 
-  protected class FeedItemContextClass extends XMLContext
+  protected class FeedItemContextClass extends XMLParsingContext
   {
     protected int dechromedContentMode;
     protected List<String> linkField = new ArrayList<String>();
@@ -4645,40 +4637,41 @@ public class RSSConnector extends org.ap
     protected File contentsFile = null;
     protected String descriptionField = null;
 
-    public FeedItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, int dechromedContentMode)
+    public FeedItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, int dechromedContentMode)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.dechromedContentMode = dechromedContentMode;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "ttl" and "item", nothing else.
       if (localName.equals("link"))
       {
         // "link" tag
-        String ref = atts.getValue("href");
+        String ref = atts.get("href");
         if (ref != null && ref.length() > 0)
           linkField.add(ref);
-        return super.beginTag(namespaceURI,localName,qName,atts);
+        return super.beginTag(namespace,localName,qName,atts);
       }
       else if (localName.equals("published") || localName.equals("updated"))
       {
         // "published" pr "updated" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("title"))
       {
         // "title" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("category"))
       {
-        String category = atts.getValue("term");
+        String category = atts.get("term");
         if (category != null && category.length() > 0)
           categoryField.add(category);
-        return super.beginTag(namespaceURI,localName,qName,atts);
+        return super.beginTag(namespace,localName,qName,atts);
       }
       else
       {
@@ -4687,7 +4680,7 @@ public class RSSConnector extends org.ap
         case DECHROMED_NONE:
           if (localName.equals("subtitle"))
           {
-            return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+            return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
           }
           break;
         case DECHROMED_DESCRIPTION:
@@ -4696,7 +4689,7 @@ public class RSSConnector extends org.ap
             try
             {
               File tempFile = File.createTempFile("_rssdata_","tmp");
-              return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+              return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
             }
             catch (java.net.SocketTimeoutException e)
             {
@@ -4718,7 +4711,7 @@ public class RSSConnector extends org.ap
             try
             {
               File tempFile = File.createTempFile("_rssdata_","tmp");
-              return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+              return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
             }
             catch (java.net.SocketTimeoutException e)
             {
@@ -4735,30 +4728,31 @@ public class RSSConnector extends org.ap
           }
           else if (localName.equals("subtitle"))
           {
-            return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+            return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
           }
           break;
         default:
           break;
         }
         // Skip everything else.
-        return super.beginTag(namespaceURI,localName,qName,atts);
+        return super.beginTag(namespace,localName,qName,atts);
       }
     }
 
     /** Convert the individual sub-fields of the item context into their final forms */
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("published") || theTag.equals("updated"))
       {
-        pubDateField = ((XMLStringContext)theContext).getValue();
+        pubDateField = ((XMLStringParsingContext)theContext).getValue();
       }
       else if (theTag.equals("title"))
       {
-        titleField = ((XMLStringContext)theContext).getValue();
+        titleField = ((XMLStringParsingContext)theContext).getValue();
       }
       else
       {
@@ -4767,7 +4761,7 @@ public class RSSConnector extends org.ap
         case DECHROMED_NONE:
           if (theTag.equals("subtitle"))
           {
-            titleField = ((XMLStringContext)theContext).getValue();
+            titleField = ((XMLStringParsingContext)theContext).getValue();
           }
           break;
         case DECHROMED_DESCRIPTION:
@@ -4775,7 +4769,7 @@ public class RSSConnector extends org.ap
           {
             // Content file has been written; retrieve it (being sure not to leak any files already hanging around!)
             tagCleanup();
-            contentsFile = ((XMLFileContext)theContext).getCompletedFile();
+            contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
             return;
           }
           break;
@@ -4784,12 +4778,12 @@ public class RSSConnector extends org.ap
           {
             // Retrieve content file
             tagCleanup();
-            contentsFile = ((XMLFileContext)theContext).getCompletedFile();
+            contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
             return;
           }
           else if (theTag.equals("subtitle"))
           {
-            titleField = ((XMLStringContext)theContext).getValue();
+            titleField = ((XMLStringParsingContext)theContext).getValue();
           }
           break;
         default:
@@ -4936,7 +4930,7 @@ public class RSSConnector extends org.ap
     }
   }
   
-  protected class UrlsetContextClass extends XMLContext
+  protected class UrlsetContextClass extends XMLParsingContext
   {
     /** The document identifier */
     protected String documentIdentifier;
@@ -4948,31 +4942,33 @@ public class RSSConnector extends org.ap
     /** ttl value */
     protected String ttlValue = null;
 
-    public UrlsetContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentIdentifier, IProcessActivity activities, Filter filter)
+    public UrlsetContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.documentIdentifier = documentIdentifier;
       this.activities = activities;
       this.filter = filter;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "url", nothing else.
       if (localName.equals("url") || localName.equals("sitemap"))
       {
         // Item seen.  We don't need any of the attributes etc., but we need to start a new context.
-        return new UrlsetItemContextClass(theStream,namespaceURI,localName,qName,atts);
+        return new UrlsetItemContextClass(theStream,namespace,localName,qName,atts);
       }
       // Skip everything else.
-      return super.beginTag(namespaceURI,localName,qName,atts);
+      return super.beginTag(namespace,localName,qName,atts);
     }
 
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("url") || theTag.equals("sitemap"))
       {
@@ -5038,50 +5034,52 @@ public class RSSConnector extends org.ap
     }
   }
 
-  protected class UrlsetItemContextClass extends XMLContext
+  protected class UrlsetItemContextClass extends XMLParsingContext
   {
     protected String linkField = null;
     protected String pubDateField = null;
 
-    public UrlsetItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+    public UrlsetItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "loc" and "lastmod", nothing else.
       if (localName.equals("loc"))
       {
         // "loc" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("lastmod"))
       {
         // "lastmod" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else
       {
         // Skip everything else.
-        return super.beginTag(namespaceURI,localName,qName,atts);
+        return super.beginTag(namespace,localName,qName,atts);
       }
     }
 
     /** Convert the individual sub-fields of the item context into their final forms */
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("loc"))
       {
-        linkField = ((XMLStringContext)theContext).getValue();
+        linkField = ((XMLStringParsingContext)theContext).getValue();
       }
       else if (theTag.equals("lastmod"))
       {
-        pubDateField = ((XMLStringContext)theContext).getValue();
+        pubDateField = ((XMLStringParsingContext)theContext).getValue();
       }
       else
       {

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java?rev=1444628&r1=1444627&r2=1444628&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/FormParseState.java Sun Feb 10 22:55:30 2013
@@ -19,6 +19,7 @@
 package org.apache.manifoldcf.crawler.connectors.webcrawler;
 
 import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.fuzzyml.*;
 import java.util.*;
 
 /** This class interprets the tag stream generated by the BasicParseState class, and keeps track of the form tags. */
@@ -46,10 +47,11 @@ public class FormParseState extends Link
   // Override methods having to do with notification of tag discovery
 
   @Override
-  protected void noteNonscriptTag(String tagName, Map attributes)
+  protected boolean noteNonscriptTag(String tagName, Map<String,String> attributes)
     throws ManifoldCFException
   {
-    super.noteNonscriptTag(tagName,attributes);
+    if (super.noteNonscriptTag(tagName,attributes))
+      return true;
     switch (formParseState)
     {
     case FORMPARSESTATE_NORMAL:
@@ -125,13 +127,15 @@ public class FormParseState extends Link
     default:
       throw new ManifoldCFException("Unknown form parse state: "+Integer.toString(formParseState));
     }
+    return false;
   }
 
   @Override
-  protected void noteNonscriptEndTag(String tagName)
+  protected boolean noteNonscriptEndTag(String tagName)
     throws ManifoldCFException
   {
-    super.noteNonscriptEndTag(tagName);
+    if (super.noteNonscriptEndTag(tagName))
+      return true;
     switch (formParseState)
     {
     case FORMPARSESTATE_NORMAL:
@@ -158,7 +162,7 @@ public class FormParseState extends Link
           optionMap.put("type","select");
           optionMap.put("name",selectName);
           optionMap.put("multiple",selectMultiple);
-          optionMap.put("value",htmlBodyDecode(optionValueText.toString()));
+          optionMap.put("value",optionValueText.toString());
           optionMap.put("selected",optionSelected);
           handler.noteFormInput(optionMap);
         }
@@ -175,13 +179,15 @@ public class FormParseState extends Link
     default:
       throw new ManifoldCFException("Unknown form parse state: "+Integer.toString(formParseState));
     }
+    return false;
   }
 
   @Override
-  protected void noteNormalCharacter(char thisChar)
+  protected boolean noteNormalCharacter(char thisChar)
     throws ManifoldCFException
   {
-    super.noteNormalCharacter(thisChar);
+    if (super.noteNormalCharacter(thisChar))
+      return true;
     if (formParseState == FORMPARSESTATE_IN_OPTION)
     {
       if (optionValueText != null)
@@ -189,6 +195,7 @@ public class FormParseState extends Link
     }
     else
       handler.noteTextCharacter(thisChar);
+    return false;
   }
 
 }

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java?rev=1444628&r1=1444627&r2=1444628&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/LinkParseState.java Sun Feb 10 22:55:30 2013
@@ -19,6 +19,7 @@
 package org.apache.manifoldcf.crawler.connectors.webcrawler;
 
 import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.fuzzyml.*;
 import java.util.*;
 
 /** This class recognizes and interprets all links */
@@ -34,10 +35,11 @@ public class LinkParseState extends Meta
   }
 
   @Override
-  protected void noteNonscriptTag(String tagName, Map attributes)
+  protected boolean noteNonscriptTag(String tagName, Map<String,String> attributes)
     throws ManifoldCFException
   {
-    super.noteNonscriptTag(tagName,attributes);
+    if (super.noteNonscriptTag(tagName,attributes))
+      return true;
     if (tagName.equals("a"))
     {
       String hrefValue = (String)attributes.get("href");
@@ -62,6 +64,7 @@ public class LinkParseState extends Meta
       if (srcValue != null && srcValue.length() > 0)
         handler.noteFRAMESRC(srcValue);
     }
+    return false;
   }
 
 }

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java?rev=1444628&r1=1444627&r2=1444628&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/MetaParseState.java Sun Feb 10 22:55:30 2013
@@ -19,6 +19,7 @@
 package org.apache.manifoldcf.crawler.connectors.webcrawler;
 
 import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.fuzzyml.*;
 import java.util.*;
 
 /** This class recognizes and interprets all meta tags */
@@ -33,14 +34,16 @@ public class MetaParseState extends Scri
   }
 
   @Override
-  protected void noteNonscriptTag(String tagName, Map attributes)
+  protected boolean noteNonscriptTag(String tagName, Map<String,String> attributes)
     throws ManifoldCFException
   {
-    super.noteNonscriptTag(tagName,attributes);
+    if (super.noteNonscriptTag(tagName,attributes))
+      return true;
     if (tagName.equals("meta"))
     {
       handler.noteMetaTag(attributes);
     }
+    return false;
   }
 
 }

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java?rev=1444628&r1=1444627&r2=1444628&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ScriptParseState.java Sun Feb 10 22:55:30 2013
@@ -19,10 +19,11 @@
 package org.apache.manifoldcf.crawler.connectors.webcrawler;
 
 import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.fuzzyml.*;
 import java.util.*;
 
-/** This class interprets the tag stream generated by the BasicParseState class, and causes script sections to be skipped */
-public class ScriptParseState extends BasicParseState
+/** This class interprets the tag stream generated by the HTMLParseState class, and causes script sections to be skipped */
+public class ScriptParseState extends HTMLParseState
 {
   // Script tag parsing states
   protected static final int SCRIPTPARSESTATE_NORMAL = 0;
@@ -38,17 +39,19 @@ public class ScriptParseState extends Ba
   // Override methods having to do with notification of tag discovery
 
   @Override
-  protected void noteTag(String tagName, Map attributes)
+  protected boolean noteTag(String tagName, Map<String,String> attributes)
     throws ManifoldCFException
   {
-    super.noteTag(tagName,attributes);
+    if (super.noteTag(tagName,attributes))
+      return true;
     switch (scriptParseState)
     {
     case SCRIPTPARSESTATE_NORMAL:
       if (tagName.equals("script"))
         scriptParseState = SCRIPTPARSESTATE_INSCRIPT;
       else
-        noteNonscriptTag(tagName,attributes);
+        if (noteNonscriptTag(tagName,attributes))
+          return true;
       break;
     case SCRIPTPARSESTATE_INSCRIPT:
       // Skip all tags until we see the end script one.
@@ -56,17 +59,20 @@ public class ScriptParseState extends Ba
     default:
       throw new ManifoldCFException("Unknown script parse state: "+Integer.toString(scriptParseState));
     }
+    return false;
   }
 
   @Override
-  protected void noteEndTag(String tagName)
+  protected boolean noteTagEnd(String tagName)
     throws ManifoldCFException
   {
-    super.noteEndTag(tagName);
+    if (super.noteTagEnd(tagName))
+      return true;
     switch (scriptParseState)
     {
     case SCRIPTPARSESTATE_NORMAL:
-      noteNonscriptEndTag(tagName);
+      if (noteNonscriptEndTag(tagName))
+        return true;
       break;
     case SCRIPTPARSESTATE_INSCRIPT:
       // Skip all tags until we see the end script one.
@@ -76,16 +82,19 @@ public class ScriptParseState extends Ba
     default:
       break;
     }
+    return false;
   }
 
-  protected void noteNonscriptTag(String tagName, Map attributes)
+  protected boolean noteNonscriptTag(String tagName, Map<String,String> attributes)
     throws ManifoldCFException
   {
+    return false;
   }
 
-  protected void noteNonscriptEndTag(String tagName)
+  protected boolean noteNonscriptEndTag(String tagName)
     throws ManifoldCFException
   {
+    return false;
   }
 
 }

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1444628&r1=1444627&r2=1444628&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Sun Feb 10 22:55:30 2013
@@ -24,13 +24,7 @@ import org.apache.manifoldcf.crawler.int
 import org.apache.manifoldcf.crawler.system.Logging;
 import org.apache.manifoldcf.crawler.system.ManifoldCF;
 
-import org.xml.sax.Attributes;
-
-import org.apache.manifoldcf.core.common.XMLDoc;
-import org.apache.manifoldcf.agents.common.XMLStream;
-import org.apache.manifoldcf.agents.common.XMLContext;
-import org.apache.manifoldcf.agents.common.XMLStringContext;
-import org.apache.manifoldcf.agents.common.XMLFileContext;
+import org.apache.manifoldcf.core.fuzzyml.*;
 
 import org.apache.http.conn.ConnectTimeoutException;
 import org.apache.http.client.RedirectException;
@@ -6023,8 +6017,6 @@ public class WebcrawlerConnector extends
       // from it presuming it is an RSS feed.
 
       String encoding = extractEncoding(contentType);
-      if (encoding == null)
-        encoding = "utf-8";
 
       InputStream is = cache.getData(documentURI);
       if (is == null)
@@ -6034,13 +6026,14 @@ public class WebcrawlerConnector extends
       }
       try
       {
-        // Parse the document.  This will cause various things to occur, within the instantiated XMLContext class.
-        XMLStream x = new XMLStream();
+        // Parse the document.  This will cause various things to occur, within the instantiated XMLParsingContext class.
+        Parser p = new Parser();
+        XMLFuzzyHierarchicalParseState x = new XMLFuzzyHierarchicalParseState();
         OuterContextClass c = new OuterContextClass(x,documentURI,handler);
         x.setContext(c);
         try
         {
-          x.parse(is);
+          p.parseWithCharsetDetection(encoding,is,x);
           c.checkIfValidFeed();
         }
         finally
@@ -6048,17 +6041,6 @@ public class WebcrawlerConnector extends
           x.cleanup();
         }
       }
-      catch (ManifoldCFException e)
-      {
-        // Ignore XML parsing errors.  These should probably have their own error code, but that requires a core change.
-        if (e.getMessage().indexOf("pars") >= 0)
-        {
-          if (Logging.connectors.isDebugEnabled())
-            Logging.connectors.debug("WEB: XML document '"+documentURI+"' was unparseable ("+e.getMessage()+"), skipping");
-          return;
-        }
-        throw e;
-      }
       finally
       {
         is.close();
@@ -6085,7 +6067,7 @@ public class WebcrawlerConnector extends
   }
 
   /** This class handles the outermost XML context for the feed document. */
-  protected class OuterContextClass extends XMLContext
+  protected class OuterContextClass extends XMLParsingContext
   {
     /** Keep track of the number of valid feed signals we saw */
     protected int outerTagCount = 0;
@@ -6094,7 +6076,7 @@ public class WebcrawlerConnector extends
     /** The link handler */
     protected IXMLHandler handler;
 
-    public OuterContextClass(XMLStream theStream, String documentURI, IXMLHandler handler)
+    public OuterContextClass(XMLFuzzyHierarchicalParseState theStream, String documentURI, IXMLHandler handler)
     {
       super(theStream);
       this.documentURI = documentURI;
@@ -6112,8 +6094,9 @@ public class WebcrawlerConnector extends
     }
 
     /** Handle the tag beginning to set the correct second-level parsing context */
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       if (localName.equals("rss"))
       {
@@ -6121,36 +6104,37 @@ public class WebcrawlerConnector extends
         outerTagCount++;
         if (Logging.connectors.isDebugEnabled())
           Logging.connectors.debug("WEB: Parsed bottom-level XML for RSS document '"+documentURI+"'");
-        return new RSSContextClass(theStream,namespaceURI,localName,qName,atts,documentURI,handler);
+        return new RSSContextClass(theStream,namespace,localName,qName,atts,documentURI,handler);
       }
       else if (localName.equals("RDF"))
       {
         // RDF/Atom feed detected
         outerTagCount++;
-        return new RDFContextClass(theStream,namespaceURI,localName,qName,atts,documentURI,handler);
+        return new RDFContextClass(theStream,namespace,localName,qName,atts,documentURI,handler);
       }
       else if (localName.equals("feed"))
       {
         // Basic feed detected
         outerTagCount++;
-        return new FeedContextClass(theStream,namespaceURI,localName,qName,atts,documentURI,handler);
+        return new FeedContextClass(theStream,namespace,localName,qName,atts,documentURI,handler);
       }
       else if (localName.equals("urlset") || localName.equals("sitemapindex"))
       {
         // Sitemap detected
         outerTagCount++;
-        return new UrlsetContextClass(theStream,namespaceURI,localName,qName,atts,documentURI,handler);
+        return new UrlsetContextClass(theStream,namespace,localName,qName,atts,documentURI,handler);
       }
 
       // The default action is to establish a new default context.
-      return super.beginTag(namespaceURI,localName,qName,atts);
+      return super.beginTag(namespace,localName,qName,atts);
     }
 
     /** Handle the tag ending */
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext context = theStream.getContext();
+      XMLParsingContext context = theStream.getContext();
       String tagName = context.getLocalname();
       if (tagName.equals("RDF"))
       {
@@ -6170,39 +6154,41 @@ public class WebcrawlerConnector extends
 
   }
 
-  protected class RSSContextClass extends XMLContext
+  protected class RSSContextClass extends XMLParsingContext
   {
     /** The document identifier */
     protected String documentURI;
     /** Link notification interface */
     protected IXMLHandler handler;
 
-    public RSSContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentURI, IXMLHandler handler)
+    public RSSContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentURI, IXMLHandler handler)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.documentURI = documentURI;
       this.handler = handler;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // Handle each channel
       if (localName.equals("channel"))
       {
         // Channel detected
-        return new RSSChannelContextClass(theStream,namespaceURI,localName,qName,atts,documentURI,handler);
+        return new RSSChannelContextClass(theStream,namespace,localName,qName,atts,documentURI,handler);
       }
 
       // Skip everything else.
-      return super.beginTag(namespaceURI,localName,qName,atts);
+      return super.beginTag(namespace,localName,qName,atts);
     }
 
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
       // If it's our channel tag, process global channel information
-      XMLContext context = theStream.getContext();
+      XMLParsingContext context = theStream.getContext();
       String tagName = context.getLocalname();
       if (tagName.equals("channel"))
       {
@@ -6213,7 +6199,7 @@ public class WebcrawlerConnector extends
     }
   }
 
-  protected class RSSChannelContextClass extends XMLContext
+  protected class RSSChannelContextClass extends XMLParsingContext
   {
     /** The document identifier */
     protected String documentURI;
@@ -6223,39 +6209,41 @@ public class WebcrawlerConnector extends
     /** TTL value is set on a per-channel basis */
     protected String ttlValue = null;
 
-    public RSSChannelContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentURI, IXMLHandler handler)
+    public RSSChannelContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentURI, IXMLHandler handler)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.documentURI = documentURI;
       this.handler = handler;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "ttl" and "item", nothing else.
       if (localName.equals("ttl"))
       {
         // TTL value seen.  Prepare to record it, as a string.
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("item"))
       {
         // Item seen.  We don't need any of the attributes etc., but we need to start a new context.
-        return new RSSItemContextClass(theStream,namespaceURI,localName,qName,atts);
+        return new RSSItemContextClass(theStream,namespace,localName,qName,atts);
       }
       // Skip everything else.
-      return super.beginTag(namespaceURI,localName,qName,atts);
+      return super.beginTag(namespace,localName,qName,atts);
     }
 
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("ttl"))
         // If the current context must be the TTL one, record its data value.
-        ttlValue = ((XMLStringContext)theContext).getValue();
+        ttlValue = ((XMLStringParsingContext)theContext).getValue();
       else if (theTag.equals("item"))
       {
         // It's an item.
@@ -6287,50 +6275,52 @@ public class WebcrawlerConnector extends
     }
   }
 
-  protected class RSSItemContextClass extends XMLContext
+  protected class RSSItemContextClass extends XMLParsingContext
   {
     protected String guidField = null;
     protected String linkField = null;
 
-    public RSSItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+    public RSSItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "ttl" and "item", nothing else.
       if (localName.equals("link"))
       {
         // "link" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("guid"))
       {
         // "guid" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else
       {
         // Skip everything else.
-        return super.beginTag(namespaceURI,localName,qName,atts);
+        return super.beginTag(namespace,localName,qName,atts);
       }
     }
 
     /** Convert the individual sub-fields of the item context into their final forms */
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("link"))
       {
-        linkField = ((XMLStringContext)theContext).getValue();
+        linkField = ((XMLStringParsingContext)theContext).getValue();
       }
       else if (theTag.equals("guid"))
       {
-        guidField = ((XMLStringContext)theContext).getValue();
+        guidField = ((XMLStringParsingContext)theContext).getValue();
       }
       else
       {
@@ -6359,7 +6349,7 @@ public class WebcrawlerConnector extends
     }
   }
 
-  protected class RDFContextClass extends XMLContext
+  protected class RDFContextClass extends XMLParsingContext
   {
     /** The document identifier */
     protected String documentURI;
@@ -6369,39 +6359,41 @@ public class WebcrawlerConnector extends
     /** ttl value */
     protected String ttlValue = null;
 
-    public RDFContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentURI, IXMLHandler handler)
+    public RDFContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentURI, IXMLHandler handler)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.documentURI = documentURI;
       this.handler = handler;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "ttl" and "item", nothing else.
       if (localName.equals("ttl"))
       {
         // TTL value seen.  Prepare to record it, as a string.
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("item"))
       {
         // Item seen.  We don't need any of the attributes etc., but we need to start a new context.
-        return new RDFItemContextClass(theStream,namespaceURI,localName,qName,atts);
+        return new RDFItemContextClass(theStream,namespace,localName,qName,atts);
       }
       // Skip everything else.
-      return super.beginTag(namespaceURI,localName,qName,atts);
+      return super.beginTag(namespace,localName,qName,atts);
     }
 
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("ttl"))
         // If the current context must be the TTL one, record its data value.
-        ttlValue = ((XMLStringContext)theContext).getValue();
+        ttlValue = ((XMLStringParsingContext)theContext).getValue();
       else if (theTag.equals("item"))
       {
         // It's an item.
@@ -6429,40 +6421,42 @@ public class WebcrawlerConnector extends
     }
   }
 
-  protected class RDFItemContextClass extends XMLContext
+  protected class RDFItemContextClass extends XMLParsingContext
   {
     protected String linkField = null;
 
-    public RDFItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+    public RDFItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "ttl" and "item", nothing else.
       if (localName.equals("link"))
       {
         // "link" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else
       {
         // Skip everything else.
-        return super.beginTag(namespaceURI,localName,qName,atts);
+        return super.beginTag(namespace,localName,qName,atts);
       }
     }
 
     /** Convert the individual sub-fields of the item context into their final forms */
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("link"))
       {
-        linkField = ((XMLStringContext)theContext).getValue();
+        linkField = ((XMLStringParsingContext)theContext).getValue();
       }
       else
       {
@@ -6488,7 +6482,7 @@ public class WebcrawlerConnector extends
     }
   }
 
-  protected class FeedContextClass extends XMLContext
+  protected class FeedContextClass extends XMLParsingContext
   {
     /** The document identifier */
     protected String documentURI;
@@ -6498,39 +6492,41 @@ public class WebcrawlerConnector extends
     /** ttl value */
     protected String ttlValue = null;
 
-    public FeedContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentURI, IXMLHandler handler)
+    public FeedContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentURI, IXMLHandler handler)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.documentURI = documentURI;
       this.handler = handler;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "ttl" and "item", nothing else.
       if (localName.equals("ttl"))
       {
         // TTL value seen.  Prepare to record it, as a string.
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("entry"))
       {
         // Item seen.  We don't need any of the attributes etc., but we need to start a new context.
-        return new FeedItemContextClass(theStream,namespaceURI,localName,qName,atts);
+        return new FeedItemContextClass(theStream,namespace,localName,qName,atts);
       }
       // Skip everything else.
-      return super.beginTag(namespaceURI,localName,qName,atts);
+      return super.beginTag(namespace,localName,qName,atts);
     }
 
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("ttl"))
         // If the current context must be the TTL one, record its data value.
-        ttlValue = ((XMLStringContext)theContext).getValue();
+        ttlValue = ((XMLStringParsingContext)theContext).getValue();
       else if (theTag.equals("entry"))
       {
         // It's an item.
@@ -6559,31 +6555,32 @@ public class WebcrawlerConnector extends
     }
   }
 
-  protected class FeedItemContextClass extends XMLContext
+  protected class FeedItemContextClass extends XMLParsingContext
   {
     protected List<String> linkField = new ArrayList<String>();
 
-    public FeedItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+    public FeedItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "ttl" and "item", nothing else.
       if (localName.equals("link"))
       {
         // "link" tag
-        String ref = atts.getValue("href");
+        String ref = atts.get("href");
         if (ref != null && ref.length() > 0)
           linkField.add(ref);
-        return super.beginTag(namespaceURI,localName,qName,atts);
+        return super.beginTag(namespace,localName,qName,atts);
       }
       else
       {
         // Skip everything else.
-        return super.beginTag(namespaceURI,localName,qName,atts);
+        return super.beginTag(namespace,localName,qName,atts);
       }
     }
 
@@ -6608,7 +6605,7 @@ public class WebcrawlerConnector extends
     }
   }
 
-  protected class UrlsetContextClass extends XMLContext
+  protected class UrlsetContextClass extends XMLParsingContext
   {
     /** The document identifier */
     protected String documentURI;
@@ -6618,30 +6615,32 @@ public class WebcrawlerConnector extends
     /** ttl value */
     protected String ttlValue = null;
 
-    public UrlsetContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentURI, IXMLHandler handler)
+    public UrlsetContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentURI, IXMLHandler handler)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.documentURI = documentURI;
       this.handler = handler;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "url", nothing else.
       if (localName.equals("url") || localName.equals("sitemap"))
       {
         // Item seen.  We don't need any of the attributes etc., but we need to start a new context.
-        return new UrlsetItemContextClass(theStream,namespaceURI,localName,qName,atts);
+        return new UrlsetItemContextClass(theStream,namespace,localName,qName,atts);
       }
       // Skip everything else.
-      return super.beginTag(namespaceURI,localName,qName,atts);
+      return super.beginTag(namespace,localName,qName,atts);
     }
 
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("url") || theTag.equals("sitemap"))
       {
@@ -6674,40 +6673,42 @@ public class WebcrawlerConnector extends
     }
   }
 
-  protected class UrlsetItemContextClass extends XMLContext
+  protected class UrlsetItemContextClass extends XMLParsingContext
   {
     protected String linkField = null;
 
-    public UrlsetItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+    public UrlsetItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "loc", nothing else.
       if (localName.equals("loc"))
       {
         // "loc" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else
       {
         // Skip everything else.
-        return super.beginTag(namespaceURI,localName,qName,atts);
+        return super.beginTag(namespace,localName,qName,atts);
       }
     }
 
     /** Convert the individual sub-fields of the item context into their final forms */
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("loc"))
       {
-        linkField = ((XMLStringContext)theContext).getValue();
+        linkField = ((XMLStringParsingContext)theContext).getValue();
       }
       else
       {
@@ -6768,33 +6769,14 @@ public class WebcrawlerConnector extends
         return;
       }
 
-      if (Logging.connectors.isDebugEnabled())
-        Logging.connectors.debug("WEB: Document '"+documentURI+"' is text, with encoding '"+encoding+"'; link extraction starting");
-
       try
       {
-        // Create a reader for the described encoding, if that's possible
-        Reader r = new InputStreamReader(is,encoding);
-        try
-        {
-          // We read characters at a time, understanding the basic form of html.
-          // This code represents a basic bottom-up parser, which is the best thing since we really don't want to code up all the context we'd need
-          // to do a top-down parse.  So, there is a parse state, and the code walks through the document recognizing symbols and modifying the state.
+        if (Logging.connectors.isDebugEnabled())
+          Logging.connectors.debug("WEB: Document '"+documentURI+"' is text, with encoding '"+encoding+"'; link extraction starting");
 
-          FormParseState currentParseState = new FormParseState(handler);
-          while (true)
-          {
-            int x = r.read();
-            if (x == -1)
-              break;
-            currentParseState.dealWithCharacter((char)x);
-          }
-          currentParseState.finishUp();
-        }
-        finally
-        {
-          r.close();
-        }
+        // Instantiate the parser, and call the right method
+        Parser p = new Parser();
+        p.parseWithoutCharsetDetection(encoding,is,new FormParseState(handler));
       }
       catch (UnsupportedEncodingException e)
       {

Propchange: manifoldcf/trunk/connectors/wiki/
------------------------------------------------------------------------------
  Merged /manifoldcf/branches/CONNECTORS-633/connectors/wiki:r1442813-1444626