You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/02/10 10:11:48 UTC

svn commit: r1444516 - in /manifoldcf/branches/CONNECTORS-633: connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/ framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/

Author: kwright
Date: Sun Feb 10 09:11:48 2013
New Revision: 1444516

URL: http://svn.apache.org/r1444516
Log:
Port RSS connector to new parser.

Added:
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFileParsingContext.java   (with props)
Modified:
    manifoldcf/branches/CONNECTORS-633/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
    manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyHierarchicalParseState.java

Modified: manifoldcf/branches/CONNECTORS-633/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java?rev=1444516&r1=1444515&r2=1444516&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java (original)
+++ manifoldcf/branches/CONNECTORS-633/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java Sun Feb 10 09:11:48 2013
@@ -24,13 +24,7 @@ import org.apache.manifoldcf.crawler.int
 import org.apache.manifoldcf.crawler.system.Logging;
 import org.apache.manifoldcf.crawler.system.ManifoldCF;
 
-import org.xml.sax.Attributes;
-
-import org.apache.manifoldcf.core.common.XMLDoc;
-import org.apache.manifoldcf.agents.common.XMLStream;
-import org.apache.manifoldcf.agents.common.XMLContext;
-import org.apache.manifoldcf.agents.common.XMLStringContext;
-import org.apache.manifoldcf.agents.common.XMLFileContext;
+import org.apache.manifoldcf.core.fuzzyml.*;
 
 import org.apache.http.conn.ConnectTimeoutException;
 import org.apache.http.client.RedirectException;
@@ -3473,31 +3467,15 @@ public class RSSConnector extends org.ap
       }
       try
       {
-        // Parse the document.  This will cause various things to occur, within the instantiated XMLContext class.
-        XMLStream x = new XMLStream();
+        Parser p = new Parser();
+        // Parse the document.  This will cause various things to occur, within the instantiated XMLParsingContext class.
+        XMLFuzzyHierarchicalParseState x = new XMLFuzzyHierarchicalParseState();
         OuterContextClass c = new OuterContextClass(x,documentIdentifier,activities,filter);
         x.setContext(c);
         try
         {
-          try
-          {
-            x.parse(is);
-          }
-          catch (ManifoldCFException e)
-          {
-            // Ignore XML parsing errors.
-            if (e.getMessage().indexOf("pars") >= 0)
-            {
-              if (Logging.connectors.isDebugEnabled())
-                Logging.connectors.debug("RSS: XML document '"+documentIdentifier+"' was unparseable ("+e.getMessage()+"), skipping");
-
-              c.setDefaultRescanTimeIfNeeded();
-
-              return;
-            }
-            throw e;
-
-          }
+          // Believe it or not, there are no parsing errors we can get back now.
+          p.parseWithCharsetDetection(null,is,x);
           c.checkIfValidFeed();
           c.setDefaultRescanTimeIfNeeded();
         }
@@ -3531,7 +3509,7 @@ public class RSSConnector extends org.ap
   }
 
   /** This class handles the outermost XML context for the feed document. */
-  protected class OuterContextClass extends XMLContext
+  protected class OuterContextClass extends XMLParsingContext
   {
     /** Keep track of the number of valid feed signals we saw */
     protected int outerTagCount = 0;
@@ -3544,7 +3522,7 @@ public class RSSConnector extends org.ap
     /** Flag indicating the the rescan time was set for this feed */
     protected boolean rescanTimeSet = false;
 
-    public OuterContextClass(XMLStream theStream, String documentIdentifier, IProcessActivity activities, Filter filter)
+    public OuterContextClass(XMLFuzzyHierarchicalParseState theStream, String documentIdentifier, IProcessActivity activities, Filter filter)
     {
       super(theStream);
       this.documentIdentifier = documentIdentifier;
@@ -3585,8 +3563,9 @@ public class RSSConnector extends org.ap
     }
 
     /** Handle the tag beginning to set the correct second-level parsing context */
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       if (localName.equals("rss"))
       {
@@ -3594,36 +3573,37 @@ public class RSSConnector extends org.ap
         outerTagCount++;
         if (Logging.connectors.isDebugEnabled())
           Logging.connectors.debug("RSS: Parsed bottom-level XML for RSS document '"+documentIdentifier+"'");
-        return new RSSContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
+        return new RSSContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
       }
       else if (localName.equals("RDF"))
       {
         // RDF/Atom feed detected
         outerTagCount++;
-        return new RDFContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
+        return new RDFContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
       }
       else if (localName.equals("feed"))
       {
         // Basic feed detected
         outerTagCount++;
-        return new FeedContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
+        return new FeedContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
       }
       else if (localName.equals("urlset") || localName.equals("sitemapindex"))
       {
         // Sitemap detected
         outerTagCount++;
-        return new UrlsetContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
+        return new UrlsetContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
       }
       
       // The default action is to establish a new default context.
-      return super.beginTag(namespaceURI,localName,qName,atts);
+      return super.beginTag(namespace,localName,qName,atts);
     }
 
     /** Handle the tag ending */
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext context = theStream.getContext();
+      XMLParsingContext context = theStream.getContext();
       String tagName = context.getLocalname();
       if (tagName.equals("rss"))
       {
@@ -3647,7 +3627,7 @@ public class RSSConnector extends org.ap
 
   }
 
-  protected class RSSContextClass extends XMLContext
+  protected class RSSContextClass extends XMLParsingContext
   {
     /** The document identifier */
     protected String documentIdentifier;
@@ -3658,33 +3638,35 @@ public class RSSConnector extends org.ap
     /** Rescan time set flag */
     protected boolean rescanTimeSet = false;
 
-    public RSSContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentIdentifier, IProcessActivity activities, Filter filter)
+    public RSSContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.documentIdentifier = documentIdentifier;
       this.activities = activities;
       this.filter = filter;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // Handle each channel
       if (localName.equals("channel"))
       {
         // Channel detected
-        return new RSSChannelContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
+        return new RSSChannelContextClass(theStream,namespace,localName,qName,atts,documentIdentifier,activities,filter);
       }
 
       // Skip everything else.
-      return super.beginTag(namespaceURI,localName,qName,atts);
+      return super.beginTag(namespace,localName,qName,atts);
     }
 
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
       // If it's our channel tag, process global channel information
-      XMLContext context = theStream.getContext();
+      XMLParsingContext context = theStream.getContext();
       String tagName = context.getLocalname();
       if (tagName.equals("channel"))
       {
@@ -3703,7 +3685,7 @@ public class RSSConnector extends org.ap
 
   }
 
-  protected class RSSChannelContextClass extends XMLContext
+  protected class RSSChannelContextClass extends XMLParsingContext
   {
     /** The document identifier */
     protected String documentIdentifier;
@@ -3715,40 +3697,42 @@ public class RSSConnector extends org.ap
     /** TTL value is set on a per-channel basis */
     protected String ttlValue = null;
 
-    public RSSChannelContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentIdentifier, IProcessActivity activities, Filter filter)
+    public RSSChannelContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.documentIdentifier = documentIdentifier;
       this.activities = activities;
       this.filter = filter;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "ttl" and "item", nothing else.
       if (localName.equals("ttl"))
       {
         // TTL value seen.  Prepare to record it, as a string.
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("item"))
       {
         // Item seen.  We don't need any of the attributes etc., but we need to start a new context.
-        return new RSSItemContextClass(theStream,namespaceURI,localName,qName,atts,filter.getDechromedContentMode());
+        return new RSSItemContextClass(theStream,namespace,localName,qName,atts,filter.getDechromedContentMode());
       }
       // Skip everything else.
-      return super.beginTag(namespaceURI,localName,qName,atts);
+      return super.beginTag(namespace,localName,qName,atts);
     }
 
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("ttl"))
         // If the current context must be the TTL one, record its data value.
-        ttlValue = ((XMLStringContext)theContext).getValue();
+        ttlValue = ((XMLStringParsingContext)theContext).getValue();
       else if (theTag.equals("item"))
       {
         // It's an item.
@@ -3813,7 +3797,7 @@ public class RSSConnector extends org.ap
     }
   }
 
-  protected class RSSItemContextClass extends XMLContext
+  protected class RSSItemContextClass extends XMLParsingContext
   {
     protected int dechromedContentMode;
     protected String guidField = null;
@@ -3824,40 +3808,41 @@ public class RSSConnector extends org.ap
     protected ArrayList categoryField = new ArrayList();
     protected File contentsFile = null;
 
-    public RSSItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, int dechromedContentMode)
+    public RSSItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, int dechromedContentMode)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.dechromedContentMode = dechromedContentMode;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "ttl" and "item", nothing else.
       if (localName.equals("link"))
       {
         // "link" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("guid"))
       {
         // "guid" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("pubDate"))
       {
         // "pubDate" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("title"))
       {
         // "title" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("category"))
       {
         // "category" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else
       {
@@ -3869,7 +3854,7 @@ public class RSSConnector extends org.ap
         case DECHROMED_NONE:
           if (localName.equals("description"))
           {
-            return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+            return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
           }
           break;
         case DECHROMED_DESCRIPTION:
@@ -3878,7 +3863,7 @@ public class RSSConnector extends org.ap
             try
             {
               File tempFile = File.createTempFile("_rssdata_","tmp");
-              return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+              return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
             }
             catch (java.net.SocketTimeoutException e)
             {
@@ -3900,7 +3885,7 @@ public class RSSConnector extends org.ap
             try
             {
               File tempFile = File.createTempFile("_rssdata_","tmp");
-              return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+              return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
             }
             catch (java.net.SocketTimeoutException e)
             {
@@ -3917,42 +3902,43 @@ public class RSSConnector extends org.ap
           }
           else if (localName.equals("description"))
           {
-            return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+            return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
           }
           break;
         default:
           break;
         }
         // Skip everything else.
-        return super.beginTag(namespaceURI,localName,qName,atts);
+        return super.beginTag(namespace,localName,qName,atts);
       }
     }
 
     /** Convert the individual sub-fields of the item context into their final forms */
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("link"))
       {
-        linkField = ((XMLStringContext)theContext).getValue();
+        linkField = ((XMLStringParsingContext)theContext).getValue();
       }
       else if (theTag.equals("guid"))
       {
-        guidField = ((XMLStringContext)theContext).getValue();
+        guidField = ((XMLStringParsingContext)theContext).getValue();
       }
       else if (theTag.equals("pubDate"))
       {
-        pubDateField = ((XMLStringContext)theContext).getValue();
+        pubDateField = ((XMLStringParsingContext)theContext).getValue();
       }
       else if (theTag.equals("title"))
       {
-        titleField = ((XMLStringContext)theContext).getValue();
+        titleField = ((XMLStringParsingContext)theContext).getValue();
       }
       else if (theTag.equals("category"))
       {
-        categoryField.add(((XMLStringContext)theContext).getValue());
+        categoryField.add(((XMLStringParsingContext)theContext).getValue());
       }
       else
       {
@@ -3964,7 +3950,7 @@ public class RSSConnector extends org.ap
         case DECHROMED_NONE:
           if (theTag.equals("description"))
           {
-            descriptionField = ((XMLStringContext)theContext).getValue();
+            descriptionField = ((XMLStringParsingContext)theContext).getValue();
           }
           break;
         case DECHROMED_DESCRIPTION:
@@ -3972,7 +3958,7 @@ public class RSSConnector extends org.ap
           {
             // Content file has been written; retrieve it (being sure not to leak any files already hanging around!)
             tagCleanup();
-            contentsFile = ((XMLFileContext)theContext).getCompletedFile();
+            contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
             return;
           }
           break;
@@ -3981,12 +3967,12 @@ public class RSSConnector extends org.ap
           {
             tagCleanup();
             // Retrieve content file
-            contentsFile = ((XMLFileContext)theContext).getCompletedFile();
+            contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
             return;
           }
           else if (theTag.equals("description"))
           {
-            descriptionField = ((XMLStringContext)theContext).getValue();
+            descriptionField = ((XMLStringParsingContext)theContext).getValue();
           }
           break;
         default:
@@ -4139,7 +4125,7 @@ public class RSSConnector extends org.ap
     }
   }
 
-  protected class RDFContextClass extends XMLContext
+  protected class RDFContextClass extends XMLParsingContext
   {
     /** The document identifier */
     protected String documentIdentifier;
@@ -4151,40 +4137,42 @@ public class RSSConnector extends org.ap
     /** ttl value */
     protected String ttlValue = null;
 
-    public RDFContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentIdentifier, IProcessActivity activities, Filter filter)
+    public RDFContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.documentIdentifier = documentIdentifier;
       this.activities = activities;
       this.filter = filter;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "ttl" and "item", nothing else.
       if (localName.equals("ttl"))
       {
         // TTL value seen.  Prepare to record it, as a string.
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("item"))
       {
         // Item seen.  We don't need any of the attributes etc., but we need to start a new context.
-        return new RDFItemContextClass(theStream,namespaceURI,localName,qName,atts,filter.getDechromedContentMode());
+        return new RDFItemContextClass(theStream,namespace,localName,qName,atts,filter.getDechromedContentMode());
       }
       // Skip everything else.
-      return super.beginTag(namespaceURI,localName,qName,atts);
+      return super.beginTag(namespace,localName,qName,atts);
     }
 
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("ttl"))
         // If the current context must be the TTL one, record its data value.
-        ttlValue = ((XMLStringContext)theContext).getValue();
+        ttlValue = ((XMLStringParsingContext)theContext).getValue();
       else if (theTag.equals("item"))
       {
         // It's an item.
@@ -4249,7 +4237,7 @@ public class RSSConnector extends org.ap
     }
   }
 
-  protected class RDFItemContextClass extends XMLContext
+  protected class RDFItemContextClass extends XMLParsingContext
   {
     protected int dechromedContentMode;
     protected String linkField = null;
@@ -4258,30 +4246,31 @@ public class RSSConnector extends org.ap
     protected String descriptionField = null;
     protected File contentsFile = null;
 
-    public RDFItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, int dechromedContentMode)
+    public RDFItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, int dechromedContentMode)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.dechromedContentMode = dechromedContentMode;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "ttl" and "item", nothing else.
       if (localName.equals("link"))
       {
         // "link" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("date"))
       {
         // "dc:date" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("title"))
       {
         // "title" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else
       {
@@ -4290,7 +4279,7 @@ public class RSSConnector extends org.ap
         case DECHROMED_NONE:
           if (localName.equals("description"))
           {
-            return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+            return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
           }
           break;
         case DECHROMED_DESCRIPTION:
@@ -4299,7 +4288,7 @@ public class RSSConnector extends org.ap
             try
             {
               File tempFile = File.createTempFile("_rssdata_","tmp");
-              return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+              return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
             }
             catch (java.net.SocketTimeoutException e)
             {
@@ -4321,7 +4310,7 @@ public class RSSConnector extends org.ap
             try
             {
               File tempFile = File.createTempFile("_rssdata_","tmp");
-              return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+              return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
             }
             catch (java.net.SocketTimeoutException e)
             {
@@ -4338,34 +4327,35 @@ public class RSSConnector extends org.ap
           }
           else if (localName.equals("description"))
           {
-            return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+            return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
           }
           break;
         default:
           break;
         }
         // Skip everything else.
-        return super.beginTag(namespaceURI,localName,qName,atts);
+        return super.beginTag(namespace,localName,qName,atts);
       }
     }
 
     /** Convert the individual sub-fields of the item context into their final forms */
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("link"))
       {
-        linkField = ((XMLStringContext)theContext).getValue();
+        linkField = ((XMLStringParsingContext)theContext).getValue();
       }
       else if (theTag.equals("date"))
       {
-        pubDateField = ((XMLStringContext)theContext).getValue();
+        pubDateField = ((XMLStringParsingContext)theContext).getValue();
       }
       else if (theTag.equals("title"))
       {
-        titleField = ((XMLStringContext)theContext).getValue();
+        titleField = ((XMLStringParsingContext)theContext).getValue();
       }
       else
       {
@@ -4374,7 +4364,7 @@ public class RSSConnector extends org.ap
         case DECHROMED_NONE:
           if (theTag.equals("description"))
           {
-            descriptionField = ((XMLStringContext)theContext).getValue();
+            descriptionField = ((XMLStringParsingContext)theContext).getValue();
           }
           break;
         case DECHROMED_DESCRIPTION:
@@ -4382,7 +4372,7 @@ public class RSSConnector extends org.ap
           {
             // Content file has been written; retrieve it (being sure not to leak any files already hanging around!)
             tagCleanup();
-            contentsFile = ((XMLFileContext)theContext).getCompletedFile();
+            contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
             return;
           }
           break;
@@ -4391,12 +4381,12 @@ public class RSSConnector extends org.ap
           {
             // Retrieve content file
             tagCleanup();
-            contentsFile = ((XMLFileContext)theContext).getCompletedFile();
+            contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
             return;
           }
           else if (theTag.equals("description"))
           {
-            descriptionField = ((XMLStringContext)theContext).getValue();
+            descriptionField = ((XMLStringParsingContext)theContext).getValue();
           }
           break;
         default:
@@ -4525,7 +4515,7 @@ public class RSSConnector extends org.ap
     }
   }
 
-  protected class FeedContextClass extends XMLContext
+  protected class FeedContextClass extends XMLParsingContext
   {
     /** The document identifier */
     protected String documentIdentifier;
@@ -4537,40 +4527,42 @@ public class RSSConnector extends org.ap
     /** ttl value */
     protected String ttlValue = null;
 
-    public FeedContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentIdentifier, IProcessActivity activities, Filter filter)
+    public FeedContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.documentIdentifier = documentIdentifier;
       this.activities = activities;
       this.filter = filter;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "ttl" and "item", nothing else.
       if (localName.equals("ttl"))
       {
         // TTL value seen.  Prepare to record it, as a string.
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("entry"))
       {
         // Item seen.  We don't need any of the attributes etc., but we need to start a new context.
-        return new FeedItemContextClass(theStream,namespaceURI,localName,qName,atts,filter.getDechromedContentMode());
+        return new FeedItemContextClass(theStream,namespace,localName,qName,atts,filter.getDechromedContentMode());
       }
       // Skip everything else.
-      return super.beginTag(namespaceURI,localName,qName,atts);
+      return super.beginTag(namespace,localName,qName,atts);
     }
 
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("ttl"))
         // If the current context must be the TTL one, record its data value.
-        ttlValue = ((XMLStringContext)theContext).getValue();
+        ttlValue = ((XMLStringParsingContext)theContext).getValue();
       else if (theTag.equals("entry"))
       {
         // It's an item.
@@ -4635,7 +4627,7 @@ public class RSSConnector extends org.ap
     }
   }
 
-  protected class FeedItemContextClass extends XMLContext
+  protected class FeedItemContextClass extends XMLParsingContext
   {
     protected int dechromedContentMode;
     protected List<String> linkField = new ArrayList<String>();
@@ -4645,40 +4637,41 @@ public class RSSConnector extends org.ap
     protected File contentsFile = null;
     protected String descriptionField = null;
 
-    public FeedItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, int dechromedContentMode)
+    public FeedItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, int dechromedContentMode)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.dechromedContentMode = dechromedContentMode;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "ttl" and "item", nothing else.
       if (localName.equals("link"))
       {
         // "link" tag
-        String ref = atts.getValue("href");
+        String ref = atts.get("href");
         if (ref != null && ref.length() > 0)
           linkField.add(ref);
-        return super.beginTag(namespaceURI,localName,qName,atts);
+        return super.beginTag(namespace,localName,qName,atts);
       }
       else if (localName.equals("published") || localName.equals("updated"))
       {
         // "published" pr "updated" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("title"))
       {
         // "title" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("category"))
       {
-        String category = atts.getValue("term");
+        String category = atts.get("term");
         if (category != null && category.length() > 0)
           categoryField.add(category);
-        return super.beginTag(namespaceURI,localName,qName,atts);
+        return super.beginTag(namespace,localName,qName,atts);
       }
       else
       {
@@ -4687,7 +4680,7 @@ public class RSSConnector extends org.ap
         case DECHROMED_NONE:
           if (localName.equals("subtitle"))
           {
-            return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+            return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
           }
           break;
         case DECHROMED_DESCRIPTION:
@@ -4696,7 +4689,7 @@ public class RSSConnector extends org.ap
             try
             {
               File tempFile = File.createTempFile("_rssdata_","tmp");
-              return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+              return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
             }
             catch (java.net.SocketTimeoutException e)
             {
@@ -4718,7 +4711,7 @@ public class RSSConnector extends org.ap
             try
             {
               File tempFile = File.createTempFile("_rssdata_","tmp");
-              return new XMLFileContext(theStream,namespaceURI,localName,qName,atts,tempFile);
+              return new XMLFileParsingContext(theStream,namespace,localName,qName,atts,tempFile);
             }
             catch (java.net.SocketTimeoutException e)
             {
@@ -4735,30 +4728,31 @@ public class RSSConnector extends org.ap
           }
           else if (localName.equals("subtitle"))
           {
-            return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+            return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
           }
           break;
         default:
           break;
         }
         // Skip everything else.
-        return super.beginTag(namespaceURI,localName,qName,atts);
+        return super.beginTag(namespace,localName,qName,atts);
       }
     }
 
     /** Convert the individual sub-fields of the item context into their final forms */
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("published") || theTag.equals("updated"))
       {
-        pubDateField = ((XMLStringContext)theContext).getValue();
+        pubDateField = ((XMLStringParsingContext)theContext).getValue();
       }
       else if (theTag.equals("title"))
       {
-        titleField = ((XMLStringContext)theContext).getValue();
+        titleField = ((XMLStringParsingContext)theContext).getValue();
       }
       else
       {
@@ -4767,7 +4761,7 @@ public class RSSConnector extends org.ap
         case DECHROMED_NONE:
           if (theTag.equals("subtitle"))
           {
-            titleField = ((XMLStringContext)theContext).getValue();
+            titleField = ((XMLStringParsingContext)theContext).getValue();
           }
           break;
         case DECHROMED_DESCRIPTION:
@@ -4775,7 +4769,7 @@ public class RSSConnector extends org.ap
           {
             // Content file has been written; retrieve it (being sure not to leak any files already hanging around!)
             tagCleanup();
-            contentsFile = ((XMLFileContext)theContext).getCompletedFile();
+            contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
             return;
           }
           break;
@@ -4784,12 +4778,12 @@ public class RSSConnector extends org.ap
           {
             // Retrieve content file
             tagCleanup();
-            contentsFile = ((XMLFileContext)theContext).getCompletedFile();
+            contentsFile = ((XMLFileParsingContext)theContext).getCompletedFile();
             return;
           }
           else if (theTag.equals("subtitle"))
           {
-            titleField = ((XMLStringContext)theContext).getValue();
+            titleField = ((XMLStringParsingContext)theContext).getValue();
           }
           break;
         default:
@@ -4936,7 +4930,7 @@ public class RSSConnector extends org.ap
     }
   }
   
-  protected class UrlsetContextClass extends XMLContext
+  protected class UrlsetContextClass extends XMLParsingContext
   {
     /** The document identifier */
     protected String documentIdentifier;
@@ -4948,31 +4942,33 @@ public class RSSConnector extends org.ap
     /** ttl value */
     protected String ttlValue = null;
 
-    public UrlsetContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentIdentifier, IProcessActivity activities, Filter filter)
+    public UrlsetContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts, String documentIdentifier, IProcessActivity activities, Filter filter)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
       this.documentIdentifier = documentIdentifier;
       this.activities = activities;
       this.filter = filter;
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "url", nothing else.
       if (localName.equals("url") || localName.equals("sitemap"))
       {
         // Item seen.  We don't need any of the attributes etc., but we need to start a new context.
-        return new UrlsetItemContextClass(theStream,namespaceURI,localName,qName,atts);
+        return new UrlsetItemContextClass(theStream,namespace,localName,qName,atts);
       }
       // Skip everything else.
-      return super.beginTag(namespaceURI,localName,qName,atts);
+      return super.beginTag(namespace,localName,qName,atts);
     }
 
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("url") || theTag.equals("sitemap"))
       {
@@ -5038,50 +5034,52 @@ public class RSSConnector extends org.ap
     }
   }
 
-  protected class UrlsetItemContextClass extends XMLContext
+  protected class UrlsetItemContextClass extends XMLParsingContext
   {
     protected String linkField = null;
     protected String pubDateField = null;
 
-    public UrlsetItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+    public UrlsetItemContextClass(XMLFuzzyHierarchicalParseState theStream, String namespace, String localName, String qName, Map<String,String> atts)
     {
-      super(theStream,namespaceURI,localName,qName,atts);
+      super(theStream,namespace,localName,qName,atts);
     }
 
-    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
-      throws ManifoldCFException, ServiceInterruption
+    @Override
+    protected XMLParsingContext beginTag(String namespace, String localName, String qName, Map<String,String> atts)
+      throws ManifoldCFException
     {
       // The tags we care about are "loc" and "lastmod", nothing else.
       if (localName.equals("loc"))
       {
         // "loc" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else if (localName.equals("lastmod"))
       {
         // "lastmod" tag
-        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+        return new XMLStringParsingContext(theStream,namespace,localName,qName,atts);
       }
       else
       {
         // Skip everything else.
-        return super.beginTag(namespaceURI,localName,qName,atts);
+        return super.beginTag(namespace,localName,qName,atts);
       }
     }
 
     /** Convert the individual sub-fields of the item context into their final forms */
+    @Override
     protected void endTag()
-      throws ManifoldCFException, ServiceInterruption
+      throws ManifoldCFException
     {
-      XMLContext theContext = theStream.getContext();
+      XMLParsingContext theContext = theStream.getContext();
       String theTag = theContext.getLocalname();
       if (theTag.equals("loc"))
       {
-        linkField = ((XMLStringContext)theContext).getValue();
+        linkField = ((XMLStringParsingContext)theContext).getValue();
       }
       else if (theTag.equals("lastmod"))
       {
-        pubDateField = ((XMLStringContext)theContext).getValue();
+        pubDateField = ((XMLStringParsingContext)theContext).getValue();
       }
       else
       {

Added: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFileParsingContext.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFileParsingContext.java?rev=1444516&view=auto
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFileParsingContext.java (added)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFileParsingContext.java Sun Feb 10 09:11:48 2013
@@ -0,0 +1,64 @@
+/* $Id$ */
+
+/**
+* Licensed to the Apache Software Foundation (ASF) under one or more
+* contributor license agreements. See the NOTICE file distributed with
+* this work for additional information regarding copyright ownership.
+* The ASF licenses this file to You under the Apache License, Version 2.0
+* (the "License"); you may not use this file except in compliance with
+* the License. You may obtain a copy of the License at
+*
+* http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*/
+package org.apache.manifoldcf.core.fuzzyml;
+
+import org.apache.manifoldcf.core.interfaces.*;
+import java.io.*;
+import java.util.*;
+
+/** An instance of this class represents a parsing context within a node.  Data is written to the supplied file in utf-8 format.
+*/
+public class XMLFileParsingContext extends XMLOutputStreamParsingContext
+{
+  /** The output file */
+  protected File outputFile;
+
+  /** Full constructor.  Used for individual tags. */
+  public XMLFileParsingContext(XMLFuzzyHierarchicalParseState theStream, String namespace, String localname, String qname, Map<String,String> theseAttributes, File f)
+    throws ManifoldCFException, UnsupportedEncodingException, FileNotFoundException
+  {
+    // Construct an appropriate writer
+    super(theStream,namespace,localname,qname,theseAttributes,new FileOutputStream(f));
+    // Save the file
+    outputFile = f;
+  }
+
+  /** Get file object, flushing it, closing it, and clearing it.  (This prevents the file from being deleted during cleanup of this context.) */
+  public File getCompletedFile()
+    throws ManifoldCFException
+  {
+    flush();
+    close();
+    File rval = outputFile;
+    outputFile = null;
+    return rval;
+  }
+
+  /** Cleanup whatever is left over */
+  public void tagCleanup()
+    throws ManifoldCFException
+  {
+    if (outputFile != null)
+    {
+      close();
+      outputFile.delete();
+      outputFile = null;
+    }
+  }
+}

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFileParsingContext.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFileParsingContext.java
------------------------------------------------------------------------------
    svn:keywords = Id

Modified: manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyHierarchicalParseState.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyHierarchicalParseState.java?rev=1444516&r1=1444515&r2=1444516&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyHierarchicalParseState.java (original)
+++ manifoldcf/branches/CONNECTORS-633/framework/core/src/main/java/org/apache/manifoldcf/core/fuzzyml/XMLFuzzyHierarchicalParseState.java Sun Feb 10 09:11:48 2013
@@ -48,6 +48,13 @@ public class XMLFuzzyHierarchicalParseSt
   /** Whether we're capturing escaped characters */
   protected boolean captureEscaped = false;
   
+  /** Constructor with default properties.
+  */
+  public XMLFuzzyHierarchicalParseState()
+  {
+    this(true,true,true,true,true,true);
+  }
+  
   /** Constructor.
   */
   public XMLFuzzyHierarchicalParseState(boolean lowerCaseAttributes, boolean lowerCaseTags,