You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2011/09/16 13:53:33 UTC

svn commit: r1171519 - in /incubator/lcf/trunk: ./ connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/ connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/

Author: kwright
Date: Fri Sep 16 11:53:32 2011
New Revision: 1171519

URL: http://svn.apache.org/viewvc?rev=1171519&view=rev
Log:
Fix for CONNECTORS-255.  Add ability to process site map documents in both web connector and rss connector.

Modified:
    incubator/lcf/trunk/CHANGES.txt
    incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
    incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/Robots.java
    incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: incubator/lcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/CHANGES.txt?rev=1171519&r1=1171518&r2=1171519&view=diff
==============================================================================
--- incubator/lcf/trunk/CHANGES.txt (original)
+++ incubator/lcf/trunk/CHANGES.txt Fri Sep 16 11:53:32 2011
@@ -3,6 +3,10 @@ $Id$
 
 ======================= 0.4-dev =====================
 
+CONNECTORS-255: Add ability to process site map documents, in both
+RSS connector and Web connector.
+(Karl Wright)
+
 CONNECTORS-202: Add commit-within parameter to Solr output connector.
 (Jan Høydahl, Karl Wright)
 

Modified: incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java?rev=1171519&r1=1171518&r2=1171519&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java (original)
+++ incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java Fri Sep 16 11:53:32 2011
@@ -3493,7 +3493,13 @@ public class RSSConnector extends org.ap
         outerTagCount++;
         return new FeedContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
       }
-
+      else if (qName.equals("urlset") || qName.equals("sitemapindex"))
+      {
+        // Sitemap detected
+        outerTagCount++;
+        return new UrlsetContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
+      }
+      
       // The default action is to establish a new default context.
       return super.beginTag(namespaceURI,localName,qName,atts);
     }
@@ -3516,6 +3522,10 @@ public class RSSConnector extends org.ap
       {
         rescanTimeSet = ((FeedContextClass)context).process();
       }
+      else if (tagName.equals("urlset") || tagName.equals("sitemapindex"))
+      {
+        rescanTimeSet = ((UrlsetContextClass)context).process();
+      }
       else
         super.endTag();
     }
@@ -4750,6 +4760,214 @@ public class RSSConnector extends org.ap
       }
     }
   }
+  
+  protected class UrlsetContextClass extends XMLContext
+  {
+    /** The document identifier */
+    protected String documentIdentifier;
+    /** Activities interface */
+    protected IProcessActivity activities;
+    /** Filter */
+    protected Filter filter;
+
+    /** ttl value */
+    protected String ttlValue = null;
+
+    public UrlsetContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentIdentifier, IProcessActivity activities, Filter filter)
+    {
+      super(theStream,namespaceURI,localName,qName,atts);
+      this.documentIdentifier = documentIdentifier;
+      this.activities = activities;
+      this.filter = filter;
+    }
+
+    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
+      throws ManifoldCFException, ServiceInterruption
+    {
+      // The tags we care about are "url", nothing else.
+      if (qName.equals("url") || qName.equals("sitemap"))
+      {
+        // Item seen.  We don't need any of the attributes etc., but we need to start a new context.
+        return new UrlsetItemContextClass(theStream,namespaceURI,localName,qName,atts);
+      }
+      // Skip everything else.
+      return super.beginTag(namespaceURI,localName,qName,atts);
+    }
+
+    protected void endTag()
+      throws ManifoldCFException, ServiceInterruption
+    {
+      XMLContext theContext = theStream.getContext();
+      String theTag = theContext.getQname();
+      if (theTag.equals("url") || theTag.equals("sitemap"))
+      {
+        // It's an item.
+        UrlsetItemContextClass itemContext = (UrlsetItemContextClass)theContext;
+        // Presumably, since we are done parsing, we've recorded all the information we need in the context, object including:
+        // (1) File name (if any), containing dechromed content
+        // (2) Link name(s)
+        // (3) Pubdate
+        // (4) Title
+        // The job now is to pull this info out and call the activities interface appropriately.
+
+        // NOTE: After this endTag() method is called, tagCleanup() will be called for the item context.  This should clean up
+        // all dangling files etc. that need to be removed.
+        // If an exception or error is thrown during the parse, this endTag() method will NOT be called, but the tagCleanup()
+        // method will be called regardless.
+        itemContext.process(documentIdentifier,activities,filter);
+      }
+      else
+        super.endTag();
+    }
+
+    /** Process this data */
+    protected boolean process()
+      throws ManifoldCFException
+    {
+      // Deal with the ttlvalue, if it was found
+      // Use the ttl value as a signal for when we ought to look at this feed again.  If not present, use the default.
+      long currentTime = System.currentTimeMillis();
+      Long rescanTime = filter.getDefaultRescanTime(currentTime);
+      if (ttlValue != null)
+      {
+        try
+        {
+          int minutes = Integer.parseInt(ttlValue);
+          long nextTime = currentTime + minutes * 60000L;
+          rescanTime = new Long(nextTime);
+          // Set the upper bound time; we want to scan the feeds aggressively.
+          if (Logging.connectors.isDebugEnabled())
+            Logging.connectors.debug("RSS: In SiteMap document '"+documentIdentifier+"', found a ttl value of "+ttlValue+"; setting refetch time accordingly");
+        }
+        catch (NumberFormatException e)
+        {
+          Logging.connectors.warn("RSS: SiteMap document '"+documentIdentifier+"' has illegal ttl value '"+ttlValue+"'");
+        }
+      }
+
+      if (rescanTime != null)
+      {
+        Long minimumTime = filter.getMinimumRescanTime(currentTime);
+        if (minimumTime != null)
+        {
+          if (rescanTime.longValue() < minimumTime.longValue())
+            rescanTime = minimumTime;
+        }
+      }
+
+      if (Logging.connectors.isDebugEnabled())
+        Logging.connectors.debug("RSS: In SiteMap document '"+documentIdentifier+"' setting rescan time to "+((rescanTime==null)?"null":rescanTime.toString()));
+
+      activities.setDocumentScheduleBounds(documentIdentifier,rescanTime,rescanTime,null,null);
+      return true;
+    }
+  }
+
+  protected class UrlsetItemContextClass extends XMLContext
+  {
+    protected String linkField = null;
+    protected String pubDateField = null;
+
+    public UrlsetItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+    {
+      super(theStream,namespaceURI,localName,qName,atts);
+    }
+
+    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
+      throws ManifoldCFException, ServiceInterruption
+    {
+      // The tags we care about are "loc" and "lastmod", nothing else.
+      if (qName.equals("loc"))
+      {
+        // "loc" tag
+        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+      }
+      else if (qName.equals("lastmod"))
+      {
+        // "lastmod" tag
+        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+      }
+      else
+      {
+        // Skip everything else.
+        return super.beginTag(namespaceURI,localName,qName,atts);
+      }
+    }
+
+    /** Convert the individual sub-fields of the item context into their final forms */
+    protected void endTag()
+      throws ManifoldCFException, ServiceInterruption
+    {
+      XMLContext theContext = theStream.getContext();
+      String theTag = theContext.getQname();
+      if (theTag.equals("loc"))
+      {
+        linkField = ((XMLStringContext)theContext).getValue();
+      }
+      else if (theTag.equals("lastmod"))
+      {
+        pubDateField = ((XMLStringContext)theContext).getValue();
+      }
+      else
+      {
+        super.endTag();
+      }
+    }
+
+    protected void tagCleanup()
+      throws ManifoldCFException
+    {
+    }
+
+    /** Process the data accumulated for this item */
+    public void process(String documentIdentifier, IProcessActivity activities, Filter filter)
+      throws ManifoldCFException
+    {
+      if (linkField != null && linkField.length() > 0)
+      {
+        Long origDate = null;
+        if (pubDateField != null && pubDateField.length() > 0)
+          origDate = parseZuluDate(pubDateField);
+
+        String[] links = linkField.split(", ");
+        int l = 0;
+        while (l < links.length)
+        {
+          String rawURL = links[l++].trim();
+          // Process the link
+          String newIdentifier = makeDocumentIdentifier(filter.getCanonicalizationPolicies(),documentIdentifier,rawURL);
+          if (newIdentifier != null)
+          {
+            if (Logging.connectors.isDebugEnabled())
+              Logging.connectors.debug("RSS: In SiteMap document '"+documentIdentifier+"', found a link to '"+newIdentifier+"', which has origination date "+
+              ((origDate==null)?"null":origDate.toString()));
+            if (filter.isLegalURL(newIdentifier))
+            {
+              // It's a reference!  Add it.
+              String[] dataNames = new String[]{"pubdate","source"};
+              String[][] dataValues = new String[dataNames.length][];
+              if (origDate != null)
+                dataValues[0] = new String[]{origDate.toString()};
+              dataValues[1] = new String[]{documentIdentifier};
+                  
+              // Add document reference, including the data to pass down
+              activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
+            }
+            else
+            {
+              if (Logging.connectors.isDebugEnabled())
+                Logging.connectors.debug("RSS: Identifier '"+newIdentifier+"' is excluded");
+            }
+          }
+          else
+          {
+            if (Logging.connectors.isDebugEnabled())
+              Logging.connectors.debug("RSS: In SiteMap document '"+documentIdentifier+"', found an unincluded URL '"+rawURL+"'");
+          }
+        }
+      }
+    }
+  }
 
   // Month map
   protected static HashMap monthMap = new HashMap();

Modified: incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/Robots.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/Robots.java?rev=1171519&r1=1171518&r2=1171519&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/Robots.java (original)
+++ incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/Robots.java Fri Sep 16 11:53:32 2011
@@ -649,6 +649,14 @@ public class Robots
           {
             // We don't complain about this, but right now we don't listen to it either.
           }
+          else if (lowercaseLine.startsWith("sitemap:"))
+          {
+            // We don't complain about this, but right now we don't listen to it either.
+          }
+          else if (lowercaseLine.startsWith("sitemap"))
+          {
+            // We don't complain about this, but right now we don't listen to it either.
+          }
           else
           {
             // If it's not just a blank line, complain

Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1171519&r1=1171518&r2=1171519&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Fri Sep 16 11:53:32 2011
@@ -6183,6 +6183,12 @@ public class WebcrawlerConnector extends
         outerTagCount++;
         return new FeedContextClass(theStream,namespaceURI,localName,qName,atts,documentURI,handler);
       }
+      else if (qName.equals("urlset") || qName.equals("sitemapindex"))
+      {
+        // Sitemap detected
+        outerTagCount++;
+        return new UrlsetContextClass(theStream,namespaceURI,localName,qName,atts,documentURI,handler);
+      }
 
       // The default action is to establish a new default context.
       return super.beginTag(namespaceURI,localName,qName,atts);
@@ -6202,6 +6208,10 @@ public class WebcrawlerConnector extends
       {
         ((FeedContextClass)context).process();
       }
+      else if (tagName.equals("urlset") || tagName.equals("sitemapindex"))
+      {
+        ((UrlsetContextClass)context).process();
+      }
       else
         super.endTag();
     }
@@ -6641,6 +6651,136 @@ public class WebcrawlerConnector extends
     }
   }
 
+  protected class UrlsetContextClass extends XMLContext
+  {
+    /** The document identifier */
+    protected String documentURI;
+    /** XML handler */
+    protected IXMLHandler handler;
+
+    /** ttl value */
+    protected String ttlValue = null;
+
+    public UrlsetContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentURI, IXMLHandler handler)
+    {
+      super(theStream,namespaceURI,localName,qName,atts);
+      this.documentURI = documentURI;
+      this.handler = handler;
+    }
+
+    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
+      throws ManifoldCFException, ServiceInterruption
+    {
+      // The tags we care about are "url", nothing else.
+      if (qName.equals("url") || qName.equals("sitemap"))
+      {
+        // Item seen.  We don't need any of the attributes etc., but we need to start a new context.
+        return new UrlsetItemContextClass(theStream,namespaceURI,localName,qName,atts);
+      }
+      // Skip everything else.
+      return super.beginTag(namespaceURI,localName,qName,atts);
+    }
+
+    protected void endTag()
+      throws ManifoldCFException, ServiceInterruption
+    {
+      XMLContext theContext = theStream.getContext();
+      String theTag = theContext.getQname();
+      if (theTag.equals("url") || theTag.equals("sitemap"))
+      {
+        // It's an item.
+        UrlsetItemContextClass itemContext = (UrlsetItemContextClass)theContext;
+        // Presumably, since we are done parsing, we've recorded all the information we need in the context, object including:
+        // (1) File name (if any), containing dechromed content
+        // (2) Link name(s)
+        // (3) Pubdate
+        // (4) Title
+        // The job now is to pull this info out and call the activities interface appropriately.
+
+        // NOTE: After this endTag() method is called, tagCleanup() will be called for the item context.  This should clean up
+        // all dangling files etc. that need to be removed.
+        // If an exception or error is thrown during the parse, this endTag() method will NOT be called, but the tagCleanup()
+        // method will be called regardless.
+        itemContext.process(handler);
+      }
+      else
+        super.endTag();
+    }
+
+    /** Process this data */
+    protected void process()
+      throws ManifoldCFException
+    {
+      // Deal with the ttlvalue, if it was found
+      // Use the ttl value as a signal for when we ought to look at this feed again.  If not present, use the default.
+      handler.noteDiscoveredTtlValue(ttlValue);
+    }
+  }
+
+  protected class UrlsetItemContextClass extends XMLContext
+  {
+    protected String linkField = null;
+
+    public UrlsetItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+    {
+      super(theStream,namespaceURI,localName,qName,atts);
+    }
+
+    protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
+      throws ManifoldCFException, ServiceInterruption
+    {
+      // The tags we care about are "loc", nothing else.
+      if (qName.equals("loc"))
+      {
+        // "loc" tag
+        return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+      }
+      else
+      {
+        // Skip everything else.
+        return super.beginTag(namespaceURI,localName,qName,atts);
+      }
+    }
+
+    /** Convert the individual sub-fields of the item context into their final forms */
+    protected void endTag()
+      throws ManifoldCFException, ServiceInterruption
+    {
+      XMLContext theContext = theStream.getContext();
+      String theTag = theContext.getQname();
+      if (theTag.equals("loc"))
+      {
+        linkField = ((XMLStringContext)theContext).getValue();
+      }
+      else
+      {
+        super.endTag();
+      }
+    }
+
+    protected void tagCleanup()
+      throws ManifoldCFException
+    {
+    }
+
+    /** Process the data accumulated for this item */
+    public void process(IXMLHandler handler)
+      throws ManifoldCFException
+    {
+      if (linkField != null && linkField.length() > 0)
+      {
+        String[] links = linkField.split(", ");
+        int l = 0;
+        while (l < links.length)
+        {
+          String rawURL = links[l++].trim();
+          // Process the link
+          handler.noteDiscoveredLink(rawURL);
+        }
+      }
+    }
+  }
+
   /** Handle document references from HTML */
   protected void handleHTML(String documentURI, IHTMLHandler handler)
     throws ManifoldCFException