You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2011/09/16 13:53:33 UTC
svn commit: r1171519 - in /incubator/lcf/trunk: ./
connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/
connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/
Author: kwright
Date: Fri Sep 16 11:53:32 2011
New Revision: 1171519
URL: http://svn.apache.org/viewvc?rev=1171519&view=rev
Log:
Fix for CONNECTORS-255. Add ability to process site map documents in both web connector and rss connector.
Modified:
incubator/lcf/trunk/CHANGES.txt
incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/Robots.java
incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Modified: incubator/lcf/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/CHANGES.txt?rev=1171519&r1=1171518&r2=1171519&view=diff
==============================================================================
--- incubator/lcf/trunk/CHANGES.txt (original)
+++ incubator/lcf/trunk/CHANGES.txt Fri Sep 16 11:53:32 2011
@@ -3,6 +3,10 @@ $Id$
======================= 0.4-dev =====================
+CONNECTORS-255: Add ability to process site map documents, in both
+RSS connector and Web connector.
+(Karl Wright)
+
CONNECTORS-202: Add commit-within parameter to Solr output connector.
(Jan Høydahl, Karl Wright)
Modified: incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java?rev=1171519&r1=1171518&r2=1171519&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java (original)
+++ incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java Fri Sep 16 11:53:32 2011
@@ -3493,7 +3493,13 @@ public class RSSConnector extends org.ap
outerTagCount++;
return new FeedContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
}
-
+ else if (qName.equals("urlset") || qName.equals("sitemapindex"))
+ {
+ // Sitemap detected
+ outerTagCount++;
+ return new UrlsetContextClass(theStream,namespaceURI,localName,qName,atts,documentIdentifier,activities,filter);
+ }
+
// The default action is to establish a new default context.
return super.beginTag(namespaceURI,localName,qName,atts);
}
@@ -3516,6 +3522,10 @@ public class RSSConnector extends org.ap
{
rescanTimeSet = ((FeedContextClass)context).process();
}
+ else if (tagName.equals("urlset") || tagName.equals("sitemapindex"))
+ {
+ rescanTimeSet = ((UrlsetContextClass)context).process();
+ }
else
super.endTag();
}
@@ -4750,6 +4760,214 @@ public class RSSConnector extends org.ap
}
}
}
+
+ protected class UrlsetContextClass extends XMLContext
+ {
+ /** The document identifier */
+ protected String documentIdentifier;
+ /** Activities interface */
+ protected IProcessActivity activities;
+ /** Filter */
+ protected Filter filter;
+
+ /** ttl value */
+ protected String ttlValue = null;
+
+ public UrlsetContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentIdentifier, IProcessActivity activities, Filter filter)
+ {
+ super(theStream,namespaceURI,localName,qName,atts);
+ this.documentIdentifier = documentIdentifier;
+ this.activities = activities;
+ this.filter = filter;
+ }
+
+ protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ // The tags we care about are "url", nothing else.
+ if (qName.equals("url") || qName.equals("sitemap"))
+ {
+ // Item seen. We don't need any of the attributes etc., but we need to start a new context.
+ return new UrlsetItemContextClass(theStream,namespaceURI,localName,qName,atts);
+ }
+ // Skip everything else.
+ return super.beginTag(namespaceURI,localName,qName,atts);
+ }
+
+ protected void endTag()
+ throws ManifoldCFException, ServiceInterruption
+ {
+ XMLContext theContext = theStream.getContext();
+ String theTag = theContext.getQname();
+ if (theTag.equals("url") || theTag.equals("sitemap"))
+ {
+ // It's an item.
+ UrlsetItemContextClass itemContext = (UrlsetItemContextClass)theContext;
+ // Presumably, since we are done parsing, we've recorded all the information we need in the context, object including:
+ // (1) File name (if any), containing dechromed content
+ // (2) Link name(s)
+ // (3) Pubdate
+ // (4) Title
+ // The job now is to pull this info out and call the activities interface appropriately.
+
+ // NOTE: After this endTag() method is called, tagCleanup() will be called for the item context. This should clean up
+ // all dangling files etc. that need to be removed.
+ // If an exception or error is thrown during the parse, this endTag() method will NOT be called, but the tagCleanup()
+ // method will be called regardless.
+ itemContext.process(documentIdentifier,activities,filter);
+ }
+ else
+ super.endTag();
+ }
+
+ /** Process this data */
+ protected boolean process()
+ throws ManifoldCFException
+ {
+ // Deal with the ttlvalue, if it was found
+ // Use the ttl value as a signal for when we ought to look at this feed again. If not present, use the default.
+ long currentTime = System.currentTimeMillis();
+ Long rescanTime = filter.getDefaultRescanTime(currentTime);
+ if (ttlValue != null)
+ {
+ try
+ {
+ int minutes = Integer.parseInt(ttlValue);
+ long nextTime = currentTime + minutes * 60000L;
+ rescanTime = new Long(nextTime);
+ // Set the upper bound time; we want to scan the feeds aggressively.
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("RSS: In SiteMap document '"+documentIdentifier+"', found a ttl value of "+ttlValue+"; setting refetch time accordingly");
+ }
+ catch (NumberFormatException e)
+ {
+ Logging.connectors.warn("RSS: SiteMap document '"+documentIdentifier+"' has illegal ttl value '"+ttlValue+"'");
+ }
+ }
+
+ if (rescanTime != null)
+ {
+ Long minimumTime = filter.getMinimumRescanTime(currentTime);
+ if (minimumTime != null)
+ {
+ if (rescanTime.longValue() < minimumTime.longValue())
+ rescanTime = minimumTime;
+ }
+ }
+
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("RSS: In SiteMap document '"+documentIdentifier+"' setting rescan time to "+((rescanTime==null)?"null":rescanTime.toString()));
+
+ activities.setDocumentScheduleBounds(documentIdentifier,rescanTime,rescanTime,null,null);
+ return true;
+ }
+ }
+
+ protected class UrlsetItemContextClass extends XMLContext
+ {
+ protected String linkField = null;
+ protected String pubDateField = null;
+
+ public UrlsetItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ super(theStream,namespaceURI,localName,qName,atts);
+ }
+
+ protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ // The tags we care about are "loc" and "lastmod", nothing else.
+ if (qName.equals("loc"))
+ {
+ // "loc" tag
+ return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ }
+ else if (qName.equals("lastmod"))
+ {
+ // "lastmod" tag
+ return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ }
+ else
+ {
+ // Skip everything else.
+ return super.beginTag(namespaceURI,localName,qName,atts);
+ }
+ }
+
+ /** Convert the individual sub-fields of the item context into their final forms */
+ protected void endTag()
+ throws ManifoldCFException, ServiceInterruption
+ {
+ XMLContext theContext = theStream.getContext();
+ String theTag = theContext.getQname();
+ if (theTag.equals("loc"))
+ {
+ linkField = ((XMLStringContext)theContext).getValue();
+ }
+ else if (theTag.equals("lastmod"))
+ {
+ pubDateField = ((XMLStringContext)theContext).getValue();
+ }
+ else
+ {
+ super.endTag();
+ }
+ }
+
+ protected void tagCleanup()
+ throws ManifoldCFException
+ {
+ }
+
+ /** Process the data accumulated for this item */
+ public void process(String documentIdentifier, IProcessActivity activities, Filter filter)
+ throws ManifoldCFException
+ {
+ if (linkField != null && linkField.length() > 0)
+ {
+ Long origDate = null;
+ if (pubDateField != null && pubDateField.length() > 0)
+ origDate = parseZuluDate(pubDateField);
+
+ String[] links = linkField.split(", ");
+ int l = 0;
+ while (l < links.length)
+ {
+ String rawURL = links[l++].trim();
+ // Process the link
+ String newIdentifier = makeDocumentIdentifier(filter.getCanonicalizationPolicies(),documentIdentifier,rawURL);
+ if (newIdentifier != null)
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("RSS: In SiteMap document '"+documentIdentifier+"', found a link to '"+newIdentifier+"', which has origination date "+
+ ((origDate==null)?"null":origDate.toString()));
+ if (filter.isLegalURL(newIdentifier))
+ {
+ // It's a reference! Add it.
+ String[] dataNames = new String[]{"pubdate","source"};
+ String[][] dataValues = new String[dataNames.length][];
+ if (origDate != null)
+ dataValues[0] = new String[]{origDate.toString()};
+ dataValues[1] = new String[]{documentIdentifier};
+
+ // Add document reference, including the data to pass down
+ activities.addDocumentReference(newIdentifier,documentIdentifier,null,dataNames,dataValues,origDate);
+ }
+ else
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("RSS: Identifier '"+newIdentifier+"' is excluded");
+ }
+ }
+ else
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("RSS: In SiteMap document '"+documentIdentifier+"', found an unincluded URL '"+rawURL+"'");
+ }
+ }
+ }
+ }
+ }
// Month map
protected static HashMap monthMap = new HashMap();
Modified: incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/Robots.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/Robots.java?rev=1171519&r1=1171518&r2=1171519&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/Robots.java (original)
+++ incubator/lcf/trunk/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/Robots.java Fri Sep 16 11:53:32 2011
@@ -649,6 +649,14 @@ public class Robots
{
// We don't complain about this, but right now we don't listen to it either.
}
+ else if (lowercaseLine.startsWith("sitemap:"))
+ {
+ // We don't complain about this, but right now we don't listen to it either.
+ }
+ else if (lowercaseLine.startsWith("sitemap"))
+ {
+ // We don't complain about this, but right now we don't listen to it either.
+ }
else
{
// If it's not just a blank line, complain
Modified: incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1171519&r1=1171518&r2=1171519&view=diff
==============================================================================
--- incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ incubator/lcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Fri Sep 16 11:53:32 2011
@@ -6183,6 +6183,12 @@ public class WebcrawlerConnector extends
outerTagCount++;
return new FeedContextClass(theStream,namespaceURI,localName,qName,atts,documentURI,handler);
}
+ else if (qName.equals("urlset") || qName.equals("sitemapindex"))
+ {
+ // Sitemap detected
+ outerTagCount++;
+ return new UrlsetContextClass(theStream,namespaceURI,localName,qName,atts,documentURI,handler);
+ }
// The default action is to establish a new default context.
return super.beginTag(namespaceURI,localName,qName,atts);
@@ -6202,6 +6208,10 @@ public class WebcrawlerConnector extends
{
((FeedContextClass)context).process();
}
+ else if (tagName.equals("urlset") || tagName.equals("sitemapindex"))
+ {
+ ((UrlsetContextClass)context).process();
+ }
else
super.endTag();
}
@@ -6641,6 +6651,136 @@ public class WebcrawlerConnector extends
}
}
+ protected class UrlsetContextClass extends XMLContext
+ {
+ /** The document identifier */
+ protected String documentURI;
+ /** XML handler */
+ protected IXMLHandler handler;
+
+ /** ttl value */
+ protected String ttlValue = null;
+
+ public UrlsetContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts, String documentURI, IXMLHandler handler)
+ {
+ super(theStream,namespaceURI,localName,qName,atts);
+ this.documentURI = documentURI;
+ this.handler = handler;
+ }
+
+ protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ // The tags we care about are "url", nothing else.
+ if (qName.equals("url") || qName.equals("sitemap"))
+ {
+ // Item seen. We don't need any of the attributes etc., but we need to start a new context.
+ return new UrlsetItemContextClass(theStream,namespaceURI,localName,qName,atts);
+ }
+ // Skip everything else.
+ return super.beginTag(namespaceURI,localName,qName,atts);
+ }
+
+ protected void endTag()
+ throws ManifoldCFException, ServiceInterruption
+ {
+ XMLContext theContext = theStream.getContext();
+ String theTag = theContext.getQname();
+ if (theTag.equals("url") || theTag.equals("sitemap"))
+ {
+ // It's an item.
+ UrlsetItemContextClass itemContext = (UrlsetItemContextClass)theContext;
+ // Presumably, since we are done parsing, we've recorded all the information we need in the context, object including:
+ // (1) File name (if any), containing dechromed content
+ // (2) Link name(s)
+ // (3) Pubdate
+ // (4) Title
+ // The job now is to pull this info out and call the activities interface appropriately.
+
+ // NOTE: After this endTag() method is called, tagCleanup() will be called for the item context. This should clean up
+ // all dangling files etc. that need to be removed.
+ // If an exception or error is thrown during the parse, this endTag() method will NOT be called, but the tagCleanup()
+ // method will be called regardless.
+ itemContext.process(handler);
+ }
+ else
+ super.endTag();
+ }
+
+ /** Process this data */
+ protected void process()
+ throws ManifoldCFException
+ {
+ // Deal with the ttlvalue, if it was found
+ // Use the ttl value as a signal for when we ought to look at this feed again. If not present, use the default.
+ handler.noteDiscoveredTtlValue(ttlValue);
+ }
+ }
+
+ protected class UrlsetItemContextClass extends XMLContext
+ {
+ protected String linkField = null;
+
+ public UrlsetItemContextClass(XMLStream theStream, String namespaceURI, String localName, String qName, Attributes atts)
+ {
+ super(theStream,namespaceURI,localName,qName,atts);
+ }
+
+ protected XMLContext beginTag(String namespaceURI, String localName, String qName, Attributes atts)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ // The tags we care about are "loc", nothing else.
+ if (qName.equals("loc"))
+ {
+ // "loc" tag
+ return new XMLStringContext(theStream,namespaceURI,localName,qName,atts);
+ }
+ else
+ {
+ // Skip everything else.
+ return super.beginTag(namespaceURI,localName,qName,atts);
+ }
+ }
+
+ /** Convert the individual sub-fields of the item context into their final forms */
+ protected void endTag()
+ throws ManifoldCFException, ServiceInterruption
+ {
+ XMLContext theContext = theStream.getContext();
+ String theTag = theContext.getQname();
+ if (theTag.equals("loc"))
+ {
+ linkField = ((XMLStringContext)theContext).getValue();
+ }
+ else
+ {
+ super.endTag();
+ }
+ }
+
+ protected void tagCleanup()
+ throws ManifoldCFException
+ {
+ }
+
+ /** Process the data accumulated for this item */
+ public void process(IXMLHandler handler)
+ throws ManifoldCFException
+ {
+ if (linkField != null && linkField.length() > 0)
+ {
+ String[] links = linkField.split(", ");
+ int l = 0;
+ while (l < links.length)
+ {
+ String rawURL = links[l++].trim();
+ // Process the link
+ handler.noteDiscoveredLink(rawURL);
+ }
+ }
+ }
+ }
+
/** Handle document references from HTML */
protected void handleHTML(String documentURI, IHTMLHandler handler)
throws ManifoldCFException