You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/10/23 02:29:46 UTC
svn commit: r1633729 - in /manifoldcf/branches/dev_1x: ./ connectors/rss/
connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
Author: kwright
Date: Thu Oct 23 00:29:46 2014
New Revision: 1633729
URL: http://svn.apache.org/r1633729
Log:
Pull up more CONNECTORS-1077-related changes
Modified:
manifoldcf/branches/dev_1x/ (props changed)
manifoldcf/branches/dev_1x/connectors/rss/ (props changed)
manifoldcf/branches/dev_1x/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
Propchange: manifoldcf/branches/dev_1x/
------------------------------------------------------------------------------
Merged /manifoldcf/trunk:r1633727
Propchange: manifoldcf/branches/dev_1x/connectors/rss/
------------------------------------------------------------------------------
Merged /manifoldcf/trunk/connectors/rss:r1633727
Modified: manifoldcf/branches/dev_1x/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java?rev=1633729&r1=1633728&r2=1633729&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java (original)
+++ manifoldcf/branches/dev_1x/connectors/rss/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/rss/RSSConnector.java Thu Oct 23 00:29:46 2014
@@ -129,7 +129,8 @@ public class RSSConnector extends org.ap
// Activity types
public final static String ACTIVITY_FETCH = "fetch";
public final static String ACTIVITY_ROBOTSPARSE = "robots parse";
-
+ public final static String ACTIVITY_PROCESS = "process";
+
/** Deny access token for default authority */
private final static String defaultAuthorityDenyToken = "DEAD_AUTHORITY";
@@ -701,6 +702,21 @@ public class RSSConnector extends org.ap
return rval;
}
+ protected static Set<String> xmlContentTypes;
+ static
+ {
+ xmlContentTypes = new HashSet<String>();
+ xmlContentTypes.add("text/xml");
+ xmlContentTypes.add("application/rss+xml");
+ xmlContentTypes.add("application/xml");
+ xmlContentTypes.add("application/atom+xml");
+ xmlContentTypes.add("application/xhtml+xml");
+ xmlContentTypes.add("text/XML");
+ xmlContentTypes.add("application/rdf+xml");
+ xmlContentTypes.add("text/application");
+ xmlContentTypes.add("XML");
+ }
+
/** Process a set of documents.
* This is the method that should cause each document to be fetched, processed, and the results either added
@@ -929,456 +945,512 @@ public class RSSConnector extends org.ap
"' and a previous Last-Modified value of '"+((lastModifiedValue==null)?"null":lastModifiedValue)+"'");
// Robots check. First, we need to separate the url into its components
+ URL url;
try
{
- URL url = new URL(urlValue);
- String protocol = url.getProtocol();
- int port = url.getPort();
- String hostName = url.getHost();
- String pathPart = url.getFile();
-
- // Check with robots to see if it's allowed
- if (robotsUsage >= ROBOTS_DATA && !robots.isFetchAllowed(currentContext,throttleGroupName,
- protocol,port,hostName,url.getPath(),
- userAgent,from,
- proxyHost, proxyPort, proxyAuthDomain, proxyAuthUsername, proxyAuthPassword,
- activities, connectionLimit))
- {
- activities.recordActivity(null,ACTIVITY_FETCH,
- null,urlValue,Integer.toString(-2),"Robots exclusion",null);
+ url = new URL(urlValue);
+ }
+ catch (MalformedURLException e)
+ {
+ Logging.connectors.debug("RSS: URL '"+urlValue+"' is malformed; skipping",e);
+ activities.deleteDocument(documentIdentifier);
+ continue;
+ }
+
+ String protocol = url.getProtocol();
+ int port = url.getPort();
+ String hostName = url.getHost();
+ String pathPart = url.getFile();
+
+ // Check with robots to see if it's allowed
+ if (robotsUsage >= ROBOTS_DATA && !robots.isFetchAllowed(currentContext,throttleGroupName,
+ protocol,port,hostName,url.getPath(),
+ userAgent,from,
+ proxyHost, proxyPort, proxyAuthDomain, proxyAuthUsername, proxyAuthPassword,
+ activities, connectionLimit))
+ {
+ activities.recordActivity(null,ACTIVITY_FETCH,
+ null,urlValue,Integer.toString(-2),"Robots exclusion",null);
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("RSS: Skipping url '"+urlValue+"' because robots.txt says to");
- activities.deleteDocument(documentIdentifier);
- continue;
- }
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("RSS: Skipping url '"+urlValue+"' because robots.txt says to");
+ activities.deleteDocument(documentIdentifier);
+ continue;
+ }
- // Now, use the fetcher, and get the file.
- IThrottledConnection connection = fetcher.createConnection(currentContext,
- throttleGroupName,
- hostName,
- connectionLimit,
- feedTimeout,
- proxyHost,
- proxyPort,
- proxyAuthDomain,
- proxyAuthUsername,
- proxyAuthPassword);
+ // Now, use the fetcher, and get the file.
+ IThrottledConnection connection = fetcher.createConnection(currentContext,
+ throttleGroupName,
+ hostName,
+ connectionLimit,
+ feedTimeout,
+ proxyHost,
+ proxyPort,
+ proxyAuthDomain,
+ proxyAuthUsername,
+ proxyAuthPassword);
+ try
+ {
+ // Begin the fetch
+ connection.beginFetch("Data");
try
{
- // Begin the fetch
- connection.beginFetch("Data");
- try
+ // Execute the request.
+ // Use the connect timeout from the document specification!
+ int status = connection.executeFetch(protocol,port,pathPart,userAgent,from,
+ lastETagValue,lastModifiedValue);
+ switch (status)
{
- // Execute the request.
- // Use the connect timeout from the document specification!
- int status = connection.executeFetch(protocol,port,pathPart,userAgent,from,
- lastETagValue,lastModifiedValue);
- switch (status)
+ case IThrottledConnection.STATUS_NOCHANGE:
+ versionString = oldVersionString;
+ break;
+ case IThrottledConnection.STATUS_OK:
+ try
{
- case IThrottledConnection.STATUS_NOCHANGE:
- versionString = oldVersionString;
- break;
- case IThrottledConnection.STATUS_OK:
- try
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("RSS: Successfully fetched "+urlValue);
+ // Document successfully fetched!
+ // If its content is xml, presume it's a feed...
+ String contentType = connection.getResponseHeader("Content-Type");
+ // Some sites have multiple content types. We just look at the LAST one in that case.
+ if (contentType != null)
{
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("RSS: Successfully fetched "+urlValue);
- // Document successfully fetched!
- // If its content is xml, presume it's a feed...
- String contentType = connection.getResponseHeader("Content-Type");
- // Some sites have multiple content types. We just look at the LAST one in that case.
- if (contentType != null)
+ String[] contentTypes = contentType.split(",");
+ if (contentTypes.length > 0)
+ contentType = contentTypes[contentTypes.length-1].trim();
+ else
+ contentType = null;
+ }
+ String strippedContentType = contentType;
+ if (strippedContentType != null)
+ {
+ int pos = strippedContentType.indexOf(";");
+ if (pos != -1)
+ strippedContentType = strippedContentType.substring(0,pos).trim();
+ }
+ boolean isXML = (strippedContentType != null && xmlContentTypes.contains(strippedContentType));
+ ingestURL = null;
+ if (!isXML)
+ {
+ // If the chromed content mode is set to "skip", and we got here, it means
+ // we should not include the content.
+ if (f.getChromedContentMode() == CHROMED_SKIP)
{
- String[] contentTypes = contentType.split(",");
- if (contentTypes.length > 0)
- contentType = contentTypes[contentTypes.length-1].trim();
- else
- contentType = null;
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("RSS: Removing url '"+urlValue+"' because it no longer has dechromed content available");
+ versionString = null;
+ break;
}
- boolean isXML = (contentType != null &&
- (contentType.startsWith("text/xml") ||
- contentType.startsWith("application/rss+xml") ||
- contentType.startsWith("application/xml") ||
- contentType.startsWith("application/atom+xml") ||
- contentType.startsWith("application/xhtml+xml") ||
- contentType.startsWith("text/XML") ||
- contentType.startsWith("application/rdf+xml") ||
- contentType.startsWith("text/application") ||
- contentType.startsWith("XML") ));
- ingestURL = null;
- if (!isXML)
+
+ // Decide whether to exclude this document based on what we see here.
+ // Basically, we want to get rid of everything that we don't know what
+ // to do with in the ingestion system.
+ if (!activities.checkMimeTypeIndexable(contentType))
{
- // If the chromed content mode is set to "skip", and we got here, it means
- // we should not include the content.
- if (f.getChromedContentMode() == CHROMED_SKIP)
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("RSS: Removing url '"+urlValue+"' because it no longer has dechromed content available");
- versionString = null;
- break;
- }
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("RSS: Removing url '"+urlValue+"' because it had the wrong content type: "+((contentType==null)?"null":"'"+contentType+"'"));
+ versionString = null;
+ break;
+ }
- // Decide whether to exclude this document based on what we see here.
- // Basically, we want to get rid of everything that we don't know what
- // to do with in the ingestion system.
- if (!isContentInteresting(activities,contentType))
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("RSS: Removing url '"+urlValue+"' because it had the wrong content type: "+((contentType==null)?"null":"'"+contentType+"'"));
- versionString = null;
- break;
- }
+ ingestURL = f.mapDocumentURL(urlValue);
+ }
+ else
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("RSS: The url '"+urlValue+"' is a feed");
- ingestURL = f.mapDocumentURL(urlValue);
- }
- else
+ if (!f.isSeed(urlValue))
{
+ // Remove the feed from consideration, since it has left the list of seeds
if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("RSS: The url '"+urlValue+"' is a feed");
+ Logging.connectors.debug("RSS: Removing feed url '"+urlValue+"' because it is not a seed.");
+ versionString = null;
+ break;
+ }
+ }
- if (!f.isSeed(urlValue))
+ InputStream is = connection.getResponseBodyStream();
+ try
+ {
+ long checkSum = cache.addData(activities,urlValue,contentType,is);
+ StringBuilder sb = new StringBuilder();
+ if (ingestURL != null)
+ {
+ // We think it is ingestable. The version string accordingly starts with a "+".
+
+ // Grab what we need from the passed-down data for the document. These will all become part
+ // of the version string.
+ pubDates = activities.retrieveParentData(urlValue,"pubdate");
+ sources = activities.retrieveParentData(urlValue,"source");
+ titles = activities.retrieveParentData(urlValue,"title");
+ authorNames = activities.retrieveParentData(urlValue,"authorname");
+ authorEmails = activities.retrieveParentData(urlValue,"authoremail");
+ categories = activities.retrieveParentData(urlValue,"category");
+ descriptions = activities.retrieveParentData(urlValue,"description");
+ java.util.Arrays.sort(pubDates);
+ java.util.Arrays.sort(sources);
+ java.util.Arrays.sort(titles);
+ java.util.Arrays.sort(authorNames);
+ java.util.Arrays.sort(authorEmails);
+ java.util.Arrays.sort(categories);
+ java.util.Arrays.sort(descriptions);
+
+ if (sources.length == 0)
{
- // Remove the feed from consideration, since it has left the list of seeds
if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("RSS: Removing feed url '"+urlValue+"' because it is not a seed.");
- versionString = null;
- break;
+ Logging.connectors.debug("RSS: Warning; URL '"+ingestURL+"' doesn't seem to have any RSS feed source!");
}
- }
- InputStream is = connection.getResponseBodyStream();
- try
- {
- long checkSum = cache.addData(activities,urlValue,contentType,is);
- StringBuilder sb = new StringBuilder();
- if (ingestURL != null)
+ sb.append('+');
+ packList(sb,acls,'+');
+ if (acls.length > 0)
{
- // We think it is ingestable. The version string accordingly starts with a "+".
-
- // Grab what we need from the passed-down data for the document. These will all become part
- // of the version string.
- pubDates = activities.retrieveParentData(urlValue,"pubdate");
- sources = activities.retrieveParentData(urlValue,"source");
- titles = activities.retrieveParentData(urlValue,"title");
- authorNames = activities.retrieveParentData(urlValue,"authorname");
- authorEmails = activities.retrieveParentData(urlValue,"authoremail");
- categories = activities.retrieveParentData(urlValue,"category");
- descriptions = activities.retrieveParentData(urlValue,"description");
- java.util.Arrays.sort(pubDates);
- java.util.Arrays.sort(sources);
- java.util.Arrays.sort(titles);
- java.util.Arrays.sort(authorNames);
- java.util.Arrays.sort(authorEmails);
- java.util.Arrays.sort(categories);
- java.util.Arrays.sort(descriptions);
-
- if (sources.length == 0)
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("RSS: Warning; URL '"+ingestURL+"' doesn't seem to have any RSS feed source!");
- }
-
sb.append('+');
- packList(sb,acls,'+');
- if (acls.length > 0)
- {
- sb.append('+');
- pack(sb,defaultAuthorityDenyToken,'+');
- }
- else
- sb.append('-');
- // Now, do the metadata
- packList(sb,metadata,'+');
- // The ingestion URL
- pack(sb,ingestURL,'+');
- // The pub dates
- packList(sb,pubDates,'+');
- // The titles
- packList(sb,titles,'+');
- // The sources
- packList(sb,sources,'+');
- // The categories
- packList(sb,categories,'+');
- // The descriptions
- packList(sb,descriptions,'+');
- // The author names
- packList(sb,authorNames,'+');
- // The author emails
- packList(sb,authorEmails,'+');
+ pack(sb,defaultAuthorityDenyToken,'+');
}
else
- {
sb.append('-');
- String etag = connection.getResponseHeader("ETag");
- if (etag == null)
- pack(sb,"",'+');
- else
- pack(sb,etag,'+');
- String lastModified = connection.getResponseHeader("Last-Modified");
- if (lastModified == null)
- pack(sb,"",'+');
- else
- pack(sb,lastModified,'+');
-
- }
-
- // Do the checksum part, which does not need to be parseable.
- sb.append(new Long(checkSum).toString());
-
- versionString = sb.toString();
+ // The ingestion URL
+ pack(sb,ingestURL,'+');
+ // The pub dates
+ packList(sb,pubDates,'+');
+ // The titles
+ packList(sb,titles,'+');
+ // The sources
+ packList(sb,sources,'+');
+ // The categories
+ packList(sb,categories,'+');
+ // The descriptions
+ packList(sb,descriptions,'+');
+ // The author names
+ packList(sb,authorNames,'+');
+ // The author emails
+ packList(sb,authorEmails,'+');
}
- finally
+ else
{
- is.close();
+ sb.append('-');
+ String etag = connection.getResponseHeader("ETag");
+ if (etag == null)
+ pack(sb,"",'+');
+ else
+ pack(sb,etag,'+');
+ String lastModified = connection.getResponseHeader("Last-Modified");
+ if (lastModified == null)
+ pack(sb,"",'+');
+ else
+ pack(sb,lastModified,'+');
+
}
+
+ // Do the checksum part, which does not need to be parseable.
+ sb.append(new Long(checkSum).toString());
+
+ versionString = sb.toString();
}
- catch (java.net.SocketTimeoutException e)
- {
- Logging.connectors.warn("RSS: Socket timeout exception fetching document contents '"+urlValue+"' - skipping: "+e.getMessage(), e);
- versionString = null;
- }
- catch (ConnectTimeoutException e)
- {
- Logging.connectors.warn("RSS: Connecto timeout exception fetching document contents '"+urlValue+"' - skipping: "+e.getMessage(), e);
- versionString = null;
- }
- catch (InterruptedIOException e)
- {
- throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
- }
- catch (IOException e)
+ finally
{
- Logging.connectors.warn("RSS: IO exception fetching document contents '"+urlValue+"' - skipping: "+e.getMessage(), e);
- versionString = null;
+ is.close();
}
+ }
+ catch (java.net.SocketTimeoutException e)
+ {
+ Logging.connectors.warn("RSS: Socket timeout exception fetching document contents '"+urlValue+"' - skipping: "+e.getMessage(), e);
+ versionString = null;
+ }
+ catch (ConnectTimeoutException e)
+ {
+ Logging.connectors.warn("RSS: Connecto timeout exception fetching document contents '"+urlValue+"' - skipping: "+e.getMessage(), e);
+ versionString = null;
+ }
+ catch (InterruptedIOException e)
+ {
+ throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ }
+ catch (IOException e)
+ {
+ Logging.connectors.warn("RSS: IO exception fetching document contents '"+urlValue+"' - skipping: "+e.getMessage(), e);
+ versionString = null;
+ }
- break;
+ break;
- case IThrottledConnection.STATUS_SITEERROR:
- case IThrottledConnection.STATUS_PAGEERROR:
- default:
- // Record an *empty* version.
- // This signals the processDocuments() method that we really don't want to ingest this document, but we also don't
- // want to blow the document out of the queue, since then we'd wind up perhaps fetching it multiple times.
- versionString = "";
- break;
- }
- }
- finally
- {
- connection.doneFetch(activities);
+ case IThrottledConnection.STATUS_SITEERROR:
+ case IThrottledConnection.STATUS_PAGEERROR:
+ default:
+ // Record an *empty* version.
+ // This signals the processDocuments() method that we really don't want to ingest this document, but we also don't
+ // want to blow the document out of the queue, since then we'd wind up perhaps fetching it multiple times.
+ versionString = "";
+ break;
}
}
finally
{
- connection.close();
+ connection.doneFetch(activities);
}
+ }
+ finally
+ {
+ connection.close();
+ }
- if (versionString == null)
- {
- activities.deleteDocument(documentIdentifier);
- continue;
- }
+ if (versionString == null)
+ {
+ activities.deleteDocument(documentIdentifier);
+ continue;
+ }
- if (!(versionString.length() == 0 || activities.checkDocumentNeedsReindexing(documentIdentifier,versionString)))
- continue;
+ if (!(versionString.length() == 0 || activities.checkDocumentNeedsReindexing(documentIdentifier,versionString)))
+ continue;
- // Process document!
+ // Process document!
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("RSS: Processing '"+urlValue+"'");
+
+ // The only links we extract come from documents that we think are RSS feeds.
+ // When we think that's the case, we attempt to parse it as RSS XML.
+ if (ingestURL == null)
+ {
if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("RSS: Processing '"+urlValue+"'");
+ Logging.connectors.debug("RSS: Interpreting document '"+urlValue+"' as a feed");
- // The only links we extract come from documents that we think are RSS feeds.
- // When we think that's the case, we attempt to parse it as RSS XML.
- if (ingestURL == null)
+ // We think it is a feed.
+ // If this is a continuous job, AND scanonly is true, it means that the document was either identical to the
+ // previous fetch, or was not fetched at all. In that case, it may not even be there, and we *certainly* don't
+ // want to attempt to process it in any case.
+ //
+
+ // NOTE: I re-enabled the scan permanently because we need the TTL value to be set whatever the cost. If the
+ // TTL value is not set, we default to the specified job's feed-rescan time, which is not going to be current enough for some feeds.
+ if (true || jobMode != JOBMODE_CONTINUOUS)
{
+ handleRSSFeedSAX(urlValue,activities,f);
if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("RSS: Interpreting document '"+urlValue+"' as a feed");
-
- // We think it is a feed.
- // If this is a continuous job, AND scanonly is true, it means that the document was either identical to the
- // previous fetch, or was not fetched at all. In that case, it may not even be there, and we *certainly* don't
- // want to attempt to process it in any case.
- //
-
- // NOTE: I re-enabled the scan permanently because we need the TTL value to be set whatever the cost. If the
- // TTL value is not set, we default to the specified job's feed-rescan time, which is not going to be current enough for some feeds.
- if (true || jobMode != JOBMODE_CONTINUOUS)
- {
- handleRSSFeedSAX(urlValue,activities,f);
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("RSS: Extraction of feed '"+urlValue+"' complete");
+ Logging.connectors.debug("RSS: Extraction of feed '"+urlValue+"' complete");
- // Record the feed's version string, so we won't refetch unless needed.
- // This functionality is required for the last ETag and Last-Modified fields to be sent to the rss server, and to
- // keep track of the adaptive parameters.
- activities.recordDocument(documentIdentifier,versionString);
- }
- else
- {
- // The problem here is that we really do need to set the rescan time to something reasonable.
- // But we might not even have read the feed! So what to do??
- // One answer is to build a connector-specific table that carries the last value of every feed around.
- // Another answer is to change the version code to always read the feed (and the heck with ETag and Last-Modified).
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("RSS: Feed '"+urlValue+"' does not appear to differ from previous fetch for a continuous job; not extracting!");
-
- long currentTime = System.currentTimeMillis();
+ // Record the feed's version string, so we won't refetch unless needed.
+ // This functionality is required for the last ETag and Last-Modified fields to be sent to the rss server, and to
+ // keep track of the adaptive parameters.
+ activities.recordDocument(documentIdentifier,versionString);
+ }
+ else
+ {
+ // The problem here is that we really do need to set the rescan time to something reasonable.
+ // But we might not even have read the feed! So what to do??
+ // One answer is to build a connector-specific table that carries the last value of every feed around.
+ // Another answer is to change the version code to always read the feed (and the heck with ETag and Last-Modified).
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("RSS: Feed '"+urlValue+"' does not appear to differ from previous fetch for a continuous job; not extracting!");
- Long defaultRescanTime = f.getDefaultRescanTime(currentTime);
+ long currentTime = System.currentTimeMillis();
+
+ Long defaultRescanTime = f.getDefaultRescanTime(currentTime);
- if (defaultRescanTime != null)
+ if (defaultRescanTime != null)
+ {
+ Long minimumTime = f.getMinimumRescanTime(currentTime);
+ if (minimumTime != null)
{
- Long minimumTime = f.getMinimumRescanTime(currentTime);
- if (minimumTime != null)
- {
- if (defaultRescanTime.longValue() < minimumTime.longValue())
- defaultRescanTime = minimumTime;
- }
+ if (defaultRescanTime.longValue() < minimumTime.longValue())
+ defaultRescanTime = minimumTime;
}
+ }
- activities.setDocumentScheduleBounds(urlValue,defaultRescanTime,defaultRescanTime,null,null);
+ activities.setDocumentScheduleBounds(urlValue,defaultRescanTime,defaultRescanTime,null,null);
- }
}
- else
+ }
+ else
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("RSS: Interpreting '"+urlValue+"' as a document");
+
+ String errorCode = null;
+ String errorDesc = null;
+ long startTime = System.currentTimeMillis();
+ Long fileLengthLong = null;
+ try
{
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("RSS: Interpreting '"+urlValue+"' as a document");
-
- if (isDataIngestable(activities,urlValue))
+ long documentLength = cache.getDataLength(documentIdentifier);
+ if (!activities.checkLengthIndexable(documentLength))
{
- // Treat it as an ingestable document.
-
- long dataSize = cache.getDataLength(urlValue);
- RepositoryDocument rd = new RepositoryDocument();
+ activities.noDocument(documentIdentifier,versionString);
+ errorCode = activities.EXCLUDED_LENGTH;
+ errorDesc = "Document rejected because of length ("+documentLength+")";
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("RSS: Skipping document '"+urlValue+"' because its length was rejected ("+documentLength+")");
+ continue;
+ }
- // Set content type
- rd.setMimeType(cache.getContentType(urlValue));
+ if (!activities.checkURLIndexable(documentIdentifier))
+ {
+ activities.noDocument(documentIdentifier,versionString);
+ errorCode = activities.EXCLUDED_URL;
+ errorDesc = "Document rejected because of URL ('"+documentIdentifier+"')";
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("RSS: Skipping document '"+urlValue+"' because its URL was rejected ('"+documentIdentifier+"')");
+ continue;
+ }
- // Turn into acls and add into description
- String[] denyAcls;
- if (acls == null)
- denyAcls = null;
- else if (acls.length == 0)
- denyAcls = new String[0];
+ // Check if it's a recognized content type
+ String contentType = cache.getContentType(documentIdentifier);
+ // Some sites have multiple content types. We just look at the LAST one in that case.
+ if (contentType != null)
+ {
+ String[] contentTypes = contentType.split(",");
+ if (contentTypes.length > 0)
+ contentType = contentTypes[contentTypes.length-1].trim();
else
- denyAcls = new String[]{defaultAuthorityDenyToken};
-
- if (acls != null && denyAcls != null)
- rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,acls,denyAcls);
-
- // Grab metadata
- Map<String,Set<String>> metaHash = new HashMap<String,Set<String>>();
- for (int k = 0; k < namesAndValues.size(); k++)
+ contentType = null;
+ }
+ if (!activities.checkMimeTypeIndexable(contentType))
+ {
+ activities.noDocument(documentIdentifier,versionString);
+ errorCode = activities.EXCLUDED_MIMETYPE;
+ errorDesc = "Document rejected because of mime type ("+contentType+")";
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("RSS: Skipping document '"+urlValue+"' because its mime type was rejected ('"+contentType+"')");
+ continue;
+ }
+
+ // Treat it as an ingestable document.
+
+ long dataSize = cache.getDataLength(urlValue);
+ RepositoryDocument rd = new RepositoryDocument();
+
+ // Grab metadata
+ Map<String,Set<String>> metaHash = new HashMap<String,Set<String>>();
+ for (int k = 0; k < namesAndValues.size(); k++)
+ {
+ NameValue nv = (NameValue)namesAndValues.get(k);
+ Set<String> hashValues = metaHash.get(nv.getName());
+ if (hashValues == null)
{
- NameValue nv = (NameValue)namesAndValues.get(k);
- Set<String> hashValues = metaHash.get(nv.getName());
- if (hashValues == null)
- {
- hashValues = new HashSet<String>();
- metaHash.put(nv.getName(),hashValues);
- }
- hashValues.add(nv.getValue());
+ hashValues = new HashSet<String>();
+ metaHash.put(nv.getName(),hashValues);
}
- for (String key : metaHash.keySet())
+ hashValues.add(nv.getValue());
+ }
+ for (String key : metaHash.keySet())
+ {
+ Set<String> metaList = metaHash.get(key);
+ String[] values = new String[metaList.size()];
+ int k = 0;
+ for (String value : metaList)
{
- Set<String> metaList = metaHash.get(key);
- String[] values = new String[metaList.size()];
- int k = 0;
- for (String value : metaList)
- {
- values[k++] = value;
- }
- rd.addField(key,values);
+ values[k++] = value;
}
+ rd.addField(key,values);
+ }
- if (titles != null && titles.length > 0)
- rd.addField("title",titles);
- if (authorNames != null && authorNames.length > 0)
- rd.addField("authorname",authorNames);
- if (authorEmails != null && authorEmails.length > 0)
- rd.addField("authoremail",authorEmails);
- if (descriptions != null && descriptions.length > 0)
- rd.addField("summary",descriptions);
- if (sources != null && sources.length > 0)
- rd.addField("source",sources);
- if (categories != null && categories.length > 0)
- rd.addField("category",categories);
-
- // The pubdates are a ms since epoch value; we want the minimum one for the origination time.
- Long minimumOrigTime = null;
- if (pubDates != null && pubDates.length > 0)
+ // Set content type
+ if (contentType != null)
+ rd.setMimeType(contentType);
+
+ // Turn into acls and add into description
+ String[] denyAcls;
+ if (acls == null)
+ denyAcls = null;
+ else if (acls.length == 0)
+ denyAcls = new String[0];
+ else
+ denyAcls = new String[]{defaultAuthorityDenyToken};
+
+ if (acls != null && denyAcls != null)
+ rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,acls,denyAcls);
+
+ if (titles != null && titles.length > 0)
+ rd.addField("title",titles);
+ if (authorNames != null && authorNames.length > 0)
+ rd.addField("authorname",authorNames);
+ if (authorEmails != null && authorEmails.length > 0)
+ rd.addField("authoremail",authorEmails);
+ if (descriptions != null && descriptions.length > 0)
+ rd.addField("summary",descriptions);
+ if (sources != null && sources.length > 0)
+ rd.addField("source",sources);
+ if (categories != null && categories.length > 0)
+ rd.addField("category",categories);
+
+ // The pubdates are a ms since epoch value; we want the minimum one for the origination time.
+ Long minimumOrigTime = null;
+ if (pubDates != null && pubDates.length > 0)
+ {
+ String[] pubDateValuesISO = new String[pubDates.length];
+ TimeZone tz = TimeZone.getTimeZone("UTC");
+ DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm'Z'");
+ df.setTimeZone(tz);
+ for (int k = 0; k < pubDates.length; k++)
{
- String[] pubDateValuesISO = new String[pubDates.length];
- TimeZone tz = TimeZone.getTimeZone("UTC");
- DateFormat df = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm'Z'");
- df.setTimeZone(tz);
- for (int k = 0; k < pubDates.length; k++)
+ String pubDate = pubDates[k];
+ try
{
- String pubDate = pubDates[k];
- try
- {
- Long pubDateLong = new Long(pubDate);
- if (minimumOrigTime == null || pubDateLong.longValue() < minimumOrigTime.longValue())
- minimumOrigTime = pubDateLong;
- pubDateValuesISO[k] = df.format(new Date(pubDateLong.longValue()));
- }
- catch (NumberFormatException e)
- {
- // Do nothing; the version string seems to not mean anything
- pubDateValuesISO[k] = "";
- }
+ Long pubDateLong = new Long(pubDate);
+ if (minimumOrigTime == null || pubDateLong.longValue() < minimumOrigTime.longValue())
+ minimumOrigTime = pubDateLong;
+ pubDateValuesISO[k] = df.format(new Date(pubDateLong.longValue()));
+ }
+ catch (NumberFormatException e)
+ {
+ // Do nothing; the version string seems to not mean anything
+ pubDateValuesISO[k] = "";
}
- rd.addField("pubdate",pubDates);
- rd.addField("pubdateiso",pubDateValuesISO);
}
+ rd.addField("pubdate",pubDates);
+ rd.addField("pubdateiso",pubDateValuesISO);
+ }
- if (minimumOrigTime != null)
- activities.setDocumentOriginationTime(urlValue,minimumOrigTime);
+ if (minimumOrigTime != null)
+ activities.setDocumentOriginationTime(urlValue,minimumOrigTime);
- InputStream is = cache.getData(urlValue);
- if (is != null)
+ InputStream is = cache.getData(urlValue);
+ if (is != null)
+ {
+ try
{
+ rd.setBinary(is,dataSize);
try
{
- rd.setBinary(is,dataSize);
- try
- {
- activities.ingestDocumentWithException(documentIdentifier,versionString,ingestURL,rd);
- }
- catch (IOException e)
- {
- handleIOException(e,"reading data");
- }
+ activities.ingestDocumentWithException(documentIdentifier,versionString,ingestURL,rd);
+ errorCode = "OK";
+ fileLengthLong = new Long(dataSize);
}
- finally
+ catch (IOException e)
{
- try
- {
- is.close();
- }
- catch (IOException e)
- {
- handleIOException(e,"closing stream");
- }
+ errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
+ errorDesc = e.getMessage();
+ handleIOException(e,"reading data");
+ }
+ }
+ finally
+ {
+ try
+ {
+ is.close();
+ }
+ catch (IOException e)
+ {
+ errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
+ errorDesc = e.getMessage();
+ handleIOException(e,"closing stream");
}
}
- }
- else
- {
- activities.noDocument(documentIdentifier,versionString);
-
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("RSS: Skipping document '"+urlValue+"' because it cannot be indexed");
}
}
- }
- catch (MalformedURLException e)
- {
- Logging.connectors.debug("RSS: URL '"+urlValue+"' is malformed; skipping",e);
- activities.deleteDocument(documentIdentifier);
- continue;
+ catch (ManifoldCFException e)
+ {
+ if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
+ errorCode = null;
+ throw e;
+ }
+ finally
+ {
+ if (errorCode != null)
+ activities.recordActivity(new Long(startTime),ACTIVITY_PROCESS,
+ null,urlValue,errorCode,errorDesc,null);
+ }
}
}
}
@@ -5230,62 +5302,6 @@ public class RSSConnector extends org.ap
// Protected methods and classes
- /** Code to check if data is interesting, based on response code and content type.
- */
- protected boolean isContentInteresting(IFingerprintActivity activities, String contentType)
- throws ServiceInterruption, ManifoldCFException
- {
- // Look at the content type and decide if it's a kind we want. This is defined
- // as something we think we can either ingest, or extract links from.
-
- // For now, we're only going to attempt to extract links from html. This will change eventually.
- // But the check here is just what the content type is.
- if (contentType == null)
- return false;
-
- int pos = contentType.indexOf(";");
- if (pos != -1)
- contentType = contentType.substring(0,pos);
- contentType = contentType.trim();
-
- return activities.checkMimeTypeIndexable(contentType);
- }
-
- /** Code to check if an already-fetched document should be ingested.
- */
- protected boolean isDataIngestable(IFingerprintActivity activities, String documentIdentifier)
- throws ServiceInterruption, ManifoldCFException
- {
- if (activities.checkLengthIndexable(cache.getDataLength(documentIdentifier)) == false)
- return false;
-
- if (activities.checkURLIndexable(documentIdentifier) == false)
- return false;
-
- // Check if it's a recognized content type
- String contentType = cache.getContentType(documentIdentifier);
-
- // Some sites have multiple content types. We just look at the LAST one in that case.
- if (contentType != null)
- {
- String[] contentTypes = contentType.split(",");
- if (contentTypes.length > 0)
- contentType = contentTypes[contentTypes.length-1].trim();
- else
- contentType = null;
- }
-
- if (contentType == null)
- return false;
-
- int pos = contentType.indexOf(";");
- if (pos != -1)
- contentType = contentType.substring(0,pos);
- contentType = contentType.trim();
-
- return activities.checkMimeTypeIndexable(contentType);
- }
-
/** Given the current parameters, find the correct throttled fetcher object
* (or create one if not there).
*/