You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/06/13 17:55:55 UTC
svn commit: r1492719 -
/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Author: kwright
Date: Thu Jun 13 15:55:55 2013
New Revision: 1492719
URL: http://svn.apache.org/r1492719
Log:
Add logging output describing why document rejected. Part of CONNECTORS-715.
Modified:
manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1492719&r1=1492718&r2=1492719&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Thu Jun 13 15:55:55 2013
@@ -5604,7 +5604,10 @@ public class WebcrawlerConnector extends
if (interestingMimeTypeMap.get(contentType) != null)
return true;
- return activities.checkMimeTypeIndexable(contentType);
+ boolean rval = activities.checkMimeTypeIndexable(contentType);
+ if (rval == false && Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not fetching because output connector does not want mimetype '"+contentType+"'");
+ return rval;
}
/** Code to check if an already-fetched document should be ingested.
@@ -5616,13 +5619,25 @@ public class WebcrawlerConnector extends
return false;
if (activities.checkLengthIndexable(cache.getDataLength(documentIdentifier)) == false)
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because output connector thinks length "+cache.getDataLength(documentIdentifier)+" is too long");
return false;
-
+ }
+
if (activities.checkURLIndexable(documentIdentifier) == false)
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because output connector does not want URL");
return false;
+ }
if (filter.isDocumentIndexable(documentIdentifier) == false)
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because document does not match web job constraints");
return false;
+ }
// Check if it's a recognized content type
String contentType = cache.getContentType(documentIdentifier);
@@ -5645,7 +5660,10 @@ public class WebcrawlerConnector extends
contentType = contentType.substring(0,pos);
contentType = contentType.trim();
- return activities.checkMimeTypeIndexable(contentType);
+ boolean rval = activities.checkMimeTypeIndexable(contentType);
+ if (rval == false && Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because output connector does not want mime type '"+contentType+"'");
+ return rval;
}
/** Find a redirection URI, if it exists */