You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/06/13 17:55:55 UTC

svn commit: r1492719 - /manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Author: kwright
Date: Thu Jun 13 15:55:55 2013
New Revision: 1492719

URL: http://svn.apache.org/r1492719
Log:
Add logging output describing why document rejected.  Part of CONNECTORS-715.

Modified:
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1492719&r1=1492718&r2=1492719&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Thu Jun 13 15:55:55 2013
@@ -5604,7 +5604,10 @@ public class WebcrawlerConnector extends
     if (interestingMimeTypeMap.get(contentType) != null)
       return true;
     
-    return activities.checkMimeTypeIndexable(contentType);
+    boolean rval = activities.checkMimeTypeIndexable(contentType);
+    if (rval == false && Logging.connectors.isDebugEnabled())
+      Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not fetching because output connector does not want mimetype '"+contentType+"'");
+    return rval;
   }
   
   /** Code to check if an already-fetched document should be ingested.
@@ -5616,13 +5619,25 @@ public class WebcrawlerConnector extends
       return false;
 
     if (activities.checkLengthIndexable(cache.getDataLength(documentIdentifier)) == false)
+    {
+      if (Logging.connectors.isDebugEnabled())
+        Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because output connector thinks length "+cache.getDataLength(documentIdentifier)+" is too long");
       return false;
-
+    }
+    
     if (activities.checkURLIndexable(documentIdentifier) == false)
+    {
+      if (Logging.connectors.isDebugEnabled())
+        Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because output connector does not want URL");
       return false;
+    }
 
     if (filter.isDocumentIndexable(documentIdentifier) == false)
+    {
+      if (Logging.connectors.isDebugEnabled())
+        Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because document does not match web job constraints");
       return false;
+    }
     
     // Check if it's a recognized content type
     String contentType = cache.getContentType(documentIdentifier);
@@ -5645,7 +5660,10 @@ public class WebcrawlerConnector extends
       contentType = contentType.substring(0,pos);
     contentType = contentType.trim();
 
-    return activities.checkMimeTypeIndexable(contentType);
+    boolean rval = activities.checkMimeTypeIndexable(contentType);
+    if (rval == false && Logging.connectors.isDebugEnabled())
+      Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because output connector does not want mime type '"+contentType+"'");
+    return rval;
   }
 
   /** Find a redirection URI, if it exists */