You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/10/24 10:14:32 UTC

svn commit: r1634022 - in /manifoldcf/branches/dev_1x: ./ connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Author: kwright
Date: Fri Oct 24 08:14:32 2014
New Revision: 1634022

URL: http://svn.apache.org/r1634022
Log:
Pull up more CONNECTORS-1077-related code from trunk

Modified:
    manifoldcf/branches/dev_1x/   (props changed)
    manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Propchange: manifoldcf/branches/dev_1x/
------------------------------------------------------------------------------
  Merged /manifoldcf/trunk:r1634021

Modified: manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1634022&r1=1634021&r2=1634022&view=diff
==============================================================================
--- manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/branches/dev_1x/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Fri Oct 24 08:14:32 2014
@@ -777,8 +777,6 @@ public class WebcrawlerConnector extends
               continue;
             
             processDocument(activities,documentIdentifier,versionString,indexDocument,metaHash,metaHash2,acls,filter);
-              //continue;
-            
             break;
           case RESULT_RETRY_DOCUMENT:
             // Document could not be processed right now.
@@ -1291,7 +1289,7 @@ public class WebcrawlerConnector extends
     
   }
 
-  protected boolean processDocument(IProcessActivity activities, String documentIdentifier, String versionString,
+  protected void processDocument(IProcessActivity activities, String documentIdentifier, String versionString,
     boolean indexDocument, Map<String,Set<String>> metaHash, Map<String,Set<String>> metaHash2, String[] acls, DocumentURLFilter filter)
     throws ManifoldCFException, ServiceInterruption
   {
@@ -1311,7 +1309,7 @@ public class WebcrawlerConnector extends
         errorCode = "CONTENTNOTINDEXABLE";
         errorDesc = "Content not indexable";
         activities.noDocument(documentIdentifier,versionString);
-        return true;
+        return;
       }
       
       int responseCode = cache.getResponseCode(documentIdentifier);
@@ -1322,7 +1320,7 @@ public class WebcrawlerConnector extends
         errorCode = "RESPONSECODENOTINDEXABLE";
         errorDesc = "HTTP response code not indexable ("+responseCode+")";
         activities.noDocument(documentIdentifier,versionString);
-        return true;
+        return;
       }
 
       long dataLength = cache.getDataLength(documentIdentifier);
@@ -1333,7 +1331,7 @@ public class WebcrawlerConnector extends
         errorCode = activities.EXCLUDED_LENGTH;
         errorDesc = "Rejected due to length ("+dataLength+")";
         activities.noDocument(documentIdentifier,versionString);
-        return true;
+        return;
       }
       
       if (activities.checkURLIndexable(documentIdentifier) == false)
@@ -1343,7 +1341,7 @@ public class WebcrawlerConnector extends
         errorCode = activities.EXCLUDED_URL;
         errorDesc = "Rejected due to URL ('"+documentIdentifier+"')";
         activities.noDocument(documentIdentifier,versionString);
-        return true;
+        return;
       }
 
       String ingestURL = filter.isDocumentIndexable(documentIdentifier);
@@ -1354,7 +1352,7 @@ public class WebcrawlerConnector extends
         errorCode = "JOBRESTRICTION";
         errorDesc = "Rejected because job excludes this URL ('"+documentIdentifier+"')";
         activities.noDocument(documentIdentifier,versionString);
-        return true;
+        return;
       }
       
       // Check if it's a recognized content type
@@ -1385,7 +1383,7 @@ public class WebcrawlerConnector extends
         errorCode = activities.EXCLUDED_MIMETYPE;
         errorDesc = "Rejected because of mime type ("+contentType+")";
         activities.noDocument(documentIdentifier,versionString);
-        return true;
+        return;
       }
       
       // Ingest the document
@@ -1488,7 +1486,6 @@ public class WebcrawlerConnector extends
       else
         Logging.connectors.error("WEB: Expected a cached document for '"+documentIdentifier+"', but none present!");
       
-      return false;
     }
     catch (ManifoldCFException e)
     {