You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/10/23 16:00:17 UTC

svn commit: r1633815 - /manifoldcf/branches/CONNECTORS-1077/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Author: kwright
Date: Thu Oct 23 14:00:16 2014
New Revision: 1633815

URL: http://svn.apache.org/r1633815
Log:
web connector reorganization which causes classloader verification failure under both JDK 7 and 8.

Modified:
    manifoldcf/branches/CONNECTORS-1077/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: manifoldcf/branches/CONNECTORS-1077/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1077/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1633815&r1=1633814&r2=1633815&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1077/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/branches/CONNECTORS-1077/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Thu Oct 23 14:00:16 2014
@@ -132,6 +132,7 @@ public class WebcrawlerConnector extends
 
   // Activity types
   public final static String ACTIVITY_FETCH = "fetch";
+  public final static String ACTIVITY_PROCESS = "process";
   public final static String ACTIVITY_ROBOTSPARSE = "robots parse";
   public final static String ACTIVITY_LOGON_START = "begin logon";
   public final static String ACTIVITY_LOGON_END = "end logon";
@@ -321,7 +322,7 @@ public class WebcrawlerConnector extends
   @Override
   public String[] getActivitiesList()
   {
-    return new String[]{ACTIVITY_FETCH, ACTIVITY_ROBOTSPARSE, ACTIVITY_LOGON_START, ACTIVITY_LOGON_END};
+    return new String[]{ACTIVITY_FETCH, ACTIVITY_PROCESS, ACTIVITY_ROBOTSPARSE, ACTIVITY_LOGON_START, ACTIVITY_LOGON_END};
   }
 
 
@@ -673,454 +674,456 @@ public class WebcrawlerConnector extends
             
           while (true)
           {
+            URL url;
             try
             {
               // Do the mapping from the current host name to the IP address
-              URL url = new URL(currentURI);
-              String hostName = url.getHost();
-              StringBuilder ipAddressBuffer = new StringBuilder();
-              int ipAddressStatus = lookupIPAddress(currentURI,activities,hostName,currentTime,ipAddressBuffer);
-              if (ipAddressStatus == RESULTSTATUS_TRUE)
+              url = new URL(currentURI);
+            }
+            catch (MalformedURLException e)
+            {
+              // currentURI is malformed.
+              // If the document was the primary, we should remove it from the queue.  But if it's part of a login sequence, we'd better just retry later.
+              contextMessage = "was not a valid URL: "+e.getMessage();
+              contextException = e;
+              activityResultCode = "-12";
+              resultSignal = RESULT_NO_DOCUMENT;
+              break;
+            }
+
+            String hostName = url.getHost();
+            StringBuilder ipAddressBuffer = new StringBuilder();
+            int ipAddressStatus = lookupIPAddress(currentURI,activities,hostName,currentTime,ipAddressBuffer);
+            if (ipAddressStatus == RESULTSTATUS_TRUE)
+            {
+              String ipAddress = ipAddressBuffer.toString();
+              String protocol = url.getProtocol();
+              int port = url.getPort();
+              if (port == -1)
+                port = url.getDefaultPort();
+
+              // Try to fetch the document.  We'll need its bin names first.
+              String[] binNames = getBinNames(currentURI);
+
+              // Get the credentials for this document (if any)
+              PageCredentials credential = getPageCredential(currentURI);
+              IKeystoreManager trustStore;
+              // Save effort - only bother to get a trust store if this is https
+              if (protocol.equalsIgnoreCase("https"))
+                // null return is possible here; indicates "trust everything"
+                trustStore = getTrustStore(currentURI);
+              else
+                trustStore = KeystoreManagerFactory.make("");
+              // Check robots, if enabled, and if we're fetching the primary document identifier.  See comment above.
+              int robotsStatus = RESULTSTATUS_TRUE;
+              if (!documentIdentifier.equals(currentURI) || robotsUsage < ROBOTS_DATA || (robotsStatus = checkFetchAllowed(documentIdentifier,protocol,ipAddress,port,credential,trustStore,hostName,binNames,currentTime,
+                url.getFile(),activities,connectionLimit,proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword)) == RESULTSTATUS_TRUE)
               {
-                String ipAddress = ipAddressBuffer.toString();
-                String protocol = url.getProtocol();
-                int port = url.getPort();
-                if (port == -1)
-                  port = url.getDefaultPort();
-
-                // Try to fetch the document.  We'll need its bin names first.
-                String[] binNames = getBinNames(currentURI);
-
-                // Get the credentials for this document (if any)
-                PageCredentials credential = getPageCredential(currentURI);
-                IKeystoreManager trustStore;
-                // Save effort - only bother to get a trust store if this is https
-                if (protocol.equalsIgnoreCase("https"))
-                  // null return is possible here; indicates "trust everything"
-                  trustStore = getTrustStore(currentURI);
-                else
-                  trustStore = KeystoreManagerFactory.make("");
-                // Check robots, if enabled, and if we're fetching the primary document identifier.  See comment above.
-                int robotsStatus = RESULTSTATUS_TRUE;
-                if (!documentIdentifier.equals(currentURI) || robotsUsage < ROBOTS_DATA || (robotsStatus = checkFetchAllowed(documentIdentifier,protocol,ipAddress,port,credential,trustStore,hostName,binNames,currentTime,
-                  url.getFile(),activities,connectionLimit,proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword)) == RESULTSTATUS_TRUE)
-                {
-                  // Passed the robots check!
+                // Passed the robots check!
 
-                  // Find whatever login parameters apply.  This will be null if currentURI is not a login page, and will contain
-                  // interesting information if it is.
-                  LoginCookies lc = null;
-                  if (sessionCredential != null)
-                  {
-                    lc = cookieManager.readCookies(sessionCredential.getSequenceKey());
-                  }
+                // Find whatever login parameters apply.  This will be null if currentURI is not a login page, and will contain
+                // interesting information if it is.
+                LoginCookies lc = null;
+                if (sessionCredential != null)
+                {
+                  lc = cookieManager.readCookies(sessionCredential.getSequenceKey());
+                }
 
-                  // Prepare to perform the fetch, and decide what to do with the document.
-                  //
-                  IThrottledConnection connection = ThrottledFetcher.getConnection(currentContext,
-                    throttleGroupName,
-                    protocol,ipAddress,port,
-                    credential,trustStore,throttleDescription,binNames,connectionLimit,
-                    proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword);
+                // Prepare to perform the fetch, and decide what to do with the document.
+                //
+                IThrottledConnection connection = ThrottledFetcher.getConnection(currentContext,
+                  throttleGroupName,
+                  protocol,ipAddress,port,
+                  credential,trustStore,throttleDescription,binNames,connectionLimit,
+                  proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword);
+                try
+                {
+                  connection.beginFetch((sessionState == SESSIONSTATE_LOGIN)?FETCH_LOGIN:FETCH_STANDARD);
                   try
                   {
-                    connection.beginFetch((sessionState == SESSIONSTATE_LOGIN)?FETCH_LOGIN:FETCH_STANDARD);
-                    try
-                    {
-
-                      // Execute the fetch!
-                      connection.executeFetch(url.getFile(),userAgent,from,connectionTimeoutMilliseconds,
-                        socketTimeoutMilliseconds,false,hostName,formData,lc);
-                      int response = connection.getResponseCode();
+                    // Execute the fetch!
+                    connection.executeFetch(url.getFile(),userAgent,from,connectionTimeoutMilliseconds,
+                      socketTimeoutMilliseconds,false,hostName,formData,lc);
+                    int response = connection.getResponseCode();
 
-                      if (response == 200 || response == 302 || response == 301)
+                    if (response == 200 || response == 302 || response == 301)
+                    {
+                      // If this was part of the login sequence, update the cookies regardless of what else happens
+                      if (sessionState == SESSIONSTATE_LOGIN)
                       {
-                        // If this was part of the login sequence, update the cookies regardless of what else happens
-                        if (sessionState == SESSIONSTATE_LOGIN)
-                        {
-                          // Update the cookies
-                          LoginCookies lastFetchCookies = connection.getLastFetchCookies();
-                          cookieManager.updateCookies(sessionCredential.getSequenceKey(),lastFetchCookies);
-                        }
+                        // Update the cookies
+                        LoginCookies lastFetchCookies = connection.getLastFetchCookies();
+                        cookieManager.updateCookies(sessionCredential.getSequenceKey(),lastFetchCookies);
+                      }
 
-                        // Decide whether to exclude this document based on what we see here.
-                        // Basically, we want to get rid of everything that we (a) don't know what
-                        // to do with in the ingestion system, and (b) we can't get useful links from.
+                      // Decide whether to exclude this document based on what we see here.
+                      // Basically, we want to get rid of everything that we (a) don't know what
+                      // to do with in the ingestion system, and (b) we can't get useful links from.
 
-                        String contentType = extractContentType(connection.getResponseHeader("Content-Type"));
+                      String contentType = extractContentType(connection.getResponseHeader("Content-Type"));
 
-                        if (isContentInteresting(activities,currentURI,response,contentType))
-                        {
-                          // Treat it as real, and cache it.
-                          checkSum = cache.addData(activities,currentURI,connection);
-                          headerData = connection.getResponseHeaders();
-                          resultSignal = RESULT_VERSION_NEEDED;
-                          activityResultCode = null;
-                        }
-                        else
-                        {
-                          contextMessage = "it had the wrong content type";
-                          resultSignal = RESULT_NO_DOCUMENT;
-                          activityResultCode = null;
-                        }
+                      if (isContentInteresting(activities,currentURI,response,contentType))
+                      {
+                        // Treat it as real, and cache it.
+                        checkSum = cache.addData(activities,currentURI,connection);
+                        headerData = connection.getResponseHeaders();
+                        resultSignal = RESULT_VERSION_NEEDED;
+                        activityResultCode = null;
                       }
                       else
                       {
-                        // We got some kind of http error code.
-                        // We don't want to remove it from the queue entirely, because that would cause us to lose track of the item, and therefore lose
-                        // control of all scheduling around it.  Instead, we leave it on the queue and give it an empty version string; that will lead it to be
-                        // reprocessed without fail on the next scheduled check.
-                        // Decode response body to the extent we can
-                        String contentType = extractContentType(connection.getResponseHeader("Content-Type"));
-                        String encoding = extractEncoding(contentType);
-                        if (encoding == null)
-                          encoding = StandardCharsets.UTF_8.name();
-                        String decodedResponse = "undecodable";
-                        try
-                        {
-                          decodedResponse = "'"+connection.getLimitedResponseBody(1024,encoding)+"'";
-                        }
-                        catch (ManifoldCFException e)
-                        {
-                          // Eat this exception unless it is an interrupt
-                          if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
-                            throw e;
-                          connection.noteInterrupted(e);
-                        }
-                        catch (ServiceInterruption e)
-                        {
-                          // Eat this exception too
-                          connection.noteInterrupted(e);
-                        }
-                        contextMessage = "it failed to fetch (status="+Integer.toString(response)+", message="+decodedResponse+")";
-                        resultSignal = RESULT_NO_VERSION;
+                        contextMessage = "it had the wrong content type ('"+contentType+"')";
+                        resultSignal = RESULT_NO_DOCUMENT;
                         activityResultCode = null;
                       }
                     }
-                    catch (ManifoldCFException e)
-                    {
-                      connection.noteInterrupted(e);
-                      throw e;
-                    }
-                    catch (ServiceInterruption e)
+                    else
                     {
-                      connection.noteInterrupted(e);
-                      throw e;
-                    }
-                    finally
-                    {
-                      connection.doneFetch(activities);
+                      // We got some kind of http error code.
+                      // We don't want to remove it from the queue entirely, because that would cause us to lose track of the item, and therefore lose
+                      // control of all scheduling around it.  Instead, we leave it on the queue and give it an empty version string; that will lead it to be
+                      // reprocessed without fail on the next scheduled check.
+                      // Decode response body to the extent we can
+                      String contentType = extractContentType(connection.getResponseHeader("Content-Type"));
+                      String encoding = extractEncoding(contentType);
+                      if (encoding == null)
+                        encoding = StandardCharsets.UTF_8.name();
+                      String decodedResponse = "undecodable";
+                      try
+                      {
+                        decodedResponse = "'"+connection.getLimitedResponseBody(1024,encoding)+"'";
+                      }
+                      catch (ManifoldCFException e)
+                      {
+                        // Eat this exception unless it is an interrupt
+                        if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
+                          throw e;
+                        connection.noteInterrupted(e);
+                      }
+                      catch (ServiceInterruption e)
+                      {
+                        // Eat this exception too
+                        connection.noteInterrupted(e);
+                      }
+                      contextMessage = "it failed to fetch (status="+Integer.toString(response)+", message="+decodedResponse+")";
+                      resultSignal = RESULT_NO_VERSION;
+                      activityResultCode = null;
                     }
                   }
+                  catch (ManifoldCFException e)
+                  {
+                    connection.noteInterrupted(e);
+                    throw e;
+                  }
+                  catch (ServiceInterruption e)
+                  {
+                    connection.noteInterrupted(e);
+                    throw e;
+                  }
                   finally
                   {
-                    connection.close();
+                    connection.doneFetch(activities);
                   }
+                }
+                finally
+                {
+                  connection.close();
+                }
 
-                  // State transition logic.  If the result indicates a successful fetch so far, we need to decide where to go next.
-                  // This happens AFTER we've released all the connections, because it's conceivable that processing here might be
-                  // significant, and we don't want to tie things up unnecessarily.
-                  String preferredLink = null;
-                  String preferredRedirection = null;
-                  formData = null;
-                  String contentLink = null;
-                  if (resultSignal == RESULT_VERSION_NEEDED)
+                // State transition logic.  If the result indicates a successful fetch so far, we need to decide where to go next.
+                // This happens AFTER we've released all the connections, because it's conceivable that processing here might be
+                // significant, and we don't want to tie things up unnecessarily.
+                String preferredLink = null;
+                String preferredRedirection = null;
+                formData = null;
+                String contentLink = null;
+                if (resultSignal == RESULT_VERSION_NEEDED)
+                {
+                  // If we get here, we know:
+                  // (a) There's a cached version of the page on disk we can read as many times as necessary;
+                  // (b) The saved cookies have not been updated yet, so we'll need to do that where appropriate.
+
+                  // The way we determine if we're in the login sequence for a site is by TWO criteria:
+                  // (1) The URI must match the specified regular expression, and
+                  // (2) The data from that URI must contain the specified form or link information.
+                  // We use the same criteria to look for the exit from a sequence.  So, in essence, we're *always* going to need to know whether we're
+                  // officially in the sequence, or not, so we evaluate it always.
+                  boolean isLoginPage = false;
+                  if (sessionCredential != null)
                   {
-                    // If we get here, we know:
-                    // (a) There's a cached version of the page on disk we can read as many times as necessary;
-                    // (b) The saved cookies have not been updated yet, so we'll need to do that where appropriate.
-
-                    // The way we determine if we're in the login sequence for a site is by TWO criteria:
-                    // (1) The URI must match the specified regular expression, and
-                    // (2) The data from that URI must contain the specified form or link information.
-                    // We use the same criteria to look for the exit from a sequence.  So, in essence, we're *always* going to need to know whether we're
-                    // officially in the sequence, or not, so we evaluate it always.
-                    boolean isLoginPage = false;
-                    if (sessionCredential != null)
+                    Iterator iterMatches = sessionCredential.findLoginParameters(currentURI);
+                    boolean seenAnything = false;
+                    boolean seenFormError = false;
+                    boolean seenLinkError = false;
+                    boolean seenRedirectionError = false;
+                    boolean seenContentError = false;
+                    while (iterMatches.hasNext())
                     {
-                      Iterator iterMatches = sessionCredential.findLoginParameters(currentURI);
-                      boolean seenAnything = false;
-                      boolean seenFormError = false;
-                      boolean seenLinkError = false;
-                      boolean seenRedirectionError = false;
-                      boolean seenContentError = false;
-                      while (iterMatches.hasNext())
+                      seenAnything = true;
+                      LoginParameters lp = (LoginParameters)iterMatches.next();
+                      // Note that more than one of the rules may match.
+                      // In that case, a clear order of precedence applies between form-style rules and link-style: form has priority.
+                      // If more than one of the same kind of rule is seen, then all bets are off, a warning is displayed, and nothing is
+                      // matched.
+
+                      // Parse the page; it had better match up!  Otherwise we get null back.
+                      FormData newFormData = findHTMLForm(currentURI,lp);
+                      if (newFormData != null)
                       {
-                        seenAnything = true;
-                        LoginParameters lp = (LoginParameters)iterMatches.next();
-                        // Note that more than one of the rules may match.
-                        // In that case, a clear order of precedence applies between form-style rules and link-style: form has priority.
-                        // If more than one of the same kind of rule is seen, then all bets are off, a warning is displayed, and nothing is
-                        // matched.
-
-                        // Parse the page; it had better match up!  Otherwise we get null back.
-                        FormData newFormData = findHTMLForm(currentURI,lp);
-                        if (newFormData != null)
+                        if (formData != null)
                         {
-                          if (formData != null)
+                          // Oops, more than one matching form rule.  Complain.
+                          seenFormError = true;
+                          formData = null;
+                        }
+                        else if (!seenFormError)
+                        {
+                          // A form overrides links, redirection, or content
+                          formData = newFormData;
+                          preferredLink = null;
+                          preferredRedirection = null;
+                        }
+                      }
+                      else
+                      {
+                        // Look for the preferred link instead.
+                        String newPreferredLink = findHTMLLinkURI(currentURI,lp);
+                        if (newPreferredLink != null)
+                        {
+                          if (preferredLink != null)
                           {
-                            // Oops, more than one matching form rule.  Complain.
-                            seenFormError = true;
-                            formData = null;
+                            // Oops
+                            seenLinkError = true;
+                            preferredLink = null;
                           }
-                          else if (!seenFormError)
+                          else if (!seenLinkError && !seenFormError && formData == null)
                           {
-                            // A form overrides links, redirection, or content
-                            formData = newFormData;
-                            preferredLink = null;
+                            // Link overrides redirection and content
+                            preferredLink = newPreferredLink;
                             preferredRedirection = null;
                           }
                         }
                         else
                         {
-                          // Look for the preferred link instead.
-                          String newPreferredLink = findHTMLLinkURI(currentURI,lp);
-                          if (newPreferredLink != null)
+                          // Look for the preferred redirection.
+                          String newPreferredRedirection = findPreferredRedirectionURI(currentURI,lp);
+                          if (newPreferredRedirection != null)
                           {
-                            if (preferredLink != null)
+                            if (preferredRedirection != null)
                             {
-                              // Oops
-                              seenLinkError = true;
-                              preferredLink = null;
+                              seenRedirectionError = true;
+                              preferredRedirection = null;
                             }
-                            else if (!seenLinkError && !seenFormError && formData == null)
+                            else if (!seenRedirectionError && !seenLinkError && !seenFormError && formData == null && preferredLink == null)
                             {
-                              // Link overrides redirection and content
-                              preferredLink = newPreferredLink;
-                              preferredRedirection = null;
+                              preferredRedirection = newPreferredRedirection;
                             }
                           }
                           else
                           {
-                            // Look for the preferred redirection.
-                            String newPreferredRedirection = findPreferredRedirectionURI(currentURI,lp);
-                            if (newPreferredRedirection != null)
+                            // Look for the content in the page.  The link returned may be an empty string, if matching content
+                            // is discovered but there is no override.  It will be null of the content is not found.
+                            String newContentLink = findSpecifiedContent(currentURI,lp);
+                            if (newContentLink != null)
                             {
-                              if (preferredRedirection != null)
+                              if (contentLink != null)
                               {
-                                seenRedirectionError = true;
-                                preferredRedirection = null;
+                                seenContentError = true;
+                                contentLink = null;
                               }
-                              else if (!seenRedirectionError && !seenLinkError && !seenFormError && formData == null && preferredLink == null)
+                              else if (!seenContentError && !seenRedirectionError && !seenLinkError && !seenFormError && formData == null && preferredLink == null && preferredRedirection == null)
                               {
-                                preferredRedirection = newPreferredRedirection;
-                              }
-                            }
-                            else
-                            {
-                              // Look for the content in the page.  The link returned may be an empty string, if matching content
-                              // is discovered but there is no override.  It will be null of the content is not found.
-                              String newContentLink = findSpecifiedContent(currentURI,lp);
-                              if (newContentLink != null)
-                              {
-                                if (contentLink != null)
-                                {
-                                  seenContentError = true;
-                                  contentLink = null;
-                                }
-                                else if (!seenContentError && !seenRedirectionError && !seenLinkError && !seenFormError && formData == null && preferredLink == null && preferredRedirection == null)
-                                {
-                                  contentLink = newContentLink;
-                                }
+                                contentLink = newContentLink;
                               }
                             }
                           }
                         }
                       }
-
-                      // Now, evaluate all the data and pick the right rule
-                      if (formData != null)
-                      {
-                        // We found the right form!  And, we filled it in.  So now we enter the "login sequence".
-                        if (Logging.connectors.isDebugEnabled())
-                          Logging.connectors.debug("WEB: Document '"+currentURI+"' matches form, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
-                        isLoginPage = true;
-                      }
-                      else if (preferredLink != null)
-                      {
-                        if (Logging.connectors.isDebugEnabled())
-                          Logging.connectors.debug("WEB: Document '"+currentURI+"' matches preferred link, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
-                        isLoginPage = true;
-                      }
-                      else if (preferredRedirection != null)
-                      {
-                        if (Logging.connectors.isDebugEnabled())
-                          Logging.connectors.debug("WEB: Document '"+currentURI+"' matches preferred redirection, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
-                        isLoginPage = true;
-                      }
-                      else if (contentLink != null)
-                      {
-                        if (Logging.connectors.isDebugEnabled())
-                          Logging.connectors.debug("WEB: Document '"+currentURI+"' matches content, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
-                        isLoginPage = true;
-                      }
-                      else
-                      {
-                        if (seenAnything && Logging.connectors.isDebugEnabled())
-                          Logging.connectors.debug("WEB: Document '"+currentURI+"' did not match expected form, link, redirection, or content for sequence '"+sessionCredential.getSequenceKey()+"'");
-                      }
                     }
 
-                    // Should we do a state transition into the "logging in" state?
-                    if (sessionState == SESSIONSTATE_NORMAL && isLoginPage)
+                    // Now, evaluate all the data and pick the right rule
+                    if (formData != null)
                     {
-                      // Entering the login sequence.  Make sure we actually can do this...
-                      if (activities.beginEventSequence(globalSequenceEvent))
-                      {
-                        if (Logging.connectors.isDebugEnabled())
-                          Logging.connectors.debug("WEB: For document '"+documentIdentifier+"', beginning login sequence '"+sessionCredential.getSequenceKey()+"'");
+                      // We found the right form!  And, we filled it in.  So now we enter the "login sequence".
+                      if (Logging.connectors.isDebugEnabled())
+                        Logging.connectors.debug("WEB: Document '"+currentURI+"' matches form, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
+                      isLoginPage = true;
+                    }
+                    else if (preferredLink != null)
+                    {
+                      if (Logging.connectors.isDebugEnabled())
+                        Logging.connectors.debug("WEB: Document '"+currentURI+"' matches preferred link, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
+                      isLoginPage = true;
+                    }
+                    else if (preferredRedirection != null)
+                    {
+                      if (Logging.connectors.isDebugEnabled())
+                        Logging.connectors.debug("WEB: Document '"+currentURI+"' matches preferred redirection, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
+                      isLoginPage = true;
+                    }
+                    else if (contentLink != null)
+                    {
+                      if (Logging.connectors.isDebugEnabled())
+                        Logging.connectors.debug("WEB: Document '"+currentURI+"' matches content, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
+                      isLoginPage = true;
+                    }
+                    else
+                    {
+                      if (seenAnything && Logging.connectors.isDebugEnabled())
+                        Logging.connectors.debug("WEB: Document '"+currentURI+"' did not match expected form, link, redirection, or content for sequence '"+sessionCredential.getSequenceKey()+"'");
+                    }
+                  }
 
-                        activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_START,
-                          null,sessionCredential.getSequenceKey(),"OK",null,null);
+                  // Should we do a state transition into the "logging in" state?
+                  if (sessionState == SESSIONSTATE_NORMAL && isLoginPage)
+                  {
+                    // Entering the login sequence.  Make sure we actually can do this...
+                    if (activities.beginEventSequence(globalSequenceEvent))
+                    {
+                      if (Logging.connectors.isDebugEnabled())
+                        Logging.connectors.debug("WEB: For document '"+documentIdentifier+"', beginning login sequence '"+sessionCredential.getSequenceKey()+"'");
 
-                        // Transition to the right state, etc.
-                        sessionState = SESSIONSTATE_LOGIN;
-                      }
-                      else
-                      {
-                        if (Logging.connectors.isDebugEnabled())
-                          Logging.connectors.debug("WEB: For document '"+documentIdentifier+"', login sequence '"+sessionCredential.getSequenceKey()+"' was already in progress.");
+                      activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_START,
+                        null,sessionCredential.getSequenceKey(),"OK",null,null);
 
-                        // Didn't make it in.  Retry the main URI when the proper conditions are met.
-                        // We don't want the cached data anymore.
-                        cache.deleteData(currentURI);
-                        contextMessage = "login sequence already in progress";
-                        resultSignal = RESULT_RETRY_DOCUMENT;
-                        activityResultCode = null;
-                      }
+                      // Transition to the right state, etc.
+                      sessionState = SESSIONSTATE_LOGIN;
                     }
-                    else if (sessionState == SESSIONSTATE_LOGIN && isLoginPage == false)
+                    else
                     {
-                      //== Exit login mode ==
-                      activities.completeEventSequence(globalSequenceEvent);
-                      activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_END,
-                        null,sessionCredential.getSequenceKey(),"OK",null,null);
-                      sessionState = SESSIONSTATE_NORMAL;
-                      // Make sure we go back and try the original document again, if we happened to have been directed somewhere else
-                      if (!currentURI.equals(documentIdentifier))
-                      {
-                        cache.deleteData(currentURI);
-                        currentURI = documentIdentifier;
-                        continue;
-                      }
-                      // Otherwise, the last fetch stands on its own.  Fall through, and allow processing and link extraction
+                      if (Logging.connectors.isDebugEnabled())
+                        Logging.connectors.debug("WEB: For document '"+documentIdentifier+"', login sequence '"+sessionCredential.getSequenceKey()+"' was already in progress.");
+
+                      // Didn't make it in.  Retry the main URI when the proper conditions are met.
+                      // We don't want the cached data anymore.
+                      cache.deleteData(currentURI);
+                      contextMessage = "login sequence already in progress";
+                      resultSignal = RESULT_RETRY_DOCUMENT;
+                      activityResultCode = null;
                     }
-                    
-                    // Now, based on the session state and the document contents, decide how to proceed
-                    if (resultSignal == RESULT_VERSION_NEEDED && sessionState == SESSIONSTATE_LOGIN)
+                  }
+                  else if (sessionState == SESSIONSTATE_LOGIN && isLoginPage == false)
+                  {
+                    //== Exit login mode ==
+                    activities.completeEventSequence(globalSequenceEvent);
+                    activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_END,
+                      null,sessionCredential.getSequenceKey(),"OK",null,null);
+                    sessionState = SESSIONSTATE_NORMAL;
+                    // Make sure we go back and try the original document again, if we happened to have been directed somewhere else
+                    if (!currentURI.equals(documentIdentifier))
                     {
-                      // We are dealing with a login page!
-
-                      // We need to (a) figure out what the next URI should be, and (b) record form information that it might need.
-                      // This is a bit dicey because there's really
-                      // no good way to *guarantee* that we pick the right one, if there's more than one available.
-                      // What we do is the following:
-                      //
-                      // (a) We look for matching forms.  If we found one, we submit it.
-                      // (b) Look for redirections.
-                      // (c) If there are links that vector within the login sequence, we pick one of those preferentially.
-                      // (d) If there are no links that vector within the login sequence, we pick one of the other links.
-                      //
-                      // Note well that it's probably going to be pretty easy to get this code stuck in an infinite login sequence.
-                      // While that won't be a problem performance-wise (because everything is appropriately throttled), it
-                      // is obviously not ideal, and furthermore, it will not be possible to crawl a site for which this occurs.
-                      //
-                      // Longer time (and with higher complexity) we can solve this problem by allowing the user to *specify*
-                      // which link they want us to pick for a page.  Hopefully this would not be necessary.
-
-                      // Locate the next target URI.
-                      String targetURI;
-                      if (formData != null)
-                        targetURI = formData.getActionURI();
-                      else if (preferredLink != null)
-                        targetURI = preferredLink;
-                      else if (preferredRedirection != null)
-                        targetURI = preferredRedirection;
-                      else /* if (contentLink != null) */
-                        targetURI = contentLink;
-
-                      // Definitely we don't want the cached data anymore
                       cache.deleteData(currentURI);
-
-                      // If the target URI is null, it means we could not find a suitable link.  If target URI is "",
-                      // it means that we found a designated logon page but the description did not include a link we
-                      // could chase.  Either way, treat this exactly the same
-                      // way as if the link found exited login mode.
-                      if (targetURI == null || targetURI.length() == 0)
-                      {
-                        //== Exiting login mode ==
-                        activities.completeEventSequence(globalSequenceEvent);
-                        activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_END,
-                          null,sessionCredential.getSequenceKey(),"NEXT LINK NOT FOUND",null,null);
-                        sessionState = SESSIONSTATE_NORMAL;
-                        // Make sure we go back and try the original document again, no matter where we got directed to
-                        currentURI = documentIdentifier;
-                      }
-                      else
-                      {
-                        currentURI = targetURI;
-                      }
+                      currentURI = documentIdentifier;
                       continue;
                     }
-                    else if (resultSignal != RESULT_VERSION_NEEDED && sessionState == SESSIONSTATE_LOGIN)
+                    // Otherwise, the last fetch stands on its own.  Fall through, and allow processing and link extraction
+                  }
+                    
+                  // Now, based on the session state and the document contents, decide how to proceed
+                  if (resultSignal == RESULT_VERSION_NEEDED && sessionState == SESSIONSTATE_LOGIN)
+                  {
+                    // We are dealing with a login page!
+
+                    // We need to (a) figure out what the next URI should be, and (b) record form information that it might need.
+                    // This is a bit dicey because there's really
+                    // no good way to *guarantee* that we pick the right one, if there's more than one available.
+                    // What we do is the following:
+                    //
+                    // (a) We look for matching forms.  If we found one, we submit it.
+                    // (b) Look for redirections.
+                    // (c) If there are links that vector within the login sequence, we pick one of those preferentially.
+                    // (d) If there are no links that vector within the login sequence, we pick one of the other links.
+                    //
+                    // Note well that it's probably going to be pretty easy to get this code stuck in an infinite login sequence.
+                    // While that won't be a problem performance-wise (because everything is appropriately throttled), it
+                    // is obviously not ideal, and furthermore, it will not be possible to crawl a site for which this occurs.
+                    //
+                    // Longer time (and with higher complexity) we can solve this problem by allowing the user to *specify*
+                    // which link they want us to pick for a page.  Hopefully this would not be necessary.
+
+                    // Locate the next target URI.
+                    String targetURI;
+                    if (formData != null)
+                      targetURI = formData.getActionURI();
+                    else if (preferredLink != null)
+                      targetURI = preferredLink;
+                    else if (preferredRedirection != null)
+                      targetURI = preferredRedirection;
+                    else /* if (contentLink != null) */
+                      targetURI = contentLink;
+
+                    // Definitely we don't want the cached data anymore
+                    cache.deleteData(currentURI);
+
+                    // If the target URI is null, it means we could not find a suitable link.  If target URI is "",
+                    // it means that we found a designated logon page but the description did not include a link we
+                    // could chase.  Either way, treat this exactly the same
+                    // way as if the link found exited login mode.
+                    if (targetURI == null || targetURI.length() == 0)
                     {
-                      // The next URL we fetched in the logon sequence turned out to be unsuitable.
-                      // That means that the logon sequence is fundamentally wrong.  The session thus ends,
-                      // and of course it will retry, but that's neither here nor there.
                       //== Exiting login mode ==
                       activities.completeEventSequence(globalSequenceEvent);
                       activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_END,
-                        null,sessionCredential.getSequenceKey(),"LINK TARGET UNSUITABLE",null,null);
+                        null,sessionCredential.getSequenceKey(),"NEXTLINKNOTFOUND","Could not find a usable link to the next page: "+contextMessage,null);
                       sessionState = SESSIONSTATE_NORMAL;
-                      // Fall through, leaving everything else alone.
+                      // Make sure we go back and try the original document again, no matter where we got directed to
+                      currentURI = documentIdentifier;
                     }
+                    else
+                    {
+                      currentURI = targetURI;
+                    }
+                    continue;
                   }
-
-                }
-                else
-                {
-                  if (robotsStatus == RESULTSTATUS_FALSE)
-                  {
-                    activityResultCode = "-11";
-                    contextMessage = "robots.txt says so";
-                    resultSignal = RESULT_NO_DOCUMENT;
-                  }
-                  else
+                  else if (resultSignal != RESULT_VERSION_NEEDED && sessionState == SESSIONSTATE_LOGIN)
                   {
-                    // Robots prerequisite in progress
-                    activityResultCode = null;
-                    resultSignal = RESULT_RETRY_DOCUMENT;
-                    contextMessage = "robots prerequisite already in progress";
+                    // The next URL we fetched in the logon sequence turned out to be unsuitable.
+                    // That means that the logon sequence is fundamentally wrong.  The session thus ends,
+                    // and of course it will retry, but that's neither here nor there.
+                    //== Exiting login mode ==
+                    activities.completeEventSequence(globalSequenceEvent);
+                    activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_END,
+                      null,sessionCredential.getSequenceKey(),"LINKTARGETUNSUITABLE","Page was unsuitable for a login sequence because: "+contextMessage,null);
+                    sessionState = SESSIONSTATE_NORMAL;
+                    // Fall through, leaving everything else alone.
                   }
                 }
+
               }
               else
               {
-                if (ipAddressStatus == RESULTSTATUS_FALSE)
+                if (robotsStatus == RESULTSTATUS_FALSE)
                 {
-                  activityResultCode = "-10";
-                  contextMessage = "ip address not found";
+                  activityResultCode = "-11";
+                  contextMessage = "robots.txt says so";
                   resultSignal = RESULT_NO_DOCUMENT;
                 }
                 else
                 {
-                  // DNS prerequisite in progress
+                  // Robots prerequisite in progress
                   activityResultCode = null;
-                  contextMessage = "dns prerequisite already in progress";
                   resultSignal = RESULT_RETRY_DOCUMENT;
+                  contextMessage = "robots prerequisite already in progress";
                 }
               }
             }
-            catch (MalformedURLException e)
+            else
             {
-              // currentURI is malformed.
-              // If the document was the primary, we should remove it from the queue.  But if it's part of a login sequence, we'd better just retry later.
-              contextMessage = "was not a valid URL: "+e.getMessage();
-              contextException = e;
-              activityResultCode = "-12";
-              resultSignal = RESULT_NO_DOCUMENT;
+              if (ipAddressStatus == RESULTSTATUS_FALSE)
+              {
+                activityResultCode = "-10";
+                contextMessage = "ip address not found";
+                resultSignal = RESULT_NO_DOCUMENT;
+              }
+              else
+              {
+                // DNS prerequisite in progress
+                activityResultCode = null;
+                contextMessage = "dns prerequisite already in progress";
+                resultSignal = RESULT_RETRY_DOCUMENT;
+              }
             }
-
+            
             // If we fail on a document that's not the primary, the result should be to retry the primary later.
             if (!currentURI.equals(documentIdentifier))
             {
@@ -1136,7 +1139,7 @@ public class WebcrawlerConnector extends
 
           // Now, look at the result signal, and set up the version appropriately.
           if (activityResultCode != null)
-            activities.recordActivity(null,ACTIVITY_FETCH,null,documentIdentifier,activityResultCode,((contextMessage!=null)?contextMessage:""),null);
+            activities.recordActivity(null,ACTIVITY_FETCH,null,documentIdentifier,activityResultCode,contextMessage,null);
 
           switch (resultSignal)
           {
@@ -1218,7 +1221,7 @@ public class WebcrawlerConnector extends
             sb.append(filterVersion);
               
             String versionString = sb.toString();
-              
+
             // Now, extract links.
             // We'll call the "link extractor" series, so we can plug more stuff in over time.
             boolean indexDocument = extractLinks(documentIdentifier,activities,filter);
@@ -1227,110 +1230,9 @@ public class WebcrawlerConnector extends
             if (!activities.checkDocumentNeedsReindexing(documentIdentifier,versionString))
               continue;
             
-            // Consider this document for ingestion.
-            // We can exclude it if it does not seem to be a kind of document that the ingestion system knows
-            // about.
-            String ingestURL;
-            if (indexDocument)
-              ingestURL = isDataIngestable(activities,documentIdentifier,filter);
-            else
-              ingestURL = null;
-
-            if (ingestURL == null)
-            {
-              // In case the indexability of the document changed, we still want to notify the incremental indexer.
-              // We do this by using a null url and a null repository document.  If a document with this identifier was
-              // previously indexed, it will be removed.
-                
-              activities.noDocument(documentIdentifier,versionString);
-                
-              if (Logging.connectors.isDebugEnabled())
-                Logging.connectors.debug("WEB: Decided not to ingest '"+documentIdentifier+"' because it did not match ingestability criteria");
+            if (processDocument(activities,documentIdentifier,versionString,indexDocument,metaHash,acls,filter))
               continue;
-            }
             
-            // Ingest the document
-            if (Logging.connectors.isDebugEnabled())
-              Logging.connectors.debug("WEB: Decided to ingest '"+documentIdentifier+"'");
-
-            RepositoryDocument rd = new RepositoryDocument();
-
-            // Set the file name
-            String fileName = "";
-            try {
-              fileName = documentIdentifiertoFileName(documentIdentifier);
-            } catch (URISyntaxException e1) {
-              fileName = "";
-            }
-            if (fileName.length() > 0){
-              rd.setFileName(fileName);
-            }
-                
-            // Set the content type
-            rd.setMimeType(cache.getContentType(documentIdentifier));
-                
-            // Turn into acls and add into description
-            String[] denyAcls;
-            if (acls == null)
-              denyAcls = null;
-            else
-            {
-              if (acls.length > 0)
-                denyAcls = new String[]{defaultAuthorityDenyToken};
-              else
-                denyAcls = new String[0];
-            }
-            
-            if (acls != null && denyAcls != null)
-              rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,acls,denyAcls);
-
-            // Grab metadata
-            for (String key : metaHash.keySet())
-            {
-              Set<String> metaList = metaHash.get(key);
-              String[] values = new String[metaList.size()];
-              int k = 0;
-              for (String value : metaList)
-              {
-                values[k++] = value;
-              }
-              rd.addField(key,values);
-            }
-
-            long length = cache.getDataLength(documentIdentifier);
-            InputStream is = cache.getData(documentIdentifier);
-
-            if (is != null)
-            {
-              try
-              {
-                rd.setBinary(is,length);
-                try
-                {
-                  activities.ingestDocumentWithException(documentIdentifier,versionString,ingestURL,rd);
-                }
-                catch (IOException e)
-                {
-                  handleIOException(e,"reading data");
-                }
-              }
-              finally
-              {
-                try
-                {
-                  is.close();
-                }
-                catch (IOException e)
-                {
-                  handleIOException(e,"closing stream");
-                }
-              }
-            }
-            else
-              Logging.connectors.error("WEB: Expected a cached document for '"+documentIdentifier+"', but none present!");
-
-            // MHL
-              
             break;
           case RESULT_RETRY_DOCUMENT:
             // Document could not be processed right now.
@@ -1359,6 +1261,208 @@ public class WebcrawlerConnector extends
     }
   }
 
+  protected boolean processDocument(IProcessActivity activities, String documentIdentifier, String versionString,
+    boolean indexDocument, Map<String,Set<String>> metaHash, String[] acls, DocumentURLFilter filter)
+    throws ManifoldCFException, ServiceInterruption
+  {
+    // Consider this document for ingestion.
+    String errorCode = null;
+    String errorDesc = null;
+    Long fileLengthLong = null;
+    long startTime = System.currentTimeMillis();
+    
+    try
+    {
+      // We can exclude it if it does not seem to be a kind of document that the ingestion system knows
+      // about.
+      
+      if (!indexDocument)
+      {
+        errorCode = "CONTENTNOTINDEXABLE";
+        errorDesc = "Content not indexable";
+        activities.noDocument(documentIdentifier,versionString);
+        return true;
+      }
+      
+      int responseCode = cache.getResponseCode(documentIdentifier);
+      if (responseCode != 200)
+      {
+        if (Logging.connectors.isDebugEnabled())
+          Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because response code not indexable: "+responseCode);
+        errorCode = "RESPONSECODENOTINDEXABLE";
+        errorDesc = "HTTP response code not indexable ("+responseCode+")";
+        activities.noDocument(documentIdentifier,versionString);
+        return true;
+      }
+
+      long dataLength = cache.getDataLength(documentIdentifier);
+      if (!activities.checkLengthIndexable(dataLength))
+      {
+        if (Logging.connectors.isDebugEnabled())
+          Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because pipeline thinks length "+dataLength+" is not acceptable");
+        errorCode = activities.EXCLUDED_LENGTH;
+        errorDesc = "Rejected due to length ("+dataLength+")";
+        activities.noDocument(documentIdentifier,versionString);
+        return true;
+      }
+      
+      if (activities.checkURLIndexable(documentIdentifier) == false)
+      {
+        if (Logging.connectors.isDebugEnabled())
+          Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because output connector does not want URL");
+        errorCode = activities.EXCLUDED_URL;
+        errorDesc = "Rejected due to URL ('"+documentIdentifier+"')";
+        activities.noDocument(documentIdentifier,versionString);
+        return true;
+      }
+
+      String ingestURL = filter.isDocumentIndexable(documentIdentifier);
+      if (ingestURL == null)
+      {
+        if (Logging.connectors.isDebugEnabled())
+          Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because document does not match web job constraints");
+        errorCode = "JOBRESTRICTION";
+        errorDesc = "Rejected because job excludes this URL ('"+documentIdentifier+"')";
+        activities.noDocument(documentIdentifier,versionString);
+        return true;
+      }
+      
+      // Check if it's a recognized content type
+      String contentType = cache.getContentType(documentIdentifier);
+
+      // Some sites have multiple content types.  We just look at the LAST one in that case.
+      if (contentType != null)
+      {
+        String[] contentTypes = contentType.split(",");
+        if (contentTypes.length > 0)
+          contentType = contentTypes[contentTypes.length-1].trim();
+        else
+          contentType = null;
+      }
+
+      if (contentType != null)
+      {
+        int pos = contentType.indexOf(";");
+        if (pos != -1)
+          contentType = contentType.substring(0,pos);
+        contentType = contentType.trim();
+      }
+
+      if (!activities.checkMimeTypeIndexable(contentType))
+      {
+        if (Logging.connectors.isDebugEnabled())
+          Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because output connector does not want mime type '"+contentType+"'");
+        errorCode = activities.EXCLUDED_MIMETYPE;
+        errorDesc = "Rejected because of mime type ("+contentType+")";
+        activities.noDocument(documentIdentifier,versionString);
+        return true;
+      }
+      
+      // Ingest the document
+      if (Logging.connectors.isDebugEnabled())
+        Logging.connectors.debug("WEB: Decided to ingest '"+documentIdentifier+"'");
+
+      RepositoryDocument rd = new RepositoryDocument();
+
+      // Set the file name
+      String fileName = "";
+      try {
+        fileName = documentIdentifiertoFileName(documentIdentifier);
+      } catch (URISyntaxException e1) {
+        fileName = "";
+      }
+      if (fileName.length() > 0){
+        rd.setFileName(fileName);
+      }
+          
+      // Set the content type
+      String mimeType = cache.getContentType(documentIdentifier);
+      if (mimeType != null)
+        rd.setMimeType(mimeType);
+          
+      // Turn into acls and add into description
+      String[] denyAcls;
+      if (acls == null)
+        denyAcls = null;
+      else
+      {
+        if (acls.length > 0)
+          denyAcls = new String[]{defaultAuthorityDenyToken};
+        else
+          denyAcls = new String[0];
+      }
+      
+      if (acls != null && denyAcls != null)
+        rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,acls,denyAcls);
+
+      // Grab metadata
+      for (String key : metaHash.keySet())
+      {
+        Set<String> metaList = metaHash.get(key);
+        String[] values = new String[metaList.size()];
+        int k = 0;
+        for (String value : metaList)
+        {
+          values[k++] = value;
+        }
+        rd.addField(key,values);
+      }
+
+      InputStream is = cache.getData(documentIdentifier);
+
+      if (is != null)
+      {
+        try
+        {
+          rd.setBinary(is,dataLength);
+          try
+          {
+            activities.ingestDocumentWithException(documentIdentifier,versionString,ingestURL,rd);
+            errorCode = "OK";
+            fileLengthLong = new Long(dataLength);
+          }
+          catch (IOException e)
+          {
+            errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
+            errorDesc = e.getMessage();
+            handleIOException(e,"reading data");
+          }
+        }
+        finally
+        {
+          try
+          {
+            is.close();
+          }
+          catch (IOException e)
+          {
+            errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
+            errorDesc = e.getMessage();
+            handleIOException(e,"closing stream");
+          }
+        }
+      }
+      else
+        Logging.connectors.error("WEB: Expected a cached document for '"+documentIdentifier+"', but none present!");
+      
+      return false;
+    }
+    catch (ManifoldCFException e)
+    {
+      if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
+        errorCode = null;
+      throw e;
+    }
+    finally
+    {
+      if (errorCode != null)
+        activities.recordActivity(new Long(startTime),ACTIVITY_PROCESS,
+          fileLengthLong,documentIdentifier,errorCode,errorDesc,null);
+    }
+
+
+  }
+  
   protected static String extractContentType(String contentType)
   {
     // Some sites have multiple content types.  We just look at the LAST one in that case.
@@ -5727,13 +5831,14 @@ public class WebcrawlerConnector extends
     if (contentType == null)
       return false;
 
-    int pos = contentType.indexOf(";");
+    String strippedContentType = contentType;
+    int pos = strippedContentType.indexOf(";");
     if (pos != -1)
-      contentType = contentType.substring(0,pos);
-    contentType = contentType.trim();
+      strippedContentType = strippedContentType.substring(0,pos);
+    strippedContentType = strippedContentType.trim();
 
     // There are presumably mime types we can extract links from that we can't index?
-    if (interestingMimeTypeMap.contains(contentType))
+    if (interestingMimeTypeMap.contains(strippedContentType))
       return true;
     
     boolean rval = activities.checkMimeTypeIndexable(contentType);
@@ -5742,69 +5847,6 @@ public class WebcrawlerConnector extends
     return rval;
   }
   
-  /** Code to check if an already-fetched document should be ingested.
-  *@return null if document should not be ingested, or the URL if it should.
-  */
-  protected String isDataIngestable(IFingerprintActivity activities, String documentIdentifier, DocumentURLFilter filter)
-    throws ServiceInterruption, ManifoldCFException
-  {
-    if (cache.getResponseCode(documentIdentifier) != 200)
-      return null;
-
-    if (activities.checkLengthIndexable(cache.getDataLength(documentIdentifier)) == false)
-    {
-      if (Logging.connectors.isDebugEnabled())
-        Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because output connector thinks length "+cache.getDataLength(documentIdentifier)+" is too long");
-      return null;
-    }
-    
-    if (activities.checkURLIndexable(documentIdentifier) == false)
-    {
-      if (Logging.connectors.isDebugEnabled())
-        Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because output connector does not want URL");
-      return null;
-    }
-
-    String ingestURL = filter.isDocumentIndexable(documentIdentifier);
-    if (ingestURL == null)
-    {
-      if (Logging.connectors.isDebugEnabled())
-        Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because document does not match web job constraints");
-      return null;
-    }
-    
-    // Check if it's a recognized content type
-    String contentType = cache.getContentType(documentIdentifier);
-
-    // Some sites have multiple content types.  We just look at the LAST one in that case.
-    if (contentType != null)
-    {
-      String[] contentTypes = contentType.split(",");
-      if (contentTypes.length > 0)
-        contentType = contentTypes[contentTypes.length-1].trim();
-      else
-        contentType = null;
-    }
-
-    if (contentType == null)
-      return null;
-
-    int pos = contentType.indexOf(";");
-    if (pos != -1)
-      contentType = contentType.substring(0,pos);
-    contentType = contentType.trim();
-
-    boolean rval = activities.checkMimeTypeIndexable(contentType);
-    if (rval == false)
-    {
-      if (Logging.connectors.isDebugEnabled())
-        Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because output connector does not want mime type '"+contentType+"'");
-      return null;
-    }
-    
-    return ingestURL;
-  }
-
   /** Convert a document identifier to filename.
    * @param documentIdentifier
    * @return