You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/10/23 16:00:17 UTC
svn commit: r1633815 -
/manifoldcf/branches/CONNECTORS-1077/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Author: kwright
Date: Thu Oct 23 14:00:16 2014
New Revision: 1633815
URL: http://svn.apache.org/r1633815
Log:
web connector reorganization which causes classloader verification failure under both JDK 7 and 8.
Modified:
manifoldcf/branches/CONNECTORS-1077/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Modified: manifoldcf/branches/CONNECTORS-1077/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-1077/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1633815&r1=1633814&r2=1633815&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-1077/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/branches/CONNECTORS-1077/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Thu Oct 23 14:00:16 2014
@@ -132,6 +132,7 @@ public class WebcrawlerConnector extends
// Activity types
public final static String ACTIVITY_FETCH = "fetch";
+ public final static String ACTIVITY_PROCESS = "process";
public final static String ACTIVITY_ROBOTSPARSE = "robots parse";
public final static String ACTIVITY_LOGON_START = "begin logon";
public final static String ACTIVITY_LOGON_END = "end logon";
@@ -321,7 +322,7 @@ public class WebcrawlerConnector extends
@Override
public String[] getActivitiesList()
{
- return new String[]{ACTIVITY_FETCH, ACTIVITY_ROBOTSPARSE, ACTIVITY_LOGON_START, ACTIVITY_LOGON_END};
+ return new String[]{ACTIVITY_FETCH, ACTIVITY_PROCESS, ACTIVITY_ROBOTSPARSE, ACTIVITY_LOGON_START, ACTIVITY_LOGON_END};
}
@@ -673,454 +674,456 @@ public class WebcrawlerConnector extends
while (true)
{
+ URL url;
try
{
// Do the mapping from the current host name to the IP address
- URL url = new URL(currentURI);
- String hostName = url.getHost();
- StringBuilder ipAddressBuffer = new StringBuilder();
- int ipAddressStatus = lookupIPAddress(currentURI,activities,hostName,currentTime,ipAddressBuffer);
- if (ipAddressStatus == RESULTSTATUS_TRUE)
+ url = new URL(currentURI);
+ }
+ catch (MalformedURLException e)
+ {
+ // currentURI is malformed.
+ // If the document was the primary, we should remove it from the queue. But if it's part of a login sequence, we'd better just retry later.
+ contextMessage = "was not a valid URL: "+e.getMessage();
+ contextException = e;
+ activityResultCode = "-12";
+ resultSignal = RESULT_NO_DOCUMENT;
+ break;
+ }
+
+ String hostName = url.getHost();
+ StringBuilder ipAddressBuffer = new StringBuilder();
+ int ipAddressStatus = lookupIPAddress(currentURI,activities,hostName,currentTime,ipAddressBuffer);
+ if (ipAddressStatus == RESULTSTATUS_TRUE)
+ {
+ String ipAddress = ipAddressBuffer.toString();
+ String protocol = url.getProtocol();
+ int port = url.getPort();
+ if (port == -1)
+ port = url.getDefaultPort();
+
+ // Try to fetch the document. We'll need its bin names first.
+ String[] binNames = getBinNames(currentURI);
+
+ // Get the credentials for this document (if any)
+ PageCredentials credential = getPageCredential(currentURI);
+ IKeystoreManager trustStore;
+ // Save effort - only bother to get a trust store if this is https
+ if (protocol.equalsIgnoreCase("https"))
+ // null return is possible here; indicates "trust everything"
+ trustStore = getTrustStore(currentURI);
+ else
+ trustStore = KeystoreManagerFactory.make("");
+ // Check robots, if enabled, and if we're fetching the primary document identifier. See comment above.
+ int robotsStatus = RESULTSTATUS_TRUE;
+ if (!documentIdentifier.equals(currentURI) || robotsUsage < ROBOTS_DATA || (robotsStatus = checkFetchAllowed(documentIdentifier,protocol,ipAddress,port,credential,trustStore,hostName,binNames,currentTime,
+ url.getFile(),activities,connectionLimit,proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword)) == RESULTSTATUS_TRUE)
{
- String ipAddress = ipAddressBuffer.toString();
- String protocol = url.getProtocol();
- int port = url.getPort();
- if (port == -1)
- port = url.getDefaultPort();
-
- // Try to fetch the document. We'll need its bin names first.
- String[] binNames = getBinNames(currentURI);
-
- // Get the credentials for this document (if any)
- PageCredentials credential = getPageCredential(currentURI);
- IKeystoreManager trustStore;
- // Save effort - only bother to get a trust store if this is https
- if (protocol.equalsIgnoreCase("https"))
- // null return is possible here; indicates "trust everything"
- trustStore = getTrustStore(currentURI);
- else
- trustStore = KeystoreManagerFactory.make("");
- // Check robots, if enabled, and if we're fetching the primary document identifier. See comment above.
- int robotsStatus = RESULTSTATUS_TRUE;
- if (!documentIdentifier.equals(currentURI) || robotsUsage < ROBOTS_DATA || (robotsStatus = checkFetchAllowed(documentIdentifier,protocol,ipAddress,port,credential,trustStore,hostName,binNames,currentTime,
- url.getFile(),activities,connectionLimit,proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword)) == RESULTSTATUS_TRUE)
- {
- // Passed the robots check!
+ // Passed the robots check!
- // Find whatever login parameters apply. This will be null if currentURI is not a login page, and will contain
- // interesting information if it is.
- LoginCookies lc = null;
- if (sessionCredential != null)
- {
- lc = cookieManager.readCookies(sessionCredential.getSequenceKey());
- }
+ // Find whatever login parameters apply. This will be null if currentURI is not a login page, and will contain
+ // interesting information if it is.
+ LoginCookies lc = null;
+ if (sessionCredential != null)
+ {
+ lc = cookieManager.readCookies(sessionCredential.getSequenceKey());
+ }
- // Prepare to perform the fetch, and decide what to do with the document.
- //
- IThrottledConnection connection = ThrottledFetcher.getConnection(currentContext,
- throttleGroupName,
- protocol,ipAddress,port,
- credential,trustStore,throttleDescription,binNames,connectionLimit,
- proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword);
+ // Prepare to perform the fetch, and decide what to do with the document.
+ //
+ IThrottledConnection connection = ThrottledFetcher.getConnection(currentContext,
+ throttleGroupName,
+ protocol,ipAddress,port,
+ credential,trustStore,throttleDescription,binNames,connectionLimit,
+ proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword);
+ try
+ {
+ connection.beginFetch((sessionState == SESSIONSTATE_LOGIN)?FETCH_LOGIN:FETCH_STANDARD);
try
{
- connection.beginFetch((sessionState == SESSIONSTATE_LOGIN)?FETCH_LOGIN:FETCH_STANDARD);
- try
- {
-
- // Execute the fetch!
- connection.executeFetch(url.getFile(),userAgent,from,connectionTimeoutMilliseconds,
- socketTimeoutMilliseconds,false,hostName,formData,lc);
- int response = connection.getResponseCode();
+ // Execute the fetch!
+ connection.executeFetch(url.getFile(),userAgent,from,connectionTimeoutMilliseconds,
+ socketTimeoutMilliseconds,false,hostName,formData,lc);
+ int response = connection.getResponseCode();
- if (response == 200 || response == 302 || response == 301)
+ if (response == 200 || response == 302 || response == 301)
+ {
+ // If this was part of the login sequence, update the cookies regardless of what else happens
+ if (sessionState == SESSIONSTATE_LOGIN)
{
- // If this was part of the login sequence, update the cookies regardless of what else happens
- if (sessionState == SESSIONSTATE_LOGIN)
- {
- // Update the cookies
- LoginCookies lastFetchCookies = connection.getLastFetchCookies();
- cookieManager.updateCookies(sessionCredential.getSequenceKey(),lastFetchCookies);
- }
+ // Update the cookies
+ LoginCookies lastFetchCookies = connection.getLastFetchCookies();
+ cookieManager.updateCookies(sessionCredential.getSequenceKey(),lastFetchCookies);
+ }
- // Decide whether to exclude this document based on what we see here.
- // Basically, we want to get rid of everything that we (a) don't know what
- // to do with in the ingestion system, and (b) we can't get useful links from.
+ // Decide whether to exclude this document based on what we see here.
+ // Basically, we want to get rid of everything that we (a) don't know what
+ // to do with in the ingestion system, and (b) we can't get useful links from.
- String contentType = extractContentType(connection.getResponseHeader("Content-Type"));
+ String contentType = extractContentType(connection.getResponseHeader("Content-Type"));
- if (isContentInteresting(activities,currentURI,response,contentType))
- {
- // Treat it as real, and cache it.
- checkSum = cache.addData(activities,currentURI,connection);
- headerData = connection.getResponseHeaders();
- resultSignal = RESULT_VERSION_NEEDED;
- activityResultCode = null;
- }
- else
- {
- contextMessage = "it had the wrong content type";
- resultSignal = RESULT_NO_DOCUMENT;
- activityResultCode = null;
- }
+ if (isContentInteresting(activities,currentURI,response,contentType))
+ {
+ // Treat it as real, and cache it.
+ checkSum = cache.addData(activities,currentURI,connection);
+ headerData = connection.getResponseHeaders();
+ resultSignal = RESULT_VERSION_NEEDED;
+ activityResultCode = null;
}
else
{
- // We got some kind of http error code.
- // We don't want to remove it from the queue entirely, because that would cause us to lose track of the item, and therefore lose
- // control of all scheduling around it. Instead, we leave it on the queue and give it an empty version string; that will lead it to be
- // reprocessed without fail on the next scheduled check.
- // Decode response body to the extent we can
- String contentType = extractContentType(connection.getResponseHeader("Content-Type"));
- String encoding = extractEncoding(contentType);
- if (encoding == null)
- encoding = StandardCharsets.UTF_8.name();
- String decodedResponse = "undecodable";
- try
- {
- decodedResponse = "'"+connection.getLimitedResponseBody(1024,encoding)+"'";
- }
- catch (ManifoldCFException e)
- {
- // Eat this exception unless it is an interrupt
- if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
- throw e;
- connection.noteInterrupted(e);
- }
- catch (ServiceInterruption e)
- {
- // Eat this exception too
- connection.noteInterrupted(e);
- }
- contextMessage = "it failed to fetch (status="+Integer.toString(response)+", message="+decodedResponse+")";
- resultSignal = RESULT_NO_VERSION;
+ contextMessage = "it had the wrong content type ('"+contentType+"')";
+ resultSignal = RESULT_NO_DOCUMENT;
activityResultCode = null;
}
}
- catch (ManifoldCFException e)
- {
- connection.noteInterrupted(e);
- throw e;
- }
- catch (ServiceInterruption e)
+ else
{
- connection.noteInterrupted(e);
- throw e;
- }
- finally
- {
- connection.doneFetch(activities);
+ // We got some kind of http error code.
+ // We don't want to remove it from the queue entirely, because that would cause us to lose track of the item, and therefore lose
+ // control of all scheduling around it. Instead, we leave it on the queue and give it an empty version string; that will lead it to be
+ // reprocessed without fail on the next scheduled check.
+ // Decode response body to the extent we can
+ String contentType = extractContentType(connection.getResponseHeader("Content-Type"));
+ String encoding = extractEncoding(contentType);
+ if (encoding == null)
+ encoding = StandardCharsets.UTF_8.name();
+ String decodedResponse = "undecodable";
+ try
+ {
+ decodedResponse = "'"+connection.getLimitedResponseBody(1024,encoding)+"'";
+ }
+ catch (ManifoldCFException e)
+ {
+ // Eat this exception unless it is an interrupt
+ if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
+ throw e;
+ connection.noteInterrupted(e);
+ }
+ catch (ServiceInterruption e)
+ {
+ // Eat this exception too
+ connection.noteInterrupted(e);
+ }
+ contextMessage = "it failed to fetch (status="+Integer.toString(response)+", message="+decodedResponse+")";
+ resultSignal = RESULT_NO_VERSION;
+ activityResultCode = null;
}
}
+ catch (ManifoldCFException e)
+ {
+ connection.noteInterrupted(e);
+ throw e;
+ }
+ catch (ServiceInterruption e)
+ {
+ connection.noteInterrupted(e);
+ throw e;
+ }
finally
{
- connection.close();
+ connection.doneFetch(activities);
}
+ }
+ finally
+ {
+ connection.close();
+ }
- // State transition logic. If the result indicates a successful fetch so far, we need to decide where to go next.
- // This happens AFTER we've released all the connections, because it's conceivable that processing here might be
- // significant, and we don't want to tie things up unnecessarily.
- String preferredLink = null;
- String preferredRedirection = null;
- formData = null;
- String contentLink = null;
- if (resultSignal == RESULT_VERSION_NEEDED)
+ // State transition logic. If the result indicates a successful fetch so far, we need to decide where to go next.
+ // This happens AFTER we've released all the connections, because it's conceivable that processing here might be
+ // significant, and we don't want to tie things up unnecessarily.
+ String preferredLink = null;
+ String preferredRedirection = null;
+ formData = null;
+ String contentLink = null;
+ if (resultSignal == RESULT_VERSION_NEEDED)
+ {
+ // If we get here, we know:
+ // (a) There's a cached version of the page on disk we can read as many times as necessary;
+ // (b) The saved cookies have not been updated yet, so we'll need to do that where appropriate.
+
+ // The way we determine if we're in the login sequence for a site is by TWO criteria:
+ // (1) The URI must match the specified regular expression, and
+ // (2) The data from that URI must contain the specified form or link information.
+ // We use the same criteria to look for the exit from a sequence. So, in essence, we're *always* going to need to know whether we're
+ // officially in the sequence, or not, so we evaluate it always.
+ boolean isLoginPage = false;
+ if (sessionCredential != null)
{
- // If we get here, we know:
- // (a) There's a cached version of the page on disk we can read as many times as necessary;
- // (b) The saved cookies have not been updated yet, so we'll need to do that where appropriate.
-
- // The way we determine if we're in the login sequence for a site is by TWO criteria:
- // (1) The URI must match the specified regular expression, and
- // (2) The data from that URI must contain the specified form or link information.
- // We use the same criteria to look for the exit from a sequence. So, in essence, we're *always* going to need to know whether we're
- // officially in the sequence, or not, so we evaluate it always.
- boolean isLoginPage = false;
- if (sessionCredential != null)
+ Iterator iterMatches = sessionCredential.findLoginParameters(currentURI);
+ boolean seenAnything = false;
+ boolean seenFormError = false;
+ boolean seenLinkError = false;
+ boolean seenRedirectionError = false;
+ boolean seenContentError = false;
+ while (iterMatches.hasNext())
{
- Iterator iterMatches = sessionCredential.findLoginParameters(currentURI);
- boolean seenAnything = false;
- boolean seenFormError = false;
- boolean seenLinkError = false;
- boolean seenRedirectionError = false;
- boolean seenContentError = false;
- while (iterMatches.hasNext())
+ seenAnything = true;
+ LoginParameters lp = (LoginParameters)iterMatches.next();
+ // Note that more than one of the rules may match.
+ // In that case, a clear order of precedence applies between form-style rules and link-style: form has priority.
+ // If more than one of the same kind of rule is seen, then all bets are off, a warning is displayed, and nothing is
+ // matched.
+
+ // Parse the page; it had better match up! Otherwise we get null back.
+ FormData newFormData = findHTMLForm(currentURI,lp);
+ if (newFormData != null)
{
- seenAnything = true;
- LoginParameters lp = (LoginParameters)iterMatches.next();
- // Note that more than one of the rules may match.
- // In that case, a clear order of precedence applies between form-style rules and link-style: form has priority.
- // If more than one of the same kind of rule is seen, then all bets are off, a warning is displayed, and nothing is
- // matched.
-
- // Parse the page; it had better match up! Otherwise we get null back.
- FormData newFormData = findHTMLForm(currentURI,lp);
- if (newFormData != null)
+ if (formData != null)
{
- if (formData != null)
+ // Oops, more than one matching form rule. Complain.
+ seenFormError = true;
+ formData = null;
+ }
+ else if (!seenFormError)
+ {
+ // A form overrides links, redirection, or content
+ formData = newFormData;
+ preferredLink = null;
+ preferredRedirection = null;
+ }
+ }
+ else
+ {
+ // Look for the preferred link instead.
+ String newPreferredLink = findHTMLLinkURI(currentURI,lp);
+ if (newPreferredLink != null)
+ {
+ if (preferredLink != null)
{
- // Oops, more than one matching form rule. Complain.
- seenFormError = true;
- formData = null;
+ // Oops
+ seenLinkError = true;
+ preferredLink = null;
}
- else if (!seenFormError)
+ else if (!seenLinkError && !seenFormError && formData == null)
{
- // A form overrides links, redirection, or content
- formData = newFormData;
- preferredLink = null;
+ // Link overrides redirection and content
+ preferredLink = newPreferredLink;
preferredRedirection = null;
}
}
else
{
- // Look for the preferred link instead.
- String newPreferredLink = findHTMLLinkURI(currentURI,lp);
- if (newPreferredLink != null)
+ // Look for the preferred redirection.
+ String newPreferredRedirection = findPreferredRedirectionURI(currentURI,lp);
+ if (newPreferredRedirection != null)
{
- if (preferredLink != null)
+ if (preferredRedirection != null)
{
- // Oops
- seenLinkError = true;
- preferredLink = null;
+ seenRedirectionError = true;
+ preferredRedirection = null;
}
- else if (!seenLinkError && !seenFormError && formData == null)
+ else if (!seenRedirectionError && !seenLinkError && !seenFormError && formData == null && preferredLink == null)
{
- // Link overrides redirection and content
- preferredLink = newPreferredLink;
- preferredRedirection = null;
+ preferredRedirection = newPreferredRedirection;
}
}
else
{
- // Look for the preferred redirection.
- String newPreferredRedirection = findPreferredRedirectionURI(currentURI,lp);
- if (newPreferredRedirection != null)
+ // Look for the content in the page. The link returned may be an empty string, if matching content
+ // is discovered but there is no override. It will be null of the content is not found.
+ String newContentLink = findSpecifiedContent(currentURI,lp);
+ if (newContentLink != null)
{
- if (preferredRedirection != null)
+ if (contentLink != null)
{
- seenRedirectionError = true;
- preferredRedirection = null;
+ seenContentError = true;
+ contentLink = null;
}
- else if (!seenRedirectionError && !seenLinkError && !seenFormError && formData == null && preferredLink == null)
+ else if (!seenContentError && !seenRedirectionError && !seenLinkError && !seenFormError && formData == null && preferredLink == null && preferredRedirection == null)
{
- preferredRedirection = newPreferredRedirection;
- }
- }
- else
- {
- // Look for the content in the page. The link returned may be an empty string, if matching content
- // is discovered but there is no override. It will be null of the content is not found.
- String newContentLink = findSpecifiedContent(currentURI,lp);
- if (newContentLink != null)
- {
- if (contentLink != null)
- {
- seenContentError = true;
- contentLink = null;
- }
- else if (!seenContentError && !seenRedirectionError && !seenLinkError && !seenFormError && formData == null && preferredLink == null && preferredRedirection == null)
- {
- contentLink = newContentLink;
- }
+ contentLink = newContentLink;
}
}
}
}
}
-
- // Now, evaluate all the data and pick the right rule
- if (formData != null)
- {
- // We found the right form! And, we filled it in. So now we enter the "login sequence".
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("WEB: Document '"+currentURI+"' matches form, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
- isLoginPage = true;
- }
- else if (preferredLink != null)
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("WEB: Document '"+currentURI+"' matches preferred link, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
- isLoginPage = true;
- }
- else if (preferredRedirection != null)
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("WEB: Document '"+currentURI+"' matches preferred redirection, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
- isLoginPage = true;
- }
- else if (contentLink != null)
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("WEB: Document '"+currentURI+"' matches content, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
- isLoginPage = true;
- }
- else
- {
- if (seenAnything && Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("WEB: Document '"+currentURI+"' did not match expected form, link, redirection, or content for sequence '"+sessionCredential.getSequenceKey()+"'");
- }
}
- // Should we do a state transition into the "logging in" state?
- if (sessionState == SESSIONSTATE_NORMAL && isLoginPage)
+ // Now, evaluate all the data and pick the right rule
+ if (formData != null)
{
- // Entering the login sequence. Make sure we actually can do this...
- if (activities.beginEventSequence(globalSequenceEvent))
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("WEB: For document '"+documentIdentifier+"', beginning login sequence '"+sessionCredential.getSequenceKey()+"'");
+ // We found the right form! And, we filled it in. So now we enter the "login sequence".
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: Document '"+currentURI+"' matches form, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
+ isLoginPage = true;
+ }
+ else if (preferredLink != null)
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: Document '"+currentURI+"' matches preferred link, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
+ isLoginPage = true;
+ }
+ else if (preferredRedirection != null)
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: Document '"+currentURI+"' matches preferred redirection, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
+ isLoginPage = true;
+ }
+ else if (contentLink != null)
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: Document '"+currentURI+"' matches content, so determined to be login page for sequence '"+sessionCredential.getSequenceKey()+"'");
+ isLoginPage = true;
+ }
+ else
+ {
+ if (seenAnything && Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: Document '"+currentURI+"' did not match expected form, link, redirection, or content for sequence '"+sessionCredential.getSequenceKey()+"'");
+ }
+ }
- activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_START,
- null,sessionCredential.getSequenceKey(),"OK",null,null);
+ // Should we do a state transition into the "logging in" state?
+ if (sessionState == SESSIONSTATE_NORMAL && isLoginPage)
+ {
+ // Entering the login sequence. Make sure we actually can do this...
+ if (activities.beginEventSequence(globalSequenceEvent))
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: For document '"+documentIdentifier+"', beginning login sequence '"+sessionCredential.getSequenceKey()+"'");
- // Transition to the right state, etc.
- sessionState = SESSIONSTATE_LOGIN;
- }
- else
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("WEB: For document '"+documentIdentifier+"', login sequence '"+sessionCredential.getSequenceKey()+"' was already in progress.");
+ activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_START,
+ null,sessionCredential.getSequenceKey(),"OK",null,null);
- // Didn't make it in. Retry the main URI when the proper conditions are met.
- // We don't want the cached data anymore.
- cache.deleteData(currentURI);
- contextMessage = "login sequence already in progress";
- resultSignal = RESULT_RETRY_DOCUMENT;
- activityResultCode = null;
- }
+ // Transition to the right state, etc.
+ sessionState = SESSIONSTATE_LOGIN;
}
- else if (sessionState == SESSIONSTATE_LOGIN && isLoginPage == false)
+ else
{
- //== Exit login mode ==
- activities.completeEventSequence(globalSequenceEvent);
- activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_END,
- null,sessionCredential.getSequenceKey(),"OK",null,null);
- sessionState = SESSIONSTATE_NORMAL;
- // Make sure we go back and try the original document again, if we happened to have been directed somewhere else
- if (!currentURI.equals(documentIdentifier))
- {
- cache.deleteData(currentURI);
- currentURI = documentIdentifier;
- continue;
- }
- // Otherwise, the last fetch stands on its own. Fall through, and allow processing and link extraction
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: For document '"+documentIdentifier+"', login sequence '"+sessionCredential.getSequenceKey()+"' was already in progress.");
+
+ // Didn't make it in. Retry the main URI when the proper conditions are met.
+ // We don't want the cached data anymore.
+ cache.deleteData(currentURI);
+ contextMessage = "login sequence already in progress";
+ resultSignal = RESULT_RETRY_DOCUMENT;
+ activityResultCode = null;
}
-
- // Now, based on the session state and the document contents, decide how to proceed
- if (resultSignal == RESULT_VERSION_NEEDED && sessionState == SESSIONSTATE_LOGIN)
+ }
+ else if (sessionState == SESSIONSTATE_LOGIN && isLoginPage == false)
+ {
+ //== Exit login mode ==
+ activities.completeEventSequence(globalSequenceEvent);
+ activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_END,
+ null,sessionCredential.getSequenceKey(),"OK",null,null);
+ sessionState = SESSIONSTATE_NORMAL;
+ // Make sure we go back and try the original document again, if we happened to have been directed somewhere else
+ if (!currentURI.equals(documentIdentifier))
{
- // We are dealing with a login page!
-
- // We need to (a) figure out what the next URI should be, and (b) record form information that it might need.
- // This is a bit dicey because there's really
- // no good way to *guarantee* that we pick the right one, if there's more than one available.
- // What we do is the following:
- //
- // (a) We look for matching forms. If we found one, we submit it.
- // (b) Look for redirections.
- // (c) If there are links that vector within the login sequence, we pick one of those preferentially.
- // (d) If there are no links that vector within the login sequence, we pick one of the other links.
- //
- // Note well that it's probably going to be pretty easy to get this code stuck in an infinite login sequence.
- // While that won't be a problem performance-wise (because everything is appropriately throttled), it
- // is obviously not ideal, and furthermore, it will not be possible to crawl a site for which this occurs.
- //
- // Longer time (and with higher complexity) we can solve this problem by allowing the user to *specify*
- // which link they want us to pick for a page. Hopefully this would not be necessary.
-
- // Locate the next target URI.
- String targetURI;
- if (formData != null)
- targetURI = formData.getActionURI();
- else if (preferredLink != null)
- targetURI = preferredLink;
- else if (preferredRedirection != null)
- targetURI = preferredRedirection;
- else /* if (contentLink != null) */
- targetURI = contentLink;
-
- // Definitely we don't want the cached data anymore
cache.deleteData(currentURI);
-
- // If the target URI is null, it means we could not find a suitable link. If target URI is "",
- // it means that we found a designated logon page but the description did not include a link we
- // could chase. Either way, treat this exactly the same
- // way as if the link found exited login mode.
- if (targetURI == null || targetURI.length() == 0)
- {
- //== Exiting login mode ==
- activities.completeEventSequence(globalSequenceEvent);
- activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_END,
- null,sessionCredential.getSequenceKey(),"NEXT LINK NOT FOUND",null,null);
- sessionState = SESSIONSTATE_NORMAL;
- // Make sure we go back and try the original document again, no matter where we got directed to
- currentURI = documentIdentifier;
- }
- else
- {
- currentURI = targetURI;
- }
+ currentURI = documentIdentifier;
continue;
}
- else if (resultSignal != RESULT_VERSION_NEEDED && sessionState == SESSIONSTATE_LOGIN)
+ // Otherwise, the last fetch stands on its own. Fall through, and allow processing and link extraction
+ }
+
+ // Now, based on the session state and the document contents, decide how to proceed
+ if (resultSignal == RESULT_VERSION_NEEDED && sessionState == SESSIONSTATE_LOGIN)
+ {
+ // We are dealing with a login page!
+
+ // We need to (a) figure out what the next URI should be, and (b) record form information that it might need.
+ // This is a bit dicey because there's really
+ // no good way to *guarantee* that we pick the right one, if there's more than one available.
+ // What we do is the following:
+ //
+ // (a) We look for matching forms. If we found one, we submit it.
+ // (b) Look for redirections.
+ // (c) If there are links that vector within the login sequence, we pick one of those preferentially.
+ // (d) If there are no links that vector within the login sequence, we pick one of the other links.
+ //
+ // Note well that it's probably going to be pretty easy to get this code stuck in an infinite login sequence.
+ // While that won't be a problem performance-wise (because everything is appropriately throttled), it
+ // is obviously not ideal, and furthermore, it will not be possible to crawl a site for which this occurs.
+ //
+ // Longer time (and with higher complexity) we can solve this problem by allowing the user to *specify*
+ // which link they want us to pick for a page. Hopefully this would not be necessary.
+
+ // Locate the next target URI.
+ String targetURI;
+ if (formData != null)
+ targetURI = formData.getActionURI();
+ else if (preferredLink != null)
+ targetURI = preferredLink;
+ else if (preferredRedirection != null)
+ targetURI = preferredRedirection;
+ else /* if (contentLink != null) */
+ targetURI = contentLink;
+
+ // Definitely we don't want the cached data anymore
+ cache.deleteData(currentURI);
+
+ // If the target URI is null, it means we could not find a suitable link. If target URI is "",
+ // it means that we found a designated logon page but the description did not include a link we
+ // could chase. Either way, treat this exactly the same
+ // way as if the link found exited login mode.
+ if (targetURI == null || targetURI.length() == 0)
{
- // The next URL we fetched in the logon sequence turned out to be unsuitable.
- // That means that the logon sequence is fundamentally wrong. The session thus ends,
- // and of course it will retry, but that's neither here nor there.
//== Exiting login mode ==
activities.completeEventSequence(globalSequenceEvent);
activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_END,
- null,sessionCredential.getSequenceKey(),"LINK TARGET UNSUITABLE",null,null);
+ null,sessionCredential.getSequenceKey(),"NEXTLINKNOTFOUND","Could not find a usable link to the next page: "+contextMessage,null);
sessionState = SESSIONSTATE_NORMAL;
- // Fall through, leaving everything else alone.
+ // Make sure we go back and try the original document again, no matter where we got directed to
+ currentURI = documentIdentifier;
}
+ else
+ {
+ currentURI = targetURI;
+ }
+ continue;
}
-
- }
- else
- {
- if (robotsStatus == RESULTSTATUS_FALSE)
- {
- activityResultCode = "-11";
- contextMessage = "robots.txt says so";
- resultSignal = RESULT_NO_DOCUMENT;
- }
- else
+ else if (resultSignal != RESULT_VERSION_NEEDED && sessionState == SESSIONSTATE_LOGIN)
{
- // Robots prerequisite in progress
- activityResultCode = null;
- resultSignal = RESULT_RETRY_DOCUMENT;
- contextMessage = "robots prerequisite already in progress";
+ // The next URL we fetched in the logon sequence turned out to be unsuitable.
+ // That means that the logon sequence is fundamentally wrong. The session thus ends,
+ // and of course it will retry, but that's neither here nor there.
+ //== Exiting login mode ==
+ activities.completeEventSequence(globalSequenceEvent);
+ activities.recordActivity(null,WebcrawlerConnector.ACTIVITY_LOGON_END,
+ null,sessionCredential.getSequenceKey(),"LINKTARGETUNSUITABLE","Page was unsuitable for a login sequence because: "+contextMessage,null);
+ sessionState = SESSIONSTATE_NORMAL;
+ // Fall through, leaving everything else alone.
}
}
+
}
else
{
- if (ipAddressStatus == RESULTSTATUS_FALSE)
+ if (robotsStatus == RESULTSTATUS_FALSE)
{
- activityResultCode = "-10";
- contextMessage = "ip address not found";
+ activityResultCode = "-11";
+ contextMessage = "robots.txt says so";
resultSignal = RESULT_NO_DOCUMENT;
}
else
{
- // DNS prerequisite in progress
+ // Robots prerequisite in progress
activityResultCode = null;
- contextMessage = "dns prerequisite already in progress";
resultSignal = RESULT_RETRY_DOCUMENT;
+ contextMessage = "robots prerequisite already in progress";
}
}
}
- catch (MalformedURLException e)
+ else
{
- // currentURI is malformed.
- // If the document was the primary, we should remove it from the queue. But if it's part of a login sequence, we'd better just retry later.
- contextMessage = "was not a valid URL: "+e.getMessage();
- contextException = e;
- activityResultCode = "-12";
- resultSignal = RESULT_NO_DOCUMENT;
+ if (ipAddressStatus == RESULTSTATUS_FALSE)
+ {
+ activityResultCode = "-10";
+ contextMessage = "ip address not found";
+ resultSignal = RESULT_NO_DOCUMENT;
+ }
+ else
+ {
+ // DNS prerequisite in progress
+ activityResultCode = null;
+ contextMessage = "dns prerequisite already in progress";
+ resultSignal = RESULT_RETRY_DOCUMENT;
+ }
}
-
+
// If we fail on a document that's not the primary, the result should be to retry the primary later.
if (!currentURI.equals(documentIdentifier))
{
@@ -1136,7 +1139,7 @@ public class WebcrawlerConnector extends
// Now, look at the result signal, and set up the version appropriately.
if (activityResultCode != null)
- activities.recordActivity(null,ACTIVITY_FETCH,null,documentIdentifier,activityResultCode,((contextMessage!=null)?contextMessage:""),null);
+ activities.recordActivity(null,ACTIVITY_FETCH,null,documentIdentifier,activityResultCode,contextMessage,null);
switch (resultSignal)
{
@@ -1218,7 +1221,7 @@ public class WebcrawlerConnector extends
sb.append(filterVersion);
String versionString = sb.toString();
-
+
// Now, extract links.
// We'll call the "link extractor" series, so we can plug more stuff in over time.
boolean indexDocument = extractLinks(documentIdentifier,activities,filter);
@@ -1227,110 +1230,9 @@ public class WebcrawlerConnector extends
if (!activities.checkDocumentNeedsReindexing(documentIdentifier,versionString))
continue;
- // Consider this document for ingestion.
- // We can exclude it if it does not seem to be a kind of document that the ingestion system knows
- // about.
- String ingestURL;
- if (indexDocument)
- ingestURL = isDataIngestable(activities,documentIdentifier,filter);
- else
- ingestURL = null;
-
- if (ingestURL == null)
- {
- // In case the indexability of the document changed, we still want to notify the incremental indexer.
- // We do this by using a null url and a null repository document. If a document with this identifier was
- // previously indexed, it will be removed.
-
- activities.noDocument(documentIdentifier,versionString);
-
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("WEB: Decided not to ingest '"+documentIdentifier+"' because it did not match ingestability criteria");
+ if (processDocument(activities,documentIdentifier,versionString,indexDocument,metaHash,acls,filter))
continue;
- }
- // Ingest the document
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("WEB: Decided to ingest '"+documentIdentifier+"'");
-
- RepositoryDocument rd = new RepositoryDocument();
-
- // Set the file name
- String fileName = "";
- try {
- fileName = documentIdentifiertoFileName(documentIdentifier);
- } catch (URISyntaxException e1) {
- fileName = "";
- }
- if (fileName.length() > 0){
- rd.setFileName(fileName);
- }
-
- // Set the content type
- rd.setMimeType(cache.getContentType(documentIdentifier));
-
- // Turn into acls and add into description
- String[] denyAcls;
- if (acls == null)
- denyAcls = null;
- else
- {
- if (acls.length > 0)
- denyAcls = new String[]{defaultAuthorityDenyToken};
- else
- denyAcls = new String[0];
- }
-
- if (acls != null && denyAcls != null)
- rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,acls,denyAcls);
-
- // Grab metadata
- for (String key : metaHash.keySet())
- {
- Set<String> metaList = metaHash.get(key);
- String[] values = new String[metaList.size()];
- int k = 0;
- for (String value : metaList)
- {
- values[k++] = value;
- }
- rd.addField(key,values);
- }
-
- long length = cache.getDataLength(documentIdentifier);
- InputStream is = cache.getData(documentIdentifier);
-
- if (is != null)
- {
- try
- {
- rd.setBinary(is,length);
- try
- {
- activities.ingestDocumentWithException(documentIdentifier,versionString,ingestURL,rd);
- }
- catch (IOException e)
- {
- handleIOException(e,"reading data");
- }
- }
- finally
- {
- try
- {
- is.close();
- }
- catch (IOException e)
- {
- handleIOException(e,"closing stream");
- }
- }
- }
- else
- Logging.connectors.error("WEB: Expected a cached document for '"+documentIdentifier+"', but none present!");
-
- // MHL
-
break;
case RESULT_RETRY_DOCUMENT:
// Document could not be processed right now.
@@ -1359,6 +1261,208 @@ public class WebcrawlerConnector extends
}
}
+ protected boolean processDocument(IProcessActivity activities, String documentIdentifier, String versionString,
+ boolean indexDocument, Map<String,Set<String>> metaHash, String[] acls, DocumentURLFilter filter)
+ throws ManifoldCFException, ServiceInterruption
+ {
+ // Consider this document for ingestion.
+ String errorCode = null;
+ String errorDesc = null;
+ Long fileLengthLong = null;
+ long startTime = System.currentTimeMillis();
+
+ try
+ {
+ // We can exclude it if it does not seem to be a kind of document that the ingestion system knows
+ // about.
+
+ if (!indexDocument)
+ {
+ errorCode = "CONTENTNOTINDEXABLE";
+ errorDesc = "Content not indexable";
+ activities.noDocument(documentIdentifier,versionString);
+ return true;
+ }
+
+ int responseCode = cache.getResponseCode(documentIdentifier);
+ if (responseCode != 200)
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because response code not indexable: "+responseCode);
+ errorCode = "RESPONSECODENOTINDEXABLE";
+ errorDesc = "HTTP response code not indexable ("+responseCode+")";
+ activities.noDocument(documentIdentifier,versionString);
+ return true;
+ }
+
+ long dataLength = cache.getDataLength(documentIdentifier);
+ if (!activities.checkLengthIndexable(dataLength))
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because pipeline thinks length "+dataLength+" is not acceptable");
+ errorCode = activities.EXCLUDED_LENGTH;
+ errorDesc = "Rejected due to length ("+dataLength+")";
+ activities.noDocument(documentIdentifier,versionString);
+ return true;
+ }
+
+ if (activities.checkURLIndexable(documentIdentifier) == false)
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because output connector does not want URL");
+ errorCode = activities.EXCLUDED_URL;
+ errorDesc = "Rejected due to URL ('"+documentIdentifier+"')";
+ activities.noDocument(documentIdentifier,versionString);
+ return true;
+ }
+
+ String ingestURL = filter.isDocumentIndexable(documentIdentifier);
+ if (ingestURL == null)
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because document does not match web job constraints");
+ errorCode = "JOBRESTRICTION";
+ errorDesc = "Rejected because job excludes this URL ('"+documentIdentifier+"')";
+ activities.noDocument(documentIdentifier,versionString);
+ return true;
+ }
+
+ // Check if it's a recognized content type
+ String contentType = cache.getContentType(documentIdentifier);
+
+ // Some sites have multiple content types. We just look at the LAST one in that case.
+ if (contentType != null)
+ {
+ String[] contentTypes = contentType.split(",");
+ if (contentTypes.length > 0)
+ contentType = contentTypes[contentTypes.length-1].trim();
+ else
+ contentType = null;
+ }
+
+ if (contentType != null)
+ {
+ int pos = contentType.indexOf(";");
+ if (pos != -1)
+ contentType = contentType.substring(0,pos);
+ contentType = contentType.trim();
+ }
+
+ if (!activities.checkMimeTypeIndexable(contentType))
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because output connector does not want mime type '"+contentType+"'");
+ errorCode = activities.EXCLUDED_MIMETYPE;
+ errorDesc = "Rejected because of mime type ("+contentType+")";
+ activities.noDocument(documentIdentifier,versionString);
+ return true;
+ }
+
+ // Ingest the document
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: Decided to ingest '"+documentIdentifier+"'");
+
+ RepositoryDocument rd = new RepositoryDocument();
+
+ // Set the file name
+ String fileName = "";
+ try {
+ fileName = documentIdentifiertoFileName(documentIdentifier);
+ } catch (URISyntaxException e1) {
+ fileName = "";
+ }
+ if (fileName.length() > 0){
+ rd.setFileName(fileName);
+ }
+
+ // Set the content type
+ String mimeType = cache.getContentType(documentIdentifier);
+ if (mimeType != null)
+ rd.setMimeType(mimeType);
+
+ // Turn into acls and add into description
+ String[] denyAcls;
+ if (acls == null)
+ denyAcls = null;
+ else
+ {
+ if (acls.length > 0)
+ denyAcls = new String[]{defaultAuthorityDenyToken};
+ else
+ denyAcls = new String[0];
+ }
+
+ if (acls != null && denyAcls != null)
+ rd.setSecurity(RepositoryDocument.SECURITY_TYPE_DOCUMENT,acls,denyAcls);
+
+ // Grab metadata
+ for (String key : metaHash.keySet())
+ {
+ Set<String> metaList = metaHash.get(key);
+ String[] values = new String[metaList.size()];
+ int k = 0;
+ for (String value : metaList)
+ {
+ values[k++] = value;
+ }
+ rd.addField(key,values);
+ }
+
+ InputStream is = cache.getData(documentIdentifier);
+
+ if (is != null)
+ {
+ try
+ {
+ rd.setBinary(is,dataLength);
+ try
+ {
+ activities.ingestDocumentWithException(documentIdentifier,versionString,ingestURL,rd);
+ errorCode = "OK";
+ fileLengthLong = new Long(dataLength);
+ }
+ catch (IOException e)
+ {
+ errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
+ errorDesc = e.getMessage();
+ handleIOException(e,"reading data");
+ }
+ }
+ finally
+ {
+ try
+ {
+ is.close();
+ }
+ catch (IOException e)
+ {
+ errorCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
+ errorDesc = e.getMessage();
+ handleIOException(e,"closing stream");
+ }
+ }
+ }
+ else
+ Logging.connectors.error("WEB: Expected a cached document for '"+documentIdentifier+"', but none present!");
+
+ return false;
+ }
+ catch (ManifoldCFException e)
+ {
+ if (e.getErrorCode() == ManifoldCFException.INTERRUPTED)
+ errorCode = null;
+ throw e;
+ }
+ finally
+ {
+ if (errorCode != null)
+ activities.recordActivity(new Long(startTime),ACTIVITY_PROCESS,
+ fileLengthLong,documentIdentifier,errorCode,errorDesc,null);
+ }
+
+
+ }
+
protected static String extractContentType(String contentType)
{
// Some sites have multiple content types. We just look at the LAST one in that case.
@@ -5727,13 +5831,14 @@ public class WebcrawlerConnector extends
if (contentType == null)
return false;
- int pos = contentType.indexOf(";");
+ String strippedContentType = contentType;
+ int pos = strippedContentType.indexOf(";");
if (pos != -1)
- contentType = contentType.substring(0,pos);
- contentType = contentType.trim();
+ strippedContentType = strippedContentType.substring(0,pos);
+ strippedContentType = strippedContentType.trim();
// There are presumably mime types we can extract links from that we can't index?
- if (interestingMimeTypeMap.contains(contentType))
+ if (interestingMimeTypeMap.contains(strippedContentType))
return true;
boolean rval = activities.checkMimeTypeIndexable(contentType);
@@ -5742,69 +5847,6 @@ public class WebcrawlerConnector extends
return rval;
}
- /** Code to check if an already-fetched document should be ingested.
- *@return null if document should not be ingested, or the URL if it should.
- */
- protected String isDataIngestable(IFingerprintActivity activities, String documentIdentifier, DocumentURLFilter filter)
- throws ServiceInterruption, ManifoldCFException
- {
- if (cache.getResponseCode(documentIdentifier) != 200)
- return null;
-
- if (activities.checkLengthIndexable(cache.getDataLength(documentIdentifier)) == false)
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because output connector thinks length "+cache.getDataLength(documentIdentifier)+" is too long");
- return null;
- }
-
- if (activities.checkURLIndexable(documentIdentifier) == false)
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because output connector does not want URL");
- return null;
- }
-
- String ingestURL = filter.isDocumentIndexable(documentIdentifier);
- if (ingestURL == null)
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because document does not match web job constraints");
- return null;
- }
-
- // Check if it's a recognized content type
- String contentType = cache.getContentType(documentIdentifier);
-
- // Some sites have multiple content types. We just look at the LAST one in that case.
- if (contentType != null)
- {
- String[] contentTypes = contentType.split(",");
- if (contentTypes.length > 0)
- contentType = contentTypes[contentTypes.length-1].trim();
- else
- contentType = null;
- }
-
- if (contentType == null)
- return null;
-
- int pos = contentType.indexOf(";");
- if (pos != -1)
- contentType = contentType.substring(0,pos);
- contentType = contentType.trim();
-
- boolean rval = activities.checkMimeTypeIndexable(contentType);
- if (rval == false)
- {
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("Web: For document '"+documentIdentifier+"', not indexing because output connector does not want mime type '"+contentType+"'");
- return null;
- }
-
- return ingestURL;
- }
-
/** Convert a document identifier to filename.
* @param documentIdentifier
* @return