You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2012/12/03 12:19:51 UTC
svn commit: r1416452 [4/4] - in /manifoldcf/branches/CONNECTORS-120-1: ./
connectors/livelink/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/livelink/
connectors/meridio/connector/src/main/java/org/apache/manifoldcf/crawler/connectors...
Modified: manifoldcf/branches/CONNECTORS-120-1/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-120-1/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java?rev=1416452&r1=1416451&r2=1416452&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-120-1/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java (original)
+++ manifoldcf/branches/CONNECTORS-120-1/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java Mon Dec 3 11:19:48 2012
@@ -1,6 +1,6 @@
/* $Id: ThrottledFetcher.java 989847 2010-08-26 17:52:30Z kwright $ */
-/**
+/**`
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
@@ -19,6 +19,7 @@
package org.apache.manifoldcf.crawler.connectors.webcrawler;
import org.apache.manifoldcf.core.interfaces.*;
+import org.apache.manifoldcf.core.common.XThreadInputStream;
import org.apache.manifoldcf.agents.interfaces.*;
import org.apache.manifoldcf.crawler.interfaces.*;
import org.apache.manifoldcf.crawler.system.Logging;
@@ -26,13 +27,58 @@ import org.apache.manifoldcf.crawler.sys
import java.util.*;
import java.io.*;
import java.net.*;
+import java.util.concurrent.TimeUnit;
-import org.apache.commons.httpclient.*;
-import org.apache.commons.httpclient.methods.*;
-import org.apache.commons.httpclient.params.*;
-import org.apache.commons.httpclient.protocol.*;
-import org.apache.commons.httpclient.cookie.CookiePolicy;
-import org.apache.commons.httpclient.auth.AuthScope;
+import org.apache.http.conn.ClientConnectionManager;
+import org.apache.http.client.HttpClient;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.methods.HttpPost;
+import org.apache.http.NameValuePair;
+import org.apache.http.impl.conn.PoolingClientConnectionManager;
+import org.apache.http.conn.scheme.Scheme;
+import org.apache.http.conn.ssl.SSLSocketFactory;
+import org.apache.http.conn.ssl.AllowAllHostnameVerifier;
+import org.apache.http.conn.ssl.BrowserCompatHostnameVerifier;
+import org.apache.http.HttpEntity;
+import org.apache.http.HttpResponse;
+import org.apache.http.auth.AuthScope;
+import org.apache.http.auth.NTCredentials;
+import org.apache.http.client.methods.HttpGet;
+import org.apache.http.client.methods.HttpRequestBase;
+import org.apache.http.impl.client.DefaultHttpClient;
+import org.apache.http.impl.client.AbstractHttpClient;
+import org.apache.http.impl.client.DefaultRedirectStrategy;
+import org.apache.http.util.EntityUtils;
+import org.apache.http.params.BasicHttpParams;
+import org.apache.http.params.HttpParams;
+import org.apache.http.params.CoreConnectionPNames;
+import org.apache.http.HttpStatus;
+import org.apache.http.HttpHost;
+import org.apache.http.Header;
+import org.apache.http.conn.params.ConnRoutePNames;
+import org.apache.http.message.BasicHeader;
+import org.apache.http.client.params.ClientPNames;
+import org.apache.http.client.params.HttpClientParams;
+import org.apache.http.client.params.CookiePolicy;
+import org.apache.http.cookie.params.CookieSpecPNames;
+import org.apache.http.impl.cookie.BasicClientCookie;
+import org.apache.http.message.BasicNameValuePair;
+import org.apache.http.protocol.HTTP;
+import org.apache.http.client.entity.UrlEncodedFormEntity;
+import org.apache.http.cookie.CookieOrigin;
+import org.apache.http.cookie.ClientCookie;
+import org.apache.http.cookie.Cookie;
+import org.apache.http.impl.cookie.BasicPathHandler;
+import org.apache.http.impl.cookie.BrowserCompatSpec;
+import org.apache.http.cookie.CookieSpecFactory;
+import org.apache.http.cookie.CookieSpec;
+
+import org.apache.http.cookie.MalformedCookieException;
+import org.apache.http.conn.ConnectTimeoutException;
+import org.apache.http.client.RedirectException;
+import org.apache.http.client.CircularRedirectException;
+import org.apache.http.NoHttpResponseException;
+import org.apache.http.HttpException;
/** This class uses httpclient to fetch stuff from webservers. However, it additionally controls the fetch
* rate in two ways: first, controlling the overall bandwidth used per server, and second, limiting the number
@@ -114,23 +160,20 @@ public class ThrottledFetcher
String proxyHost, int proxyPort, String proxyAuthDomain, String proxyAuthUsername, String proxyAuthPassword)
throws ManifoldCFException
{
- // First, create a protocol factory object, if we can
- ProtocolFactory myFactory = new ProtocolFactory();
+ // Create the https scheme for this connection
+ javax.net.ssl.SSLSocketFactory baseFactory;
String trustStoreString;
- ProtocolSocketFactory secureSocketFactory;
if (trustStore != null)
{
- secureSocketFactory = new WebSecureSocketFactory(trustStore.getSecureSocketFactory());
+ baseFactory = trustStore.getSecureSocketFactory();
trustStoreString = trustStore.getString();
}
else
{
+ baseFactory = KeystoreManagerFactory.getTrustingSecureSocketFactory();
trustStoreString = null;
- secureSocketFactory = new WebSecureSocketFactory(KeystoreManagerFactory.getTrustingSecureSocketFactory());
}
- Protocol myHttpsProtocol = new Protocol("https", (ProtocolSocketFactory)secureSocketFactory, 443);
- myFactory.registerProtocol("https",myHttpsProtocol);
ConnectionBin[] bins = new ConnectionBin[binNames.length];
@@ -370,7 +413,7 @@ public class ThrottledFetcher
// If we have a connection located, activate it.
if (connectionToReuse == null)
- connectionToReuse = new ThrottledConnection(protocol,server,port,authentication,myFactory,trustStoreString,bins,
+ connectionToReuse = new ThrottledConnection(protocol,server,port,authentication,baseFactory,trustStoreString,bins,
proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword);
connectionToReuse.setup(throttleDescription);
return connectionToReuse;
@@ -853,126 +896,6 @@ public class ThrottledFetcher
}
- // Where to record result info/file data
- protected final static String resultLogFile = "/common/web/resultlog";
- // Where to record the actual data
- protected final static String dataFileFolder = "/common/web/data/";
-
- // This is the one instance of the output class
- protected static DataRecorder dataRecorder = new DataRecorder();
-
- /** This class takes care of recording data and results for posterity */
- protected static class DataRecorder
- {
- protected int documentNumber = 0;
-
- public DataRecorder()
- {
- }
-
- public DataSession getSession(String url)
- throws ManifoldCFException
- {
- return new DataSession(this,url);
- }
-
- /** Atomically write resultlog record, returning data file name to use */
- public synchronized String writeResponseRecord(String url, int responseCode, ArrayList headerNames, ArrayList headerValues)
- throws ManifoldCFException
- {
- // Open log file
- try
- {
- OutputStream os = new FileOutputStream(resultLogFile,true);
- try
- {
- OutputStreamWriter writer = new OutputStreamWriter(os,"utf-8");
- try
- {
- String documentName = Integer.toString(documentNumber++);
- writer.write("URI: "+url+"\n");
- writer.write("File: "+documentName+"\n");
- writer.write("Code: "+Integer.toString(responseCode)+"\n");
- int i = 0;
- while (i < headerNames.size())
- {
- writer.write("Header: "+(String)headerNames.get(i)+":"+(String)headerValues.get(i)+"\n");
- i++;
- }
- return documentName;
- }
- finally
- {
- writer.close();
- }
- }
- finally
- {
- os.close();
- }
- }
- catch (IOException e)
- {
- throw new ManifoldCFException("Error recording file info: "+e.getMessage(),e);
- }
-
- }
-
-
- }
-
- /** Helper class for the above */
- protected static class DataSession
- {
- protected DataRecorder dr;
- protected String url;
- protected int responseCode = 0;
- protected ArrayList headerNames = new ArrayList();
- protected ArrayList headerValues = new ArrayList();
- protected String documentName = null;
-
- public DataSession(DataRecorder dr, String url)
- {
- this.dr = dr;
- this.url = url;
- }
-
- public void setResponseCode(int responseCode)
- {
- this.responseCode = responseCode;
- }
-
- public void addHeader(String headerName, String headerValue)
- {
- headerNames.add(headerName);
- headerValues.add(headerValue);
- }
-
- public void endHeader()
- throws ManifoldCFException
- {
- documentName = dr.writeResponseRecord(url,responseCode,headerNames,headerValues);
- }
-
- public void write(byte[] theBytes, int off, int length)
- throws IOException
- {
- if (documentName == null)
- throw new IOException("Must end header before reading data!");
- OutputStream os = new FileOutputStream(dataFileFolder+documentName,true);
- try
- {
- os.write(theBytes,off,length);
- }
- finally
- {
- os.close();
- }
- }
-
- }
-
-
/** Throttled connections. Each instance of a connection describes the bins to which it belongs,
* along with the actual open connection itself, and the last time the connection was used. */
protected static class ThrottledConnection implements IThrottledConnection
@@ -1004,9 +927,11 @@ public class ThrottledFetcher
protected String trustStoreString;
/** The http connection manager. The pool is of size 1. */
- protected MultiThreadedHttpConnectionManager connManager = null;
+ protected PoolingClientConnectionManager connManager = null;
+ /** The http client object. */
+ protected AbstractHttpClient httpClient = null;
/** The method object */
- protected HttpMethodBase fetchMethod = null;
+ protected HttpRequestBase fetchMethod = null;
/** The error trace, if any */
protected Throwable throwable = null;
/** The current URL being fetched */
@@ -1031,19 +956,19 @@ public class ThrottledFetcher
protected final String proxyAuthUsername;
/** Proxy auth password */
protected final String proxyAuthPassword;
-
- /** Protocol socket factory */
- protected ProtocolSocketFactory secureSocketFactory = null;
- protected ProtocolFactory myFactory = null;
+ /** Https protocol */
+ protected final javax.net.ssl.SSLSocketFactory httpsSocketFactory;
-
- /** Hack added to record all access data from current crawler */
- protected DataSession dataSession = null;
+ /** The thread that is actually doing the work */
+ protected ExecuteMethodThread methodThread = null;
+ /** Set if thread has been started */
+ protected boolean threadStarted = false;
+
/** Constructor. Create a connection with a specific server and port, and
* register it as active against all bins. */
public ThrottledConnection(String protocol, String server, int port, PageCredentials authentication,
- ProtocolFactory myFactory, String trustStoreString, ConnectionBin[] connectionBins,
+ javax.net.ssl.SSLSocketFactory httpsSocketFactory, String trustStoreString, ConnectionBin[] connectionBins,
String proxyHost, int proxyPort, String proxyAuthDomain, String proxyAuthUsername, String proxyAuthPassword)
{
this.proxyHost = proxyHost;
@@ -1055,7 +980,7 @@ public class ThrottledFetcher
this.server = server;
this.port = port;
this.authentication = authentication;
- this.myFactory = myFactory;
+ this.httpsSocketFactory = httpsSocketFactory;
this.trustStoreString = trustStoreString;
this.connectionBinArray = connectionBins;
this.throttleBinArray = new ThrottleBin[connectionBins.length];
@@ -1210,10 +1135,11 @@ public class ThrottledFetcher
if (connManager != null)
{
- connManager.closeIdleConnections(idleTimeout);
- connManager.deleteClosedConnections();
+ connManager.closeIdleConnections(idleTimeout, TimeUnit.MILLISECONDS);
+ connManager.closeExpiredConnections();
// Need to determine if there's a valid connection in the connection manager still, or if it is empty.
- return connManager.getConnectionsInPool() == 0;
+ //return connManager.getConnectionsInPool() == 0;
+ return true;
}
else
return true;
@@ -1312,47 +1238,6 @@ public class ThrottledFetcher
}
}
- protected static class ExecuteMethodThread extends Thread
- {
- protected HttpClient client;
- protected HostConfiguration hostConfiguration;
- protected HttpMethodBase executeMethod;
- protected Throwable exception = null;
- protected int rval = 0;
-
- public ExecuteMethodThread(HttpClient client, HostConfiguration hostConfiguration, HttpMethodBase executeMethod)
- {
- super();
- setDaemon(true);
- this.client = client;
- this.hostConfiguration = hostConfiguration;
- this.executeMethod = executeMethod;
- }
-
- public void run()
- {
- try
- {
- // Call the execute method appropriately
- rval = client.executeMethod(hostConfiguration,executeMethod,null);
- }
- catch (Throwable e)
- {
- this.exception = e;
- }
- }
-
- public Throwable getException()
- {
- return exception;
- }
-
- public int getResponse()
- {
- return rval;
- }
- }
-
/** Execute the fetch and get the return code. This method uses the
* standard logging mechanism to keep track of the fetch attempt. It also
* signals the following conditions: ServiceInterruption (if a dynamic
@@ -1375,225 +1260,263 @@ public class ThrottledFetcher
LoginCookies loginCookies)
throws ManifoldCFException, ServiceInterruption
{
- StringBuilder sb = new StringBuilder(protocol);
- sb.append("://").append(server);
+ // Set up scheme
+ SSLSocketFactory myFactory = new SSLSocketFactory(new InterruptibleSocketFactory(httpsSocketFactory,connectionTimeoutMilliseconds),
+ new AllowAllHostnameVerifier());
+ Scheme myHttpsProtocol = new Scheme("https", 443, myFactory);
+
+ int resolvedPort;
+ String displayedPort;
if (port != -1)
{
if (!(protocol.equals("http") && port == 80) &&
!(protocol.equals("https") && port == 443))
- sb.append(":").append(Integer.toString(port));
+ displayedPort = ":"+Integer.toString(port);
+ else
+ displayedPort = "";
+ resolvedPort = port;
+ }
+ else
+ {
+ if (protocol.equals("http"))
+ resolvedPort = 80;
+ else if (protocol.equals("https"))
+ resolvedPort = 443;
+ else
+ throw new IllegalArgumentException("Unexpected protocol: "+protocol);
+ displayedPort = "";
}
- sb.append(urlPath);
+
+ StringBuilder sb = new StringBuilder(protocol);
+ sb.append("://").append(server).append(displayedPort).append(urlPath);
String fetchUrl = sb.toString();
+
+ HttpHost fetchHost = new HttpHost(server,resolvedPort,protocol);
+ HttpHost hostHost;
+
if (host != null)
{
sb.setLength(0);
- sb.append(protocol).append("://").append(host);
- if (port != -1)
- {
- if (!(protocol.equals("http") && port == 80) &&
- !(protocol.equals("https") && port == 443))
- sb.append(":").append(Integer.toString(port));
- }
- sb.append(urlPath);
+ sb.append(protocol).append("://").append(host).append(displayedPort).append(urlPath);
myUrl = sb.toString();
+ hostHost = new HttpHost(host,resolvedPort,protocol);
}
else
+ {
myUrl = fetchUrl;
-
- if (recordEverything)
- // Start a new data session
- dataSession = dataRecorder.getSession(myUrl);
-
- try
+ hostHost = fetchHost;
+ }
+
+ if (connManager == null)
{
- if (connManager == null)
- connManager = new MultiThreadedHttpConnectionManager();
- HttpConnectionManagerParams httpConParam = connManager.getParams();
- httpConParam.setDefaultMaxConnectionsPerHost(1);
- httpConParam.setMaxTotalConnections(1);
- httpConParam.setConnectionTimeout(connectionTimeoutMilliseconds);
- httpConParam.setSoTimeout(socketTimeoutMilliseconds);
- connManager.setParams(httpConParam);
-
- long startTime = 0L;
- if (Logging.connectors.isDebugEnabled())
- {
- startTime = System.currentTimeMillis();
- Logging.connectors.debug("WEB: Waiting for an HttpClient object");
- }
+ PoolingClientConnectionManager localConnManager = new PoolingClientConnectionManager();
+ localConnManager.setMaxTotal(1);
+ localConnManager.setDefaultMaxPerRoute(1);
+ connManager = localConnManager;
+ }
+
+ // Set up protocol registry
+ connManager.getSchemeRegistry().register(myHttpsProtocol);
+
+ long startTime = 0L;
+ if (Logging.connectors.isDebugEnabled())
+ {
+ startTime = System.currentTimeMillis();
+ Logging.connectors.debug("WEB: Waiting for an HttpClient object");
+ }
+ // If we already have an httpclient object, great. Otherwise we have to get one, and initialize it with
+ // those parameters that aren't expected to change.
+ if (httpClient == null)
+ {
+ BasicHttpParams params = new BasicHttpParams();
+ params.setParameter(ClientPNames.DEFAULT_HOST,fetchHost);
+ params.setBooleanParameter(CoreConnectionPNames.TCP_NODELAY,true);
+ params.setBooleanParameter(CoreConnectionPNames.STALE_CONNECTION_CHECK,false);
+ params.setBooleanParameter(ClientPNames.ALLOW_CIRCULAR_REDIRECTS,true);
+ // MEDIUM_SECURITY compatibility level not supported in HttpComponents. Try BROWSER_NETSCAPE?
+ HttpClientParams.setCookiePolicy(params,CookiePolicy.BROWSER_COMPATIBILITY);
+ params.setBooleanParameter(CookieSpecPNames.SINGLE_COOKIE_HEADER,new Boolean(true));
- HttpClient client = new HttpClient(connManager);
- // Permit circular redirections, because that is how some sites set cookies
- client.getParams().setParameter(org.apache.commons.httpclient.params.HttpClientParams.ALLOW_CIRCULAR_REDIRECTS,new Boolean(true));
- // If there are redirects, this is essential to make sure the right socket factory gets used
- client.getParams().setParameter(org.apache.commons.httpclient.params.HttpClientParams.PROTOCOL_FACTORY,myFactory);
+ DefaultHttpClient localHttpClient = new DefaultHttpClient(connManager,params);
+ localHttpClient.setRedirectStrategy(new DefaultRedirectStrategy());
+ localHttpClient.getCookieSpecs().register(CookiePolicy.BROWSER_COMPATIBILITY, new CookieSpecFactory()
+ {
- HostConfiguration clientConf = new HostConfiguration();
- // clientConf.setLocalAddress(currentAddr);
+ public CookieSpec newInstance(HttpParams params)
+ {
+ return new LaxBrowserCompatSpec();
+ }
+
+ }
+ );
- // Set up protocol to use
- clientConf.setParams(new HostParams());
- clientConf.setHost(server,port,myFactory.getProtocol(protocol));
// If there's a proxy, set that too.
if (proxyHost != null && proxyHost.length() > 0)
{
- clientConf.setProxy(proxyHost,proxyPort);
+ // Configure proxy authentication
if (proxyAuthUsername != null && proxyAuthUsername.length() > 0)
{
- // Set up NTLM credentials for this fetch too.
- client.getState().setProxyCredentials(AuthScope.ANY,
- new NTCredentials(proxyAuthUsername,(proxyAuthPassword==null)?"":proxyAuthPassword,currentHost,(proxyAuthDomain==null)?"":proxyAuthDomain));
+ localHttpClient.getCredentialsProvider().setCredentials(
+ new AuthScope(proxyHost, proxyPort),
+ new NTCredentials(proxyAuthUsername, (proxyAuthPassword==null)?"":proxyAuthPassword, currentHost, (proxyAuthDomain==null)?"":proxyAuthDomain));
}
+
+ HttpHost proxy = new HttpHost(proxyHost, proxyPort);
+
+ localHttpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
}
+ // Set up authentication to use
+ if (authentication != null)
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: For "+myUrl+", discovered matching authentication credentials");
+ localHttpClient.getCredentialsProvider().setCredentials(AuthScope.ANY,
+ authentication.makeCredentialsObject(host));
+ }
+
+ httpClient = localHttpClient;
+ }
+
+ // Set the parameters we haven't keyed on (so these can change from request to request)
+ httpClient.getParams().setIntParameter(CoreConnectionPNames.SO_TIMEOUT,socketTimeoutMilliseconds);
+ httpClient.getParams().setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT,connectionTimeoutMilliseconds);
+ httpClient.getParams().setBooleanParameter(ClientPNames.HANDLE_REDIRECTS,redirectOK);
+
+ if (host != null)
+ {
if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("WEB: Got an HttpClient object after "+new Long(System.currentTimeMillis()-startTime).toString()+" ms.");
+ Logging.connectors.debug("WEB: For "+myUrl+", setting virtual host to "+host);
+ httpClient.getParams().setParameter(ClientPNames.VIRTUAL_HOST,hostHost);
+ }
- startFetchTime = System.currentTimeMillis();
- int pageFetchMethod = FormData.SUBMITMETHOD_GET;
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: Got an HttpClient object after "+new Long(System.currentTimeMillis()-startTime).toString()+" ms.");
+
+ startFetchTime = System.currentTimeMillis();
+
+ int pageFetchMethod = FormData.SUBMITMETHOD_GET;
+ if (formData != null)
+ pageFetchMethod = formData.getSubmitMethod();
+ switch (pageFetchMethod)
+ {
+ case FormData.SUBMITMETHOD_GET:
+ // MUST be just the path, or apparently we wind up resetting the HostConfiguration
+ // Add additional parameters to url path
+ String fullUrlPath;
if (formData != null)
- pageFetchMethod = formData.getSubmitMethod();
- switch (pageFetchMethod)
{
- case FormData.SUBMITMETHOD_GET:
- // MUST be just the path, or apparently we wind up resetting the HostConfiguration
- // Add additional parameters to url path
- String fullUrlPath;
- if (formData != null)
- {
- StringBuilder psb = new StringBuilder(urlPath);
- Iterator iter = formData.getElementIterator();
- char appendChar;
- if (urlPath.indexOf("?") == -1)
- appendChar = '?';
- else
- appendChar = '&';
+ StringBuilder psb = new StringBuilder(urlPath);
+ Iterator iter = formData.getElementIterator();
+ char appendChar;
+ if (urlPath.indexOf("?") == -1)
+ appendChar = '?';
+ else
+ appendChar = '&';
+ try
+ {
while (iter.hasNext())
{
- FormDataElement e = (FormDataElement)iter.next();
+ FormDataElement el = (FormDataElement)iter.next();
psb.append(appendChar);
appendChar = '&';
- String param = e.getElementName();
- String value = e.getElementValue();
+ String param = el.getElementName();
+ String value = el.getElementValue();
psb.append(java.net.URLEncoder.encode(param,"utf-8"));
if (value != null)
+ {
psb.append('=').append(java.net.URLEncoder.encode(value,"utf-8"));
+ }
}
- fullUrlPath = psb.toString();
- }
- else
- {
- fullUrlPath = urlPath;
}
- // Hack; apparently httpclient treats // as a protocol specifier and so it rips off the first section of the path in that case.
- while (fullUrlPath.startsWith("//"))
- fullUrlPath = fullUrlPath.substring(1);
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("WEB: Get method for '"+fullUrlPath+"'");
- fetchMethod = new GetMethod(fullUrlPath);
- break;
- case FormData.SUBMITMETHOD_POST:
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("WEB: Post method for '"+urlPath+"'");
- // MUST be just the path, or apparently we wind up resetting the HostConfiguration
- PostMethod postMethod = new PostMethod(urlPath);
- // Add parameters to post variables
- if (formData != null)
+ catch (java.io.UnsupportedEncodingException e)
{
- Iterator iter = formData.getElementIterator();
- while (iter.hasNext())
- {
- FormDataElement e = (FormDataElement)iter.next();
- String param = e.getElementName();
- String value = e.getElementValue();
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("WEB: Post parameter name '"+param+"' value '"+value+"' for '"+urlPath+"'");
- postMethod.addParameter(param,value);
- }
+ throw new ManifoldCFException("Unsupported encoding: "+e.getMessage(),e);
}
- fetchMethod = postMethod;
- break;
- default:
- throw new ManifoldCFException("Illegal method type: "+Integer.toString(pageFetchMethod));
- }
- // Set all appropriate headers and parameters
- fetchMethod.setRequestHeader("User-Agent",userAgent);
- fetchMethod.setRequestHeader("From",from);
- HttpMethodParams params = fetchMethod.getParams();
- if (host != null)
+ fullUrlPath = psb.toString();
+ }
+ else
{
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("WEB: For "+myUrl+", setting virtual host to "+host);
- params.setVirtualHost(host);
+ fullUrlPath = urlPath;
}
- params.setSoTimeout(socketTimeoutMilliseconds);
- params.setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY_MEDIUM_SECURITY);
- params.setParameter(HttpMethodParams.SINGLE_COOKIE_HEADER,new Boolean(true));
- fetchMethod.setParams(params);
- fetchMethod.setFollowRedirects(redirectOK);
-
- // Clear all current cookies
- HttpState state = client.getState();
- state.clearCookies();
+ // Hack; apparently httpclient treats // as a protocol specifier and so it rips off the first section of the path in that case.
+ while (fullUrlPath.startsWith("//"))
+ fullUrlPath = fullUrlPath.substring(1);
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: Get method for '"+fullUrlPath+"'");
+ fetchMethod = new HttpGet(fullUrlPath);
+ break;
+ case FormData.SUBMITMETHOD_POST:
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: Post method for '"+urlPath+"'");
+ // MUST be just the path, or apparently we wind up resetting the HostConfiguration
+ HttpPost postMethod = new HttpPost(urlPath);
+ List<NameValuePair> nvps = new ArrayList<NameValuePair>();
- // If we have any cookies to set, set them.
- if (loginCookies != null)
+ // Add parameters to post variables
+ if (formData != null)
{
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("WEB: Adding "+Integer.toString(loginCookies.getCookieCount())+" cookies for '"+urlPath+"'");
- int h = 0;
- while (h < loginCookies.getCookieCount())
+ Iterator iter = formData.getElementIterator();
+ while (iter.hasNext())
{
- state.addCookie(loginCookies.getCookie(h++));
+ FormDataElement e = (FormDataElement)iter.next();
+ String param = e.getElementName();
+ String value = e.getElementValue();
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: Post parameter name '"+param+"' value '"+value+"' for '"+urlPath+"'");
+ nvps.add(new BasicNameValuePair(param,value));
}
}
+ try
+ {
+ postMethod.setEntity(new UrlEncodedFormEntity(nvps,HTTP.UTF_8));
+ }
+ catch (java.io.UnsupportedEncodingException e)
+ {
+ throw new ManifoldCFException("Unsupported UTF-8 encoding: "+e.getMessage(),e);
+ }
+ fetchMethod = postMethod;
+ break;
+ default:
+ throw new ManifoldCFException("Illegal method type: "+Integer.toString(pageFetchMethod));
+ }
- // Copy out the current cookies, in case the fetch fails
- lastFetchCookies = loginCookies;
+ // Set all appropriate headers and parameters
+ fetchMethod.setHeader(new BasicHeader("User-Agent",userAgent));
+ fetchMethod.setHeader(new BasicHeader("From",from));
+
+ // Clear all current cookies
+ httpClient.getCookieStore().clear();
- // Set up authentication to use
- if (authentication != null)
+ // If we have any cookies to set, set them.
+ if (loginCookies != null)
+ {
+ if (Logging.connectors.isDebugEnabled())
+ Logging.connectors.debug("WEB: Adding "+Integer.toString(loginCookies.getCookieCount())+" cookies for '"+urlPath+"'");
+ int h = 0;
+ while (h < loginCookies.getCookieCount())
{
- if (Logging.connectors.isDebugEnabled())
- Logging.connectors.debug("WEB: For "+myUrl+", discovered matching authentication credentials");
- state.setCredentials(AuthScope.ANY, authentication.makeCredentialsObject(host));
+ httpClient.getCookieStore().addCookie(loginCookies.getCookie(h++));
}
+ }
- // Set the state. May not be necessary, but I'd rather not depend on undocumented httpclient implementation details.
- client.setState(state);
+ // Copy out the current cookies, in case the fetch fails
+ lastFetchCookies = loginCookies;
- // Fire it off!
+ // Create the thread
+ methodThread = new ExecuteMethodThread(this, httpClient, fetchMethod);
+ try
+ {
+ methodThread.start();
+ threadStarted = true;
try
{
- ExecuteMethodThread t = new ExecuteMethodThread(client,clientConf,fetchMethod);
- try
- {
- t.start();
- t.join();
- Throwable thr = t.getException();
- if (thr != null)
- {
- throw thr;
- }
- statusCode = t.getResponse();
- if (recordEverything)
- dataSession.setResponseCode(statusCode);
- }
- catch (InterruptedException e)
- {
- t.interrupt();
- // We need the caller to abandon any connections left around, so rethrow in a way that forces them to process the event properly.
- throw e;
- }
-
- // At least we didn't get an exception! Copy out the current cookies for later reference.
- lastFetchCookies = new CookieSet(client.getState().getCookies());
-
+ statusCode = methodThread.getResponseCode();
+ lastFetchCookies = methodThread.getCookies();
switch (statusCode)
{
case HttpStatus.SC_REQUEST_TIMEOUT:
@@ -1616,112 +1539,85 @@ public class ThrottledFetcher
default:
return;
}
-
- }
- catch (java.net.SocketTimeoutException e)
- {
- throwable = e;
- long currentTime = System.currentTimeMillis();
- throw new ServiceInterruption("Timed out waiting for IO for '"+myUrl+"': "+e.getMessage(), e, currentTime + TIME_5MIN,
- currentTime + TIME_2HRS,-1,false);
- }
- catch (org.apache.commons.httpclient.ConnectTimeoutException e)
- {
- throwable = e;
- long currentTime = System.currentTimeMillis();
- throw new ServiceInterruption("Timed out waiting for connection for '"+myUrl+"': "+e.getMessage(), e, currentTime + TIME_5MIN,
- currentTime + TIME_2HRS,-1,false);
- }
- catch (InterruptedIOException e)
- {
- //Logging.connectors.warn("IO interruption seen",e);
- throwable = new ManifoldCFException("Interrupted: "+e.getMessage(),e);
- statusCode = FETCH_INTERRUPTED;
- throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
}
- catch (org.apache.commons.httpclient.RedirectException e)
- {
- throwable = e;
- statusCode = FETCH_CIRCULAR_REDIRECT;
- if (recordEverything)
- dataSession.setResponseCode(statusCode);
- return;
- }
- catch (org.apache.commons.httpclient.NoHttpResponseException e)
- {
- throwable = e;
- long currentTime = System.currentTimeMillis();
- throw new ServiceInterruption("Timed out waiting for response for '"+myUrl+"': "+e.getMessage(), e, currentTime + TIME_15MIN,
- currentTime + TIME_2HRS,-1,false);
- }
- catch (java.net.ConnectException e)
- {
- throwable = e;
- long currentTime = System.currentTimeMillis();
- throw new ServiceInterruption("Timed out waiting for a connection for '"+myUrl+"': "+e.getMessage(), e, currentTime + TIME_2HRS,
- currentTime + TIME_6HRS,-1,false);
- }
- catch (javax.net.ssl.SSLException e)
- {
- // Probably this is an incorrectly configured trust store
- throwable = new ManifoldCFException("SSL handshake error: "+e.getMessage()+"; check your connection's Certificate configuration",e);
- statusCode = FETCH_IO_ERROR;
- if (recordEverything)
- dataSession.setResponseCode(statusCode);
- return;
- }
- catch (IOException e)
+ catch (InterruptedException e)
{
- // Treat this as a bad url. We don't know what happened, but it isn't something we are going to naively
- // retry on.
- throwable = e;
- statusCode = FETCH_IO_ERROR;
- if (recordEverything)
- dataSession.setResponseCode(statusCode);
- return;
+ methodThread.interrupt();
+ methodThread = null;
+ threadStarted = false;
+ throw e;
}
}
catch (InterruptedException e)
{
- // Drop the current connection, and in fact the whole pool, on the floor.
+ // Drop the current connection on the floor, so it cannot be reused.
fetchMethod = null;
- connManager = null;
throwable = new ManifoldCFException("Interrupted: "+e.getMessage(),e);
statusCode = FETCH_INTERRUPTED;
throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
- catch (IllegalArgumentException e)
+ catch (java.net.SocketTimeoutException e)
{
- throwable = new ManifoldCFException("Illegal URI: '"+myUrl+"'",e);
- statusCode = FETCH_BAD_URI;
- if (recordEverything)
- dataSession.setResponseCode(statusCode);
- return;
+ throwable = e;
+ long currentTime = System.currentTimeMillis();
+ throw new ServiceInterruption("Timed out waiting for IO for '"+myUrl+"': "+e.getMessage(), e, currentTime + TIME_5MIN,
+ currentTime + TIME_2HRS,-1,false);
+ }
+ catch (ConnectTimeoutException e)
+ {
+ throwable = e;
+ long currentTime = System.currentTimeMillis();
+ throw new ServiceInterruption("Timed out waiting for connection for '"+myUrl+"': "+e.getMessage(), e, currentTime + TIME_5MIN,
+ currentTime + TIME_2HRS,-1,false);
+ }
+ catch (InterruptedIOException e)
+ {
+ //Logging.connectors.warn("IO interruption seen",e);
+ throwable = new ManifoldCFException("Interrupted: "+e.getMessage(),e);
+ statusCode = FETCH_INTERRUPTED;
+ throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
}
- catch (IllegalStateException e)
+ catch (RedirectException e)
{
- throwable = new ManifoldCFException("Illegal state while fetching URI: '"+myUrl+"'",e);
- statusCode = FETCH_SEQUENCE_ERROR;
- if (recordEverything)
- dataSession.setResponseCode(statusCode);
+ throwable = e;
+ statusCode = FETCH_CIRCULAR_REDIRECT;
return;
}
- catch (ServiceInterruption e)
+ catch (NoHttpResponseException e)
{
- throw e;
+ throwable = e;
+ long currentTime = System.currentTimeMillis();
+ throw new ServiceInterruption("Timed out waiting for response for '"+myUrl+"': "+e.getMessage(), e, currentTime + TIME_15MIN,
+ currentTime + TIME_2HRS,-1,false);
}
- catch (ManifoldCFException e)
+ catch (java.net.ConnectException e)
{
- throw e;
+ throwable = e;
+ long currentTime = System.currentTimeMillis();
+ throw new ServiceInterruption("Timed out waiting for a connection for '"+myUrl+"': "+e.getMessage(), e, currentTime + TIME_2HRS,
+ currentTime + TIME_6HRS,-1,false);
+ }
+ catch (javax.net.ssl.SSLException e)
+ {
+ // Probably this is an incorrectly configured trust store
+ throwable = new ManifoldCFException("SSL handshake error: "+e.getMessage()+"; check your connection's Certificate configuration",e);
+ statusCode = FETCH_IO_ERROR;
+ return;
+ }
+ catch (IOException e)
+ {
+ // Treat this as a bad url. We don't know what happened, but it isn't something we are going to naively
+ // retry on.
+ throwable = e;
+ statusCode = FETCH_IO_ERROR;
+ return;
}
catch (Throwable e)
{
Logging.connectors.debug("WEB: Caught an unexpected exception: "+e.getMessage(),e);
throwable = e;
statusCode = FETCH_UNKNOWN_ERROR;
- if (recordEverything)
- dataSession.setResponseCode(statusCode);
return;
}
@@ -1754,23 +1650,28 @@ public class ThrottledFetcher
public Map<String,List<String>> getResponseHeaders()
throws ManifoldCFException, ServiceInterruption
{
- Header[] headers = fetchMethod.getResponseHeaders();
- Map<String,List<String>> rval = new HashMap<String,List<String>>();
- int i = 0;
- while (i < headers.length)
+ if (fetchMethod == null)
+ throw new ManifoldCFException("Attempt to get headers when there is no method");
+ if (methodThread == null || threadStarted == false)
+ throw new ManifoldCFException("Attempt to get headers when no method thread");
+ try
{
- Header h = headers[i++];
- String name = h.getName();
- String value = h.getValue();
- List<String> values = rval.get(name);
- if (values == null)
- {
- values = new ArrayList<String>();
- rval.put(name,values);
- }
- values.add(value);
+ return methodThread.getResponseHeaders();
}
- return rval;
+ catch (InterruptedException e)
+ {
+ methodThread.interrupt();
+ throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ }
+ catch (HttpException e)
+ {
+ handleHTTPException(e,"reading headers");
+ }
+ catch (IOException e)
+ {
+ handleIOException(e,"reading headers");
+ }
+ return null;
}
/** Get a specified response header, if it exists.
@@ -1781,60 +1682,59 @@ public class ThrottledFetcher
public String getResponseHeader(String headerName)
throws ManifoldCFException, ServiceInterruption
{
- Header h = fetchMethod.getResponseHeader(headerName);
- if (h == null)
- return null;
- if (recordEverything)
- dataSession.addHeader(headerName,h.getValue());
- return h.getValue();
- }
-
- /** Get the response input stream. It is the responsibility of the caller
- * to close this stream when done.
- */
- @Override
- public InputStream getResponseBodyStream()
- throws ManifoldCFException, ServiceInterruption
- {
if (fetchMethod == null)
- throw new ManifoldCFException("Attempt to get a response when there is no method");
+ throw new ManifoldCFException("Attempt to get a header when there is no method");
+ if (methodThread == null || threadStarted == false)
+ throw new ManifoldCFException("Attempt to get a header when no method thread");
try
{
- if (recordEverything)
- dataSession.endHeader();
- InputStream bodyStream = fetchMethod.getResponseBodyAsStream();
- if (bodyStream == null)
- {
- Logging.connectors.debug("Web: Couldn't set up response stream for '"+myUrl+"', retrying");
- throw new ServiceInterruption("Failed to set up body response stream for "+myUrl,null,TIME_5MIN,-1L,2,false);
- }
- return new ThrottledInputstream(this,bodyStream,dataSession);
+ return methodThread.getFirstHeader(headerName);
}
- catch (java.net.SocketTimeoutException e)
+ catch (InterruptedException e)
{
- Logging.connectors.debug("Web: Socket timeout exception setting up response stream for '"+myUrl+"', retrying");
- throw new ServiceInterruption("Socket timeout exception setting up response stream: "+e.getMessage(),e,System.currentTimeMillis()+TIME_5MIN,-1L,2,false);
+ methodThread.interrupt();
+ throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
- catch (org.apache.commons.httpclient.ConnectTimeoutException e)
+ catch (HttpException e)
{
- Logging.connectors.debug("Web: Connect timeout exception setting up response stream for '"+myUrl+"', retrying");
- throw new ServiceInterruption("Connect timeout exception setting up response stream: "+e.getMessage(),e,System.currentTimeMillis()+TIME_5MIN,-1L,2,false);
+ handleHTTPException(e,"reading header");
}
- catch (InterruptedIOException e)
+ catch (IOException e)
+ {
+ handleIOException(e,"reading header");
+ }
+ return null;
+ }
+
+ /** Get the response input stream. It is the responsibility of the caller
+ * to close this stream when done.
+ */
+ @Override
+ public InputStream getResponseBodyStream()
+ throws ManifoldCFException, ServiceInterruption
+ {
+ if (fetchMethod == null)
+ throw new ManifoldCFException("Attempt to get an input stream when there is no method");
+ if (methodThread == null || threadStarted == false)
+ throw new ManifoldCFException("Attempt to get an input stream when no method thread");
+ try
+ {
+ return methodThread.getSafeInputStream();
+ }
+ catch (InterruptedException e)
{
- //Logging.connectors.warn("IO interruption seen: "+e.getMessage(),e);
+ methodThread.interrupt();
throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
}
catch (IOException e)
{
- Logging.connectors.debug("Web: IO exception setting up response stream for '"+myUrl+"', retrying");
- throw new ServiceInterruption("IO exception setting up response stream: "+e.getMessage(),e,System.currentTimeMillis()+TIME_5MIN,-1L,2,false);
+ handleIOException(e, "reading response stream");
}
- catch (IllegalStateException e)
+ catch (HttpException e)
{
- Logging.connectors.debug("Web: State error getting response body for '"+myUrl+"', retrying");
- throw new ServiceInterruption("State error getting response body: "+e.getMessage(),e,TIME_5MIN,-1L,2,false);
+ handleHTTPException(e, "reading response stream");
}
+ return null;
}
/** Get limited response as a string.
@@ -1860,31 +1760,11 @@ public class ThrottledFetcher
is.close();
}
}
- catch (java.net.SocketTimeoutException e)
- {
- Logging.connectors.debug("Web: Socket timeout exception reading response stream for '"+myUrl+"', retrying");
- throw new ServiceInterruption("Socket timeout exception reading response stream: "+e.getMessage(),e,System.currentTimeMillis()+TIME_5MIN,-1L,2,false);
- }
- catch (org.apache.commons.httpclient.ConnectTimeoutException e)
- {
- Logging.connectors.debug("Web: Connect timeout exception reading response stream for '"+myUrl+"', retrying");
- throw new ServiceInterruption("Connect timeout exception reading response stream: "+e.getMessage(),e,System.currentTimeMillis()+TIME_5MIN,-1L,2,false);
- }
- catch (InterruptedIOException e)
- {
- //Logging.connectors.warn("IO interruption seen: "+e.getMessage(),e);
- throw new ManifoldCFException("Interrupted: "+e.getMessage(),e,ManifoldCFException.INTERRUPTED);
- }
catch (IOException e)
{
- Logging.connectors.debug("Web: IO exception reading response stream for '"+myUrl+"', retrying");
- throw new ServiceInterruption("IO exception reading response stream: "+e.getMessage(),e,System.currentTimeMillis()+TIME_5MIN,-1L,2,false);
- }
- catch (IllegalStateException e)
- {
- Logging.connectors.debug("Web: State error reading response body for '"+myUrl+"', retrying");
- throw new ServiceInterruption("State error reading response body: "+e.getMessage(),e,TIME_5MIN,-1L,2,false);
+ handleIOException(e,"reading limited response");
}
+ return null;
}
/** Note that the connection fetch was interrupted by something.
@@ -1908,6 +1788,9 @@ public class ThrottledFetcher
{
if (fetchType != null)
{
+ // Abort the connection, if not already complete
+ methodThread.abort();
+
long endTime = System.currentTimeMillis();
int i = 0;
while (i < throttleBinArray.length)
@@ -1932,20 +1815,25 @@ public class ThrottledFetcher
Logging.connectors.debug("WEB: Fetch exception for '"+myUrl+"'",throwable);
}
-
- // Clear out all the parameters
- if (fetchMethod != null)
+ // Shut down (join) the connection thread, if any, and if it started
+ if (methodThread != null)
{
- try
+ if (threadStarted)
{
- fetchMethod.releaseConnection();
- }
- catch (IllegalStateException e)
- {
- // looks like the fetch method didn't have one, or it was already released. Just eat the exception.
+ try
+ {
+ methodThread.finishUp();
+ }
+ catch (InterruptedException e)
+ {
+ throw new ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
+ }
+ threadStarted = false;
}
- fetchMethod = null;
+ methodThread = null;
}
+
+ fetchMethod = null;
throwable = null;
startFetchTime = -1L;
myUrl = null;
@@ -1999,6 +1887,61 @@ public class ThrottledFetcher
poolLock.notifyAll();
}
}
+
+ protected void handleHTTPException(HttpException e, String activity)
+ throws ServiceInterruption, ManifoldCFException
+ {
+ long currentTime = System.currentTimeMillis();
+ Logging.connectors.debug("Web: HTTP exception "+activity+" for '"+myUrl+"', retrying");
+ throw new ServiceInterruption("HTTP exception "+activity+": "+e.getMessage(),e,currentTime+TIME_5MIN,-1L,2,false);
+ }
+
+ protected void handleIOException(IOException e, String activity)
+ throws ServiceInterruption, ManifoldCFException
+ {
+ if (e instanceof java.net.SocketTimeoutException)
+ {
+ long currentTime = System.currentTimeMillis();
+ Logging.connectors.debug("Web: Socket timeout exception "+activity+" for '"+myUrl+"', retrying");
+ throw new ServiceInterruption("Socket timeout exception "+activity+": "+e.getMessage(),e,currentTime+TIME_5MIN,-1L,2,false);
+ }
+ if (e instanceof ConnectTimeoutException)
+ {
+ long currentTime = System.currentTimeMillis();
+ Logging.connectors.debug("Web: Connect timeout exception "+activity+" for '"+myUrl+"', retrying");
+ throw new ServiceInterruption("Connect timeout exception "+activity+": "+e.getMessage(),e,currentTime+TIME_5MIN,-1L,2,false);
+ }
+ if (e instanceof InterruptedIOException)
+ {
+ methodThread.interrupt();
+ throw new ManifoldCFException("Interrupted",ManifoldCFException.INTERRUPTED);
+ }
+ if (e instanceof NoHttpResponseException)
+ {
+ // Give up after 2 hours.
+ long currentTime = System.currentTimeMillis();
+ throw new ServiceInterruption("Timed out "+activity+" for '"+myUrl+"'", e, currentTime + 15L * 60000L,
+ currentTime + 120L * 60000L,-1,false);
+ }
+ if (e instanceof java.net.ConnectException)
+ {
+ // Give up after 6 hours.
+ long currentTime = System.currentTimeMillis();
+ throw new ServiceInterruption("Timed out "+activity+" for '"+myUrl+"'", e, currentTime + 1000000L,
+ currentTime + 720L * 60000L,-1,false);
+ }
+ if (e instanceof java.net.NoRouteToHostException)
+ {
+ // This exception means we know the IP address but can't get there. That's either a firewall issue, or it's something transient
+ // with the network. Some degree of retry is probably wise.
+ long currentTime = System.currentTimeMillis();
+ throw new ServiceInterruption("No route to host during "+activity+" for '"+myUrl+"'", e, currentTime + 1000000L,
+ currentTime + 720L * 60000L,-1,false);
+ }
+ long currentTime = System.currentTimeMillis();
+ Logging.connectors.debug("Web: IO exception "+activity+" for '"+myUrl+"', retrying");
+ throw new ServiceInterruption("IO exception "+activity+": "+e.getMessage(),e,currentTime+TIME_5MIN,-1L,2,false);
+ }
}
@@ -2015,15 +1958,12 @@ public class ThrottledFetcher
/** The stream we are wrapping. */
protected InputStream inputStream;
- protected DataSession dataSession;
-
/** Constructor.
*/
- public ThrottledInputstream(ThrottledConnection connection, InputStream is, DataSession dataSession)
+ public ThrottledInputstream(ThrottledConnection connection, InputStream is)
{
this.throttledConnection = connection;
this.inputStream = is;
- this.dataSession = dataSession;
}
/** Read a byte.
@@ -2094,8 +2034,6 @@ public class ThrottledFetcher
try
{
amt = inputStream.read(b,off,len);
- if (recordEverything && amt != -1)
- dataSession.write(b,off,amt);
return amt;
}
finally
@@ -2175,7 +2113,7 @@ public class ThrottledFetcher
{
Logging.connectors.debug("Socket timeout exception trying to close connection: "+e.getMessage(),e);
}
- catch (org.apache.commons.httpclient.ConnectTimeoutException e)
+ catch (ConnectTimeoutException e)
{
Logging.connectors.debug("Socket connection timeout exception trying to close connection: "+e.getMessage(),e);
}
@@ -2221,143 +2159,128 @@ public class ThrottledFetcher
}
}
- /** HTTPClient secure socket factory, which implements SecureProtocolSocketFactory
+ /** SSL Socket factory which wraps another socket factory but allows timeout on socket
+ * creation.
*/
- protected static class WebSecureSocketFactory implements SecureProtocolSocketFactory
+ protected static class InterruptibleSocketFactory extends javax.net.ssl.SSLSocketFactory
{
- /** This is the javax.net socket factory.
- */
- protected javax.net.ssl.SSLSocketFactory socketFactory;
-
- /** Constructor */
- public WebSecureSocketFactory(javax.net.ssl.SSLSocketFactory socketFactory)
+ protected final javax.net.ssl.SSLSocketFactory wrappedFactory;
+ protected final long connectTimeoutMilliseconds;
+
+ public InterruptibleSocketFactory(javax.net.ssl.SSLSocketFactory wrappedFactory, long connectTimeoutMilliseconds)
{
- this.socketFactory = socketFactory;
+ this.wrappedFactory = wrappedFactory;
+ this.connectTimeoutMilliseconds = connectTimeoutMilliseconds;
}
- public Socket createSocket(
- String host,
- int port,
- InetAddress clientHost,
- int clientPort)
+ @Override
+ public Socket createSocket()
+ throws IOException
+ {
+ // Socket isn't open
+ return wrappedFactory.createSocket();
+ }
+
+ @Override
+ public Socket createSocket(String host, int port)
throws IOException, UnknownHostException
{
- return socketFactory.createSocket(
- host,
- port,
- clientHost,
- clientPort
- );
+ return fireOffThread(InetAddress.getByName(host),port,null,-1);
}
- public Socket createSocket(
- final String host,
- final int port,
- final InetAddress localAddress,
- final int localPort,
- final HttpConnectionParams params
- ) throws IOException, UnknownHostException, ConnectTimeoutException
+ @Override
+ public Socket createSocket(InetAddress host, int port)
+ throws IOException
{
- if (params == null)
- {
- throw new IllegalArgumentException("Parameters may not be null");
- }
- int timeout = params.getConnectionTimeout();
- if (timeout == 0)
- {
- return createSocket(host, port, localAddress, localPort);
- }
- else
- {
- // We need to implement a connection timeout somehow - probably with a new thread.
- SocketCreateThread thread = new SocketCreateThread(socketFactory,host,port,localAddress,localPort);
- thread.start();
- try
- {
- // Wait for thread to complete for only a certain amount of time!
- thread.join(timeout);
- // If join() times out, then the thread is going to still be alive.
- if (thread.isAlive())
- {
- // Kill the thread - not that this will necessarily work, but we need to try
- thread.interrupt();
- throw new ConnectTimeoutException("Secure connection timed out");
- }
- // The thread terminated. Throw an error if there is one, otherwise return the result.
- Throwable t = thread.getException();
- if (t != null)
- {
- if (t instanceof java.net.SocketTimeoutException)
- throw (java.net.SocketTimeoutException)t;
- else if (t instanceof org.apache.commons.httpclient.ConnectTimeoutException)
- throw (org.apache.commons.httpclient.ConnectTimeoutException)t;
- else if (t instanceof InterruptedIOException)
- throw (InterruptedIOException)t;
- else if (t instanceof IOException)
- throw (IOException)t;
- else if (t instanceof UnknownHostException)
- throw (UnknownHostException)t;
- else if (t instanceof Error)
- throw (Error)t;
- else if (t instanceof RuntimeException)
- throw (RuntimeException)t;
- throw new Error("Received an unexpected exception: "+t.getMessage(),t);
- }
- return thread.getResult();
- }
- catch (InterruptedException e)
- {
- throw new InterruptedIOException("Interrupted: "+e.getMessage());
- }
- }
+ return fireOffThread(host,port,null,-1);
}
-
- public Socket createSocket(String host, int port)
+
+ @Override
+ public Socket createSocket(String host, int port, InetAddress localHost, int localPort)
throws IOException, UnknownHostException
{
- return socketFactory.createSocket(
- host,
- port
- );
+ return fireOffThread(InetAddress.getByName(host),port,localHost,localPort);
}
-
- public Socket createSocket(
- Socket socket,
- String host,
- int port,
- boolean autoClose)
- throws IOException, UnknownHostException
+
+ @Override
+ public Socket createSocket(InetAddress address, int port, InetAddress localAddress, int localPort)
+ throws IOException
{
- return socketFactory.createSocket(
- socket,
- host,
- port,
- autoClose
- );
+ return fireOffThread(address,port,localAddress,localPort);
}
-
- public boolean equals(Object obj)
+
+ @Override
+ public Socket createSocket(Socket s, String host, int port, boolean autoClose)
+ throws IOException
{
- if (obj == null || !(obj instanceof WebSecureSocketFactory))
- return false;
- // Each object is unique
- return super.equals(obj);
+ // Socket's already open
+ return wrappedFactory.createSocket(s,host,port,autoClose);
}
-
- public int hashCode()
+
+ @Override
+ public String[] getDefaultCipherSuites()
+ {
+ return wrappedFactory.getDefaultCipherSuites();
+ }
+
+ @Override
+ public String[] getSupportedCipherSuites()
{
- return super.hashCode();
+ return wrappedFactory.getSupportedCipherSuites();
}
+
+ protected Socket fireOffThread(InetAddress address, int port, InetAddress localHost, int localPort)
+ throws IOException
+ {
+ SocketCreateThread thread = new SocketCreateThread(wrappedFactory,address,port,localHost,localPort);
+ thread.start();
+ try
+ {
+ // Wait for thread to complete for only a certain amount of time!
+ thread.join(connectTimeoutMilliseconds);
+ // If join() times out, then the thread is going to still be alive.
+ if (thread.isAlive())
+ {
+ // Kill the thread - not that this will necessarily work, but we need to try
+ thread.interrupt();
+ throw new ConnectTimeoutException("Secure connection timed out");
+ }
+ // The thread terminated. Throw an error if there is one, otherwise return the result.
+ Throwable t = thread.getException();
+ if (t != null)
+ {
+ if (t instanceof java.net.SocketTimeoutException)
+ throw (java.net.SocketTimeoutException)t;
+ else if (t instanceof ConnectTimeoutException)
+ throw (ConnectTimeoutException)t;
+ else if (t instanceof InterruptedIOException)
+ throw (InterruptedIOException)t;
+ else if (t instanceof IOException)
+ throw (IOException)t;
+ else if (t instanceof Error)
+ throw (Error)t;
+ else if (t instanceof RuntimeException)
+ throw (RuntimeException)t;
+ throw new Error("Received an unexpected exception: "+t.getMessage(),t);
+ }
+ return thread.getResult();
+ }
+ catch (InterruptedException e)
+ {
+ throw new InterruptedIOException("Interrupted: "+e.getMessage());
+ }
+ }
+
}
-
+
/** Create a secure socket in a thread, so that we can "give up" after a while if the socket fails to connect.
*/
protected static class SocketCreateThread extends Thread
{
// Socket factory
protected javax.net.ssl.SSLSocketFactory socketFactory;
- protected String host;
+ protected InetAddress host;
protected int port;
protected InetAddress clientHost;
protected int clientPort;
@@ -2369,7 +2292,7 @@ public class ThrottledFetcher
/** Create the thread */
public SocketCreateThread(javax.net.ssl.SSLSocketFactory socketFactory,
- String host,
+ InetAddress host,
int port,
InetAddress clientHost,
int clientPort)
@@ -2386,7 +2309,10 @@ public class ThrottledFetcher
{
try
{
- rval = socketFactory.createSocket(host,port,clientHost,clientPort);
+ if (clientHost == null)
+ rval = socketFactory.createSocket(host,port);
+ else
+ rval = socketFactory.createSocket(host,port,clientHost,clientPort);
}
catch (Throwable e)
{
@@ -2405,4 +2331,353 @@ public class ThrottledFetcher
}
}
+ /** Class to override browser compatibility to make it not check cookie paths. See CONNECTORS-97.
+ */
+ protected static class LaxBrowserCompatSpec extends BrowserCompatSpec
+ {
+
+ public LaxBrowserCompatSpec()
+ {
+ super();
+ registerAttribHandler(ClientCookie.PATH_ATTR, new BasicPathHandler()
+ {
+ @Override
+ public void validate(Cookie cookie, CookieOrigin origin) throws MalformedCookieException
+ {
+ // No validation
+ }
+
+ }
+ );
+ }
+
+ }
+
+ /** This thread does the actual socket communication with the server.
+ * It's set up so that it can be abandoned at shutdown time.
+ *
+ * The way it works is as follows:
+ * - it starts the transaction
+ * - it receives the response, and saves that for the calling class to inspect
+ * - it transfers the data part to an input stream provided to the calling class
+ * - it shuts the connection down
+ *
+ * If there is an error, the sequence is aborted, and an exception is recorded
+ * for the calling class to examine.
+ *
+ * The calling class basically accepts the sequence above. It starts the
+ * thread, and tries to get a response code. If instead an exception is seen,
+ * the exception is thrown up the stack.
+ */
+ protected static class ExecuteMethodThread extends Thread
+ {
+ /** The connection */
+ protected final ThrottledConnection theConnection;
+ /** Client and method, all preconfigured */
+ protected final AbstractHttpClient httpClient;
+ protected final HttpRequestBase executeMethod;
+
+ protected HttpResponse response = null;
+ protected Throwable responseException = null;
+ protected LoginCookies cookies = null;
+ protected Throwable cookieException = null;
+ protected XThreadInputStream threadStream = null;
+ protected boolean streamCreated = false;
+ protected Throwable streamException = null;
+ protected boolean abortThread = false;
+
+ protected Throwable shutdownException = null;
+
+ protected Throwable generalException = null;
+
+ public ExecuteMethodThread(ThrottledConnection theConnection,
+ AbstractHttpClient httpClient, HttpRequestBase executeMethod)
+ {
+ super();
+ setDaemon(true);
+ this.theConnection = theConnection;
+ this.httpClient = httpClient;
+ this.executeMethod = executeMethod;
+ }
+
+ public void run()
+ {
+ try
+ {
+ try
+ {
+ // Call the execute method appropriately
+ synchronized (this)
+ {
+ if (!abortThread)
+ {
+ try
+ {
+ response = httpClient.execute(executeMethod);
+ }
+ catch (java.net.SocketTimeoutException e)
+ {
+ responseException = e;
+ }
+ catch (ConnectTimeoutException e)
+ {
+ responseException = e;
+ }
+ catch (InterruptedIOException e)
+ {
+ throw e;
+ }
+ catch (Throwable e)
+ {
+ responseException = e;
+ }
+ this.notifyAll();
+ }
+ }
+
+ // Fetch the cookies
+ if (responseException == null)
+ {
+ synchronized (this)
+ {
+ if (!abortThread)
+ {
+ try
+ {
+ cookies = new CookieSet(httpClient.getCookieStore().getCookies());
+ }
+ catch (Throwable e)
+ {
+ cookieException = e;
+ }
+ this.notifyAll();
+ }
+ }
+ }
+
+ // Start the transfer of the content
+ if (cookieException == null && responseException == null)
+ {
+ synchronized (this)
+ {
+ if (!abortThread)
+ {
+ try
+ {
+ InputStream bodyStream = response.getEntity().getContent();
+ if (bodyStream != null)
+ {
+ bodyStream = new ThrottledInputstream(theConnection,bodyStream);
+ threadStream = new XThreadInputStream(bodyStream);
+ }
+ streamCreated = true;
+ }
+ catch (java.net.SocketTimeoutException e)
+ {
+ streamException = e;
+ }
+ catch (ConnectTimeoutException e)
+ {
+ streamException = e;
+ }
+ catch (InterruptedIOException e)
+ {
+ throw e;
+ }
+ catch (Throwable e)
+ {
+ streamException = e;
+ }
+ this.notifyAll();
+ }
+ }
+ }
+
+ if (cookieException == null && responseException == null && streamException == null)
+ {
+ if (threadStream != null)
+ {
+ // Stuff the content until we are done
+ threadStream.stuffQueue();
+ }
+ }
+
+ }
+ finally
+ {
+ synchronized (this)
+ {
+ try
+ {
+ executeMethod.abort();
+ }
+ catch (Throwable e)
+ {
+ shutdownException = e;
+ }
+ this.notifyAll();
+ }
+ }
+ }
+ catch (Throwable e)
+ {
+ // We catch exceptions here that should ONLY be InterruptedExceptions, as a result of the thread being aborted.
+ this.generalException = e;
+ }
+ }
+
+ public int getResponseCode()
+ throws InterruptedException, IOException, HttpException
+ {
+ // Must wait until the response object is there
+ while (true)
+ {
+ synchronized (this)
+ {
+ checkException(responseException);
+ if (response != null)
+ return response.getStatusLine().getStatusCode();
+ wait();
+ }
+ }
+ }
+
+ public Map<String,List<String>> getResponseHeaders()
+ throws InterruptedException, IOException, HttpException
+ {
+ // Must wait for the response object to appear
+ while (true)
+ {
+ synchronized (this)
+ {
+ checkException(responseException);
+ if (response != null)
+ {
+ Header[] headers = response.getAllHeaders();
+ Map<String,List<String>> rval = new HashMap<String,List<String>>();
+ int i = 0;
+ while (i < headers.length)
+ {
+ Header h = headers[i++];
+ String name = h.getName();
+ String value = h.getValue();
+ List<String> values = rval.get(name);
+ if (values == null)
+ {
+ values = new ArrayList<String>();
+ rval.put(name,values);
+ }
+ values.add(value);
+ }
+ return rval;
+ }
+ wait();
+ }
+ }
+
+ }
+
+ public String getFirstHeader(String headerName)
+ throws InterruptedException, IOException, HttpException
+ {
+ // Must wait for the response object to appear
+ while (true)
+ {
+ synchronized (this)
+ {
+ checkException(responseException);
+ if (response != null)
+ {
+ Header h = response.getFirstHeader(headerName);
+ if (h == null)
+ return null;
+ return h.getValue();
+ }
+ wait();
+ }
+ }
+ }
+
+ public LoginCookies getCookies()
+ throws InterruptedException, IOException, HttpException
+ {
+ while (true)
+ {
+ synchronized (this)
+ {
+ if (responseException != null)
+ throw new IllegalStateException("Check for response before getting cookies");
+ checkException(cookieException);
+ if (cookies != null)
+ return cookies;
+ wait();
+ }
+ }
+ }
+
+ public InputStream getSafeInputStream()
+ throws InterruptedException, IOException, HttpException
+ {
+ // Must wait until stream is created, or until we note an exception was thrown.
+ while (true)
+ {
+ synchronized (this)
+ {
+ if (responseException != null)
+ throw new IllegalStateException("Check for response before getting stream");
+ if (cookieException != null)
+ throw new IllegalStateException("Check for cookies before getting stream");
+ checkException(streamException);
+ if (streamCreated)
+ return threadStream;
+ wait();
+ }
+ }
+ }
+
+ public void abort()
+ {
+ // This will be called during the finally
+ // block in the case where all is well (and
+ // the stream completed) and in the case where
+ // there were exceptions.
+ synchronized (this)
+ {
+ if (streamCreated)
+ {
+ if (threadStream != null)
+ threadStream.abort();
+ }
+ abortThread = true;
+ }
+ }
+
+ public void finishUp()
+ throws InterruptedException
+ {
+ join();
+ }
+
+ protected synchronized void checkException(Throwable exception)
+ throws IOException, HttpException
+ {
+ if (exception != null)
+ {
+ // Throw the current exception, but clear it, so no further throwing is possible on the same problem.
+ Throwable e = exception;
+ if (e instanceof IOException)
+ throw (IOException)e;
+ else if (e instanceof HttpException)
+ throw (HttpException)e;
+ else if (e instanceof RuntimeException)
+ throw (RuntimeException)e;
+ else if (e instanceof Error)
+ throw (Error)e;
+ else
+ throw new RuntimeException("Unhandled exception of type: "+e.getClass().getName(),e);
+ }
+ }
+
+ }
+
}
Modified: manifoldcf/branches/CONNECTORS-120-1/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/TrustsDescription.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-120-1/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/TrustsDescription.java?rev=1416452&r1=1416451&r2=1416452&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-120-1/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/TrustsDescription.java (original)
+++ manifoldcf/branches/CONNECTORS-120-1/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/TrustsDescription.java Mon Dec 3 11:19:48 2012
@@ -23,10 +23,6 @@ import org.apache.manifoldcf.crawler.sys
import java.util.*;
import java.util.regex.*;
-import org.apache.commons.httpclient.*;
-import org.apache.commons.httpclient.methods.*;
-import org.apache.commons.httpclient.params.*;
-
/** This class describes trust information pulled from a configuration.
* The data contained is organized by regular expression performed on a url. What we store
* for each regular expression is a Pattern, for efficiency.
Modified: manifoldcf/branches/CONNECTORS-120-1/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-120-1/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1416452&r1=1416451&r2=1416452&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-120-1/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/branches/CONNECTORS-120-1/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Mon Dec 3 11:19:48 2012
@@ -32,15 +32,17 @@ import org.apache.manifoldcf.agents.comm
import org.apache.manifoldcf.agents.common.XMLStringContext;
import org.apache.manifoldcf.agents.common.XMLFileContext;
+import org.apache.http.conn.ConnectTimeoutException;
+import org.apache.http.client.RedirectException;
+import org.apache.http.client.CircularRedirectException;
+import org.apache.http.NoHttpResponseException;
+import org.apache.http.HttpException;
+
import java.io.*;
import java.util.*;
import java.net.*;
import java.util.regex.*;
-import org.apache.commons.httpclient.*;
-import org.apache.commons.httpclient.methods.*;
-import org.apache.commons.httpclient.params.*;
-
/** This is the Web Crawler implementation of the IRepositoryConnector interface.
* This connector may be superceded by one that calls out to python, or by a entirely
* python Connector Framework, depending on how the winds blow.
@@ -1412,7 +1414,7 @@ public class WebcrawlerConnector extends
{
throw new ManifoldCFException("Socket timeout error closing stream: "+e.getMessage(),e);
}
- catch (org.apache.commons.httpclient.ConnectTimeoutException e)
+ catch (ConnectTimeoutException e)
{
throw new ManifoldCFException("Socket connect timeout error closing stream: "+e.getMessage(),e);
}
@@ -5174,7 +5176,7 @@ public class WebcrawlerConnector extends
{
Logging.connectors.warn("Web: Couldn't clear robots cache: "+e2.getMessage(),e2);
}
- catch (org.apache.commons.httpclient.ConnectTimeoutException e2)
+ catch (ConnectTimeoutException e2)
{
Logging.connectors.warn("Web: Couldn't clear robots cache: "+e2.getMessage(),e2);
}
@@ -5188,7 +5190,7 @@ public class WebcrawlerConnector extends
Logging.connectors.warn("Web: Couldn't clear robots cache: "+e2.getMessage(),e2);
}
}
- catch (org.apache.commons.httpclient.ConnectTimeoutException e)
+ catch (ConnectTimeoutException e)
{
if (Logging.connectors.isDebugEnabled())
Logging.connectors.debug("Web: Fetch of robots.txt from "+protocol+"://"+hostIPAddressAndPort+"(host='"+hostName+"') generated Socket Connect Timeout Exception: "+e.getMessage(),e);
@@ -5202,7 +5204,7 @@ public class WebcrawlerConnector extends
{
Logging.connectors.warn("Web: Couldn't clear robots cache: "+e2.getMessage(),e2);
}
- catch (org.apache.commons.httpclient.ConnectTimeoutException e2)
+ catch (ConnectTimeoutException e2)
{
Logging.connectors.warn("Web: Couldn't clear robots cache: "+e2.getMessage(),e2);
}
@@ -5235,7 +5237,7 @@ public class WebcrawlerConnector extends
{
Logging.connectors.warn("Web: Couldn't clear robots cache: "+e2.getMessage(),e2);
}
- catch (org.apache.commons.httpclient.ConnectTimeoutException e2)
+ catch (ConnectTimeoutException e2)
{
Logging.connectors.warn("Web: Couldn't clear robots cache: "+e2.getMessage(),e2);
}
@@ -6062,7 +6064,7 @@ public class WebcrawlerConnector extends
{
throw new ManifoldCFException("Socket timeout exception: "+e.getMessage(),e);
}
- catch (org.apache.commons.httpclient.ConnectTimeoutException e)
+ catch (ConnectTimeoutException e)
{
throw new ManifoldCFException("Socket connect timeout exception: "+e.getMessage(),e);
}
@@ -6801,7 +6803,7 @@ public class WebcrawlerConnector extends
{
throw new ManifoldCFException("Socket timeout exception: "+e.getMessage(),e);
}
- catch (org.apache.commons.httpclient.ConnectTimeoutException e)
+ catch (ConnectTimeoutException e)
{
throw new ManifoldCFException("Socket connect timeout exception: "+e.getMessage(),e);
}
@@ -6856,7 +6858,7 @@ public class WebcrawlerConnector extends
{
throw new ManifoldCFException("Socket timeout exception accessing cached document: "+e.getMessage(),e);
}
- catch (org.apache.commons.httpclient.ConnectTimeoutException e)
+ catch (ConnectTimeoutException e)
{
throw new ManifoldCFException("Socket timeout exception accessing cached document: "+e.getMessage(),e);
}
Modified: manifoldcf/branches/CONNECTORS-120-1/connectors/webcrawler/pom.xml
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-120-1/connectors/webcrawler/pom.xml?rev=1416452&r1=1416451&r2=1416452&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-120-1/connectors/webcrawler/pom.xml (original)
+++ manifoldcf/branches/CONNECTORS-120-1/connectors/webcrawler/pom.xml Mon Dec 3 11:19:48 2012
@@ -89,9 +89,9 @@
<version>${project.version}</version>
</dependency>
<dependency>
- <groupId>commons-httpclient</groupId>
- <artifactId>commons-httpclient</artifactId>
- <version>${commons-httpclient.version}</version>
+ <groupId>org.apache.httpcomponents</groupId>
+ <artifactId>httpclient</artifactId>
+ <version>${httpcomponent.version}</version>
</dependency>
<dependency>
<groupId>commons-logging</groupId>