You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2014/04/06 23:04:55 UTC

svn commit: r1585345 - in /manifoldcf/trunk: ./ connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java

Author: kwright
Date: Sun Apr  6 21:04:55 2014
New Revision: 1585345

URL: http://svn.apache.org/r1585345
Log:
Merge in CONNECTORS-911 fix for the web connector

Modified:
    manifoldcf/trunk/   (props changed)
    manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java

Propchange: manifoldcf/trunk/
------------------------------------------------------------------------------
  Merged /manifoldcf/branches/CONNECTORS-911-2:r1584050-1584051
  Merged /manifoldcf/branches/CONNECTORS-911:r1580649-1584049,1584052-1585344

Modified: manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
URL: http://svn.apache.org/viewvc/manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java?rev=1585345&r1=1585344&r2=1585345&view=diff
==============================================================================
--- manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java (original)
+++ manifoldcf/trunk/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java Sun Apr  6 21:04:55 2014
@@ -33,15 +33,20 @@ import java.util.zip.GZIPInputStream;
 import java.util.concurrent.TimeUnit;
 import java.nio.charset.Charset;
 
-import org.apache.http.conn.ClientConnectionManager;
+import org.apache.http.conn.HttpClientConnectionManager;
 import org.apache.http.client.HttpClient;
+import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
+import org.apache.http.impl.client.HttpClientBuilder;
+import org.apache.http.protocol.HttpRequestExecutor;
+import org.apache.http.impl.client.HttpClients;
+import org.apache.http.client.config.RequestConfig;
+import org.apache.http.client.CredentialsProvider;
+import org.apache.http.impl.client.BasicCredentialsProvider;
+import org.apache.http.config.SocketConfig;
 import org.apache.http.client.methods.HttpGet;
 import org.apache.http.client.methods.HttpPost;
 import org.apache.http.NameValuePair;
-import org.apache.http.impl.conn.PoolingClientConnectionManager;
-import org.apache.http.conn.scheme.Scheme;
-import org.apache.http.conn.ssl.SSLSocketFactory;
-import org.apache.http.conn.ssl.AllowAllHostnameVerifier;
+import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
 import org.apache.http.conn.ssl.BrowserCompatHostnameVerifier;
 import org.apache.http.HttpEntity;
 import org.apache.http.HttpResponse;
@@ -49,23 +54,13 @@ import org.apache.http.auth.AuthScope;
 import org.apache.http.auth.NTCredentials;
 import org.apache.http.client.methods.HttpGet;
 import org.apache.http.client.methods.HttpRequestBase;
-import org.apache.http.impl.client.DefaultHttpClient;
-import org.apache.http.impl.client.AbstractHttpClient;
 import org.apache.http.impl.client.DefaultRedirectStrategy;
 import org.apache.http.util.EntityUtils;
-import org.apache.http.params.BasicHttpParams;
-import org.apache.http.params.HttpParams;
-import org.apache.http.params.CoreConnectionPNames;
 import org.apache.http.HttpStatus;
 import org.apache.http.HttpHost;
 import org.apache.http.Header;
 import org.apache.http.HeaderElement;
-import org.apache.http.conn.params.ConnRoutePNames;
 import org.apache.http.message.BasicHeader;
-import org.apache.http.client.params.ClientPNames;
-import org.apache.http.client.params.HttpClientParams;
-import org.apache.http.client.params.CookiePolicy;
-import org.apache.http.cookie.params.CookieSpecPNames;
 import org.apache.http.impl.cookie.BasicClientCookie;
 import org.apache.http.message.BasicNameValuePair;
 import org.apache.http.protocol.HTTP;
@@ -75,14 +70,22 @@ import org.apache.http.cookie.ClientCook
 import org.apache.http.cookie.Cookie;
 import org.apache.http.impl.cookie.BasicPathHandler;
 import org.apache.http.impl.cookie.BrowserCompatSpec;
-import org.apache.http.cookie.CookieSpecFactory;
 import org.apache.http.cookie.CookieSpec;
 import org.apache.http.client.CookieStore;
 import org.apache.http.protocol.HttpContext;
 import org.apache.http.protocol.BasicHttpContext;
-import org.apache.http.client.protocol.ClientContext;
+import org.apache.http.client.protocol.HttpClientContext;
 import org.apache.http.cookie.CookieIdentityComparator;
 import org.apache.http.client.HttpRequestRetryHandler;
+import org.apache.http.cookie.CookieSpecProvider;
+import org.apache.http.config.RegistryBuilder;
+import org.apache.http.config.Registry;
+import org.apache.http.client.config.CookieSpecs;
+import org.apache.http.impl.cookie.BestMatchSpecFactory;
+import org.apache.http.impl.cookie.BrowserCompatSpecFactory;
+import org.apache.http.impl.cookie.RFC2965SpecFactory;
+import org.apache.http.impl.cookie.NetscapeDraftSpecFactory;
+import org.apache.http.impl.cookie.IgnoreSpecFactory;
 
 import org.apache.http.cookie.MalformedCookieException;
 import org.apache.http.conn.ConnectTimeoutException;
@@ -142,6 +145,15 @@ public class ThrottledFetcher
 
   protected static final Charset UTF_8 = Charset.forName("UTF-8");
 
+  private static final Registry<CookieSpecProvider> cookieSpecRegistry =
+    RegistryBuilder.<CookieSpecProvider>create()
+      .register(CookieSpecs.BEST_MATCH, new BestMatchSpecFactory())
+      .register(CookieSpecs.STANDARD, new RFC2965SpecFactory())
+      .register(CookieSpecs.BROWSER_COMPATIBILITY, new LaxBrowserCompatSpecFactory())
+      .register(CookieSpecs.NETSCAPE, new NetscapeDraftSpecFactory())
+      .register(CookieSpecs.IGNORE_COOKIES, new IgnoreSpecFactory())
+      .build();
+
   /** Constructor.  Private since we never instantiate.
   */
   private ThrottledFetcher()
@@ -257,9 +269,9 @@ public class ThrottledFetcher
     protected long expireTime = -1L;
 
     /** The http connection manager.  The pool is of size 1.  */
-    protected PoolingClientConnectionManager connManager = null;
+    protected HttpClientConnectionManager connManager = null;
     /** The http client object. */
-    protected AbstractHttpClient httpClient = null;
+    protected HttpClient httpClient = null;
     /** The method object */
     protected HttpRequestBase fetchMethod = null;
     /** The error trace, if any */
@@ -395,9 +407,8 @@ public class ThrottledFetcher
       throws ManifoldCFException, ServiceInterruption
     {
       // Set up scheme
-      SSLSocketFactory myFactory = new SSLSocketFactory(new InterruptibleSocketFactory(httpsSocketFactory,connectionTimeoutMilliseconds),
-        new AllowAllHostnameVerifier());
-      Scheme myHttpsProtocol = new Scheme("https", 443, myFactory);
+      SSLConnectionSocketFactory myFactory = new SSLConnectionSocketFactory(new InterruptibleSocketFactory(httpsSocketFactory,connectionTimeoutMilliseconds),
+        SSLConnectionSocketFactory.ALLOW_ALL_HOSTNAME_VERIFIER);
 
       int hostPort;
       String displayedPort;
@@ -443,15 +454,9 @@ public class ThrottledFetcher
       
       if (connManager == null)
       {
-        PoolingClientConnectionManager localConnManager = new PoolingClientConnectionManager();
-        localConnManager.setMaxTotal(1);
-        localConnManager.setDefaultMaxPerRoute(1);
-        connManager = localConnManager;
+        connManager = new PoolingHttpClientConnectionManager();
       }
       
-      // Set up protocol registry
-      connManager.getSchemeRegistry().register(myHttpsProtocol);
-      
       long startTime = 0L;
       if (Logging.connectors.isDebugEnabled())
       {
@@ -459,10 +464,63 @@ public class ThrottledFetcher
         Logging.connectors.debug("WEB: Waiting for an HttpClient object");
       }
 
-      // If we already have an httpclient object, great.  Otherwise we have to get one, and initialize it with
-      // those parameters that aren't expected to change.
-      if (httpClient == null)
+      CredentialsProvider credentialsProvider = new BasicCredentialsProvider();
+
+      // Set up authentication to use
+      if (authentication != null)
       {
+        if (Logging.connectors.isDebugEnabled())
+          Logging.connectors.debug("WEB: For "+myUrl+", discovered matching authentication credentials");
+        credentialsProvider.setCredentials(AuthScope.ANY,
+          authentication.makeCredentialsObject(host));
+      }
+
+      RequestConfig.Builder requestBuilder = RequestConfig.custom()
+        .setCircularRedirectsAllowed(true)
+        .setSocketTimeout(socketTimeoutMilliseconds)
+        .setStaleConnectionCheckEnabled(true)
+        .setExpectContinueEnabled(true)
+        .setConnectTimeout(connectionTimeoutMilliseconds)
+        .setConnectionRequestTimeout(socketTimeoutMilliseconds)
+        .setCookieSpec(CookieSpecs.BROWSER_COMPATIBILITY)
+        .setRedirectsEnabled(redirectOK);
+
+      // If there's a proxy, set that too.
+      if (proxyHost != null && proxyHost.length() > 0)
+      {
+        // Configure proxy authentication
+        if (proxyAuthUsername != null && proxyAuthUsername.length() > 0)
+        {
+          credentialsProvider.setCredentials(
+            new AuthScope(proxyHost, proxyPort),
+            new NTCredentials(proxyAuthUsername, (proxyAuthPassword==null)?"":proxyAuthPassword, currentHost, (proxyAuthDomain==null)?"":proxyAuthDomain));
+        }
+
+        HttpHost proxy = new HttpHost(proxyHost, proxyPort);
+
+        requestBuilder.setProxy(proxy);
+      }
+
+
+      httpClient = HttpClients.custom()
+        .setConnectionManager(connManager)
+        .setMaxConnTotal(1)
+        .setMaxConnPerRoute(1)
+        .disableAutomaticRetries()
+        .setDefaultCookieSpecRegistry(cookieSpecRegistry)
+        .setDefaultRequestConfig(requestBuilder.build())
+        .setDefaultSocketConfig(SocketConfig.custom()
+          .setTcpNoDelay(true)
+          .setSoTimeout(socketTimeoutMilliseconds)
+          .build())
+        .setDefaultCredentialsProvider(credentialsProvider)
+        .setSSLSocketFactory(myFactory)
+        .setRequestExecutor(new HttpRequestExecutor(socketTimeoutMilliseconds))
+        .setRedirectStrategy(new DefaultRedirectStrategy())
+        // ??? need to add equivalent of setCookiePolicy(CookiePolicy.BROWSER_COMPATIBILITY)
+        .build();
+
+        /*
         BasicHttpParams params = new BasicHttpParams();
         params.setParameter(ClientPNames.DEFAULT_HOST,fetchHost);
         params.setBooleanParameter(CoreConnectionPNames.TCP_NODELAY,true);
@@ -497,45 +555,18 @@ public class ThrottledFetcher
           }
         );
 
-        // If there's a proxy, set that too.
-        if (proxyHost != null && proxyHost.length() > 0)
-        {
-          // Configure proxy authentication
-          if (proxyAuthUsername != null && proxyAuthUsername.length() > 0)
-          {
-            localHttpClient.getCredentialsProvider().setCredentials(
-              new AuthScope(proxyHost, proxyPort),
-              new NTCredentials(proxyAuthUsername, (proxyAuthPassword==null)?"":proxyAuthPassword, currentHost, (proxyAuthDomain==null)?"":proxyAuthDomain));
-          }
-
-          HttpHost proxy = new HttpHost(proxyHost, proxyPort);
 
-          localHttpClient.getParams().setParameter(ConnRoutePNames.DEFAULT_PROXY, proxy);
-        }
-
-        // Set up authentication to use
-        if (authentication != null)
-        {
-          if (Logging.connectors.isDebugEnabled())
-            Logging.connectors.debug("WEB: For "+myUrl+", discovered matching authentication credentials");
-          localHttpClient.getCredentialsProvider().setCredentials(AuthScope.ANY,
-            authentication.makeCredentialsObject(host));
-        }
           
         httpClient = localHttpClient;
-      }
+        */
 
 
       // Set the parameters we haven't keyed on (so these can change from request to request)
-      httpClient.getParams().setIntParameter(CoreConnectionPNames.SO_TIMEOUT,socketTimeoutMilliseconds);
-      httpClient.getParams().setIntParameter(CoreConnectionPNames.CONNECTION_TIMEOUT,connectionTimeoutMilliseconds);
-      httpClient.getParams().setBooleanParameter(ClientPNames.HANDLE_REDIRECTS,redirectOK);
 
       if (host != null)
       {
         if (Logging.connectors.isDebugEnabled())
           Logging.connectors.debug("WEB: For "+myUrl+", setting virtual host to "+host);
-        httpClient.getParams().setParameter(ClientPNames.VIRTUAL_HOST,hostHost);
       }
 
 
@@ -650,10 +681,8 @@ public class ThrottledFetcher
       // Copy out the current cookies, in case the fetch fails
       lastFetchCookies = loginCookies;
 
-      //httpClient.setCookieStore(cookieStore);
-      
       // Create the thread
-      methodThread = new ExecuteMethodThread(this, fetchThrottler, httpClient, fetchMethod, cookieStore);
+      methodThread = new ExecuteMethodThread(this, fetchThrottler, httpClient, hostHost, fetchMethod, cookieStore);
       try
       {
         methodThread.start();
@@ -1273,6 +1302,16 @@ public class ThrottledFetcher
     }
   }
 
+  /** Class to create a cookie spec.
+  */
+  protected static class LaxBrowserCompatSpecFactory extends BrowserCompatSpecFactory
+  {
+    public CookieSpec create(HttpContext context)
+    {
+      return new LaxBrowserCompatSpec();
+    }
+  }
+  
   /** Class to override browser compatibility to make it not check cookie paths.  See CONNECTORS-97.
   */
   protected static class LaxBrowserCompatSpec extends BrowserCompatSpec
@@ -1318,7 +1357,8 @@ public class ThrottledFetcher
     /** The fetch throttler */
     protected final IFetchThrottler fetchThrottler;
     /** Client and method, all preconfigured */
-    protected final AbstractHttpClient httpClient;
+    protected final HttpClient httpClient;
+    protected final HttpHost target;
     protected final HttpRequestBase executeMethod;
     protected final CookieStore cookieStore;
     
@@ -1337,13 +1377,14 @@ public class ThrottledFetcher
     protected Throwable generalException = null;
     
     public ExecuteMethodThread(ThrottledConnection theConnection, IFetchThrottler fetchThrottler,
-      AbstractHttpClient httpClient, HttpRequestBase executeMethod, CookieStore cookieStore)
+      HttpClient httpClient, HttpHost target, HttpRequestBase executeMethod, CookieStore cookieStore)
     {
       super();
       setDaemon(true);
       this.theConnection = theConnection;
       this.fetchThrottler = fetchThrottler;
       this.httpClient = httpClient;
+      this.target = target;
       this.executeMethod = executeMethod;
       this.cookieStore = cookieStore;
     }
@@ -1362,8 +1403,8 @@ public class ThrottledFetcher
               try
               {
                 HttpContext context = new BasicHttpContext();
-                context.setAttribute(ClientContext.COOKIE_STORE,cookieStore);
-                response = httpClient.execute(executeMethod,context);
+                context.setAttribute(HttpClientContext.COOKIE_STORE,cookieStore);
+                response = httpClient.execute(target,executeMethod,context);
               }
               catch (java.net.SocketTimeoutException e)
               {