You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/12/17 09:18:41 UTC

svn commit: r1551480 - in /manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler: ThrottleDescription.java ThrottledFetcher.java WebcrawlerConnector.java

Author: kwright
Date: Tue Dec 17 08:18:41 2013
New Revision: 1551480

URL: http://svn.apache.org/r1551480
Log:
Refactor a bit in prep for integrating common throttling service

Modified:
    manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottleDescription.java
    manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
    manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java

Modified: manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottleDescription.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottleDescription.java?rev=1551480&r1=1551479&r2=1551480&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottleDescription.java (original)
+++ manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottleDescription.java Tue Dec 17 08:18:41 2013
@@ -33,13 +33,13 @@ import java.util.regex.*;
 * any given bin value as much as possible.  For that reason I've organized this structure
 * accordingly.
 */
-public class ThrottleDescription
+public class ThrottleDescription implements IThrottleSpec
 {
   public static final String _rcsid = "@(#)$Id: ThrottleDescription.java 988245 2010-08-23 18:39:35Z kwright $";
 
   /** This is the hash that contains everything.  It's keyed by the regexp string itself.
   * Values are ThrottleItem's. */
-  protected HashMap patternHash = new HashMap();
+  protected Map<String,ThrottleItem> patternHash = new HashMap<String,ThrottleItem>();
 
   /** Constructor.  Build the description from the ConfigParams. */
   public ThrottleDescription(ConfigParams configData)
@@ -146,17 +146,15 @@ public class ThrottleDescription
   }
 
   /** Given a bin name, find the max open connections to use for that bin.
-  *@return -1 if no limit found.
+  *@return Integer.MAX_VALUE if no limit found.
   */
+  @Override
   public int getMaxOpenConnections(String binName)
   {
     // Go through the regexps and match; for each match, find the maximum possible.
     int maxCount = -1;
-    Iterator iter = patternHash.keySet().iterator();
-    while (iter.hasNext())
+    for (ThrottleItem ti : patternHash.values())
     {
-      String binDescription = (String)iter.next();
-      ThrottleItem ti = (ThrottleItem)patternHash.get(binDescription);
       Integer limit = ti.getMaxOpenConnections();
       if (limit != null)
       {
@@ -169,22 +167,24 @@ public class ThrottleDescription
         }
       }
     }
+    if (maxCount == -1)
+      maxCount = Integer.MAX_VALUE;
+    else if (maxCount == 0)
+      maxCount = 1;
     return maxCount;
   }
 
   /** Look up minimum milliseconds per byte for a bin.
   *@return 0.0 if no limit found.
   */
+  @Override
   public double getMinimumMillisecondsPerByte(String binName)
   {
     // Go through the regexps and match; for each match, find the maximum possible.
     double minMilliseconds = 0.0;
     boolean seenSomething = false;
-    Iterator iter = patternHash.keySet().iterator();
-    while (iter.hasNext())
+    for (ThrottleItem ti : patternHash.values())
     {
-      String binDescription = (String)iter.next();
-      ThrottleItem ti = (ThrottleItem)patternHash.get(binDescription);
       Double limit = ti.getMinimumMillisecondsPerByte();
       if (limit != null)
       {
@@ -206,16 +206,14 @@ public class ThrottleDescription
   /** Look up minimum milliseconds for a fetch for a bin.
   *@return 0 if no limit found.
   */
+  @Override
   public long getMinimumMillisecondsPerFetch(String binName)
   {
     // Go through the regexps and match; for each match, find the maximum possible.
     long minMilliseconds = 0L;
     boolean seenSomething = false;
-    Iterator iter = patternHash.keySet().iterator();
-    while (iter.hasNext())
+    for (ThrottleItem ti : patternHash.values())
     {
-      String binDescription = (String)iter.next();
-      ThrottleItem ti = (ThrottleItem)patternHash.get(binDescription);
       Long limit = ti.getMinimumMillisecondsPerFetch();
       if (limit != null)
       {
@@ -239,7 +237,7 @@ public class ThrottleDescription
   protected static class ThrottleItem
   {
     /** The bin-matching pattern. */
-    protected Pattern pattern;
+    protected final Pattern pattern;
     /** The minimum milliseconds between bytes, or null if no limit. */
     protected Double minimumMillisecondsPerByte = null;
     /** The minimum milliseconds per fetch, or null if no limit */

Modified: manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java?rev=1551480&r1=1551479&r2=1551480&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java (original)
+++ manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java Tue Dec 17 08:18:41 2013
@@ -164,10 +164,11 @@ public class ThrottledFetcher
   *@param connectionLimit isthe maximum number of connections permitted.
   *@return an IThrottledConnection object that can be used to fetch from the port.
   */
-  public static IThrottledConnection getConnection(String protocol, String server, int port,
+  public static IThrottledConnection getConnection(IThreadContext threadContext,
+    String protocol, String server, int port,
     PageCredentials authentication,
     IKeystoreManager trustStore,
-    ThrottleDescription throttleDescription, String[] binNames,
+    IThrottleSpec throttleDescription, String[] binNames,
     int connectionLimit,
     String proxyHost, int proxyPort, String proxyAuthDomain, String proxyAuthUsername, String proxyAuthPassword)
     throws ManifoldCFException
@@ -291,12 +292,6 @@ public class ThrottledFetcher
               // Figure out the connection limit for this bin, based on the throttle description
               int maxConnections = throttleDescription.getMaxOpenConnections(binName);
 
-              // If no restriction, use a very large value.
-              if (maxConnections == -1)
-                maxConnections = Integer.MAX_VALUE;
-              else if (maxConnections == 0)
-                maxConnections = 1;
-
               // Now, do what we need to do to reserve our connection for this bin.
               // If we can't reserve it now, we plan on undoing everything we did, so
               // whatever we do must be reversible.  Furthermore, nothing we call here
@@ -436,7 +431,7 @@ public class ThrottledFetcher
 
 
   /** Flush connections that have timed out from inactivity. */
-  public static void flushIdleConnections()
+  public static void flushIdleConnections(IThreadContext threadContext)
     throws ManifoldCFException
   {
     synchronized (poolLock)
@@ -1150,7 +1145,7 @@ public class ThrottledFetcher
 
     /** Set up the connection.  This allows us to feed all bins the correct bandwidth limit info.
     */
-    public void setup(ThrottleDescription description)
+    public void setup(IThrottleSpec description)
     {
       // Go through all bins, and set up the current limits.
       int i = 0;

Modified: manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1551480&r1=1551479&r2=1551480&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Tue Dec 17 08:18:41 2013
@@ -160,6 +160,8 @@ public class WebcrawlerConnector extends
   protected int connectionTimeoutMilliseconds = 60000;
   /** Socket timeout, milliseconds */
   protected int socketTimeoutMilliseconds = 300000;
+  /** Throttle group name */
+  protected String throttleGroupName = null;
 
   // Canonicalization enabling/disabling.  Eventually this will probably need to be by regular expression.
 
@@ -354,6 +356,9 @@ public class WebcrawlerConnector extends
     {
       String x;
 
+      // Either set this from the connection name, or just have one.  Right now, we have one.
+      String throttleGroupName = "";
+      
       String emailAddress = params.getParameter(WebcrawlerConfig.PARAMETER_EMAIL);
       if (emailAddress == null)
         throw new ManifoldCFException("Missing email address");
@@ -406,7 +411,7 @@ public class WebcrawlerConnector extends
   public void poll()
     throws ManifoldCFException
   {
-    ThrottledFetcher.flushIdleConnections();
+    ThrottledFetcher.flushIdleConnections(currentContext);
   }
 
   /** Check status of connection.
@@ -425,6 +430,7 @@ public class WebcrawlerConnector extends
   public void disconnect()
     throws ManifoldCFException
   {
+    throttleGroupName = null;
     throttleDescription = null;
     credentialsDescription = null;
     trustsDescription = null;
@@ -711,7 +717,8 @@ public class WebcrawlerConnector extends
 
                   // Prepare to perform the fetch, and decide what to do with the document.
                   //
-                  IThrottledConnection connection = ThrottledFetcher.getConnection(protocol,ipAddress,port,
+                  IThrottledConnection connection = ThrottledFetcher.getConnection(currentContext,
+                    protocol,ipAddress,port,
                     credential,trustStore,throttleDescription,binNames,connectionLimit,
                     proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword);
                   try
@@ -5126,7 +5133,8 @@ public class WebcrawlerConnector extends
       // We've successfully obtained a lock on reading robots for this server!  Now, guarantee that we'll free it, by instantiating a try/finally
       try
       {
-        IThrottledConnection connection = ThrottledFetcher.getConnection(protocol,hostIPAddress,port,credential,
+        IThrottledConnection connection = ThrottledFetcher.getConnection(currentContext,
+          protocol,hostIPAddress,port,credential,
           trustStore,throttleDescription,binNames,connectionLimit,
           proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword);
         try