You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2013/12/17 09:18:41 UTC
svn commit: r1551480 - in
/manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler:
ThrottleDescription.java ThrottledFetcher.java WebcrawlerConnector.java
Author: kwright
Date: Tue Dec 17 08:18:41 2013
New Revision: 1551480
URL: http://svn.apache.org/r1551480
Log:
Refactor a bit in prep for integrating common throttling service
Modified:
manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottleDescription.java
manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
Modified: manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottleDescription.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottleDescription.java?rev=1551480&r1=1551479&r2=1551480&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottleDescription.java (original)
+++ manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottleDescription.java Tue Dec 17 08:18:41 2013
@@ -33,13 +33,13 @@ import java.util.regex.*;
* any given bin value as much as possible. For that reason I've organized this structure
* accordingly.
*/
-public class ThrottleDescription
+public class ThrottleDescription implements IThrottleSpec
{
public static final String _rcsid = "@(#)$Id: ThrottleDescription.java 988245 2010-08-23 18:39:35Z kwright $";
/** This is the hash that contains everything. It's keyed by the regexp string itself.
* Values are ThrottleItem's. */
- protected HashMap patternHash = new HashMap();
+ protected Map<String,ThrottleItem> patternHash = new HashMap<String,ThrottleItem>();
/** Constructor. Build the description from the ConfigParams. */
public ThrottleDescription(ConfigParams configData)
@@ -146,17 +146,15 @@ public class ThrottleDescription
}
/** Given a bin name, find the max open connections to use for that bin.
- *@return -1 if no limit found.
+ *@return Integer.MAX_VALUE if no limit found.
*/
+ @Override
public int getMaxOpenConnections(String binName)
{
// Go through the regexps and match; for each match, find the maximum possible.
int maxCount = -1;
- Iterator iter = patternHash.keySet().iterator();
- while (iter.hasNext())
+ for (ThrottleItem ti : patternHash.values())
{
- String binDescription = (String)iter.next();
- ThrottleItem ti = (ThrottleItem)patternHash.get(binDescription);
Integer limit = ti.getMaxOpenConnections();
if (limit != null)
{
@@ -169,22 +167,24 @@ public class ThrottleDescription
}
}
}
+ if (maxCount == -1)
+ maxCount = Integer.MAX_VALUE;
+ else if (maxCount == 0)
+ maxCount = 1;
return maxCount;
}
/** Look up minimum milliseconds per byte for a bin.
*@return 0.0 if no limit found.
*/
+ @Override
public double getMinimumMillisecondsPerByte(String binName)
{
// Go through the regexps and match; for each match, find the maximum possible.
double minMilliseconds = 0.0;
boolean seenSomething = false;
- Iterator iter = patternHash.keySet().iterator();
- while (iter.hasNext())
+ for (ThrottleItem ti : patternHash.values())
{
- String binDescription = (String)iter.next();
- ThrottleItem ti = (ThrottleItem)patternHash.get(binDescription);
Double limit = ti.getMinimumMillisecondsPerByte();
if (limit != null)
{
@@ -206,16 +206,14 @@ public class ThrottleDescription
/** Look up minimum milliseconds for a fetch for a bin.
*@return 0 if no limit found.
*/
+ @Override
public long getMinimumMillisecondsPerFetch(String binName)
{
// Go through the regexps and match; for each match, find the maximum possible.
long minMilliseconds = 0L;
boolean seenSomething = false;
- Iterator iter = patternHash.keySet().iterator();
- while (iter.hasNext())
+ for (ThrottleItem ti : patternHash.values())
{
- String binDescription = (String)iter.next();
- ThrottleItem ti = (ThrottleItem)patternHash.get(binDescription);
Long limit = ti.getMinimumMillisecondsPerFetch();
if (limit != null)
{
@@ -239,7 +237,7 @@ public class ThrottleDescription
protected static class ThrottleItem
{
/** The bin-matching pattern. */
- protected Pattern pattern;
+ protected final Pattern pattern;
/** The minimum milliseconds between bytes, or null if no limit. */
protected Double minimumMillisecondsPerByte = null;
/** The minimum milliseconds per fetch, or null if no limit */
Modified: manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java?rev=1551480&r1=1551479&r2=1551480&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java (original)
+++ manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/ThrottledFetcher.java Tue Dec 17 08:18:41 2013
@@ -164,10 +164,11 @@ public class ThrottledFetcher
*@param connectionLimit isthe maximum number of connections permitted.
*@return an IThrottledConnection object that can be used to fetch from the port.
*/
- public static IThrottledConnection getConnection(String protocol, String server, int port,
+ public static IThrottledConnection getConnection(IThreadContext threadContext,
+ String protocol, String server, int port,
PageCredentials authentication,
IKeystoreManager trustStore,
- ThrottleDescription throttleDescription, String[] binNames,
+ IThrottleSpec throttleDescription, String[] binNames,
int connectionLimit,
String proxyHost, int proxyPort, String proxyAuthDomain, String proxyAuthUsername, String proxyAuthPassword)
throws ManifoldCFException
@@ -291,12 +292,6 @@ public class ThrottledFetcher
// Figure out the connection limit for this bin, based on the throttle description
int maxConnections = throttleDescription.getMaxOpenConnections(binName);
- // If no restriction, use a very large value.
- if (maxConnections == -1)
- maxConnections = Integer.MAX_VALUE;
- else if (maxConnections == 0)
- maxConnections = 1;
-
// Now, do what we need to do to reserve our connection for this bin.
// If we can't reserve it now, we plan on undoing everything we did, so
// whatever we do must be reversible. Furthermore, nothing we call here
@@ -436,7 +431,7 @@ public class ThrottledFetcher
/** Flush connections that have timed out from inactivity. */
- public static void flushIdleConnections()
+ public static void flushIdleConnections(IThreadContext threadContext)
throws ManifoldCFException
{
synchronized (poolLock)
@@ -1150,7 +1145,7 @@ public class ThrottledFetcher
/** Set up the connection. This allows us to feed all bins the correct bandwidth limit info.
*/
- public void setup(ThrottleDescription description)
+ public void setup(IThrottleSpec description)
{
// Go through all bins, and set up the current limits.
int i = 0;
Modified: manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java
URL: http://svn.apache.org/viewvc/manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java?rev=1551480&r1=1551479&r2=1551480&view=diff
==============================================================================
--- manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java (original)
+++ manifoldcf/branches/CONNECTORS-829/connectors/webcrawler/connector/src/main/java/org/apache/manifoldcf/crawler/connectors/webcrawler/WebcrawlerConnector.java Tue Dec 17 08:18:41 2013
@@ -160,6 +160,8 @@ public class WebcrawlerConnector extends
protected int connectionTimeoutMilliseconds = 60000;
/** Socket timeout, milliseconds */
protected int socketTimeoutMilliseconds = 300000;
+ /** Throttle group name */
+ protected String throttleGroupName = null;
// Canonicalization enabling/disabling. Eventually this will probably need to be by regular expression.
@@ -354,6 +356,9 @@ public class WebcrawlerConnector extends
{
String x;
+ // Either set this from the connection name, or just have one. Right now, we have one.
+ String throttleGroupName = "";
+
String emailAddress = params.getParameter(WebcrawlerConfig.PARAMETER_EMAIL);
if (emailAddress == null)
throw new ManifoldCFException("Missing email address");
@@ -406,7 +411,7 @@ public class WebcrawlerConnector extends
public void poll()
throws ManifoldCFException
{
- ThrottledFetcher.flushIdleConnections();
+ ThrottledFetcher.flushIdleConnections(currentContext);
}
/** Check status of connection.
@@ -425,6 +430,7 @@ public class WebcrawlerConnector extends
public void disconnect()
throws ManifoldCFException
{
+ throttleGroupName = null;
throttleDescription = null;
credentialsDescription = null;
trustsDescription = null;
@@ -711,7 +717,8 @@ public class WebcrawlerConnector extends
// Prepare to perform the fetch, and decide what to do with the document.
//
- IThrottledConnection connection = ThrottledFetcher.getConnection(protocol,ipAddress,port,
+ IThrottledConnection connection = ThrottledFetcher.getConnection(currentContext,
+ protocol,ipAddress,port,
credential,trustStore,throttleDescription,binNames,connectionLimit,
proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword);
try
@@ -5126,7 +5133,8 @@ public class WebcrawlerConnector extends
// We've successfully obtained a lock on reading robots for this server! Now, guarantee that we'll free it, by instantiating a try/finally
try
{
- IThrottledConnection connection = ThrottledFetcher.getConnection(protocol,hostIPAddress,port,credential,
+ IThrottledConnection connection = ThrottledFetcher.getConnection(currentContext,
+ protocol,hostIPAddress,port,credential,
trustStore,throttleDescription,binNames,connectionLimit,
proxyHost,proxyPort,proxyAuthDomain,proxyAuthUsername,proxyAuthPassword);
try