You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@manifoldcf.apache.org by kw...@apache.org on 2010/02/18 15:31:31 UTC
svn commit: r911418 [2/8] - in /incubator/lcf/trunk/modules:
connectors/webcrawler/connector/org/apache/lcf/crawler/connectors/webcrawler/
framework/agents/org/apache/lcf/agents/agentmanager/
framework/agents/org/apache/lcf/agents/incrementalingest/ fr...
Modified: incubator/lcf/trunk/modules/connectors/webcrawler/connector/org/apache/lcf/crawler/connectors/webcrawler/RobotsManager.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/connectors/webcrawler/connector/org/apache/lcf/crawler/connectors/webcrawler/RobotsManager.java?rev=911418&r1=911417&r2=911418&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/connectors/webcrawler/connector/org/apache/lcf/crawler/connectors/webcrawler/RobotsManager.java (original)
+++ incubator/lcf/trunk/modules/connectors/webcrawler/connector/org/apache/lcf/crawler/connectors/webcrawler/RobotsManager.java Thu Feb 18 14:31:31 2010
@@ -34,891 +34,885 @@
*/
public class RobotsManager extends org.apache.lcf.core.database.BaseTable
{
- public static final String _rcsid = "@(#)$Id$";
+ public static final String _rcsid = "@(#)$Id$";
- // Robots cache class. Only one needed.
- protected static RobotsCacheClass robotsCacheClass = new RobotsCacheClass();
+ // Robots cache class. Only one needed.
+ protected static RobotsCacheClass robotsCacheClass = new RobotsCacheClass();
- // Database fields
- protected final static String hostField = "hostname";
- protected final static String robotsField = "robotsdata";
- protected final static String expirationField = "expirationtime";
-
- // Cache manager. This handle is set up during the constructor.
- ICacheManager cacheManager;
-
- /** Constructor. Note that one robotsmanager handle is only useful within a specific thread context,
- * so the calling connector object logic must recreate the handle whenever the thread context changes.
- *@param tc is the thread context.
- *@param database is the database handle.
- */
- public RobotsManager(IThreadContext tc, IDBInterface database)
- throws LCFException
- {
- super(database,"robotsdata");
- cacheManager = CacheManagerFactory.make(tc);
- }
-
- /** Install the manager.
- */
- public void install()
- throws LCFException
- {
- beginTransaction();
- try
- {
- Map existing = getTableSchema(null,null);
- if (existing == null)
- {
- // Install the table.
- HashMap map = new HashMap();
- map.put(hostField,new ColumnDescription("VARCHAR(255)",true,false,null,null,false));
- map.put(expirationField,new ColumnDescription("BIGINT",false,false,null,null,false));
- map.put(robotsField,new ColumnDescription("BYTEA",false,true,null,null,false));
- performCreate(map,null);
- }
- }
- catch (LCFException e)
- {
- signalRollback();
- throw e;
- }
- catch (Error e)
- {
- signalRollback();
- throw e;
- }
- finally
- {
- endTransaction();
- }
- }
-
- /** Uninstall the manager.
- */
- public void deinstall()
- throws LCFException
- {
- performDrop(null);
- }
-
-
- /** Read robots.txt data from the cache or from the database.
- *@param hostName is the host for which the data is desired.
- *@param currentTime is the time of the check.
- *@return null if the record needs to be fetched, true if fetch is allowed.
- */
- public Boolean checkFetchAllowed(String userAgent, String hostName, long currentTime, String pathString,
- IVersionActivity activities)
- throws LCFException
- {
- // Build description objects
- HostDescription[] objectDescriptions = new HostDescription[1];
- StringSetBuffer ssb = new StringSetBuffer();
- ssb.add(getRobotsKey(hostName));
- objectDescriptions[0] = new HostDescription(hostName,new StringSet(ssb));
-
- HostExecutor exec = new HostExecutor(this,activities,objectDescriptions[0]);
- cacheManager.findObjectsAndExecute(objectDescriptions,null,exec,getTransactionID());
-
- // We do the expiration check here, rather than in the query, so that caching
- // is possible.
- RobotsData rd = exec.getResults();
- if (rd == null || rd.getExpirationTime() <= currentTime)
- return null;
- return new Boolean(rd.isFetchAllowed(userAgent,pathString));
- }
-
- /** Write robots.txt, replacing any existing row.
- *@param hostName is the host.
- *@param expirationTime is the time this data should expire.
- *@param data is the robots data stream. May be null.
- */
- public void writeRobotsData(String hostName, long expirationTime, InputStream data)
- throws LCFException, IOException
- {
- TempFileInput tfi = null;
- try
- {
- if (data != null)
- {
- try
- {
- tfi = new TempFileInput(data);
- }
- catch (LCFException e)
- {
- if (e.getErrorCode() == LCFException.INTERRUPTED)
- throw e;
- throw new IOException("Fetch failed: "+e.getMessage());
- }
- }
-
- StringSetBuffer ssb = new StringSetBuffer();
- ssb.add(getRobotsKey(hostName));
- StringSet cacheKeys = new StringSet(ssb);
- ICacheHandle ch = cacheManager.enterCache(null,cacheKeys,getTransactionID());
- try
- {
-
- beginTransaction();
- try
- {
- // See whether the instance exists
- ArrayList params = new ArrayList();
- params.add(hostName);
- IResultSet set = performQuery("SELECT * FROM "+getTableName()+" WHERE "+
- hostField+"=?",params,null,null);
- HashMap values = new HashMap();
- values.put(expirationField,new Long(expirationTime));
- if (tfi != null)
- values.put(robotsField,tfi);
- if (set.getRowCount() > 0)
- {
- // Update
- params.clear();
- params.add(hostName);
- performUpdate(values," WHERE "+hostField+"=?",params,null);
- }
- else
- {
- // Insert
- values.put(hostField,hostName);
- // We only need the general key because this is new.
- performInsert(values,null);
- }
- cacheManager.invalidateKeys(ch);
- }
- catch (LCFException e)
- {
- signalRollback();
- throw e;
- }
- catch (Error e)
- {
- signalRollback();
- throw e;
- }
- finally
- {
- endTransaction();
- }
- }
- finally
- {
- cacheManager.leaveCache(ch);
- }
- }
- finally
- {
- if (tfi != null)
- tfi.discard();
- }
- }
-
- // Protected methods and classes
-
- /** Construct a key which represents an individual host name.
- *@param hostName is the name of the connector.
- *@return the cache key.
- */
- protected static String getRobotsKey(String hostName)
- {
- return "ROBOTS_"+hostName;
- }
-
- /** Read robots data, if it exists.
- *@return null if the data doesn't exist at all. Return robots data if it does.
- */
- protected RobotsData readRobotsData(String hostName, IVersionActivity activities)
- throws LCFException
- {
- try
- {
- ArrayList list = new ArrayList();
- list.add(hostName);
- IResultSet set = performQuery("SELECT "+robotsField+","+expirationField+" FROM "+getTableName()+
- " WHERE "+hostField+"=?",list,null,null);
- if (set.getRowCount() == 0)
- return null;
- if (set.getRowCount() > 1)
- throw new LCFException("Unexpected number of robotsdata rows matching '"+hostName+"': "+Integer.toString(set.getRowCount()));
- IResultRow row = set.getRow(0);
- long expiration = ((Long)row.getValue(expirationField)).longValue();
- BinaryInput bi = (BinaryInput)row.getValue(robotsField);
- if (bi == null)
- return new RobotsData(null,expiration,hostName,activities);
- try
- {
- InputStream is = bi.getStream();
- return new RobotsData(is,expiration,hostName,activities);
- }
- finally
- {
- bi.discard();
- }
- }
- catch (InterruptedIOException e)
- {
- throw new LCFException("Interrupted: "+e.getMessage(),e,LCFException.INTERRUPTED);
- }
- catch (IOException e)
- {
- throw new LCFException("IO error reading robots data for "+hostName+": "+e.getMessage(),e);
- }
- }
-
- /** Convert a string from the robots file into a readable form that does NOT contain NUL characters (since postgresql does not accept those).
- */
- protected static String makeReadable(String inputString)
- {
- StringBuffer sb = new StringBuffer();
- int i = 0;
- while (i < inputString.length())
- {
- char y = inputString.charAt(i++);
- if (y >= ' ')
- sb.append(y);
- else
- {
- sb.append('^');
- sb.append((char)(y + '@'));
- }
- }
- return sb.toString();
- }
-
- /** This is a cached data item.
- */
- protected static class RobotsData
- {
- protected long expiration;
- protected ArrayList records = null;
-
- /** Constructor. */
- public RobotsData(InputStream is, long expiration, String hostName, IVersionActivity activities)
- throws IOException, LCFException
- {
- this.expiration = expiration;
- if (is == null)
- {
- records = null;
- return;
- }
- Reader r = new InputStreamReader(is,"utf-8");
- try
- {
- BufferedReader br = new BufferedReader(r);
- try
- {
- parseRobotsTxt(br,hostName,activities);
- }
- finally
- {
- br.close();
- }
- }
- finally
- {
- r.close();
- }
- }
-
- /** Check if fetch is allowed */
- public boolean isFetchAllowed(String userAgent, String pathString)
- {
- if (records == null)
- return true;
-
- boolean wasDisallowed = false;
- boolean wasAllowed = false;
-
- // First matching user-agent takes precedence, according to the following chunk of spec:
- // "These name tokens are used in User-agent lines in /robots.txt to
- // identify to which specific robots the record applies. The robot
- // must obey the first record in /robots.txt that contains a User-
- // Agent line whose value contains the name token of the robot as a
- // substring. The name comparisons are case-insensitive. If no such
- // record exists, it should obey the first record with a User-agent
- // line with a "*" value, if present. If no record satisfied either
- // condition, or no records are present at all, access is unlimited."
-
- boolean sawAgent = false;
-
- String userAgentUpper = userAgent.toUpperCase();
-
- int i = 0;
- while (i < records.size())
- {
- Record r = (Record)records.get(i++);
- if (r.isAgentMatch(userAgentUpper,false))
- {
- if (r.isDisallowed(pathString))
- wasDisallowed = true;
- if (r.isAllowed(pathString))
- wasAllowed = true;
-
- sawAgent = true;
- break;
- }
- }
- if (sawAgent == false)
- {
- i = 0;
- while (i < records.size())
- {
- Record r = (Record)records.get(i++);
- if (r.isAgentMatch("*",true))
- {
- if (r.isDisallowed(pathString))
- wasDisallowed = true;
- if (r.isAllowed(pathString))
- wasAllowed = true;
-
- sawAgent = true;
- break;
- }
- }
- }
-
- if (sawAgent == false)
- return true;
-
- // Allowed always overrides disallowed
- if (wasAllowed)
- return true;
- if (wasDisallowed)
- return false;
-
- // No match -> crawl allowed
- return true;
- }
-
- /** Get expiration */
- public long getExpirationTime()
- {
- return expiration;
- }
-
- /** Parse the robots.txt file using a reader.
- * Is NOT expected to close the stream.
- */
- protected void parseRobotsTxt(BufferedReader r, String hostName, IVersionActivity activities)
- throws IOException, LCFException
- {
- boolean parseCompleted = false;
- boolean robotsWasHtml = false;
- boolean foundErrors = false;
- String description = null;
-
- long startParseTime = System.currentTimeMillis();
- try
- {
- records = new ArrayList();
- Record record = null;
- boolean seenAction = false;
- while (true)
- {
- String x = r.readLine();
- if (x == null)
- break;
- int numSignPos = x.indexOf("#");
- if (numSignPos != -1)
- x = x.substring(0,numSignPos);
- String lowercaseLine = x.toLowerCase().trim();
- if (lowercaseLine.startsWith("user-agent:"))
- {
- if (seenAction)
- {
- records.add(record);
- record = null;
- seenAction = false;
- }
- if (record == null)
- record = new Record();
-
- String agentName = x.substring("User-agent:".length()).trim();
- record.addAgent(agentName);
- }
- else if (lowercaseLine.startsWith("user-agent"))
- {
- if (seenAction)
- {
- records.add(record);
- record = null;
- seenAction = false;
- }
- if (record == null)
- record = new Record();
-
- String agentName = x.substring("User-agent".length()).trim();
- record.addAgent(agentName);
- }
- else if (lowercaseLine.startsWith("disallow:"))
- {
- if (record == null)
- {
- description = "Disallow without User-agent";
- Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description);
- foundErrors = true;
- }
- else
- {
- String disallowPath = x.substring("Disallow:".length()).trim();
- // The spec says that a blank disallow means let everything through.
- if (disallowPath.length() > 0)
- record.addDisallow(disallowPath);
- seenAction = true;
- }
- }
- else if (lowercaseLine.startsWith("disallow"))
- {
- if (record == null)
- {
- description = "Disallow without User-agent";
- Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description);
- foundErrors = true;
- }
- else
- {
- String disallowPath = x.substring("Disallow".length()).trim();
- // The spec says that a blank disallow means let everything through.
- if (disallowPath.length() > 0)
- record.addDisallow(disallowPath);
- seenAction = true;
- }
- }
- else if (lowercaseLine.startsWith("allow:"))
- {
- if (record == null)
- {
- description = "Allow without User-agent";
- Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description);
- foundErrors = true;
- }
- else
- {
- String allowPath = x.substring("Allow:".length()).trim();
- // The spec says that a blank disallow means let everything through.
- if (allowPath.length() > 0)
- record.addAllow(allowPath);
- seenAction = true;
- }
- }
- else if (lowercaseLine.startsWith("allow"))
- {
- if (record == null)
- {
- description = "Allow without User-agent";
- Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description);
- foundErrors = true;
- }
- else
- {
- String allowPath = x.substring("Allow".length()).trim();
- // The spec says that a blank disallow means let everything through.
- if (allowPath.length() > 0)
- record.addAllow(allowPath);
- seenAction = true;
- }
- }
- else if (lowercaseLine.startsWith("crawl-delay:"))
- {
- // We don't complain about this, but right now we don't listen to it either.
- }
- else if (lowercaseLine.startsWith("crawl-delay"))
- {
- // We don't complain about this, but right now we don't listen to it either.
- }
- else
- {
- // If it's not just a blank line, complain
- if (x.trim().length() > 0)
- {
- String problemLine = makeReadable(x);
- description = "Unknown robots.txt line: '"+problemLine+"'";
- Logging.connectors.warn("Web: Unknown robots.txt line from '"+hostName+"': '"+problemLine+"'");
- if (x.indexOf("<html") != -1 || x.indexOf("<HTML") != -1)
- {
- // Looks like some kind of an html file, probably as a result of a redirection, so just abort as if we have a page error
- robotsWasHtml = true;
- parseCompleted = true;
- break;
- }
- foundErrors = true;
- }
- }
- }
- if (record != null)
- records.add(record);
- parseCompleted = true;
- }
- finally
- {
- // Log the fact that we attempted to parse robots.txt, as well as what happened
- // These are the following situations we will report:
- // (1) INCOMPLETE - Parsing did not complete - if the stream was interrupted
- // (2) HTML - Robots was html - if the robots data seemed to be html
- // (3) ERRORS - Robots had errors - if the robots data was accepted but had errors in it
- // (4) SUCCESS - Robots parsed successfully - if the robots data was parsed without problem
- String status;
- if (parseCompleted)
- {
- if (robotsWasHtml)
- {
- status = "HTML";
- description = "Robots file contained HTML, skipped";
- }
- else
- {
- if (foundErrors)
- {
- status = "ERRORS";
- // description should already be set
- }
- else
- {
- status = "SUCCESS";
- description = null;
- }
- }
- }
- else
- {
- status = "INCOMPLETE";
- description = "Parsing was interrupted";
- }
-
- activities.recordActivity(new Long(startParseTime),WebcrawlerConnector.ACTIVITY_ROBOTSPARSE,
- null,hostName,status,description,null);
-
- }
- }
-
- }
-
- /** Check if path matches specification */
- protected static boolean doesPathMatch(String path, String spec)
- {
- // For robots 1.0, this function would do just this:
- // return path.startsWith(spec);
- // However, we implement the "google bot" spec, which allows wildcard matches that are, in fact, regular-expression-like in some ways.
- // The "specification" can be found here: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40367
- return doesPathMatch(path,0,spec,0);
- }
-
- /** Recursive method for matching specification to path. */
- protected static boolean doesPathMatch(String path, int pathIndex, String spec, int specIndex)
- {
- while (true)
- {
- if (specIndex == spec.length())
- // Hit the end of the specification! We're done.
- return true;
- char specChar = spec.charAt(specIndex++);
- if (specChar == '*')
- {
- // Found a specification wildcard.
- // Eat up all the '*' characters at this position - otherwise each additional one increments the exponent of how long this can take,
- // making denial-of-service via robots parsing a possibility.
- while (specIndex < spec.length())
- {
- if (spec.charAt(specIndex) != '*')
- break;
- specIndex++;
- }
- // It represents zero or more characters, so we must recursively try for a match against all remaining characters in the path string.
- while (true)
- {
- boolean match = doesPathMatch(path,pathIndex,spec,specIndex);
- if (match)
- return true;
- if (path.length() == pathIndex)
- // Nothing further to try, and no match
- return false;
- pathIndex++;
- // Try again
- }
- }
- else if (specChar == '$' && specIndex == spec.length())
- {
- // Found a specification end-of-path character.
- // (It can only be legitimately the last character of the specification.)
- return pathIndex == path.length();
- }
- if (pathIndex == path.length())
- // Hit the end of the path! (but not the end of the specification!)
- return false;
- if (path.charAt(pathIndex) != specChar)
- return false;
- // On to the next match
- pathIndex++;
- }
- }
-
- /** This is the object description for a robots host object.
- * This is the key that is used to look up cached data.
- */
- protected static class HostDescription extends org.apache.lcf.core.cachemanager.BaseDescription
- {
- protected String hostName;
- protected String criticalSectionName;
- protected StringSet cacheKeys;
-
- public HostDescription(String hostName, StringSet invKeys)
- {
- super("robotscache");
- this.hostName = hostName;
- criticalSectionName = getClass().getName()+"-"+hostName;
- cacheKeys = invKeys;
- }
-
- public String getHostName()
- {
- return hostName;
- }
-
- public int hashCode()
- {
- return hostName.hashCode();
- }
-
- public boolean equals(Object o)
- {
- if (!(o instanceof HostDescription))
- return false;
- HostDescription d = (HostDescription)o;
- return d.hostName.equals(hostName);
- }
-
- public String getCriticalSectionName()
- {
- return criticalSectionName;
- }
-
- /** Get the cache keys for an object (which may or may not exist yet in
- * the cache). This method is called in order for cache manager to throw the correct locks.
- * @return the object's cache keys, or null if the object should not
- * be cached.
- */
- public StringSet getObjectKeys()
- {
- return cacheKeys;
- }
-
- /** Get the object class for an object. The object class is used to determine
- * the group of objects treated in the same LRU manner.
- * @return the newly created object's object class, or null if there is no
- * such class, and LRU behavior is not desired.
- */
- public ICacheClass getObjectClass()
- {
- return robotsCacheClass;
- }
- }
-
- /** Cache class for robots.
- * An instance of this class describes the cache class for robots data caching. There's
- * only ever a need for one, so that will be created statically.
- */
- protected static class RobotsCacheClass implements ICacheClass
- {
- /** Get the name of the object class.
- * This determines the set of objects that are treated in the same
- * LRU pool.
- *@return the class name.
- */
- public String getClassName()
- {
- // We count all the robot data, so this is a constant string.
- return "ROBOTSCLASS";
- }
-
- /** Get the maximum LRU count of the object class.
- *@return the maximum number of the objects of the particular class
- * allowed.
- */
- public int getMaxLRUCount()
- {
- // Hardwired for the moment; 2000 robots data records will be cached,
- // and no more.
- return 2000;
- }
-
- }
-
- /** This is the executor object for locating robots host objects.
- * This object furnishes the operations the cache manager needs to rebuild objects that it needs that are
- * not in the cache at the moment.
- */
- protected static class HostExecutor extends org.apache.lcf.core.cachemanager.ExecutorBase
- {
- // Member variables
- protected RobotsManager thisManager;
- protected RobotsData returnValue;
- protected HostDescription thisHost;
- protected IVersionActivity activities;
-
- /** Constructor.
- *@param manager is the RobotsManager class instance.
- *@param objectDescription is the desired object description.
- */
- public HostExecutor(RobotsManager manager, IVersionActivity activities, HostDescription objectDescription)
- {
- super();
- thisManager = manager;
- this.activities = activities;
- thisHost = objectDescription;
- returnValue = null;
- }
-
- /** Get the result.
- *@return the looked-up or read cached instance.
- */
- public RobotsData getResults()
- {
- return returnValue;
- }
-
- /** Create a set of new objects to operate on and cache. This method is called only
- * if the specified object(s) are NOT available in the cache. The specified objects
- * should be created and returned; if they are not created, it means that the
- * execution cannot proceed, and the execute() method will not be called.
- * @param objectDescriptions is the set of unique identifier of the object.
- * @return the newly created objects to cache, or null, if any object cannot be created.
- * The order of the returned objects must correspond to the order of the object descriptinos.
- */
- public Object[] create(ICacheDescription[] objectDescriptions) throws LCFException
- {
- // I'm not expecting multiple values to be request, so it's OK to walk through the objects
- // and do a request at a time.
- RobotsData[] rval = new RobotsData[objectDescriptions.length];
- int i = 0;
- while (i < rval.length)
- {
- HostDescription desc = (HostDescription)objectDescriptions[i];
- // I need to cache both the data and the expiration date, and pick up both when I
- // do the query. This is because I don't want to cache based on request time, since that
- // would screw up everything!
- rval[i] = thisManager.readRobotsData(desc.getHostName(),activities);
- i++;
- }
-
- return rval;
- }
-
-
- /** Notify the implementing class of the existence of a cached version of the
- * object. The object is passed to this method so that the execute() method below
- * will have it available to operate on. This method is also called for all objects
- * that are freshly created as well.
- * @param objectDescription is the unique identifier of the object.
- * @param cachedObject is the cached object.
- */
- public void exists(ICacheDescription objectDescription, Object cachedObject) throws LCFException
- {
- // Cast what came in as what it really is
- HostDescription objectDesc = (HostDescription)objectDescription;
- RobotsData robotsData = (RobotsData)cachedObject;
- if (objectDesc.equals(thisHost))
- returnValue = robotsData;
- }
-
- /** Perform the desired operation. This method is called after either createGetObject()
- * or exists() is called for every requested object.
- */
- public void execute() throws LCFException
- {
- // Does nothing; we only want to fetch objects in this cacher.
- }
-
-
- }
-
- /** This class represents a record in a robots.txt file. It contains one or
- * more user-agents, and one or more disallows.
- */
- protected static class Record
- {
- protected ArrayList userAgents = new ArrayList();
- protected ArrayList disallows = new ArrayList();
- protected ArrayList allows = new ArrayList();
-
- /** Constructor.
- */
- public Record()
- {
- }
-
- /** Add a user-agent.
- */
- public void addAgent(String agentName)
- {
- userAgents.add(agentName);
- }
-
- /** Add a disallow.
- */
- public void addDisallow(String disallowPath)
- {
- disallows.add(disallowPath);
- }
-
- /** Add an allow.
- */
- public void addAllow(String allowPath)
- {
- allows.add(allowPath);
- }
-
- /** See if user-agent matches.
- */
- public boolean isAgentMatch(String agentNameUpper, boolean exactMatch)
- {
- int i = 0;
- while (i < userAgents.size())
- {
- String agent = ((String)userAgents.get(i++)).toUpperCase();
- if (exactMatch && agent.trim().equals(agentNameUpper))
- return true;
- if (!exactMatch && agentNameUpper.indexOf(agent) != -1)
- return true;
- }
- return false;
- }
-
- /** See if path is disallowed. Only called if user-agent has already
- * matched. (This checks if there's an explicit match with one of the
- * Disallows clauses.)
- */
- public boolean isDisallowed(String path)
- {
- int i = 0;
- while (i < disallows.size())
- {
- String disallow = (String)disallows.get(i++);
- if (doesPathMatch(path,disallow))
- return true;
- }
- return false;
- }
-
- /** See if path is allowed. Only called if user-agent has already
- * matched. (This checks if there's an explicit match with one of the
- * Allows clauses).
- */
- public boolean isAllowed(String path)
- {
- int i = 0;
- while (i < allows.size())
- {
- String allow = (String)allows.get(i++);
- if (doesPathMatch(path,allow))
- return true;
- }
- return false;
- }
-
- }
+ // Database fields
+ protected final static String hostField = "hostname";
+ protected final static String robotsField = "robotsdata";
+ protected final static String expirationField = "expirationtime";
+
+ // Cache manager. This handle is set up during the constructor.
+ ICacheManager cacheManager;
+
+ /** Constructor. Note that one robotsmanager handle is only useful within a specific thread context,
+ * so the calling connector object logic must recreate the handle whenever the thread context changes.
+ *@param tc is the thread context.
+ *@param database is the database handle.
+ */
+ public RobotsManager(IThreadContext tc, IDBInterface database)
+ throws LCFException
+ {
+ super(database,"robotsdata");
+ cacheManager = CacheManagerFactory.make(tc);
+ }
+
+ /** Install the manager.
+ */
+ public void install()
+ throws LCFException
+ {
+ // Standard practice: outer loop on install methods, no transactions
+ while (true)
+ {
+ Map existing = getTableSchema(null,null);
+ if (existing == null)
+ {
+ // Install the table.
+ HashMap map = new HashMap();
+ map.put(hostField,new ColumnDescription("VARCHAR(255)",true,false,null,null,false));
+ map.put(expirationField,new ColumnDescription("BIGINT",false,false,null,null,false));
+ map.put(robotsField,new ColumnDescription("BYTEA",false,true,null,null,false));
+ performCreate(map,null);
+ }
+ else
+ {
+ // Upgrade code, if needed, goes here
+ }
+
+ // Handle indexes, if needed
+
+ break;
+ }
+ }
+
+ /** Uninstall the manager.
+ */
+ public void deinstall()
+ throws LCFException
+ {
+ performDrop(null);
+ }
+
+
+ /** Read robots.txt data from the cache or from the database.
+ *@param hostName is the host for which the data is desired.
+ *@param currentTime is the time of the check.
+ *@return null if the record needs to be fetched, true if fetch is allowed.
+ */
+ public Boolean checkFetchAllowed(String userAgent, String hostName, long currentTime, String pathString,
+ IVersionActivity activities)
+ throws LCFException
+ {
+ // Build description objects
+ HostDescription[] objectDescriptions = new HostDescription[1];
+ StringSetBuffer ssb = new StringSetBuffer();
+ ssb.add(getRobotsKey(hostName));
+ objectDescriptions[0] = new HostDescription(hostName,new StringSet(ssb));
+
+ HostExecutor exec = new HostExecutor(this,activities,objectDescriptions[0]);
+ cacheManager.findObjectsAndExecute(objectDescriptions,null,exec,getTransactionID());
+
+ // We do the expiration check here, rather than in the query, so that caching
+ // is possible.
+ RobotsData rd = exec.getResults();
+ if (rd == null || rd.getExpirationTime() <= currentTime)
+ return null;
+ return new Boolean(rd.isFetchAllowed(userAgent,pathString));
+ }
+
+ /** Write robots.txt, replacing any existing row.
+ *@param hostName is the host.
+ *@param expirationTime is the time this data should expire.
+ *@param data is the robots data stream. May be null.
+ */
+ public void writeRobotsData(String hostName, long expirationTime, InputStream data)
+ throws LCFException, IOException
+ {
+ TempFileInput tfi = null;
+ try
+ {
+ if (data != null)
+ {
+ try
+ {
+ tfi = new TempFileInput(data);
+ }
+ catch (LCFException e)
+ {
+ if (e.getErrorCode() == LCFException.INTERRUPTED)
+ throw e;
+ throw new IOException("Fetch failed: "+e.getMessage());
+ }
+ }
+
+ StringSetBuffer ssb = new StringSetBuffer();
+ ssb.add(getRobotsKey(hostName));
+ StringSet cacheKeys = new StringSet(ssb);
+ ICacheHandle ch = cacheManager.enterCache(null,cacheKeys,getTransactionID());
+ try
+ {
+
+ beginTransaction();
+ try
+ {
+ // See whether the instance exists
+ ArrayList params = new ArrayList();
+ params.add(hostName);
+ IResultSet set = performQuery("SELECT * FROM "+getTableName()+" WHERE "+
+ hostField+"=?",params,null,null);
+ HashMap values = new HashMap();
+ values.put(expirationField,new Long(expirationTime));
+ if (tfi != null)
+ values.put(robotsField,tfi);
+ if (set.getRowCount() > 0)
+ {
+ // Update
+ params.clear();
+ params.add(hostName);
+ performUpdate(values," WHERE "+hostField+"=?",params,null);
+ }
+ else
+ {
+ // Insert
+ values.put(hostField,hostName);
+ // We only need the general key because this is new.
+ performInsert(values,null);
+ }
+ cacheManager.invalidateKeys(ch);
+ }
+ catch (LCFException e)
+ {
+ signalRollback();
+ throw e;
+ }
+ catch (Error e)
+ {
+ signalRollback();
+ throw e;
+ }
+ finally
+ {
+ endTransaction();
+ }
+ }
+ finally
+ {
+ cacheManager.leaveCache(ch);
+ }
+ }
+ finally
+ {
+ if (tfi != null)
+ tfi.discard();
+ }
+ }
+
+ // Protected methods and classes
+
+ /** Construct a key which represents an individual host name.
+ *@param hostName is the name of the connector.
+ *@return the cache key.
+ */
+ protected static String getRobotsKey(String hostName)
+ {
+ return "ROBOTS_"+hostName;
+ }
+
+ /** Read robots data, if it exists.
+ *@return null if the data doesn't exist at all. Return robots data if it does.
+ */
+ protected RobotsData readRobotsData(String hostName, IVersionActivity activities)
+ throws LCFException
+ {
+ try
+ {
+ ArrayList list = new ArrayList();
+ list.add(hostName);
+ IResultSet set = performQuery("SELECT "+robotsField+","+expirationField+" FROM "+getTableName()+
+ " WHERE "+hostField+"=?",list,null,null);
+ if (set.getRowCount() == 0)
+ return null;
+ if (set.getRowCount() > 1)
+ throw new LCFException("Unexpected number of robotsdata rows matching '"+hostName+"': "+Integer.toString(set.getRowCount()));
+ IResultRow row = set.getRow(0);
+ long expiration = ((Long)row.getValue(expirationField)).longValue();
+ BinaryInput bi = (BinaryInput)row.getValue(robotsField);
+ if (bi == null)
+ return new RobotsData(null,expiration,hostName,activities);
+ try
+ {
+ InputStream is = bi.getStream();
+ return new RobotsData(is,expiration,hostName,activities);
+ }
+ finally
+ {
+ bi.discard();
+ }
+ }
+ catch (InterruptedIOException e)
+ {
+ throw new LCFException("Interrupted: "+e.getMessage(),e,LCFException.INTERRUPTED);
+ }
+ catch (IOException e)
+ {
+ throw new LCFException("IO error reading robots data for "+hostName+": "+e.getMessage(),e);
+ }
+ }
+
+ /** Convert a string from the robots file into a readable form that does NOT contain NUL characters (since postgresql does not accept those).
+ */
+ protected static String makeReadable(String inputString)
+ {
+ StringBuffer sb = new StringBuffer();
+ int i = 0;
+ while (i < inputString.length())
+ {
+ char y = inputString.charAt(i++);
+ if (y >= ' ')
+ sb.append(y);
+ else
+ {
+ sb.append('^');
+ sb.append((char)(y + '@'));
+ }
+ }
+ return sb.toString();
+ }
+
+ /** This is a cached data item.
+ */
+ protected static class RobotsData
+ {
+ protected long expiration;
+ protected ArrayList records = null;
+
+ /** Constructor. */
+ public RobotsData(InputStream is, long expiration, String hostName, IVersionActivity activities)
+ throws IOException, LCFException
+ {
+ this.expiration = expiration;
+ if (is == null)
+ {
+ records = null;
+ return;
+ }
+ Reader r = new InputStreamReader(is,"utf-8");
+ try
+ {
+ BufferedReader br = new BufferedReader(r);
+ try
+ {
+ parseRobotsTxt(br,hostName,activities);
+ }
+ finally
+ {
+ br.close();
+ }
+ }
+ finally
+ {
+ r.close();
+ }
+ }
+
+ /** Check if fetch is allowed */
+ public boolean isFetchAllowed(String userAgent, String pathString)
+ {
+ if (records == null)
+ return true;
+
+ boolean wasDisallowed = false;
+ boolean wasAllowed = false;
+
+ // First matching user-agent takes precedence, according to the following chunk of spec:
+ // "These name tokens are used in User-agent lines in /robots.txt to
+ // identify to which specific robots the record applies. The robot
+ // must obey the first record in /robots.txt that contains a User-
+ // Agent line whose value contains the name token of the robot as a
+ // substring. The name comparisons are case-insensitive. If no such
+ // record exists, it should obey the first record with a User-agent
+ // line with a "*" value, if present. If no record satisfied either
+ // condition, or no records are present at all, access is unlimited."
+
+ boolean sawAgent = false;
+
+ String userAgentUpper = userAgent.toUpperCase();
+
+ int i = 0;
+ while (i < records.size())
+ {
+ Record r = (Record)records.get(i++);
+ if (r.isAgentMatch(userAgentUpper,false))
+ {
+ if (r.isDisallowed(pathString))
+ wasDisallowed = true;
+ if (r.isAllowed(pathString))
+ wasAllowed = true;
+
+ sawAgent = true;
+ break;
+ }
+ }
+ if (sawAgent == false)
+ {
+ i = 0;
+ while (i < records.size())
+ {
+ Record r = (Record)records.get(i++);
+ if (r.isAgentMatch("*",true))
+ {
+ if (r.isDisallowed(pathString))
+ wasDisallowed = true;
+ if (r.isAllowed(pathString))
+ wasAllowed = true;
+
+ sawAgent = true;
+ break;
+ }
+ }
+ }
+
+ if (sawAgent == false)
+ return true;
+
+ // Allowed always overrides disallowed
+ if (wasAllowed)
+ return true;
+ if (wasDisallowed)
+ return false;
+
+ // No match -> crawl allowed
+ return true;
+ }
+
+ /** Get expiration */
+ public long getExpirationTime()
+ {
+ return expiration;
+ }
+
+ /** Parse the robots.txt file using a reader.
+ * Is NOT expected to close the stream.
+ */
+ protected void parseRobotsTxt(BufferedReader r, String hostName, IVersionActivity activities)
+ throws IOException, LCFException
+ {
+ boolean parseCompleted = false;
+ boolean robotsWasHtml = false;
+ boolean foundErrors = false;
+ String description = null;
+
+ long startParseTime = System.currentTimeMillis();
+ try
+ {
+ records = new ArrayList();
+ Record record = null;
+ boolean seenAction = false;
+ while (true)
+ {
+ String x = r.readLine();
+ if (x == null)
+ break;
+ int numSignPos = x.indexOf("#");
+ if (numSignPos != -1)
+ x = x.substring(0,numSignPos);
+ String lowercaseLine = x.toLowerCase().trim();
+ if (lowercaseLine.startsWith("user-agent:"))
+ {
+ if (seenAction)
+ {
+ records.add(record);
+ record = null;
+ seenAction = false;
+ }
+ if (record == null)
+ record = new Record();
+
+ String agentName = x.substring("User-agent:".length()).trim();
+ record.addAgent(agentName);
+ }
+ else if (lowercaseLine.startsWith("user-agent"))
+ {
+ if (seenAction)
+ {
+ records.add(record);
+ record = null;
+ seenAction = false;
+ }
+ if (record == null)
+ record = new Record();
+
+ String agentName = x.substring("User-agent".length()).trim();
+ record.addAgent(agentName);
+ }
+ else if (lowercaseLine.startsWith("disallow:"))
+ {
+ if (record == null)
+ {
+ description = "Disallow without User-agent";
+ Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description);
+ foundErrors = true;
+ }
+ else
+ {
+ String disallowPath = x.substring("Disallow:".length()).trim();
+ // The spec says that a blank disallow means let everything through.
+ if (disallowPath.length() > 0)
+ record.addDisallow(disallowPath);
+ seenAction = true;
+ }
+ }
+ else if (lowercaseLine.startsWith("disallow"))
+ {
+ if (record == null)
+ {
+ description = "Disallow without User-agent";
+ Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description);
+ foundErrors = true;
+ }
+ else
+ {
+ String disallowPath = x.substring("Disallow".length()).trim();
+ // The spec says that a blank disallow means let everything through.
+ if (disallowPath.length() > 0)
+ record.addDisallow(disallowPath);
+ seenAction = true;
+ }
+ }
+ else if (lowercaseLine.startsWith("allow:"))
+ {
+ if (record == null)
+ {
+ description = "Allow without User-agent";
+ Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description);
+ foundErrors = true;
+ }
+ else
+ {
+ String allowPath = x.substring("Allow:".length()).trim();
+ // The spec says that a blank disallow means let everything through.
+ if (allowPath.length() > 0)
+ record.addAllow(allowPath);
+ seenAction = true;
+ }
+ }
+ else if (lowercaseLine.startsWith("allow"))
+ {
+ if (record == null)
+ {
+ description = "Allow without User-agent";
+ Logging.connectors.warn("Web: Bad robots.txt file format from '"+hostName+"': "+description);
+ foundErrors = true;
+ }
+ else
+ {
+ String allowPath = x.substring("Allow".length()).trim();
+ // The spec says that a blank disallow means let everything through.
+ if (allowPath.length() > 0)
+ record.addAllow(allowPath);
+ seenAction = true;
+ }
+ }
+ else if (lowercaseLine.startsWith("crawl-delay:"))
+ {
+ // We don't complain about this, but right now we don't listen to it either.
+ }
+ else if (lowercaseLine.startsWith("crawl-delay"))
+ {
+ // We don't complain about this, but right now we don't listen to it either.
+ }
+ else
+ {
+ // If it's not just a blank line, complain
+ if (x.trim().length() > 0)
+ {
+ String problemLine = makeReadable(x);
+ description = "Unknown robots.txt line: '"+problemLine+"'";
+ Logging.connectors.warn("Web: Unknown robots.txt line from '"+hostName+"': '"+problemLine+"'");
+ if (x.indexOf("<html") != -1 || x.indexOf("<HTML") != -1)
+ {
+ // Looks like some kind of an html file, probably as a result of a redirection, so just abort as if we have a page error
+ robotsWasHtml = true;
+ parseCompleted = true;
+ break;
+ }
+ foundErrors = true;
+ }
+ }
+ }
+ if (record != null)
+ records.add(record);
+ parseCompleted = true;
+ }
+ finally
+ {
+ // Log the fact that we attempted to parse robots.txt, as well as what happened
+ // These are the following situations we will report:
+ // (1) INCOMPLETE - Parsing did not complete - if the stream was interrupted
+ // (2) HTML - Robots was html - if the robots data seemed to be html
+ // (3) ERRORS - Robots had errors - if the robots data was accepted but had errors in it
+ // (4) SUCCESS - Robots parsed successfully - if the robots data was parsed without problem
+ String status;
+ if (parseCompleted)
+ {
+ if (robotsWasHtml)
+ {
+ status = "HTML";
+ description = "Robots file contained HTML, skipped";
+ }
+ else
+ {
+ if (foundErrors)
+ {
+ status = "ERRORS";
+ // description should already be set
+ }
+ else
+ {
+ status = "SUCCESS";
+ description = null;
+ }
+ }
+ }
+ else
+ {
+ status = "INCOMPLETE";
+ description = "Parsing was interrupted";
+ }
+
+ activities.recordActivity(new Long(startParseTime),WebcrawlerConnector.ACTIVITY_ROBOTSPARSE,
+ null,hostName,status,description,null);
+
+ }
+ }
+
+ }
+
+ /** Check if path matches specification */
+ protected static boolean doesPathMatch(String path, String spec)
+ {
+ // For robots 1.0, this function would do just this:
+ // return path.startsWith(spec);
+ // However, we implement the "google bot" spec, which allows wildcard matches that are, in fact, regular-expression-like in some ways.
+ // The "specification" can be found here: http://www.google.com/support/webmasters/bin/answer.py?hl=en&answer=40367
+ return doesPathMatch(path,0,spec,0);
+ }
+
+ /** Recursive method for matching specification to path. */
+ protected static boolean doesPathMatch(String path, int pathIndex, String spec, int specIndex)
+ {
+ while (true)
+ {
+ if (specIndex == spec.length())
+ // Hit the end of the specification! We're done.
+ return true;
+ char specChar = spec.charAt(specIndex++);
+ if (specChar == '*')
+ {
+ // Found a specification wildcard.
+ // Eat up all the '*' characters at this position - otherwise each additional one increments the exponent of how long this can take,
+ // making denial-of-service via robots parsing a possibility.
+ while (specIndex < spec.length())
+ {
+ if (spec.charAt(specIndex) != '*')
+ break;
+ specIndex++;
+ }
+ // It represents zero or more characters, so we must recursively try for a match against all remaining characters in the path string.
+ while (true)
+ {
+ boolean match = doesPathMatch(path,pathIndex,spec,specIndex);
+ if (match)
+ return true;
+ if (path.length() == pathIndex)
+ // Nothing further to try, and no match
+ return false;
+ pathIndex++;
+ // Try again
+ }
+ }
+ else if (specChar == '$' && specIndex == spec.length())
+ {
+ // Found a specification end-of-path character.
+ // (It can only be legitimately the last character of the specification.)
+ return pathIndex == path.length();
+ }
+ if (pathIndex == path.length())
+ // Hit the end of the path! (but not the end of the specification!)
+ return false;
+ if (path.charAt(pathIndex) != specChar)
+ return false;
+ // On to the next match
+ pathIndex++;
+ }
+ }
+
+ /** This is the object description for a robots host object.
+ * This is the key that is used to look up cached data.
+ */
+ protected static class HostDescription extends org.apache.lcf.core.cachemanager.BaseDescription
+ {
+ protected String hostName;
+ protected String criticalSectionName;
+ protected StringSet cacheKeys;
+
+ public HostDescription(String hostName, StringSet invKeys)
+ {
+ super("robotscache");
+ this.hostName = hostName;
+ criticalSectionName = getClass().getName()+"-"+hostName;
+ cacheKeys = invKeys;
+ }
+
+ public String getHostName()
+ {
+ return hostName;
+ }
+
+ public int hashCode()
+ {
+ return hostName.hashCode();
+ }
+
+ public boolean equals(Object o)
+ {
+ if (!(o instanceof HostDescription))
+ return false;
+ HostDescription d = (HostDescription)o;
+ return d.hostName.equals(hostName);
+ }
+
+ public String getCriticalSectionName()
+ {
+ return criticalSectionName;
+ }
+
+ /** Get the cache keys for an object (which may or may not exist yet in
+ * the cache). This method is called in order for cache manager to throw the correct locks.
+ * @return the object's cache keys, or null if the object should not
+ * be cached.
+ */
+ public StringSet getObjectKeys()
+ {
+ return cacheKeys;
+ }
+
+ /** Get the object class for an object. The object class is used to determine
+ * the group of objects treated in the same LRU manner.
+ * @return the newly created object's object class, or null if there is no
+ * such class, and LRU behavior is not desired.
+ */
+ public ICacheClass getObjectClass()
+ {
+ return robotsCacheClass;
+ }
+ }
+
+ /** Cache class for robots.
+ * An instance of this class describes the cache class for robots data caching. There's
+ * only ever a need for one, so that will be created statically.
+ */
+ protected static class RobotsCacheClass implements ICacheClass
+ {
+ /** Get the name of the object class.
+ * This determines the set of objects that are treated in the same
+ * LRU pool.
+ *@return the class name.
+ */
+ public String getClassName()
+ {
+ // We count all the robot data, so this is a constant string.
+ return "ROBOTSCLASS";
+ }
+
+ /** Get the maximum LRU count of the object class.
+ *@return the maximum number of the objects of the particular class
+ * allowed.
+ */
+ public int getMaxLRUCount()
+ {
+ // Hardwired for the moment; 2000 robots data records will be cached,
+ // and no more.
+ return 2000;
+ }
+
+ }
+
+ /** This is the executor object for locating robots host objects.
+ * This object furnishes the operations the cache manager needs to rebuild objects that it needs that are
+ * not in the cache at the moment.
+ */
+ protected static class HostExecutor extends org.apache.lcf.core.cachemanager.ExecutorBase
+ {
+ // Member variables
+ protected RobotsManager thisManager;
+ protected RobotsData returnValue;
+ protected HostDescription thisHost;
+ protected IVersionActivity activities;
+
+ /** Constructor.
+ *@param manager is the RobotsManager class instance.
+ *@param objectDescription is the desired object description.
+ */
+ public HostExecutor(RobotsManager manager, IVersionActivity activities, HostDescription objectDescription)
+ {
+ super();
+ thisManager = manager;
+ this.activities = activities;
+ thisHost = objectDescription;
+ returnValue = null;
+ }
+
+ /** Get the result.
+ *@return the looked-up or read cached instance.
+ */
+ public RobotsData getResults()
+ {
+ return returnValue;
+ }
+
+ /** Create a set of new objects to operate on and cache. This method is called only
+ * if the specified object(s) are NOT available in the cache. The specified objects
+ * should be created and returned; if they are not created, it means that the
+ * execution cannot proceed, and the execute() method will not be called.
+ * @param objectDescriptions is the set of unique identifier of the object.
+ * @return the newly created objects to cache, or null, if any object cannot be created.
+ * The order of the returned objects must correspond to the order of the object descriptinos.
+ */
+ public Object[] create(ICacheDescription[] objectDescriptions) throws LCFException
+ {
+ // I'm not expecting multiple values to be request, so it's OK to walk through the objects
+ // and do a request at a time.
+ RobotsData[] rval = new RobotsData[objectDescriptions.length];
+ int i = 0;
+ while (i < rval.length)
+ {
+ HostDescription desc = (HostDescription)objectDescriptions[i];
+ // I need to cache both the data and the expiration date, and pick up both when I
+ // do the query. This is because I don't want to cache based on request time, since that
+ // would screw up everything!
+ rval[i] = thisManager.readRobotsData(desc.getHostName(),activities);
+ i++;
+ }
+
+ return rval;
+ }
+
+
+ /** Notify the implementing class of the existence of a cached version of the
+ * object. The object is passed to this method so that the execute() method below
+ * will have it available to operate on. This method is also called for all objects
+ * that are freshly created as well.
+ * @param objectDescription is the unique identifier of the object.
+ * @param cachedObject is the cached object.
+ */
+ public void exists(ICacheDescription objectDescription, Object cachedObject) throws LCFException
+ {
+ // Cast what came in as what it really is
+ HostDescription objectDesc = (HostDescription)objectDescription;
+ RobotsData robotsData = (RobotsData)cachedObject;
+ if (objectDesc.equals(thisHost))
+ returnValue = robotsData;
+ }
+
+ /** Perform the desired operation. This method is called after either createGetObject()
+ * or exists() is called for every requested object.
+ */
+ public void execute() throws LCFException
+ {
+ // Does nothing; we only want to fetch objects in this cacher.
+ }
+
+
+ }
+
+ /** This class represents a record in a robots.txt file. It contains one or
+ * more user-agents, and one or more disallows.
+ */
+ protected static class Record
+ {
+ protected ArrayList userAgents = new ArrayList();
+ protected ArrayList disallows = new ArrayList();
+ protected ArrayList allows = new ArrayList();
+
+ /** Constructor.
+ */
+ public Record()
+ {
+ }
+
+ /** Add a user-agent.
+ */
+ public void addAgent(String agentName)
+ {
+ userAgents.add(agentName);
+ }
+
+ /** Add a disallow.
+ */
+ public void addDisallow(String disallowPath)
+ {
+ disallows.add(disallowPath);
+ }
+
+ /** Add an allow.
+ */
+ public void addAllow(String allowPath)
+ {
+ allows.add(allowPath);
+ }
+
+ /** See if user-agent matches.
+ */
+ public boolean isAgentMatch(String agentNameUpper, boolean exactMatch)
+ {
+ int i = 0;
+ while (i < userAgents.size())
+ {
+ String agent = ((String)userAgents.get(i++)).toUpperCase();
+ if (exactMatch && agent.trim().equals(agentNameUpper))
+ return true;
+ if (!exactMatch && agentNameUpper.indexOf(agent) != -1)
+ return true;
+ }
+ return false;
+ }
+
+ /** See if path is disallowed. Only called if user-agent has already
+ * matched. (This checks if there's an explicit match with one of the
+ * Disallows clauses.)
+ */
+ public boolean isDisallowed(String path)
+ {
+ int i = 0;
+ while (i < disallows.size())
+ {
+ String disallow = (String)disallows.get(i++);
+ if (doesPathMatch(path,disallow))
+ return true;
+ }
+ return false;
+ }
+
+ /** See if path is allowed. Only called if user-agent has already
+ * matched. (This checks if there's an explicit match with one of the
+ * Allows clauses).
+ */
+ public boolean isAllowed(String path)
+ {
+ int i = 0;
+ while (i < allows.size())
+ {
+ String allow = (String)allows.get(i++);
+ if (doesPathMatch(path,allow))
+ return true;
+ }
+ return false;
+ }
+
+ }
}
Modified: incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/agentmanager/AgentManager.java
URL: http://svn.apache.org/viewvc/incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/agentmanager/AgentManager.java?rev=911418&r1=911417&r2=911418&view=diff
==============================================================================
--- incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/agentmanager/AgentManager.java (original)
+++ incubator/lcf/trunk/modules/framework/agents/org/apache/lcf/agents/agentmanager/AgentManager.java Thu Feb 18 14:31:31 2010
@@ -50,8 +50,8 @@
public void install()
throws LCFException
{
- beginTransaction();
- try
+ // We always use an outer loop, in case the upgrade will need it.
+ while (true)
{
// Check if table is already present
Map existing = getTableSchema(null,null);
@@ -61,20 +61,14 @@
map.put(classNameField,new ColumnDescription("VARCHAR(255)",true,false,null,null,false));
performCreate(map,null);
}
- }
- catch (LCFException e)
- {
- signalRollback();
- throw e;
- }
- catch (Error e)
- {
- signalRollback();
- throw e;
- }
- finally
- {
- endTransaction();
+ else
+ {
+ // Any required upgrade code goes here.
+ }
+
+ // Any index creation goes here.
+
+ break;
}
}