You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2007/05/10 18:29:54 UTC
svn commit: r536925 - in /lucene/nutch/trunk: CHANGES.txt
src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
Author: siren
Date: Thu May 10 09:29:51 2007
New Revision: 536925
URL: http://svn.apache.org/viewvc?view=rev&rev=536925
Log:
NUTCH-446 RobotRulesParser should ignore Crawl-delay values of other bots in robots.txt, contributed by Doğacan Güney
Modified:
lucene/nutch/trunk/CHANGES.txt
lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?view=diff&rev=536925&r1=536924&r2=536925
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Thu May 10 09:29:51 2007
@@ -11,6 +11,9 @@
(Eelco Lempsink via ab)
4. NUTCH-456 - Parse msexcel plugin speedup (Heiko Dietze via siren)
+
+ 5. NUTCH-446 - RobotRulesParser should ignore Crawl-delay values of other
+ bots in robots.txt (Dogacan Guney via siren)
Release 0.9 - 2007-04-02
Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?view=diff&rev=536925&r1=536924&r2=536925
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original)
+++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Thu May 10 09:29:51 2007
@@ -389,15 +389,17 @@
} else if ( (line.length() >= 12)
&& (line.substring(0, 12).equalsIgnoreCase("Crawl-Delay:"))) {
doneAgents = true;
- long crawlDelay = -1;
- String delay = line.substring("Crawl-Delay:".length(), line.length()).trim();
- if (delay.length() > 0) {
- try {
- crawlDelay = Long.parseLong(delay) * 1000; // sec to millisec
- } catch (Exception e) {
- LOG.info("can not parse Crawl-Delay:" + e.toString());
+ if (addRules) {
+ long crawlDelay = -1;
+ String delay = line.substring("Crawl-Delay:".length(), line.length()).trim();
+ if (delay.length() > 0) {
+ try {
+ crawlDelay = Long.parseLong(delay) * 1000; // sec to millisec
+ } catch (Exception e) {
+ LOG.info("can not parse Crawl-Delay:" + e.toString());
+ }
+ currentRules.setCrawlDelay(crawlDelay);
}
- currentRules.setCrawlDelay(crawlDelay);
}
}
}
@@ -500,7 +502,7 @@
/** command-line main for testing */
public static void main(String[] argv) {
- if (argv.length != 3) {
+ if (argv.length < 3) {
System.out.println("Usage:");
System.out.println(" java <robots-file> <url-file> <agent-name>+");
System.out.println("");
@@ -513,7 +515,7 @@
try {
FileInputStream robotsIn= new FileInputStream(argv[0]);
LineNumberReader testsIn= new LineNumberReader(new FileReader(argv[1]));
- String[] robotNames= new String[argv.length - 1];
+ String[] robotNames= new String[argv.length - 2];
for (int i= 0; i < argv.length - 2; i++)
robotNames[i]= argv[i+2];
Modified: lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java?view=diff&rev=536925&r1=536924&r2=536925
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java (original)
+++ lucene/nutch/trunk/src/plugin/lib-http/src/test/org/apache/nutch/protocol/http/api/TestRobotRulesParser.java Thu May 10 09:29:51 2007
@@ -262,6 +262,26 @@
}
}
}
+
+ public void testCrawlDelay() {
+ RobotRulesParser p = new RobotRulesParser(new String[] { "nutchbot" });
+ String delayRule1 = "User-agent: nutchbot" + CR +
+ "Crawl-delay: 10" + CR +
+ "User-agent: foobot" + CR +
+ "Crawl-delay: 20" + CR +
+ "User-agent: *" + CR +
+ "Disallow:/baz" + CR;
+ String delayRule2 = "User-agent: foobot" + CR +
+ "Crawl-delay: 20" + CR +
+ "User-agent: *" + CR +
+ "Disallow:/baz" + CR;
+ RobotRuleSet rules = p.parseRules(delayRule1.getBytes());
+ long crawlDelay = rules.getCrawlDelay();
+ assertTrue("testing crawl delay for agent nutchbot - rule 1", (crawlDelay == 10000));
+ rules = p.parseRules(delayRule2.getBytes());
+ crawlDelay = rules.getCrawlDelay();
+ assertTrue("testing crawl delay for agent nutchbot - rule 2", (crawlDelay == -1));
+ }
// helper