You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/06/21 00:15:43 UTC
svn commit: r1604291 - in /nutch: branches/2.x/ branches/2.x/conf/
branches/2.x/src/java/org/apache/nutch/fetcher/
branches/2.x/src/java/org/apache/nutch/protocol/ trunk/ trunk/conf/
trunk/src/java/org/apache/nutch/fetcher/ trunk/src/java/org/apache/nu...
Author: snagel
Date: Fri Jun 20 22:15:43 2014
New Revision: 1604291
URL: http://svn.apache.org/r1604291
Log:
NUTCH-1718 redefine http.robots.agent as "additional agent names"
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/conf/nutch-default.xml
nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java
nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
nutch/trunk/CHANGES.txt
nutch/trunk/conf/nutch-default.xml
nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1604291&r1=1604290&r2=1604291&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Jun 20 22:15:43 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Current Development
+* NUTCH-1718 redefine http.robots.agent as "additional agent names" (snagel, Tejas Patil, Daniel Kugel)
+
* NUTCH-1796 Ensure Gora object builders are used as oppose to empty constructors (snagel via lewismc)
* NUTCH-1590 [SECURITY] Frame injection vulnerability in published Javadoc (jnioche)
Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1604291&r1=1604290&r2=1604291&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Fri Jun 20 22:15:43 2014
@@ -90,11 +90,18 @@
<property>
<name>http.robots.agents</name>
- <value>*</value>
- <description>The agent strings we'll look for in robots.txt files,
- comma-separated, in decreasing order of precedence. You should
- put the value of http.agent.name as the first agent name, and keep the
- default * at the end of the list. E.g.: BlurflDev,Blurfl,*
+ <value></value>
+ <description>Any other agents, apart from 'http.agent.name', that the robots
+ parser would look for in robots.txt. Multiple agents can be provided using
+ comma as a delimiter. eg. mybot,foo-spider,bar-crawler
+
+ The ordering of agents does NOT matter and the robots parser would make
+ decision based on the agent which matches first to the robots rules.
+ Also, there is NO need to add a wildcard (ie. "*") to this string as the
+ robots parser would smartly take care of a no-match situation.
+
+ If no value is specified, by default HTTP agent (ie. 'http.agent.name')
+ would be used for user agent matching by the robots parser.
</description>
</property>
Modified: nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java?rev=1604291&r1=1604290&r2=1604291&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java Fri Jun 20 22:15:43 2014
@@ -255,10 +255,7 @@ public class FetcherJob extends NutchToo
}
void checkConfiguration() {
-
- // ensure that a value has been set for the agent name and that that
- // agent name is the first value in the agents we advertise for robot
- // rules parsing
+ // ensure that a value has been set for the agent name
String agentName = getConf().get("http.agent.name");
if (agentName == null || agentName.trim().length() == 0) {
String message = "Fetcher: No agents listed in 'http.agent.name'"
@@ -267,23 +264,6 @@ public class FetcherJob extends NutchToo
LOG.error(message);
}
throw new IllegalArgumentException(message);
- } else {
-
- // get all of the agents that we advertise
- String agentNames = getConf().get("http.robots.agents");
- StringTokenizer tok = new StringTokenizer(agentNames, ",");
- ArrayList<String> agents = new ArrayList<String>();
- while (tok.hasMoreTokens()) {
- agents.add(tok.nextToken().trim());
- }
-
- // if the first one is not equal to our agent name, log fatal and throw
- // an exception
- if (!(agents.get(0)).equalsIgnoreCase(agentName)) {
- String message = "Fetcher: Your 'http.agent.name' value should be "
- + "listed first in 'http.robots.agents' property.";
- LOG.warn(message);
- }
}
}
Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1604291&r1=1604290&r2=1604291&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java Fri Jun 20 22:15:43 2014
@@ -85,43 +85,27 @@ public abstract class RobotRulesParser i
// Grab the agent names we advertise to robots files.
String agentName = conf.get("http.agent.name");
- if (null == agentName) {
+ if (agentName == null || (agentName = agentName.trim()).isEmpty()) {
throw new RuntimeException("Agent name not configured!");
}
+ agentNames = agentName;
- String agentNames = conf.get("http.robots.agents");
- StringTokenizer tok = new StringTokenizer(agentNames, ",");
- ArrayList<String> agents = new ArrayList<String>();
- while (tok.hasMoreTokens()) {
- agents.add(tok.nextToken().trim());
- }
-
- /**
- * If there are no agents for robots-parsing, use the
- * default agent-string. If both are present, our agent-string
- * should be the first one we advertise to robots-parsing.
- */
- if (agents.size() == 0) {
- if (LOG.isErrorEnabled()) {
- LOG.error("No agents listed in 'http.robots.agents' property!");
- }
- } else {
- StringBuffer combinedAgentsString = new StringBuffer(agentName);
- int index = 0;
-
- if ((agents.get(0)).equalsIgnoreCase(agentName))
- index++;
- else if (LOG.isErrorEnabled()) {
- LOG.error("Agent we advertise (" + agentName
- + ") not listed first in 'http.robots.agents' property!");
+ // If there are any other agents specified, append those to the list of agents
+ String otherAgents = conf.get("http.robots.agents");
+ if(otherAgents != null && !otherAgents.trim().isEmpty()) {
+ StringTokenizer tok = new StringTokenizer(otherAgents, ",");
+ StringBuilder sb = new StringBuilder(agentNames);
+ while (tok.hasMoreTokens()) {
+ String str = tok.nextToken().trim();
+ if (str.equals("*") || str.equals(agentName)) {
+ // skip wildcard "*" or agent name itself
+ // (required for backward compatibility, cf. NUTCH-1715 and NUTCH-1718)
+ } else {
+ sb.append(",").append(str);
+ }
}
- // append all the agents from the http.robots.agents property
- for(; index < agents.size(); index++) {
- combinedAgentsString.append(", " + agents.get(index));
- }
-
- this.agentNames = combinedAgentsString.toString();
+ agentNames = sb.toString();
}
}
@@ -137,8 +121,8 @@ public abstract class RobotRulesParser i
*
* @param url A string containing url
* @param content Contents of the robots file in a byte array
- * @param contentType The
- * @param robotName A string containing value of
+ * @param contentType The content type of the robots file
+ * @param robotName A string containing all the robots agent names used by parser for matching
* @return BaseRobotRules object
*/
public BaseRobotRules parseRules (String url, byte[] content, String contentType, String robotName) {
@@ -160,23 +144,18 @@ public abstract class RobotRulesParser i
/** command-line main for testing */
public static void main(String[] argv) {
- if (argv.length < 3) {
+ if (argv.length != 3) {
System.err.println("Usage: RobotRulesParser <robots-file> <url-file> <agent-names>\n");
System.err.println(" <robots-file> - Input robots.txt file which will be parsed.");
System.err.println(" <url-file> - Contains input URLs (1 per line) which are tested against the rules.");
- System.err.println(" <agent-names> - Input agent name. Multiple agent names can be specified using spaces.");
+ System.err.println(" <agent-names> - Input agent names. Multiple agent names can be provided using");
+ System.err.println(" comma as a delimiter without any spaces.");
System.exit(-1);
}
try {
- StringBuilder agentNames = new StringBuilder();
- for(int counter = 2; counter < argv.length; counter++)
- agentNames.append(argv[counter]).append(",");
-
- agentNames.deleteCharAt(agentNames.length()-1);
-
byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
- BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", agentNames.toString());
+ BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", argv[2]);
LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
String testPath = testsIn.readLine().trim();
Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1604291&r1=1604290&r2=1604291&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jun 20 22:15:43 2014
@@ -2,6 +2,8 @@ Nutch Change Log
Nutch Current Development
+* NUTCH-1718 redefine http.robots.agent as "additional agent names" (snagel, Tejas Patil, Daniel Kugel)
+
* NUTCH-1794 IndexingFilterChecker to optionally dumpText (markus)
* NUTCH-1590 [SECURITY] Frame injection vulnerability in published Javadoc (jnioche)
Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1604291&r1=1604290&r2=1604291&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Jun 20 22:15:43 2014
@@ -89,11 +89,18 @@
<property>
<name>http.robots.agents</name>
- <value>*</value>
- <description>The agent strings we'll look for in robots.txt files,
- comma-separated, in decreasing order of precedence. You should
- put the value of http.agent.name as the first agent name, and keep the
- default * at the end of the list. E.g.: BlurflDev,Blurfl,*
+ <value></value>
+ <description>Any other agents, apart from 'http.agent.name', that the robots
+ parser would look for in robots.txt. Multiple agents can be provided using
+ comma as a delimiter. eg. mybot,foo-spider,bar-crawler
+
+ The ordering of agents does NOT matter and the robots parser would make
+ decision based on the agent which matches first to the robots rules.
+ Also, there is NO need to add a wildcard (ie. "*") to this string as the
+ robots parser would smartly take care of a no-match situation.
+
+ If no value is specified, by default HTTP agent (ie. 'http.agent.name')
+ would be used for user agent matching by the robots parser.
</description>
</property>
Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1604291&r1=1604290&r2=1604291&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Fri Jun 20 22:15:43 2014
@@ -1475,10 +1475,7 @@ public class Fetcher extends Configured
}
private void checkConfiguration() {
-
- // ensure that a value has been set for the agent name and that that
- // agent name is the first value in the agents we advertise for robot
- // rules parsing
+ // ensure that a value has been set for the agent name
String agentName = getConf().get("http.agent.name");
if (agentName == null || agentName.trim().length() == 0) {
String message = "Fetcher: No agents listed in 'http.agent.name'"
@@ -1487,25 +1484,6 @@ public class Fetcher extends Configured
LOG.error(message);
}
throw new IllegalArgumentException(message);
- } else {
-
- // get all of the agents that we advertise
- String agentNames = getConf().get("http.robots.agents");
- StringTokenizer tok = new StringTokenizer(agentNames, ",");
- ArrayList<String> agents = new ArrayList<String>();
- while (tok.hasMoreTokens()) {
- agents.add(tok.nextToken().trim());
- }
-
- // if the first one is not equal to our agent name, log fatal and throw
- // an exception
- if (!(agents.get(0)).equalsIgnoreCase(agentName)) {
- String message = "Fetcher: Your 'http.agent.name' value should be "
- + "listed first in 'http.robots.agents' property.";
- if (LOG.isWarnEnabled()) {
- LOG.warn(message);
- }
- }
}
}
Modified: nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1604291&r1=1604290&r2=1604291&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java Fri Jun 20 22:15:43 2014
@@ -85,43 +85,27 @@ public abstract class RobotRulesParser i
// Grab the agent names we advertise to robots files.
String agentName = conf.get("http.agent.name");
- if (null == agentName) {
+ if (agentName == null || (agentName = agentName.trim()).isEmpty()) {
throw new RuntimeException("Agent name not configured!");
}
+ agentNames = agentName;
- String agentNames = conf.get("http.robots.agents");
- StringTokenizer tok = new StringTokenizer(agentNames, ",");
- ArrayList<String> agents = new ArrayList<String>();
- while (tok.hasMoreTokens()) {
- agents.add(tok.nextToken().trim());
- }
-
- /**
- * If there are no agents for robots-parsing, use the
- * default agent-string. If both are present, our agent-string
- * should be the first one we advertise to robots-parsing.
- */
- if (agents.size() == 0) {
- if (LOG.isErrorEnabled()) {
- LOG.error("No agents listed in 'http.robots.agents' property!");
- }
- } else {
- StringBuffer combinedAgentsString = new StringBuffer(agentName);
- int index = 0;
-
- if ((agents.get(0)).equalsIgnoreCase(agentName))
- index++;
- else if (LOG.isErrorEnabled()) {
- LOG.error("Agent we advertise (" + agentName
- + ") not listed first in 'http.robots.agents' property!");
+ // If there are any other agents specified, append those to the list of agents
+ String otherAgents = conf.get("http.robots.agents");
+ if(otherAgents != null && !otherAgents.trim().isEmpty()) {
+ StringTokenizer tok = new StringTokenizer(otherAgents, ",");
+ StringBuilder sb = new StringBuilder(agentNames);
+ while (tok.hasMoreTokens()) {
+ String str = tok.nextToken().trim();
+ if (str.equals("*") || str.equals(agentName)) {
+ // skip wildcard "*" or agent name itself
+ // (required for backward compatibility, cf. NUTCH-1715 and NUTCH-1718)
+ } else {
+ sb.append(",").append(str);
+ }
}
- // append all the agents from the http.robots.agents property
- for(; index < agents.size(); index++) {
- combinedAgentsString.append(", " + agents.get(index));
- }
-
- this.agentNames = combinedAgentsString.toString();
+ agentNames = sb.toString();
}
}
@@ -137,8 +121,8 @@ public abstract class RobotRulesParser i
*
* @param url A string containing url
* @param content Contents of the robots file in a byte array
- * @param contentType The
- * @param robotName A string containing value of
+ * @param contentType The content type of the robots file
+ * @param robotName A string containing all the robots agent names used by parser for matching
* @return BaseRobotRules object
*/
public BaseRobotRules parseRules (String url, byte[] content, String contentType, String robotName) {
@@ -160,30 +144,24 @@ public abstract class RobotRulesParser i
/** command-line main for testing */
public static void main(String[] argv) {
- if (argv.length < 3) {
+ if (argv.length != 3) {
System.err.println("Usage: RobotRulesParser <robots-file> <url-file> <agent-names>\n");
System.err.println("\tThe <robots-file> will be parsed as a robots.txt file,");
System.err.println("\tusing the given <agent-name> to select rules. URLs ");
System.err.println("\twill be read (one per line) from <url-file>, and tested");
- System.err.println("\tagainst the rules. Multiple agent names can be specified using spaces.");
+ System.err.println("\tagainst the rules. Multiple agent names can be provided using");
+ System.err.println("\tcomma as a delimiter without any spaces.");
System.exit(-1);
}
try {
- StringBuilder agentNames = new StringBuilder();
- for(int counter = 2; counter < argv.length; counter++)
- agentNames.append(argv[counter]).append(",");
-
- agentNames.deleteCharAt(agentNames.length()-1);
-
byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
- BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", agentNames.toString());
+ BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", argv[2]);
LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
String testPath = testsIn.readLine().trim();
while (testPath != null) {
- System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not allowed") +
- ":\t" + testPath);
+ System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not allowed") + ":\t" + testPath);
testPath = testsIn.readLine();
}
testsIn.close();