You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by sn...@apache.org on 2014/06/21 00:15:43 UTC

svn commit: r1604291 - in /nutch: branches/2.x/ branches/2.x/conf/ branches/2.x/src/java/org/apache/nutch/fetcher/ branches/2.x/src/java/org/apache/nutch/protocol/ trunk/ trunk/conf/ trunk/src/java/org/apache/nutch/fetcher/ trunk/src/java/org/apache/nu...

Author: snagel
Date: Fri Jun 20 22:15:43 2014
New Revision: 1604291

URL: http://svn.apache.org/r1604291
Log:
NUTCH-1718 redefine http.robots.agent as "additional agent names"

Modified:
    nutch/branches/2.x/CHANGES.txt
    nutch/branches/2.x/conf/nutch-default.xml
    nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java
    nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
    nutch/trunk/CHANGES.txt
    nutch/trunk/conf/nutch-default.xml
    nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
    nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java

Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1604291&r1=1604290&r2=1604291&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Fri Jun 20 22:15:43 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Current Development
 
+* NUTCH-1718 redefine http.robots.agent as "additional agent names" (snagel, Tejas Patil, Daniel Kugel)
+
 * NUTCH-1796 Ensure Gora object builders are used as oppose to empty constructors (snagel via lewismc)
 
 * NUTCH-1590 [SECURITY] Frame injection vulnerability in published Javadoc (jnioche)

Modified: nutch/branches/2.x/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/conf/nutch-default.xml?rev=1604291&r1=1604290&r2=1604291&view=diff
==============================================================================
--- nutch/branches/2.x/conf/nutch-default.xml (original)
+++ nutch/branches/2.x/conf/nutch-default.xml Fri Jun 20 22:15:43 2014
@@ -90,11 +90,18 @@
 
 <property>
   <name>http.robots.agents</name>
-  <value>*</value>
-  <description>The agent strings we'll look for in robots.txt files,
-  comma-separated, in decreasing order of precedence. You should
-  put the value of http.agent.name as the first agent name, and keep the
-  default * at the end of the list. E.g.: BlurflDev,Blurfl,*
+  <value></value>
+  <description>Any other agents, apart from 'http.agent.name', that the robots
+  parser would look for in robots.txt. Multiple agents can be provided using 
+  comma as a delimiter. eg. mybot,foo-spider,bar-crawler
+  
+  The ordering of agents does NOT matter and the robots parser would make 
+  decision based on the agent which matches first to the robots rules.  
+  Also, there is NO need to add a wildcard (ie. "*") to this string as the 
+  robots parser would smartly take care of a no-match situation. 
+    
+  If no value is specified, by default HTTP agent (ie. 'http.agent.name') 
+  would be used for user agent matching by the robots parser. 
   </description>
 </property>
 

Modified: nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java?rev=1604291&r1=1604290&r2=1604291&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/fetcher/FetcherJob.java Fri Jun 20 22:15:43 2014
@@ -255,10 +255,7 @@ public class FetcherJob extends NutchToo
   }
 
   void checkConfiguration() {
-
-    // ensure that a value has been set for the agent name and that that
-    // agent name is the first value in the agents we advertise for robot
-    // rules parsing
+    // ensure that a value has been set for the agent name
     String agentName = getConf().get("http.agent.name");
     if (agentName == null || agentName.trim().length() == 0) {
       String message = "Fetcher: No agents listed in 'http.agent.name'"
@@ -267,23 +264,6 @@ public class FetcherJob extends NutchToo
         LOG.error(message);
       }
       throw new IllegalArgumentException(message);
-    } else {
-
-      // get all of the agents that we advertise
-      String agentNames = getConf().get("http.robots.agents");
-      StringTokenizer tok = new StringTokenizer(agentNames, ",");
-      ArrayList<String> agents = new ArrayList<String>();
-      while (tok.hasMoreTokens()) {
-        agents.add(tok.nextToken().trim());
-      }
-
-      // if the first one is not equal to our agent name, log fatal and throw
-      // an exception
-      if (!(agents.get(0)).equalsIgnoreCase(agentName)) {
-        String message = "Fetcher: Your 'http.agent.name' value should be "
-            + "listed first in 'http.robots.agents' property.";
-        LOG.warn(message);
-      }
     }
   }
 

Modified: nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1604291&r1=1604290&r2=1604291&view=diff
==============================================================================
--- nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java (original)
+++ nutch/branches/2.x/src/java/org/apache/nutch/protocol/RobotRulesParser.java Fri Jun 20 22:15:43 2014
@@ -85,43 +85,27 @@ public abstract class RobotRulesParser i
 
     // Grab the agent names we advertise to robots files.
     String agentName = conf.get("http.agent.name");
-    if (null == agentName) {
+    if (agentName == null || (agentName = agentName.trim()).isEmpty()) {
       throw new RuntimeException("Agent name not configured!");
     }
+    agentNames = agentName;
 
-    String agentNames = conf.get("http.robots.agents");
-    StringTokenizer tok = new StringTokenizer(agentNames, ",");
-    ArrayList<String> agents = new ArrayList<String>();
-    while (tok.hasMoreTokens()) {
-      agents.add(tok.nextToken().trim());
-    }
-
-    /**
-     * If there are no agents for robots-parsing, use the
-     * default agent-string. If both are present, our agent-string
-     * should be the first one we advertise to robots-parsing.
-     */
-    if (agents.size() == 0) {
-      if (LOG.isErrorEnabled()) {
-        LOG.error("No agents listed in 'http.robots.agents' property!");
-      }
-    } else { 
-      StringBuffer combinedAgentsString = new StringBuffer(agentName);
-      int index = 0;
-
-      if ((agents.get(0)).equalsIgnoreCase(agentName))
-        index++;
-      else if (LOG.isErrorEnabled()) {
-        LOG.error("Agent we advertise (" + agentName
-            + ") not listed first in 'http.robots.agents' property!");
+    // If there are any other agents specified, append those to the list of agents
+    String otherAgents = conf.get("http.robots.agents");
+    if(otherAgents != null && !otherAgents.trim().isEmpty()) {
+      StringTokenizer tok = new StringTokenizer(otherAgents, ",");
+      StringBuilder sb = new StringBuilder(agentNames);
+      while (tok.hasMoreTokens()) {
+        String str = tok.nextToken().trim();
+        if (str.equals("*") || str.equals(agentName)) {
+          // skip wildcard "*" or agent name itself
+          // (required for backward compatibility, cf. NUTCH-1715 and NUTCH-1718)
+        } else {
+          sb.append(",").append(str);
+        }
       }
 
-      // append all the agents from the http.robots.agents property
-      for(; index < agents.size(); index++) {
-        combinedAgentsString.append(", " + agents.get(index));
-      }
-
-      this.agentNames = combinedAgentsString.toString();
+      agentNames = sb.toString();
     }
   }
 
@@ -137,8 +121,8 @@ public abstract class RobotRulesParser i
    *    
    * @param url A string containing url
    * @param content Contents of the robots file in a byte array 
-   * @param contentType The 
-   * @param robotName A string containing value of  
+   * @param contentType The content type of the robots file
+   * @param robotName A string containing all the robots agent names used by parser for matching
    * @return BaseRobotRules object 
    */
   public BaseRobotRules parseRules (String url, byte[] content, String contentType, String robotName) {
@@ -160,23 +144,18 @@ public abstract class RobotRulesParser i
   /** command-line main for testing */
   public static void main(String[] argv) {
 
-    if (argv.length < 3) {
+    if (argv.length != 3) {
       System.err.println("Usage: RobotRulesParser <robots-file> <url-file> <agent-names>\n");
       System.err.println("    <robots-file> - Input robots.txt file which will be parsed.");
       System.err.println("    <url-file>    - Contains input URLs (1 per line) which are tested against the rules.");
-      System.err.println("    <agent-names> - Input agent name. Multiple agent names can be specified using spaces.");
+      System.err.println("    <agent-names> - Input agent names. Multiple agent names can be provided using");
+      System.err.println("                    comma as a delimiter without any spaces.");
       System.exit(-1);
     }
 
     try {
-      StringBuilder agentNames = new StringBuilder();
-      for(int counter = 2; counter < argv.length; counter++) 
-        agentNames.append(argv[counter]).append(",");
-
-      agentNames.deleteCharAt(agentNames.length()-1);
-
       byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
-      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", agentNames.toString());
+      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", argv[2]);
 
       LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
       String testPath = testsIn.readLine().trim();

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1604291&r1=1604290&r2=1604291&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Fri Jun 20 22:15:43 2014
@@ -2,6 +2,8 @@ Nutch Change Log
 
 Nutch Current Development
 
+* NUTCH-1718 redefine http.robots.agent as "additional agent names" (snagel, Tejas Patil, Daniel Kugel)
+
 * NUTCH-1794 IndexingFilterChecker to optionally dumpText (markus)
 
 * NUTCH-1590 [SECURITY] Frame injection vulnerability in published Javadoc (jnioche)

Modified: nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/nutch/trunk/conf/nutch-default.xml?rev=1604291&r1=1604290&r2=1604291&view=diff
==============================================================================
--- nutch/trunk/conf/nutch-default.xml (original)
+++ nutch/trunk/conf/nutch-default.xml Fri Jun 20 22:15:43 2014
@@ -89,11 +89,18 @@
 
 <property>
   <name>http.robots.agents</name>
-  <value>*</value>
-  <description>The agent strings we'll look for in robots.txt files,
-  comma-separated, in decreasing order of precedence. You should
-  put the value of http.agent.name as the first agent name, and keep the
-  default * at the end of the list. E.g.: BlurflDev,Blurfl,*
+  <value></value>
+  <description>Any other agents, apart from 'http.agent.name', that the robots
+  parser would look for in robots.txt. Multiple agents can be provided using 
+  comma as a delimiter. eg. mybot,foo-spider,bar-crawler
+  
+  The ordering of agents does NOT matter and the robots parser would make 
+  decision based on the agent which matches first to the robots rules.  
+  Also, there is NO need to add a wildcard (ie. "*") to this string as the 
+  robots parser would smartly take care of a no-match situation. 
+    
+  If no value is specified, by default HTTP agent (ie. 'http.agent.name') 
+  would be used for user agent matching by the robots parser. 
   </description>
 </property>
 

Modified: nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java?rev=1604291&r1=1604290&r2=1604291&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher.java Fri Jun 20 22:15:43 2014
@@ -1475,10 +1475,7 @@ public class Fetcher extends Configured 
   }
 
   private void checkConfiguration() {
-
-    // ensure that a value has been set for the agent name and that that
-    // agent name is the first value in the agents we advertise for robot
-    // rules parsing
+    // ensure that a value has been set for the agent name
     String agentName = getConf().get("http.agent.name");
     if (agentName == null || agentName.trim().length() == 0) {
       String message = "Fetcher: No agents listed in 'http.agent.name'"
@@ -1487,25 +1484,6 @@ public class Fetcher extends Configured 
         LOG.error(message);
       }
       throw new IllegalArgumentException(message);
-    } else {
-
-      // get all of the agents that we advertise
-      String agentNames = getConf().get("http.robots.agents");
-      StringTokenizer tok = new StringTokenizer(agentNames, ",");
-      ArrayList<String> agents = new ArrayList<String>();
-      while (tok.hasMoreTokens()) {
-        agents.add(tok.nextToken().trim());
-      }
-
-      // if the first one is not equal to our agent name, log fatal and throw
-      // an exception
-      if (!(agents.get(0)).equalsIgnoreCase(agentName)) {
-        String message = "Fetcher: Your 'http.agent.name' value should be "
-            + "listed first in 'http.robots.agents' property.";
-        if (LOG.isWarnEnabled()) {
-          LOG.warn(message);
-        }
-      }
     }
   }
 

Modified: nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java?rev=1604291&r1=1604290&r2=1604291&view=diff
==============================================================================
--- nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java (original)
+++ nutch/trunk/src/java/org/apache/nutch/protocol/RobotRulesParser.java Fri Jun 20 22:15:43 2014
@@ -85,43 +85,27 @@ public abstract class RobotRulesParser i
 
     // Grab the agent names we advertise to robots files.
     String agentName = conf.get("http.agent.name");
-    if (null == agentName) {
+    if (agentName == null || (agentName = agentName.trim()).isEmpty()) {
       throw new RuntimeException("Agent name not configured!");
     }
+    agentNames = agentName;
 
-    String agentNames = conf.get("http.robots.agents");
-    StringTokenizer tok = new StringTokenizer(agentNames, ",");
-    ArrayList<String> agents = new ArrayList<String>();
-    while (tok.hasMoreTokens()) {
-      agents.add(tok.nextToken().trim());
-    }
-
-    /**
-     * If there are no agents for robots-parsing, use the
-     * default agent-string. If both are present, our agent-string
-     * should be the first one we advertise to robots-parsing.
-     */
-    if (agents.size() == 0) {
-      if (LOG.isErrorEnabled()) {
-        LOG.error("No agents listed in 'http.robots.agents' property!");
-      }
-    } else { 
-      StringBuffer combinedAgentsString = new StringBuffer(agentName);
-      int index = 0;
-
-      if ((agents.get(0)).equalsIgnoreCase(agentName))
-        index++;
-      else if (LOG.isErrorEnabled()) {
-        LOG.error("Agent we advertise (" + agentName
-            + ") not listed first in 'http.robots.agents' property!");
+    // If there are any other agents specified, append those to the list of agents
+    String otherAgents = conf.get("http.robots.agents");
+    if(otherAgents != null && !otherAgents.trim().isEmpty()) {
+      StringTokenizer tok = new StringTokenizer(otherAgents, ",");
+      StringBuilder sb = new StringBuilder(agentNames);
+      while (tok.hasMoreTokens()) {
+        String str = tok.nextToken().trim();
+        if (str.equals("*") || str.equals(agentName)) {
+          // skip wildcard "*" or agent name itself
+          // (required for backward compatibility, cf. NUTCH-1715 and NUTCH-1718)
+        } else {
+          sb.append(",").append(str);
+        }
       }
 
-      // append all the agents from the http.robots.agents property
-      for(; index < agents.size(); index++) {
-        combinedAgentsString.append(", " + agents.get(index));
-      }
-
-      this.agentNames = combinedAgentsString.toString();
+      agentNames = sb.toString();
     }
   }
 
@@ -137,8 +121,8 @@ public abstract class RobotRulesParser i
    *    
    * @param url A string containing url
    * @param content Contents of the robots file in a byte array 
-   * @param contentType The 
-   * @param robotName A string containing value of  
+   * @param contentType The content type of the robots file
+   * @param robotName A string containing all the robots agent names used by parser for matching
    * @return BaseRobotRules object 
    */
   public BaseRobotRules parseRules (String url, byte[] content, String contentType, String robotName) {
@@ -160,30 +144,24 @@ public abstract class RobotRulesParser i
   /** command-line main for testing */
   public static void main(String[] argv) {
 
-    if (argv.length < 3) {
+    if (argv.length != 3) {
       System.err.println("Usage: RobotRulesParser <robots-file> <url-file> <agent-names>\n");
       System.err.println("\tThe <robots-file> will be parsed as a robots.txt file,");
       System.err.println("\tusing the given <agent-name> to select rules.  URLs ");
       System.err.println("\twill be read (one per line) from <url-file>, and tested");
-      System.err.println("\tagainst the rules. Multiple agent names can be specified using spaces.");
+      System.err.println("\tagainst the rules. Multiple agent names can be provided using");
+      System.err.println("\tcomma as a delimiter without any spaces.");
       System.exit(-1);
     }
 
     try {
-      StringBuilder agentNames = new StringBuilder();
-      for(int counter = 2; counter < argv.length; counter++) 
-        agentNames.append(argv[counter]).append(",");
-
-      agentNames.deleteCharAt(agentNames.length()-1);
-
       byte[] robotsBytes = Files.toByteArray(new File(argv[0]));
-      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", agentNames.toString());
+      BaseRobotRules rules = robotParser.parseContent(argv[0], robotsBytes, "text/plain", argv[2]);
 
       LineNumberReader testsIn = new LineNumberReader(new FileReader(argv[1]));
       String testPath = testsIn.readLine().trim();
       while (testPath != null) {
-        System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not allowed") +
-            ":\t" + testPath);
+        System.out.println( (rules.isAllowed(testPath) ? "allowed" : "not allowed") + ":\t" + testPath);
         testPath = testsIn.readLine();
       }
       testsIn.close();