You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by ab...@apache.org on 2006/07/20 02:04:57 UTC

svn commit: r423670 - /lucene/nutch/trunk/conf/nutch-default.xml

Author: ab
Date: Wed Jul 19 17:04:56 2006
New Revision: 423670

URL: http://svn.apache.org/viewvc?rev=423670&view=rev
Log:
Set http.agent.name and related properties to empty values. This forces
people to put some sensible values there, and protects the Nutch project
from being blamed for someone else's misbehavior.

Modified:
    lucene/nutch/trunk/conf/nutch-default.xml

Modified: lucene/nutch/trunk/conf/nutch-default.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/conf/nutch-default.xml?rev=423670&r1=423669&r2=423670&view=diff
==============================================================================
--- lucene/nutch/trunk/conf/nutch-default.xml (original)
+++ lucene/nutch/trunk/conf/nutch-default.xml Wed Jul 19 17:04:56 2006
@@ -33,15 +33,31 @@
 
 <property>
   <name>http.agent.name</name>
-  <value>NutchCVS</value>
-  <description>Our HTTP 'User-Agent' request header.</description>
+  <value></value>
+  <description>HTTP 'User-Agent' request header. MUST NOT be empty - 
+  please set this to a single word uniquely related to your organization.
+
+  NOTE: You should also check other related properties:
+
+	http.robots.agents
+	http.agent.description
+	http.agent.url
+	http.agent.email
+	http.agent.version
+
+  and set their values appropriately.
+
+  </description>
 </property>
 
 <property>
   <name>http.robots.agents</name>
-  <value>NutchCVS,Nutch,*</value>
+  <value>*</value>
   <description>The agent strings we'll look for in robots.txt files,
-  comma-separated, in decreasing order of precedence.</description>
+  comma-separated, in decreasing order of precedence. You should
+  put the value of http.agent.name as the first agent name, and keep the
+  default * at the end of the list. E.g.: BlurflDev,Blurfl,*
+  </description>
 </property>
 
 <property>
@@ -55,7 +71,7 @@
 
 <property>
   <name>http.agent.description</name>
-  <value>Nutch</value>
+  <value></value>
   <description>Further description of our bot- this text is used in
   the User-Agent header.  It appears in parenthesis after the agent name.
   </description>
@@ -63,22 +79,26 @@
 
 <property>
   <name>http.agent.url</name>
-  <value>http://lucene.apache.org/nutch/bot.html</value>
+  <value></value>
   <description>A URL to advertise in the User-Agent header.  This will 
-   appear in parenthesis after the agent name.
+   appear in parenthesis after the agent name. Custom dictates that this
+   should be a URL of a page explaining the purpose and behavior of this
+   crawler.
   </description>
 </property>
 
 <property>
   <name>http.agent.email</name>
-  <value>nutch-agent@lucene.apache.org</value>
+  <value></value>
   <description>An email address to advertise in the HTTP 'From' request
-   header and User-Agent header.</description>
+   header and User-Agent header. A good practice is to mangle this
+   address (e.g. 'info at example dot com') to avoid spamming.
+  </description>
 </property>
 
 <property>
   <name>http.agent.version</name>
-  <value>0.8-dev</value>
+  <value>Nutch-0.8-dev</value>
   <description>A version string to advertise in the User-Agent 
    header.</description>
 </property>