You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by si...@apache.org on 2009/02/24 10:54:30 UTC

svn commit: r747319 - in /lucene/nutch/trunk: ./ src/java/org/apache/nutch/fetcher/ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/test/ src/test/org/apache/nutch/fetcher/

Author: siren
Date: Tue Feb 24 09:54:30 2009
New Revision: 747319

URL: http://svn.apache.org/viewvc?rev=747319&view=rev
Log:
NUTCH-247 - Robot parser to restrict, contributed by kubes

Modified:
    lucene/nutch/trunk/CHANGES.txt
    lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
    lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
    lucene/nutch/trunk/src/test/crawl-tests.xml
    lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java

Modified: lucene/nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/CHANGES.txt?rev=747319&r1=747318&r2=747319&view=diff
==============================================================================
--- lucene/nutch/trunk/CHANGES.txt (original)
+++ lucene/nutch/trunk/CHANGES.txt Tue Feb 24 09:54:30 2009
@@ -357,6 +357,8 @@
 133. NUTCH-626 - Fetcher2 breaks out the domain with db.ignore.external.links
      set at cross domain redirects (Remco Verhoef, dogacan via siren)
 
+134. NUTCH-247 - Robot parser to restrict (kubes, siren)
+
 Release 0.9 - 2007-04-02
 
  1. Changed log4j confiquration to log to stdout on commandline

Modified: lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java?rev=747319&r1=747318&r2=747319&view=diff
==============================================================================
--- lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java (original)
+++ lucene/nutch/trunk/src/java/org/apache/nutch/fetcher/Fetcher2.java Tue Feb 24 09:54:30 2009
@@ -933,6 +933,8 @@
   public void fetch(Path segment, int threads, boolean parsing)
     throws IOException {
 
+    checkConfiguration();
+
     if (LOG.isInfoEnabled()) {
       LOG.info("Fetcher: starting");
       LOG.info("Fetcher: segment: " + segment);
@@ -995,4 +997,40 @@
     fetcher.fetch(segment, threads, parsing);              // run the Fetcher
 
   }
+
+  private void checkConfiguration() {
+
+    // ensure that a value has been set for the agent name and that that
+    // agent name is the first value in the agents we advertise for robot
+    // rules parsing
+    String agentName = getConf().get("http.agent.name");
+    if (agentName == null || agentName.trim().length() == 0) {
+      String message = "Fetcher: No agents listed in 'http.agent.name'"
+          + " property.";
+      if (LOG.isFatalEnabled()) {
+        LOG.fatal(message);
+      }
+      throw new IllegalArgumentException(message);
+    } else {
+
+      // get all of the agents that we advertise
+      String agentNames = getConf().get("http.robots.agents");
+      StringTokenizer tok = new StringTokenizer(agentNames, ",");
+      ArrayList<String> agents = new ArrayList<String>();
+      while (tok.hasMoreTokens()) {
+        agents.add(tok.nextToken().trim());
+      }
+
+      // if the first one is not equal to our agent name, log fatal and throw
+      // an exception
+      if (!(agents.get(0)).equalsIgnoreCase(agentName)) {
+        String message = "Fetcher: Your 'http.agent.name' value should be "
+            + "listed first in 'http.robots.agents' property.";
+        if (LOG.isWarnEnabled()) {
+          LOG.warn(message);
+        }
+      }
+    }
+  }
+
 }

Modified: lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java?rev=747319&r1=747318&r2=747319&view=diff
==============================================================================
--- lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java (original)
+++ lucene/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/RobotRulesParser.java Tue Feb 24 09:54:30 2009
@@ -223,9 +223,6 @@
     // Grab the agent names we advertise to robots files.
     //
     String agentName = conf.get("http.agent.name");
-    if (null == agentName) {
-      throw new RuntimeException("Agent name not configured!");
-    }
     String agentNames = conf.get("http.robots.agents");
     StringTokenizer tok = new StringTokenizer(agentNames, ",");
     ArrayList agents = new ArrayList();
@@ -233,23 +230,6 @@
       agents.add(tok.nextToken().trim());
     }
 
-    //
-    // If there are no agents for robots-parsing, use our
-    // default agent-string.  If both are present, our agent-string
-    // should be the first one we advertise to robots-parsing.
-    //
-    if (agents.size() == 0) {
-      agents.add(agentName);
-      if (LOG.isFatalEnabled()) {
-        LOG.fatal("No agents listed in 'http.robots.agents' property!");
-      }
-    } else if (!((String)agents.get(0)).equalsIgnoreCase(agentName)) {
-      agents.add(0, agentName);
-      if (LOG.isFatalEnabled()) {
-        LOG.fatal("Agent we advertise (" + agentName
-                + ") not listed first in 'http.robots.agents' property!");
-      }
-    }
     setRobotNames((String[]) agents.toArray(new String[agents.size()]));
   }
 

Modified: lucene/nutch/trunk/src/test/crawl-tests.xml
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/crawl-tests.xml?rev=747319&r1=747318&r2=747319&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/crawl-tests.xml (original)
+++ lucene/nutch/trunk/src/test/crawl-tests.xml Tue Feb 24 09:54:30 2009
@@ -33,5 +33,20 @@
   <value>test-nutch,*</value>
 </property>
 
+<property>
+  <name>http.agent.name.check</name>
+  <value>true</value>
+</property>
+
+<property>                                                                                                                                                   
+  <name>http.robots.agents</name>                                                                                                                            
+  <value>test-nutch,*</value>                                                                                                                                
+  <description>The agent strings we'll look for in robots.txt files,                                                                                         
+  comma-separated, in decreasing order of precedence. You should                                                                                             
+  put the value of http.agent.name as the first agent name, and keep the                                                                                     
+  default * at the end of the list. E.g.: BlurflDev,Blurfl,*                                                                                                 
+  </description>                                                                                                                                             
+</property>
+
 </configuration>
 

Modified: lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java
URL: http://svn.apache.org/viewvc/lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java?rev=747319&r1=747318&r2=747319&view=diff
==============================================================================
--- lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java (original)
+++ lucene/nutch/trunk/src/test/org/apache/nutch/fetcher/TestFetcher.java Tue Feb 24 09:54:30 2009
@@ -167,5 +167,24 @@
   private void addUrl(ArrayList<String> urls, String page) {
     urls.add("http://127.0.0.1:" + server.getListeners()[0].getPort() + "/" + page);
   }
+  
+  public void testAgentNameCheck() {
+
+    boolean failedNoAgentName = false;
+    conf.set("http.agent.name", "");
+
+    try {
+      conf.setBoolean("fetcher.parse", true);
+      Fetcher2 fetcher = new Fetcher2(conf);
+      fetcher.fetch(null, 1, false);
+    } catch (IllegalArgumentException iae) {
+      String message = iae.getMessage();
+      failedNoAgentName = message.equals("Fetcher: No agents listed in "
+          + "'http.agent.name' property.");
+    } catch (Exception e) {
+    }
+
+    assertTrue(failedNoAgentName);
+  }
 
 }