You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by te...@apache.org on 2013/05/21 03:31:44 UTC

svn commit: r1484638 - in /nutch/trunk: ./ src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/ src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/

Author: tejasp
Date: Tue May 21 01:31:44 2013
New Revision: 1484638

URL: http://svn.apache.org/r1484638
Log:
NUTCH-1513 Support Robots.txt for Ftp urls

Added:
    nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
Modified:
    nutch/trunk/CHANGES.txt
    nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
    nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java

Modified: nutch/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/trunk/CHANGES.txt?rev=1484638&r1=1484637&r2=1484638&view=diff
==============================================================================
--- nutch/trunk/CHANGES.txt (original)
+++ nutch/trunk/CHANGES.txt Tue May 21 01:31:44 2013
@@ -2,6 +2,8 @@ Nutch Change Log
 
 (trunk): Current Development
 
+* NUTCH-1513 Support Robots.txt for Ftp urls (tejas)
+
 * NUTCH-1249 and NUTCH-1275 : Resolve all issues flagged up by adding javac -Xlint argument (tejasp)
 
 * NUTCH-1053 Parsing of RSS feeds fails (tejasp)

Modified: nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java?rev=1484638&r1=1484637&r2=1484638&view=diff
==============================================================================
--- nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java (original)
+++ nutch/trunk/src/plugin/lib-http/src/java/org/apache/nutch/protocol/http/api/HttpBase.java Tue May 21 01:31:44 2013
@@ -131,8 +131,6 @@ public abstract class HttpBase implement
   public Configuration getConf() {
     return this.conf;
   }
-   
-  
   
   public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
     

Modified: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=1484638&r1=1484637&r2=1484638&view=diff
==============================================================================
--- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java (original)
+++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java Tue May 21 01:31:44 2013
@@ -29,11 +29,9 @@ import org.apache.nutch.net.protocols.Re
 import org.apache.hadoop.conf.Configuration;
 
 import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.RobotRulesParser;
 import org.apache.nutch.protocol.Protocol;
 import org.apache.nutch.protocol.ProtocolOutput;
 import org.apache.nutch.protocol.ProtocolStatus;
-
 import crawlercommons.robots.BaseRobotRules;
 
 import java.net.URL;
@@ -48,8 +46,6 @@ import java.io.IOException;
  *                             {@code ftp.server.timeout}, {@code ftp.password}, 
  *                             {@code ftp.keep.connection} and {@code ftp.follow.talk}.
  * For details see "FTP properties" section in {@code nutch-default.xml}.
- *
- * @author John Xing
  */
 public class Ftp implements Protocol {
 
@@ -84,9 +80,11 @@ public class Ftp implements Protocol {
 
   private Configuration conf;
 
+  private FtpRobotRulesParser robots = null;
 
   // constructor
   public Ftp() {
+    robots = new FtpRobotRulesParser();
   }
 
   /** Set the timeout. */
@@ -240,6 +238,7 @@ public class Ftp implements Protocol {
     this.serverTimeout = conf.getInt("ftp.server.timeout", 60 * 1000);
     this.keepConnection = conf.getBoolean("ftp.keep.connection", false);
     this.followTalk = conf.getBoolean("ftp.follow.talk", false);
+    this.robots.setConf(conf);
   }
 
   /**
@@ -250,12 +249,10 @@ public class Ftp implements Protocol {
   }
 
   /** 
-   * Currently, no robots parsing is done for ftp protocol 
-   * and this returns a set of empty rules which will allow every url.
-   * There a jira logged for the same NUTCH-1513
+   * Get the robots rules for a given url
    */
   public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
-    return RobotRulesParser.EMPTY_RULES;
+    return robots.getRobotRulesSet(this, url);
   }
 
   public int getBufferSize() {

Added: nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java?rev=1484638&view=auto
==============================================================================
--- nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java (added)
+++ nutch/trunk/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java Tue May 21 01:31:44 2013
@@ -0,0 +1,99 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.protocol.RobotRulesParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
+
+/**
+ * This class is used for parsing robots for urls belonging to FTP protocol.
+ * It extends the generic {@link RobotRulesParser} class and contains 
+ * Ftp protocol specific implementation for obtaining the robots file.
+ */
+public class FtpRobotRulesParser extends RobotRulesParser {
+
+  private static final String CONTENT_TYPE = "text/plain";
+  public static final Logger LOG = LoggerFactory.getLogger(FtpRobotRulesParser.class);
+
+  FtpRobotRulesParser() { }
+
+  public FtpRobotRulesParser(Configuration conf) {
+    super(conf);
+  }
+
+  /**
+   * The hosts for which the caching of robots rules is yet to be done,
+   * it sends a Ftp request to the host corresponding to the {@link URL} 
+   * passed, gets robots file, parses the rules and caches the rules object
+   * to avoid re-work in future.
+   * 
+   *  @param ftp The {@link Protocol} object
+   *  @param url URL 
+   *  
+   *  @return robotRules A {@link BaseRobotRules} object for the rules
+   */
+  public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) {
+
+    String protocol = url.getProtocol().toLowerCase();  // normalize to lower case
+    String host = url.getHost().toLowerCase();          // normalize to lower case
+
+    BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(protocol + ":" + host);
+
+    boolean cacheRule = true;
+
+    if (robotRules == null) {                     // cache miss
+      if (LOG.isTraceEnabled())
+        LOG.trace("cache miss " + url);
+
+      try {
+        Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
+        ProtocolOutput output = ((Ftp)ftp).getProtocolOutput(robotsUrl, new CrawlDatum());
+        ProtocolStatus status = output.getStatus();
+
+        if (status.getCode() == ProtocolStatus.SUCCESS) {
+          robotRules =  parseRules(url.toString(), output.getContent().getContent(), 
+                                  CONTENT_TYPE, agentNames);
+        } else {                                       
+          robotRules = EMPTY_RULES;                 // use default rules
+        }
+      } catch (Throwable t) {
+        if (LOG.isInfoEnabled()) {
+          LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
+        }
+        cacheRule = false;
+        robotRules = EMPTY_RULES;
+      }
+
+      if (cacheRule)
+        CACHE.put(protocol + ":" + host, robotRules);  // cache rules for host
+    }
+    return robotRules;
+  }
+}