You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by te...@apache.org on 2013/05/21 03:29:55 UTC
svn commit: r1484637 - in /nutch/branches/2.x: CHANGES.txt
src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
Author: tejasp
Date: Tue May 21 01:29:55 2013
New Revision: 1484637
URL: http://svn.apache.org/r1484637
Log:
NUTCH-1513 Support Robots.txt for Ftp urls
Added:
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
Modified:
nutch/branches/2.x/CHANGES.txt
nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
Modified: nutch/branches/2.x/CHANGES.txt
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/CHANGES.txt?rev=1484637&r1=1484636&r2=1484637&view=diff
==============================================================================
--- nutch/branches/2.x/CHANGES.txt (original)
+++ nutch/branches/2.x/CHANGES.txt Tue May 21 01:29:55 2013
@@ -2,6 +2,8 @@ Nutch Change Log
Release 2.2 - Current Development
+* NUTCH-1513 Support Robots.txt for Ftp urls (tejasp)
+
* NUTCH-1053 Parsing of RSS feeds fails (tejasp)
* NUTCH-1563 FetchSchedule#getFields is never used by GeneratorJob (Feng)
Modified: nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java?rev=1484637&r1=1484636&r2=1484637&view=diff
==============================================================================
--- nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java (original)
+++ nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/Ftp.java Tue May 21 01:29:55 2013
@@ -28,7 +28,6 @@ import org.apache.commons.net.ftp.FTPFil
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.Content;
-import org.apache.nutch.protocol.RobotRulesParser;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolOutput;
import org.apache.nutch.protocol.ProtocolStatusCodes;
@@ -46,8 +45,6 @@ import crawlercommons.robots.BaseRobotRu
* {@code ftp.server.timeout}, {@code ftp.password},
* {@code ftp.keep.connection} and {@code ftp.follow.talk}.
* For details see "FTP properties" section in {@code nutch-default.xml}.
- *
- * @author John Xing
*/
public class Ftp implements Protocol {
@@ -89,8 +86,11 @@ public class Ftp implements Protocol {
private Configuration conf;
+ private FtpRobotRulesParser robots = null;
+
// constructor
public Ftp() {
+ robots = new FtpRobotRulesParser();
}
/** Set the timeout. */
@@ -179,6 +179,7 @@ public class Ftp implements Protocol {
this.serverTimeout = conf.getInt("ftp.server.timeout", 60 * 1000);
this.keepConnection = conf.getBoolean("ftp.keep.connection", false);
this.followTalk = conf.getBoolean("ftp.follow.talk", false);
+ this.robots.setConf(conf);
}
/**
@@ -257,11 +258,9 @@ public class Ftp implements Protocol {
}
/**
- * Currently, no robots parsing is done for ftp protocol
- * and this returns a set of empty rules which will allow every url.
- * There a jira logged for the same NUTCH-1513
+ * Get the robots rules for a given url
*/
public BaseRobotRules getRobotRules(String url, WebPage page) {
- return RobotRulesParser.EMPTY_RULES;
+ return robots.getRobotRulesSet(this, url);
}
}
Added: nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
URL: http://svn.apache.org/viewvc/nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java?rev=1484637&view=auto
==============================================================================
--- nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java (added)
+++ nutch/branches/2.x/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java Tue May 21 01:29:55 2013
@@ -0,0 +1,100 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import java.net.URL;
+
+import org.apache.commons.io.IOUtils;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatusCodes;
+import org.apache.nutch.protocol.RobotRulesParser;
+import org.apache.nutch.storage.WebPage;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
+
+/**
+ * This class is used for parsing robots for urls belonging to FTP protocol.
+ * It extends the generic {@link RobotRulesParser} class and contains
+ * Ftp protocol specific implementation for obtaining the robots file.
+ */
+public class FtpRobotRulesParser extends RobotRulesParser {
+
+ private static final String CONTENT_TYPE = "text/plain";
+ public static final Logger LOG = LoggerFactory.getLogger(FtpRobotRulesParser.class);
+
+ FtpRobotRulesParser() { }
+
+ public FtpRobotRulesParser(Configuration conf) {
+ super(conf);
+ }
+
+ /**
+ * The hosts for which the caching of robots rules is yet to be done,
+ * it sends a Ftp request to the host corresponding to the {@link URL}
+ * passed, gets robots file, parses the rules and caches the rules object
+ * to avoid re-work in future.
+ *
+ * @param ftp The {@link Protocol} object
+ * @param url URL
+ *
+ * @return robotRules A {@link BaseRobotRules} object for the rules
+ */
+ public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) {
+
+ String protocol = url.getProtocol().toLowerCase(); // normalize to lower case
+ String host = url.getHost().toLowerCase(); // normalize to lower case
+
+ BaseRobotRules robotRules = (SimpleRobotRules) CACHE.get(protocol + ":" + host);
+
+ boolean cacheRule = true;
+
+ if (robotRules == null) { // cache miss
+
+ if (LOG.isTraceEnabled())
+ LOG.trace("cache miss " + url);
+
+ try {
+ String robotsUrl = new URL(url, "/robots.txt").toString();
+ ProtocolOutput output = ((Ftp)ftp).getProtocolOutput(robotsUrl, new WebPage());
+ int statusCode = output.getStatus().getCode();
+
+ if (statusCode == ProtocolStatusCodes.SUCCESS) {
+ robotRules = parseRules(url.toString(), output.getContent().getContent(),
+ CONTENT_TYPE, agentNames);
+ } else {
+ robotRules = EMPTY_RULES; // use default rules
+ }
+ } catch (Throwable t) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
+ }
+ cacheRule = false;
+ robotRules = EMPTY_RULES;
+ }
+
+ if (cacheRule)
+ CACHE.put(protocol + ":" + host, robotRules); // cache rules for host
+ }
+ return robotRules;
+ }
+}