You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:24 UTC
[08/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build
for nutch-core and nutch-plugins
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Client.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Client.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Client.java
new file mode 100644
index 0000000..da25d87
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Client.java
@@ -0,0 +1,595 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+
+import java.net.InetAddress;
+import java.net.Socket;
+
+import java.util.List;
+//import java.util.LinkedList;
+
+import org.apache.commons.net.MalformedServerReplyException;
+
+import org.apache.commons.net.ftp.FTP;
+import org.apache.commons.net.ftp.FTPCommand;
+import org.apache.commons.net.ftp.FTPFile;
+import org.apache.commons.net.ftp.FTPFileEntryParser;
+import org.apache.commons.net.ftp.FTPReply;
+
+import org.apache.commons.net.ftp.FTPConnectionClosedException;
+
+/***********************************************
+ * Client.java encapsulates functionalities necessary for nutch to get dir list
+ * and retrieve file from an FTP server. This class takes care of all low level
+ * details of interacting with an FTP server and provides a convenient higher
+ * level interface.
+ *
+ * Modified from FtpClient.java in apache commons-net.
+ *
+ * Notes by John Xing: ftp server implementations are hardly uniform and none
+ * seems to follow RFCs whole-heartedly. We have no choice, but assume common
+ * denominator as following: (1) Use stream mode for data transfer. Block mode
+ * will be better for multiple file downloading and partial file downloading.
+ * However not every ftpd has block mode support. (2) Use passive mode for data
+ * connection. So Nutch will work if we run behind firewall. (3) Data connection
+ * is opened/closed per ftp command for the reasons listed in (1). There are ftp
+ * servers out there, when partial downloading is enforced by closing data
+ * channel socket on our client side, the server side immediately closes control
+ * channel (socket). Our codes deal with such a bad behavior. (4) LIST is used
+ * to obtain remote file attributes if possible. MDTM & SIZE would be nice, but
+ * not as ubiquitously implemented as LIST. (5) Avoid using ABOR in single
+ * thread? Do not use it at all.
+ *
+ * About exceptions: Some specific exceptions are re-thrown as one of
+ * FtpException*.java In fact, each function throws FtpException*.java or pass
+ * IOException.
+ *
+ * @author John Xing
+ ***********************************************/
+
+public class Client extends FTP {
+ private int __dataTimeout;
+ private int __passivePort;
+ private String __passiveHost;
+ // private int __fileType, __fileFormat;
+ private boolean __remoteVerificationEnabled;
+ // private FTPFileEntryParser __entryParser;
+ private String __systemName;
+
+ /** Public default constructor */
+ public Client() {
+ __initDefaults();
+ __dataTimeout = -1;
+ __remoteVerificationEnabled = true;
+ }
+
+ // defaults when initialize
+ private void __initDefaults() {
+ __passiveHost = null;
+ __passivePort = -1;
+ __systemName = null;
+ // __fileType = FTP.ASCII_FILE_TYPE;
+ // __fileFormat = FTP.NON_PRINT_TEXT_FORMAT;
+ // __entryParser = null;
+ }
+
+ // parse reply for pass()
+ private void __parsePassiveModeReply(String reply)
+ throws MalformedServerReplyException {
+ int i, index, lastIndex;
+ String octet1, octet2;
+ StringBuffer host;
+
+ reply = reply.substring(reply.indexOf('(') + 1, reply.indexOf(')')).trim();
+
+ host = new StringBuffer(24);
+ lastIndex = 0;
+ index = reply.indexOf(',');
+ host.append(reply.substring(lastIndex, index));
+
+ for (i = 0; i < 3; i++) {
+ host.append('.');
+ lastIndex = index + 1;
+ index = reply.indexOf(',', lastIndex);
+ host.append(reply.substring(lastIndex, index));
+ }
+
+ lastIndex = index + 1;
+ index = reply.indexOf(',', lastIndex);
+
+ octet1 = reply.substring(lastIndex, index);
+ octet2 = reply.substring(index + 1);
+
+ // index and lastIndex now used as temporaries
+ try {
+ index = Integer.parseInt(octet1);
+ lastIndex = Integer.parseInt(octet2);
+ } catch (NumberFormatException e) {
+ throw new MalformedServerReplyException(
+ "Could not parse passive host information.\nServer Reply: " + reply);
+ }
+
+ index <<= 8;
+ index |= lastIndex;
+
+ __passiveHost = host.toString();
+ __passivePort = index;
+ }
+
+ /**
+ * open a passive data connection socket
+ *
+ * @param command
+ * @param arg
+ * @return
+ * @throws IOException
+ * @throws FtpExceptionCanNotHaveDataConnection
+ */
+ protected Socket __openPassiveDataConnection(int command, String arg)
+ throws IOException, FtpExceptionCanNotHaveDataConnection {
+ Socket socket;
+
+ // // 20040317, xing, accommodate ill-behaved servers, see below
+ // int port_previous = __passivePort;
+
+ if (pasv() != FTPReply.ENTERING_PASSIVE_MODE)
+ throw new FtpExceptionCanNotHaveDataConnection("pasv() failed. "
+ + getReplyString());
+
+ try {
+ __parsePassiveModeReply(getReplyStrings()[0]);
+ } catch (MalformedServerReplyException e) {
+ throw new FtpExceptionCanNotHaveDataConnection(e.getMessage());
+ }
+
+ // // 20040317, xing, accommodate ill-behaved servers, see above
+ // int count = 0;
+ // System.err.println("__passivePort "+__passivePort);
+ // System.err.println("port_previous "+port_previous);
+ // while (__passivePort == port_previous) {
+ // // just quit if too many tries. make it an exception here?
+ // if (count++ > 10)
+ // return null;
+ // // slow down further for each new try
+ // Thread.sleep(500*count);
+ // if (pasv() != FTPReply.ENTERING_PASSIVE_MODE)
+ // throw new FtpExceptionCanNotHaveDataConnection(
+ // "pasv() failed. " + getReplyString());
+ // //return null;
+ // try {
+ // __parsePassiveModeReply(getReplyStrings()[0]);
+ // } catch (MalformedServerReplyException e) {
+ // throw new FtpExceptionCanNotHaveDataConnection(e.getMessage());
+ // }
+ // }
+
+ socket = _socketFactory_.createSocket(__passiveHost, __passivePort);
+
+ if (!FTPReply.isPositivePreliminary(sendCommand(command, arg))) {
+ socket.close();
+ return null;
+ }
+
+ if (__remoteVerificationEnabled && !verifyRemote(socket)) {
+ InetAddress host1, host2;
+
+ host1 = socket.getInetAddress();
+ host2 = getRemoteAddress();
+
+ socket.close();
+
+ // our precaution
+ throw new FtpExceptionCanNotHaveDataConnection(
+ "Host attempting data connection " + host1.getHostAddress()
+ + " is not same as server " + host2.getHostAddress()
+ + " So we intentionally close it for security precaution.");
+ }
+
+ if (__dataTimeout >= 0)
+ socket.setSoTimeout(__dataTimeout);
+
+ return socket;
+ }
+
+ /***
+ * Sets the timeout in milliseconds to use for data connection. set
+ * immediately after opening the data connection.
+ ***/
+ public void setDataTimeout(int timeout) {
+ __dataTimeout = timeout;
+ }
+
+ /***
+ * Closes the connection to the FTP server and restores connection parameters
+ * to the default values.
+ * <p>
+ *
+ * @exception IOException
+ * If an error occurs while disconnecting.
+ ***/
+ public void disconnect() throws IOException {
+ __initDefaults();
+ super.disconnect();
+ // no worry for data connection, since we always close it
+ // in every ftp command that invloves data connection
+ }
+
+ /***
+ * Enable or disable verification that the remote host taking part of a data
+ * connection is the same as the host to which the control connection is
+ * attached. The default is for verification to be enabled. You may set this
+ * value at any time, whether the FTPClient is currently connected or not.
+ * <p>
+ *
+ * @param enable
+ * True to enable verification, false to disable verification.
+ ***/
+ public void setRemoteVerificationEnabled(boolean enable) {
+ __remoteVerificationEnabled = enable;
+ }
+
+ /***
+ * Return whether or not verification of the remote host participating in data
+ * connections is enabled. The default behavior is for verification to be
+ * enabled.
+ * <p>
+ *
+ * @return True if verification is enabled, false if not.
+ ***/
+ public boolean isRemoteVerificationEnabled() {
+ return __remoteVerificationEnabled;
+ }
+
+ /***
+ * Login to the FTP server using the provided username and password.
+ * <p>
+ *
+ * @param username
+ * The username to login under.
+ * @param password
+ * The password to use.
+ * @return True if successfully completed, false if not.
+ * @exception FTPConnectionClosedException
+ * If the FTP server prematurely closes the connection as a
+ * result of the client being idle or some other reason causing
+ * the server to send FTP reply code 421. This exception may be
+ * caught either as an IOException or independently as itself.
+ * @exception IOException
+ * If an I/O error occurs while either sending a command to the
+ * server or receiving a reply from the server.
+ ***/
+ public boolean login(String username, String password) throws IOException {
+ user(username);
+
+ if (FTPReply.isPositiveCompletion(getReplyCode()))
+ return true;
+
+ // If we get here, we either have an error code, or an intermmediate
+ // reply requesting password.
+ if (!FTPReply.isPositiveIntermediate(getReplyCode()))
+ return false;
+
+ return FTPReply.isPositiveCompletion(pass(password));
+ }
+
+ /***
+ * Logout of the FTP server by sending the QUIT command.
+ * <p>
+ *
+ * @return True if successfully completed, false if not.
+ * @exception FTPConnectionClosedException
+ * If the FTP server prematurely closes the connection as a
+ * result of the client being idle or some other reason causing
+ * the server to send FTP reply code 421. This exception may be
+ * caught either as an IOException or independently as itself.
+ * @exception IOException
+ * If an I/O error occurs while either sending a command to the
+ * server or receiving a reply from the server.
+ ***/
+ public boolean logout() throws IOException {
+ return FTPReply.isPositiveCompletion(quit());
+ }
+
+ /**
+ * retrieve list reply for path
+ *
+ * @param path
+ * @param entries
+ * @param limit
+ * @param parser
+ * @throws IOException
+ * @throws FtpExceptionCanNotHaveDataConnection
+ * @throws FtpExceptionUnknownForcedDataClose
+ * @throws FtpExceptionControlClosedByForcedDataClose
+ */
+ public void retrieveList(String path, List<FTPFile> entries, int limit,
+ FTPFileEntryParser parser) throws IOException,
+ FtpExceptionCanNotHaveDataConnection, FtpExceptionUnknownForcedDataClose,
+ FtpExceptionControlClosedByForcedDataClose {
+ Socket socket = __openPassiveDataConnection(FTPCommand.LIST, path);
+
+ if (socket == null)
+ throw new FtpExceptionCanNotHaveDataConnection("LIST "
+ + ((path == null) ? "" : path));
+
+ BufferedReader reader = new BufferedReader(new InputStreamReader(
+ socket.getInputStream()));
+
+ // force-close data channel socket, when download limit is reached
+ // boolean mandatory_close = false;
+
+ // List entries = new LinkedList();
+ int count = 0;
+ String line = parser.readNextEntry(reader);
+ while (line != null) {
+ FTPFile ftpFile = parser.parseFTPEntry(line);
+ // skip non-formatted lines
+ if (ftpFile == null) {
+ line = parser.readNextEntry(reader);
+ continue;
+ }
+ entries.add(ftpFile);
+ count += line.length();
+ // impose download limit if limit >= 0, otherwise no limit
+ // here, cut off is up to the line when total bytes is just over limit
+ if (limit >= 0 && count > limit) {
+ // mandatory_close = true;
+ break;
+ }
+ line = parser.readNextEntry(reader);
+ }
+
+ // if (mandatory_close)
+ // you always close here, no matter mandatory_close or not.
+ // however different ftp servers respond differently, see below.
+ socket.close();
+
+ // scenarios:
+ // (1) mandatory_close is false, download limit not reached
+ // no special care here
+ // (2) mandatory_close is true, download limit is reached
+ // different servers have different reply codes:
+
+ try {
+ int reply = getReply();
+ if (!_notBadReply(reply))
+ throw new FtpExceptionUnknownForcedDataClose(getReplyString());
+ } catch (FTPConnectionClosedException e) {
+ // some ftp servers will close control channel if data channel socket
+ // is closed by our end before all data has been read out. Check:
+ // tux414.q-tam.hp.com FTP server (hp.com version whp02)
+ // so must catch FTPConnectionClosedException thrown by getReply() above
+ // disconnect();
+ throw new FtpExceptionControlClosedByForcedDataClose(e.getMessage());
+ }
+
+ }
+
+ /**
+ * retrieve file for path
+ *
+ * @param path
+ * @param os
+ * @param limit
+ * @throws IOException
+ * @throws FtpExceptionCanNotHaveDataConnection
+ * @throws FtpExceptionUnknownForcedDataClose
+ * @throws FtpExceptionControlClosedByForcedDataClose
+ */
+ public void retrieveFile(String path, OutputStream os, int limit)
+ throws IOException, FtpExceptionCanNotHaveDataConnection,
+ FtpExceptionUnknownForcedDataClose,
+ FtpExceptionControlClosedByForcedDataClose {
+
+ Socket socket = __openPassiveDataConnection(FTPCommand.RETR, path);
+
+ if (socket == null)
+ throw new FtpExceptionCanNotHaveDataConnection("RETR "
+ + ((path == null) ? "" : path));
+
+ InputStream input = socket.getInputStream();
+
+ // 20040318, xing, treat everything as BINARY_FILE_TYPE for now
+ // do we ever need ASCII_FILE_TYPE?
+ // if (__fileType == ASCII_FILE_TYPE)
+ // input = new FromNetASCIIInputStream(input);
+
+ // fixme, should we instruct server here for binary file type?
+
+ // force-close data channel socket
+ // boolean mandatory_close = false;
+
+ int len;
+ int count = 0;
+ byte[] buf = new byte[org.apache.commons.net.io.Util.DEFAULT_COPY_BUFFER_SIZE];
+ while ((len = input.read(buf, 0, buf.length)) != -1) {
+ count += len;
+ // impose download limit if limit >= 0, otherwise no limit
+ // here, cut off is exactly of limit bytes
+ if (limit >= 0 && count > limit) {
+ os.write(buf, 0, len - (count - limit));
+ // mandatory_close = true;
+ break;
+ }
+ os.write(buf, 0, len);
+ os.flush();
+ }
+
+ // if (mandatory_close)
+ // you always close here, no matter mandatory_close or not.
+ // however different ftp servers respond differently, see below.
+ socket.close();
+
+ // scenarios:
+ // (1) mandatory_close is false, download limit not reached
+ // no special care here
+ // (2) mandatory_close is true, download limit is reached
+ // different servers have different reply codes:
+
+ // do not need this
+ // sendCommand("ABOR");
+
+ try {
+ int reply = getReply();
+ if (!_notBadReply(reply))
+ throw new FtpExceptionUnknownForcedDataClose(getReplyString());
+ } catch (FTPConnectionClosedException e) {
+ // some ftp servers will close control channel if data channel socket
+ // is closed by our end before all data has been read out. Check:
+ // tux414.q-tam.hp.com FTP server (hp.com version whp02)
+ // so must catch FTPConnectionClosedException thrown by getReply() above
+ // disconnect();
+ throw new FtpExceptionControlClosedByForcedDataClose(e.getMessage());
+ }
+
+ }
+
+ /**
+ * reply check after closing data connection
+ *
+ * @param reply
+ * @return
+ */
+ private boolean _notBadReply(int reply) {
+
+ if (FTPReply.isPositiveCompletion(reply)) {
+ // do nothing
+ } else if (reply == 426) { // FTPReply.TRANSFER_ABORTED
+ // some ftp servers reply 426, e.g.,
+ // foggy FTP server (Version wu-2.6.2(2)
+ // there is second reply witing? no!
+ // getReply();
+ } else if (reply == 450) { // FTPReply.FILE_ACTION_NOT_TAKEN
+ // some ftp servers reply 450, e.g.,
+ // ProFTPD [ftp.kernel.org]
+ // there is second reply witing? no!
+ // getReply();
+ } else if (reply == 451) { // FTPReply.ACTION_ABORTED
+ // some ftp servers reply 451, e.g.,
+ // ProFTPD [ftp.kernel.org]
+ // there is second reply witing? no!
+ // getReply();
+ } else if (reply == 451) { // FTPReply.ACTION_ABORTED
+ } else {
+ // what other kind of ftp server out there?
+ return false;
+ }
+
+ return true;
+ }
+
+ /***
+ * Sets the file type to be transferred. This should be one of
+ * <code> FTP.ASCII_FILE_TYPE </code>, <code> FTP.IMAGE_FILE_TYPE </code>,
+ * etc. The file type only needs to be set when you want to change the type.
+ * After changing it, the new type stays in effect until you change it again.
+ * The default file type is <code> FTP.ASCII_FILE_TYPE </code> if this method
+ * is never called.
+ * <p>
+ *
+ * @param fileType
+ * The <code> _FILE_TYPE </code> constant indcating the type of file.
+ * @return True if successfully completed, false if not.
+ * @exception FTPConnectionClosedException
+ * If the FTP server prematurely closes the connection as a
+ * result of the client being idle or some other reason causing
+ * the server to send FTP reply code 421. This exception may be
+ * caught either as an IOException or independently as itself.
+ * @exception IOException
+ * If an I/O error occurs while either sending a command to the
+ * server or receiving a reply from the server.
+ ***/
+ public boolean setFileType(int fileType) throws IOException {
+ if (FTPReply.isPositiveCompletion(type(fileType))) {
+ /*
+ * __fileType = fileType; __fileFormat = FTP.NON_PRINT_TEXT_FORMAT;
+ */
+ return true;
+ }
+ return false;
+ }
+
+ /***
+ * Fetches the system type name from the server and returns the string. This
+ * value is cached for the duration of the connection after the first call to
+ * this method. In other words, only the first time that you invoke this
+ * method will it issue a SYST command to the FTP server. FTPClient will
+ * remember the value and return the cached value until a call to disconnect.
+ * <p>
+ *
+ * @return The system type name obtained from the server. null if the
+ * information could not be obtained.
+ * @exception FTPConnectionClosedException
+ * If the FTP server prematurely closes the connection as a
+ * result of the client being idle or some other reason causing
+ * the server to send FTP reply code 421. This exception may be
+ * caught either as an IOException or independently as itself.
+ * @exception IOException
+ * If an I/O error occurs while either sending a command to the
+ * server or receiving a reply from the server.
+ ***/
+ public String getSystemName() throws IOException, FtpExceptionBadSystResponse {
+ // if (syst() == FTPReply.NAME_SYSTEM_TYPE)
+ // Technically, we should expect a NAME_SYSTEM_TYPE response, but
+ // in practice FTP servers deviate, so we soften the condition to
+ // a positive completion.
+ if (__systemName == null && FTPReply.isPositiveCompletion(syst())) {
+ __systemName = (getReplyStrings()[0]).substring(4);
+ } else {
+ throw new FtpExceptionBadSystResponse("Bad response of SYST: "
+ + getReplyString());
+ }
+
+ return __systemName;
+ }
+
+ /***
+ * Sends a NOOP command to the FTP server. This is useful for preventing
+ * server timeouts.
+ * <p>
+ *
+ * @return True if successfully completed, false if not.
+ * @exception FTPConnectionClosedException
+ * If the FTP server prematurely closes the connection as a
+ * result of the client being idle or some other reason causing
+ * the server to send FTP reply code 421. This exception may be
+ * caught either as an IOException or independently as itself.
+ * @exception IOException
+ * If an I/O error occurs while either sending a command to the
+ * server or receiving a reply from the server.
+ ***/
+ public boolean sendNoOp() throws IOException {
+ return FTPReply.isPositiveCompletion(noop());
+ }
+
+ // client.stat(path);
+ // client.sendCommand("STAT");
+ // client.sendCommand("STAT",path);
+ // client.sendCommand("MDTM",path);
+ // client.sendCommand("SIZE",path);
+ // client.sendCommand("HELP","SITE");
+ // client.sendCommand("SYST");
+ // client.setRestartOffset(120);
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Ftp.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Ftp.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Ftp.java
new file mode 100644
index 0000000..772f3bb
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Ftp.java
@@ -0,0 +1,267 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.commons.net.ftp.FTPFileEntryParser;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.net.protocols.Response;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import crawlercommons.robots.BaseRobotRules;
+
+import java.net.URL;
+
+import java.io.IOException;
+
+/**
+ * This class is a protocol plugin used for ftp: scheme. It creates
+ * {@link FtpResponse} object and gets the content of the url from it.
+ * Configurable parameters are {@code ftp.username}, {@code ftp.password},
+ * {@code ftp.content.limit}, {@code ftp.timeout}, {@code ftp.server.timeout},
+ * {@code ftp.password}, {@code ftp.keep.connection} and {@code ftp.follow.talk}
+ * . For details see "FTP properties" section in {@code nutch-default.xml}.
+ */
+public class Ftp implements Protocol {
+
+ public static final Logger LOG = LoggerFactory.getLogger(Ftp.class);
+
+ private static final int BUFFER_SIZE = 16384; // 16*1024 = 16384
+
+ static final int MAX_REDIRECTS = 5;
+
+ int timeout;
+
+ int maxContentLength;
+
+ String userName;
+ String passWord;
+
+ // typical/default server timeout is 120*1000 millisec.
+ // better be conservative here
+ int serverTimeout;
+
+ // when to have client start anew
+ long renewalTime = -1;
+
+ boolean keepConnection;
+
+ boolean followTalk;
+
+ // ftp client
+ Client client = null;
+ // ftp dir list entry parser
+ FTPFileEntryParser parser = null;
+
+ private Configuration conf;
+
+ private FtpRobotRulesParser robots = null;
+
+ // constructor
+ public Ftp() {
+ robots = new FtpRobotRulesParser();
+ }
+
+ /** Set the timeout. */
+ public void setTimeout(int to) {
+ timeout = to;
+ }
+
+ /** Set the point at which content is truncated. */
+ public void setMaxContentLength(int length) {
+ maxContentLength = length;
+ }
+
+ /** Set followTalk */
+ public void setFollowTalk(boolean followTalk) {
+ this.followTalk = followTalk;
+ }
+
+ /** Set keepConnection */
+ public void setKeepConnection(boolean keepConnection) {
+ this.keepConnection = keepConnection;
+ }
+
+ /**
+ * Creates a {@link FtpResponse} object corresponding to the url and returns a
+ * {@link ProtocolOutput} object as per the content received
+ *
+ * @param url
+ * Text containing the ftp url
+ * @param datum
+ * The CrawlDatum object corresponding to the url
+ *
+ * @return {@link ProtocolOutput} object for the url
+ */
+ public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
+ String urlString = url.toString();
+ try {
+ URL u = new URL(urlString);
+
+ int redirects = 0;
+
+ while (true) {
+ FtpResponse response;
+ response = new FtpResponse(u, datum, this, getConf()); // make a request
+
+ int code = response.getCode();
+ datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY,
+ new Text(Integer.toString(code)));
+
+
+ if (code == 200) { // got a good response
+ return new ProtocolOutput(response.toContent()); // return it
+
+ } else if (code >= 300 && code < 400) { // handle redirect
+ if (redirects == MAX_REDIRECTS)
+ throw new FtpException("Too many redirects: " + url);
+ u = new URL(response.getHeader("Location"));
+ redirects++;
+ if (LOG.isTraceEnabled()) {
+ LOG.trace("redirect to " + u);
+ }
+ } else { // convert to exception
+ throw new FtpError(code);
+ }
+ }
+ } catch (Exception e) {
+ return new ProtocolOutput(null, new ProtocolStatus(e));
+ }
+ }
+
+ protected void finalize() {
+ try {
+ if (this.client != null && this.client.isConnected()) {
+ this.client.logout();
+ this.client.disconnect();
+ }
+ } catch (IOException e) {
+ // do nothing
+ }
+ }
+
+ /** For debugging. */
+ public static void main(String[] args) throws Exception {
+ int timeout = Integer.MIN_VALUE;
+ int maxContentLength = Integer.MIN_VALUE;
+ String logLevel = "info";
+ boolean followTalk = false;
+ boolean keepConnection = false;
+ boolean dumpContent = false;
+ String urlString = null;
+
+ String usage = "Usage: Ftp [-logLevel level] [-followTalk] [-keepConnection] [-timeout N] [-maxContentLength L] [-dumpContent] url";
+
+ if (args.length == 0) {
+ System.err.println(usage);
+ System.exit(-1);
+ }
+
+ for (int i = 0; i < args.length; i++) {
+ if (args[i].equals("-logLevel")) {
+ logLevel = args[++i];
+ } else if (args[i].equals("-followTalk")) {
+ followTalk = true;
+ } else if (args[i].equals("-keepConnection")) {
+ keepConnection = true;
+ } else if (args[i].equals("-timeout")) {
+ timeout = Integer.parseInt(args[++i]) * 1000;
+ } else if (args[i].equals("-maxContentLength")) {
+ maxContentLength = Integer.parseInt(args[++i]);
+ } else if (args[i].equals("-dumpContent")) {
+ dumpContent = true;
+ } else if (i != args.length - 1) {
+ System.err.println(usage);
+ System.exit(-1);
+ } else {
+ urlString = args[i];
+ }
+ }
+
+ Ftp ftp = new Ftp();
+
+ ftp.setFollowTalk(followTalk);
+ ftp.setKeepConnection(keepConnection);
+
+ if (timeout != Integer.MIN_VALUE) // set timeout
+ ftp.setTimeout(timeout);
+
+ if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
+ ftp.setMaxContentLength(maxContentLength);
+
+ // set log level
+ // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
+
+ Content content = ftp.getProtocolOutput(new Text(urlString),
+ new CrawlDatum()).getContent();
+
+ System.err.println("Content-Type: " + content.getContentType());
+ System.err.println("Content-Length: "
+ + content.getMetadata().get(Response.CONTENT_LENGTH));
+ System.err.println("Last-Modified: "
+ + content.getMetadata().get(Response.LAST_MODIFIED));
+ if (dumpContent) {
+ System.out.print(new String(content.getContent()));
+ }
+
+ ftp = null;
+ }
+
+ /**
+ * Set the {@link Configuration} object
+ */
+ public void setConf(Configuration conf) {
+ this.conf = conf;
+ this.maxContentLength = conf.getInt("ftp.content.limit", 64 * 1024);
+ this.timeout = conf.getInt("ftp.timeout", 10000);
+ this.userName = conf.get("ftp.username", "anonymous");
+ this.passWord = conf.get("ftp.password", "anonymous@example.com");
+ this.serverTimeout = conf.getInt("ftp.server.timeout", 60 * 1000);
+ this.keepConnection = conf.getBoolean("ftp.keep.connection", false);
+ this.followTalk = conf.getBoolean("ftp.follow.talk", false);
+ this.robots.setConf(conf);
+ }
+
+ /**
+ * Get the {@link Configuration} object
+ */
+ public Configuration getConf() {
+ return this.conf;
+ }
+
+ /**
+ * Get the robots rules for a given url
+ */
+ public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
+ return robots.getRobotRulesSet(this, url);
+ }
+
+ public int getBufferSize() {
+ return BUFFER_SIZE;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpError.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpError.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpError.java
new file mode 100644
index 0000000..b63a67e
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpError.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+/**
+ * Thrown for Ftp error codes.
+ */
+public class FtpError extends FtpException {
+
+ private int code;
+
+ public int getCode(int code) {
+ return code;
+ }
+
+ public FtpError(int code) {
+ super("Ftp Error: " + code);
+ this.code = code;
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpException.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpException.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpException.java
new file mode 100644
index 0000000..5a29668
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpException.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import org.apache.nutch.protocol.ProtocolException;
+
+/***
+ * Superclass for important exceptions thrown during FTP talk, that must be
+ * handled with care.
+ *
+ * @author John Xing
+ */
+public class FtpException extends ProtocolException {
+
+ public FtpException() {
+ super();
+ }
+
+ public FtpException(String message) {
+ super(message);
+ }
+
+ public FtpException(String message, Throwable cause) {
+ super(message, cause);
+ }
+
+ public FtpException(Throwable cause) {
+ super(cause);
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java
new file mode 100644
index 0000000..689ac8e
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+/**
+ * Exception indicating bad reply of SYST command.
+ *
+ * @author John Xing
+ */
+public class FtpExceptionBadSystResponse extends FtpException {
+ FtpExceptionBadSystResponse(String msg) {
+ super(msg);
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java
new file mode 100644
index 0000000..9f35b74
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+/**
+ * Exception indicating failure of opening data connection.
+ *
+ * @author John Xing
+ */
+public class FtpExceptionCanNotHaveDataConnection extends FtpException {
+ FtpExceptionCanNotHaveDataConnection(String msg) {
+ super(msg);
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java
new file mode 100644
index 0000000..c058fcb
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+/**
+ * Exception indicating control channel is closed by server end, due to forced
+ * closure of data channel at client (our) end.
+ *
+ * @author John Xing
+ */
+public class FtpExceptionControlClosedByForcedDataClose extends FtpException {
+ FtpExceptionControlClosedByForcedDataClose(String msg) {
+ super(msg);
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java
new file mode 100644
index 0000000..9083d7c
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+/**
+ * Exception indicating unrecognizable reply from server after forced closure of
+ * data channel by client (our) side.
+ *
+ * @author John Xing
+ */
+public class FtpExceptionUnknownForcedDataClose extends FtpException {
+ FtpExceptionUnknownForcedDataClose(String msg) {
+ super(msg);
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpResponse.java
new file mode 100644
index 0000000..f7c7c6d
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpResponse.java
@@ -0,0 +1,521 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import org.apache.commons.net.ftp.FTP;
+import org.apache.commons.net.ftp.FTPFile;
+import org.apache.commons.net.ftp.FTPReply;
+import org.apache.commons.net.ftp.parser.DefaultFTPFileEntryParserFactory;
+import org.apache.commons.net.ftp.parser.ParserInitializationException;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.hadoop.conf.Configuration;
+
+import java.net.InetAddress;
+import java.net.URL;
+import java.util.List;
+import java.util.LinkedList;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+/**
+ * FtpResponse.java mimics ftp replies as http response. It tries its best to
+ * follow http's way for headers, response codes as well as exceptions.
+ *
+ * Comments: In this class, all FtpException*.java thrown by Client.java and
+ * some important commons-net exceptions passed by Client.java must have been
+ * properly dealt with. They'd better not be leaked to the caller of this class.
+ */
+public class FtpResponse {
+
+ private String orig;
+ private String base;
+ private byte[] content;
+ private static final byte[] EMPTY_CONTENT = new byte[0];
+ private int code;
+ private Metadata headers = new Metadata();
+
+ private final Ftp ftp;
+ private Configuration conf;
+
+ /** Returns the response code. */
+ public int getCode() {
+ return code;
+ }
+
+ /** Returns the value of a named header. */
+ public String getHeader(String name) {
+ return headers.get(name);
+ }
+
+ public byte[] getContent() {
+ return content;
+ }
+
+ public Content toContent() {
+ return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
+ getHeader(Response.CONTENT_TYPE), headers, this.conf);
+ }
+
+ public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf)
+ throws FtpException, IOException {
+
+ this.orig = url.toString();
+ this.base = url.toString();
+ this.ftp = ftp;
+ this.conf = conf;
+
+ if (!"ftp".equals(url.getProtocol()))
+ throw new FtpException("Not a ftp url:" + url);
+
+ if (url.getPath() != url.getFile()) {
+ if (Ftp.LOG.isWarnEnabled()) {
+ Ftp.LOG.warn("url.getPath() != url.getFile(): " + url);
+ }
+ }
+
+ String path = "".equals(url.getPath()) ? "/" : url.getPath();
+
+ try {
+
+ if (ftp.followTalk) {
+ if (Ftp.LOG.isInfoEnabled()) {
+ Ftp.LOG.info("fetching " + url);
+ }
+ } else {
+ if (Ftp.LOG.isTraceEnabled()) {
+ Ftp.LOG.trace("fetching " + url);
+ }
+ }
+
+ InetAddress addr = InetAddress.getByName(url.getHost());
+ if (addr != null && conf.getBoolean("store.ip.address", false) == true) {
+ headers.add("_ip_", addr.getHostAddress());
+ }
+
+ // idled too long, remote server or ourselves may have timed out,
+ // should start anew.
+ if (ftp.client != null && ftp.keepConnection
+ && ftp.renewalTime < System.currentTimeMillis()) {
+ if (Ftp.LOG.isInfoEnabled()) {
+ Ftp.LOG.info("delete client because idled too long");
+ }
+ ftp.client = null;
+ }
+
+ // start anew if needed
+ if (ftp.client == null) {
+ if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+ Ftp.LOG.info("start client");
+ }
+ // the real client
+ ftp.client = new Client();
+ // when to renew, take the lesser
+ // ftp.renewalTime = System.currentTimeMillis()
+ // + ((ftp.timeout<ftp.serverTimeout) ? ftp.timeout :
+ // ftp.serverTimeout);
+
+ // timeout for control connection
+ ftp.client.setDefaultTimeout(ftp.timeout);
+ // timeout for data connection
+ ftp.client.setDataTimeout(ftp.timeout);
+
+ // follow ftp talk?
+ if (ftp.followTalk)
+ ftp.client.addProtocolCommandListener(new PrintCommandListener(
+ Ftp.LOG));
+ }
+
+ // quit from previous site if at a different site now
+ if (ftp.client.isConnected()) {
+ InetAddress remoteAddress = ftp.client.getRemoteAddress();
+ if (!addr.equals(remoteAddress)) {
+ if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+ Ftp.LOG.info("disconnect from " + remoteAddress
+ + " before connect to " + addr);
+ }
+ // quit from current site
+ ftp.client.logout();
+ ftp.client.disconnect();
+ }
+ }
+
+ // connect to current site if needed
+ if (!ftp.client.isConnected()) {
+
+ if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+ Ftp.LOG.info("connect to " + addr);
+ }
+
+ ftp.client.connect(addr);
+ if (!FTPReply.isPositiveCompletion(ftp.client.getReplyCode())) {
+ ftp.client.disconnect();
+ if (Ftp.LOG.isWarnEnabled()) {
+ Ftp.LOG.warn("ftp.client.connect() failed: " + addr + " "
+ + ftp.client.getReplyString());
+ }
+ this.code = 500; // http Internal Server Error
+ return;
+ }
+
+ if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+ Ftp.LOG.info("log into " + addr);
+ }
+
+ if (!ftp.client.login(ftp.userName, ftp.passWord)) {
+ // login failed.
+ // please note that some server may return 421 immediately
+ // after USER anonymous, thus ftp.client.login() won't return false,
+ // but throw exception, which then will be handled by caller
+ // (not dealt with here at all) .
+ ftp.client.disconnect();
+ if (Ftp.LOG.isWarnEnabled()) {
+ Ftp.LOG.warn("ftp.client.login() failed: " + addr);
+ }
+ this.code = 401; // http Unauthorized
+ return;
+ }
+
+ // insist on binary file type
+ if (!ftp.client.setFileType(FTP.BINARY_FILE_TYPE)) {
+ ftp.client.logout();
+ ftp.client.disconnect();
+ if (Ftp.LOG.isWarnEnabled()) {
+ Ftp.LOG.warn("ftp.client.setFileType() failed: " + addr);
+ }
+ this.code = 500; // http Internal Server Error
+ return;
+ }
+
+ if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+ Ftp.LOG.info("set parser for " + addr);
+ }
+
+ // SYST is valid only after login
+ try {
+ ftp.parser = null;
+ String parserKey = ftp.client.getSystemName();
+ // some server reports as UNKNOWN Type: L8, but in fact UNIX Type: L8
+ if (parserKey.startsWith("UNKNOWN Type: L8"))
+ parserKey = "UNIX Type: L8";
+ ftp.parser = (new DefaultFTPFileEntryParserFactory())
+ .createFileEntryParser(parserKey);
+ } catch (FtpExceptionBadSystResponse e) {
+ if (Ftp.LOG.isWarnEnabled()) {
+ Ftp.LOG
+ .warn("ftp.client.getSystemName() failed: " + addr + " " + e);
+ }
+ ftp.parser = null;
+ } catch (ParserInitializationException e) {
+ // ParserInitializationException is RuntimeException defined in
+ // org.apache.commons.net.ftp.parser.ParserInitializationException
+ if (Ftp.LOG.isWarnEnabled()) {
+ Ftp.LOG.warn("createFileEntryParser() failed. " + addr + " " + e);
+ }
+ ftp.parser = null;
+ } finally {
+ if (ftp.parser == null) {
+ // do not log as severe, otherwise
+ // FetcherThread/RequestScheduler will abort
+ if (Ftp.LOG.isWarnEnabled()) {
+ Ftp.LOG.warn("ftp.parser is null: " + addr);
+ }
+ ftp.client.logout();
+ ftp.client.disconnect();
+ this.code = 500; // http Internal Server Error
+ return;
+ }
+ }
+
+ } else {
+ if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+ Ftp.LOG.info("use existing connection");
+ }
+ }
+
+ this.content = null;
+
+ if (path.endsWith("/")) {
+ getDirAsHttpResponse(path, datum.getModifiedTime());
+ } else {
+ getFileAsHttpResponse(path, datum.getModifiedTime());
+ }
+
+ // reset next renewalTime, take the lesser
+ if (ftp.client != null && ftp.keepConnection) {
+ ftp.renewalTime = System.currentTimeMillis()
+ + ((ftp.timeout < ftp.serverTimeout) ? ftp.timeout
+ : ftp.serverTimeout);
+ if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+ Ftp.LOG.info("reset renewalTime to "
+ + HttpDateFormat.toString(ftp.renewalTime));
+ }
+ }
+
+ // getDirAsHttpResponse() or getFileAsHttpResponse() above
+ // may have deleted ftp.client
+ if (ftp.client != null && !ftp.keepConnection) {
+ if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+ Ftp.LOG.info("disconnect from " + addr);
+ }
+ ftp.client.logout();
+ ftp.client.disconnect();
+ }
+
+ } catch (Exception e) {
+ if (Ftp.LOG.isWarnEnabled()) {
+ Ftp.LOG.warn("Error: ", e);
+ }
+ // for any un-foreseen exception (run time exception or not),
+ // do ultimate clean and leave ftp.client for garbage collection
+ if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+ Ftp.LOG.info("delete client due to exception");
+ }
+ ftp.client = null;
+ // or do explicit garbage collection?
+ // System.gc();
+ // can we be less dramatic, using the following instead?
+ // probably unnecessary for our practical purpose here
+ // try {
+ // ftp.client.logout();
+ // ftp.client.disconnect();
+ // }
+ throw new FtpException(e);
+ // throw e;
+ }
+
+ }
+
+ // get ftp file as http response
+ private void getFileAsHttpResponse(String path, long lastModified)
+ throws IOException {
+
+ ByteArrayOutputStream os = null;
+ List<FTPFile> list = null;
+
+ try {
+ // first get its possible attributes
+ list = new LinkedList<FTPFile>();
+ ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser);
+
+ FTPFile ftpFile = (FTPFile) list.get(0);
+ this.headers.set(Response.CONTENT_LENGTH,
+ new Long(ftpFile.getSize()).toString());
+ this.headers.set(Response.LAST_MODIFIED,
+ HttpDateFormat.toString(ftpFile.getTimestamp()));
+ // don't retrieve the file if not changed.
+ if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
+ code = 304;
+ return;
+ }
+ os = new ByteArrayOutputStream(ftp.getBufferSize());
+ ftp.client.retrieveFile(path, os, ftp.maxContentLength);
+
+ this.content = os.toByteArray();
+
+ // // approximate bytes sent and read
+ // if (this.httpAccounting != null) {
+ // this.httpAccounting.incrementBytesSent(path.length());
+ // this.httpAccounting.incrementBytesRead(this.content.length);
+ // }
+
+ this.code = 200; // http OK
+
+ } catch (FtpExceptionControlClosedByForcedDataClose e) {
+
+ // control connection is off, clean up
+ // ftp.client.disconnect();
+ if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+ Ftp.LOG.info("delete client because server cut off control channel: "
+ + e);
+ }
+ ftp.client = null;
+
+ // in case this FtpExceptionControlClosedByForcedDataClose is
+ // thrown by retrieveList() (not retrieveFile()) above,
+ if (os == null) { // indicating throwing by retrieveList()
+ // throw new FtpException("fail to get attibutes: "+path);
+ if (Ftp.LOG.isWarnEnabled()) {
+ Ftp.LOG
+ .warn("Please try larger maxContentLength for ftp.client.retrieveList(). "
+ + e);
+ }
+ // in a way, this is our request fault
+ this.code = 400; // http Bad request
+ return;
+ }
+
+ FTPFile ftpFile = (FTPFile) list.get(0);
+ this.headers.set(Response.CONTENT_LENGTH,
+ new Long(ftpFile.getSize()).toString());
+ // this.headers.put("content-type", "text/html");
+ this.headers.set(Response.LAST_MODIFIED,
+ HttpDateFormat.toString(ftpFile.getTimestamp()));
+ this.content = os.toByteArray();
+ if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
+ code = 304;
+ return;
+ }
+
+ // // approximate bytes sent and read
+ // if (this.httpAccounting != null) {
+ // this.httpAccounting.incrementBytesSent(path.length());
+ // this.httpAccounting.incrementBytesRead(this.content.length);
+ // }
+
+ this.code = 200; // http OK
+
+ } catch (FtpExceptionCanNotHaveDataConnection e) {
+
+ if (FTPReply.isPositiveCompletion(ftp.client.cwd(path))) {
+ // it is not a file, but dir, so redirect as a dir
+ this.headers.set(Response.LOCATION, path + "/");
+ this.code = 300; // http redirect
+ // fixme, should we do ftp.client.cwd("/"), back to top dir?
+ } else {
+ // it is not a dir either
+ this.code = 404; // http Not Found
+ }
+
+ } catch (FtpExceptionUnknownForcedDataClose e) {
+ // Please note control channel is still live.
+ // in a way, this is our request fault
+ if (Ftp.LOG.isWarnEnabled()) {
+ Ftp.LOG.warn("Unrecognized reply after forced close of data channel. "
+ + "If this is acceptable, please modify Client.java accordingly. "
+ + e);
+ }
+ this.code = 400; // http Bad Request
+ }
+
+ }
+
+ // get ftp dir list as http response
+ private void getDirAsHttpResponse(String path, long lastModified)
+ throws IOException {
+ List<FTPFile> list = new LinkedList<FTPFile>();
+
+ try {
+
+ // change to that dir first
+ if (!FTPReply.isPositiveCompletion(ftp.client.cwd(path))) {
+ this.code = 404; // http Not Found
+ return;
+ }
+
+ // fixme, should we do ftp.client.cwd("/"), back to top dir?
+
+ ftp.client.retrieveList(null, list, ftp.maxContentLength, ftp.parser);
+ this.content = list2html(list, path, "/".equals(path) ? false : true);
+ this.headers.set(Response.CONTENT_LENGTH,
+ new Integer(this.content.length).toString());
+ this.headers.set(Response.CONTENT_TYPE, "text/html");
+ // this.headers.put("Last-Modified", null);
+
+ // // approximate bytes sent and read
+ // if (this.httpAccounting != null) {
+ // this.httpAccounting.incrementBytesSent(path.length());
+ // this.httpAccounting.incrementBytesRead(this.content.length);
+ // }
+
+ this.code = 200; // http OK
+
+ } catch (FtpExceptionControlClosedByForcedDataClose e) {
+
+ // control connection is off, clean up
+ // ftp.client.disconnect();
+ if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+ Ftp.LOG.info("delete client because server cut off control channel: "
+ + e);
+ }
+ ftp.client = null;
+
+ this.content = list2html(list, path, "/".equals(path) ? false : true);
+ this.headers.set(Response.CONTENT_LENGTH,
+ new Integer(this.content.length).toString());
+ this.headers.set(Response.CONTENT_TYPE, "text/html");
+ // this.headers.put("Last-Modified", null);
+
+ // // approximate bytes sent and read
+ // if (this.httpAccounting != null) {
+ // this.httpAccounting.incrementBytesSent(path.length());
+ // this.httpAccounting.incrementBytesRead(this.content.length);
+ // }
+
+ this.code = 200; // http OK
+
+ } catch (FtpExceptionUnknownForcedDataClose e) {
+ // Please note control channel is still live.
+ // in a way, this is our request fault
+ if (Ftp.LOG.isWarnEnabled()) {
+ Ftp.LOG.warn("Unrecognized reply after forced close of data channel. "
+ + "If this is acceptable, please modify Client.java accordingly. "
+ + e);
+ }
+ this.code = 400; // http Bad Request
+ } catch (FtpExceptionCanNotHaveDataConnection e) {
+ if (Ftp.LOG.isWarnEnabled()) {
+ Ftp.LOG.warn("" + e);
+ }
+ this.code = 500; // http Iternal Server Error
+ }
+
+ }
+
+ // generate html page from ftp dir list
+ private byte[] list2html(List<FTPFile> list, String path,
+ boolean includeDotDot) {
+
+ // StringBuffer x = new
+ // StringBuffer("<!doctype html public \"-//ietf//dtd html//en\"><html><head>");
+ StringBuffer x = new StringBuffer("<html><head>");
+ x.append("<title>Index of " + path + "</title></head>\n");
+ x.append("<body><h1>Index of " + path + "</h1><pre>\n");
+
+ if (includeDotDot) {
+ x.append("<a href='../'>../</a>\t-\t-\t-\n");
+ }
+
+ for (int i = 0; i < list.size(); i++) {
+ FTPFile f = (FTPFile) list.get(i);
+ String name = f.getName();
+ String time = HttpDateFormat.toString(f.getTimestamp());
+ if (f.isDirectory()) {
+ // some ftp server LIST "." and "..", we skip them here
+ if (name.equals(".") || name.equals(".."))
+ continue;
+ x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
+ x.append(time + "\t-\n");
+ } else if (f.isFile()) {
+ x.append("<a href='" + name + "'>" + name + "</a>\t");
+ x.append(time + "\t" + f.getSize() + "\n");
+ } else {
+ // ignore isSymbolicLink()
+ // ignore isUnknown()
+ }
+ }
+
+ x.append("</pre></body></html>\n");
+
+ return new String(x).getBytes();
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
new file mode 100644
index 0000000..3764864
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.protocol.RobotRulesParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
+
+/**
+ * This class is used for parsing robots for urls belonging to FTP protocol. It
+ * extends the generic {@link RobotRulesParser} class and contains Ftp protocol
+ * specific implementation for obtaining the robots file.
+ */
+public class FtpRobotRulesParser extends RobotRulesParser {
+
+ private static final String CONTENT_TYPE = "text/plain";
+ public static final Logger LOG = LoggerFactory
+ .getLogger(FtpRobotRulesParser.class);
+
+ FtpRobotRulesParser() {
+ }
+
+ public FtpRobotRulesParser(Configuration conf) {
+ super(conf);
+ }
+
+ /**
+ * The hosts for which the caching of robots rules is yet to be done, it sends
+ * a Ftp request to the host corresponding to the {@link URL} passed, gets
+ * robots file, parses the rules and caches the rules object to avoid re-work
+ * in future.
+ *
+ * @param ftp
+ * The {@link Protocol} object
+ * @param url
+ * URL
+ *
+ * @return robotRules A {@link BaseRobotRules} object for the rules
+ */
+ public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) {
+
+ String protocol = url.getProtocol().toLowerCase(); // normalize to lower
+ // case
+ String host = url.getHost().toLowerCase(); // normalize to lower case
+
+ if (LOG.isTraceEnabled() && isWhiteListed(url)) {
+ LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
+ }
+
+ BaseRobotRules robotRules = CACHE.get(protocol + ":" + host);
+
+ if (robotRules != null) {
+ return robotRules; // cached rule
+ } else if (LOG.isTraceEnabled()) {
+ LOG.trace("cache miss " + url);
+ }
+
+ boolean cacheRule = true;
+
+ if (isWhiteListed(url)) {
+ // check in advance whether a host is whitelisted
+ // (we do not need to fetch robots.txt)
+ robotRules = EMPTY_RULES;
+ LOG.info("Whitelisted host found for: {}", url);
+ LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host);
+
+ } else {
+ try {
+ Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
+ ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl,
+ new CrawlDatum());
+ ProtocolStatus status = output.getStatus();
+
+ if (status.getCode() == ProtocolStatus.SUCCESS) {
+ robotRules = parseRules(url.toString(), output.getContent()
+ .getContent(), CONTENT_TYPE, agentNames);
+ } else {
+ robotRules = EMPTY_RULES; // use default rules
+ }
+ } catch (Throwable t) {
+ if (LOG.isInfoEnabled()) {
+ LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
+ }
+ cacheRule = false; // try again later to fetch robots.txt
+ robotRules = EMPTY_RULES;
+ }
+
+ }
+
+ if (cacheRule)
+ CACHE.put(protocol + ":" + host, robotRules); // cache rules for host
+
+ return robotRules;
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
new file mode 100644
index 0000000..c68eac8
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import java.io.BufferedReader;
+import java.io.StringReader;
+import java.io.IOException;
+
+import org.slf4j.Logger;
+
+import org.apache.commons.net.ProtocolCommandEvent;
+import org.apache.commons.net.ProtocolCommandListener;
+
+/***
+ * This is a support class for logging all ftp command/reply traffic.
+ *
+ * @author John Xing
+ ***/
+public class PrintCommandListener implements ProtocolCommandListener {
+ private Logger __logger;
+
+ public PrintCommandListener(Logger logger) {
+ __logger = logger;
+ }
+
+ public void protocolCommandSent(ProtocolCommandEvent event) {
+ try {
+ __logIt(event);
+ } catch (IOException e) {
+ if (__logger.isInfoEnabled()) {
+ __logger.info("PrintCommandListener.protocolCommandSent(): " + e);
+ }
+ }
+ }
+
+ public void protocolReplyReceived(ProtocolCommandEvent event) {
+ try {
+ __logIt(event);
+ } catch (IOException e) {
+ if (__logger.isInfoEnabled()) {
+ __logger.info("PrintCommandListener.protocolReplyReceived(): " + e);
+ }
+ }
+ }
+
+ private void __logIt(ProtocolCommandEvent event) throws IOException {
+ if (!__logger.isInfoEnabled()) {
+ return;
+ }
+ BufferedReader br = new BufferedReader(new StringReader(event.getMessage()));
+ String line;
+ while ((line = br.readLine()) != null) {
+ __logger.info("ftp> " + line);
+ }
+ }
+}
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/package.html b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/package.html
new file mode 100644
index 0000000..d936930
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via the ftp protocol.</p><p></p>
+</body>
+</html>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/build.xml b/nutch-plugins/protocol-htmlunit/build.xml
new file mode 100644
index 0000000..899214c
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/build.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-htmlunit" default="jar-core">
+
+ <import file="../build-plugin.xml"/>
+
+ <!-- Build compilation dependencies -->
+ <target name="deps-jar">
+ <ant target="jar" inheritall="false" dir="../lib-http"/>
+ <ant target="jar" inheritall="false" dir="../lib-htmlunit"/>
+ </target>
+
+ <!-- Add compilation dependencies to classpath -->
+ <path id="plugin.deps">
+ <fileset dir="${nutch.root}/build">
+ <include name="**/lib-http/*.jar" />
+ <include name="**/lib-htmlunit/*.jar" />
+ </fileset>
+ <pathelement location="${build.dir}/test/conf"/>
+ </path>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/ivy.xml b/nutch-plugins/protocol-htmlunit/ivy.xml
new file mode 100644
index 0000000..8aa78d2
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/ivy.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" ?>
+
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+
+<ivy-module version="1.0">
+ <info organisation="org.apache.nutch" module="${ant.project.name}">
+ <license name="Apache 2.0"/>
+ <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+ <description>
+ Apache Nutch
+ </description>
+ </info>
+
+ <configurations>
+ <include file="../../..//ivy/ivy-configurations.xml"/>
+ </configurations>
+
+ <publications>
+ <!--get the artifact from our module name-->
+ <artifact conf="master"/>
+ </publications>
+
+</ivy-module>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/plugin.xml b/nutch-plugins/protocol-htmlunit/plugin.xml
new file mode 100644
index 0000000..36bcb80
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/plugin.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements. See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+ id="protocol-htmlunit"
+ name="HtmlUnit Protocol Plug-in"
+ version="1.0.0"
+ provider-name="nutch.apache.org">
+
+ <runtime>
+ <library name="protocol-htmlunit.jar">
+ <export name="*"/>
+ </library>
+ </runtime>
+
+ <requires>
+ <import plugin="nutch-extensionpoints"/>
+ <import plugin="lib-http"/>
+ <import plugin="lib-htmlunit"/>
+ </requires>
+
+ <extension id="org.apache.nutch.protocol.http"
+ name="HttpProtocol"
+ point="org.apache.nutch.protocol.Protocol">
+
+ <implementation id="org.apache.nutch.protocol.htmlunit.Http"
+ class="org.apache.nutch.protocol.htmlunit.Http">
+ <parameter name="protocolName" value="http"/>
+ </implementation>
+
+ <implementation id="org.apache.nutch.protocol.htmlunit.Http"
+ class="org.apache.nutch.protocol.htmlunit.Http">
+ <parameter name="protocolName" value="https"/>
+ </implementation>
+
+ </extension>
+</plugin>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/pom.xml b/nutch-plugins/protocol-htmlunit/pom.xml
new file mode 100644
index 0000000..e5a57d7
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/pom.xml
@@ -0,0 +1,51 @@
+<!--
+ ~ Licensed to the Apache Software Foundation (ASF) under one or more
+ ~ contributor license agreements. See the NOTICE file distributed with
+ ~ this work for additional information regarding copyright ownership.
+ ~ The ASF licenses this file to You under the Apache License, Version 2.0
+ ~ (the "License"); you may not use this file except in compliance with
+ ~ the License. You may obtain a copy of the License at
+ ~
+ ~ http://www.apache.org/licenses/LICENSE-2.0
+ ~
+ ~ Unless required by applicable law or agreed to in writing, software
+ ~ distributed under the License is distributed on an "AS IS" BASIS,
+ ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ ~ See the License for the specific language governing permissions and
+ ~ limitations under the License.
+ -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+ xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+ <modelVersion>4.0.0</modelVersion>
+
+ <parent>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>nutch-plugins</artifactId>
+ <version>1.13-SNAPSHOT</version>
+ <relativePath>../pom.xml</relativePath>
+ </parent>
+ <artifactId>protocol-htmlunit</artifactId>
+ <packaging>jar</packaging>
+
+ <name>protocol-htmlunit</name>
+ <url>http://nutch.apache.org</url>
+
+ <properties>
+ <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+ </properties>
+
+ <dependencies>
+ <dependency>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>lib-htmlunit</artifactId>
+ <version>${project.parent.version}</version>
+ </dependency>
+ <dependency>
+ <groupId>org.apache.nutch</groupId>
+ <artifactId>lib-http</artifactId>
+ <version>${project.parent.version}</version>
+ </dependency>
+ </dependencies>
+
+</project>
http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/Http.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/Http.java b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/Http.java
new file mode 100644
index 0000000..c40ed69
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/Http.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.htmlunit;
+
+import java.io.IOException;
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Http extends HttpBase {
+
+ public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+ /**
+ * Default constructor.
+ */
+ public Http() {
+ super(LOG);
+ }
+
+ /**
+ * Set the {@link org.apache.hadoop.conf.Configuration} object.
+ *
+ * @param conf
+ */
+ public void setConf(Configuration conf) {
+ super.setConf(conf);
+ }
+
+ public static void main(String[] args) throws Exception {
+ Http http = new Http();
+ http.setConf(NutchConfiguration.create());
+ main(http, args);
+ }
+
+ protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+ throws ProtocolException, IOException {
+ return new HttpResponse(this, url, datum);
+ }
+}