You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/16 19:48:24 UTC

[08/51] [partial] nutch git commit: NUTCH-2292 : Mavenize the build for nutch-core and nutch-plugins

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Client.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Client.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Client.java
new file mode 100644
index 0000000..da25d87
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Client.java
@@ -0,0 +1,595 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.OutputStream;
+
+import java.net.InetAddress;
+import java.net.Socket;
+
+import java.util.List;
+//import java.util.LinkedList;
+
+import org.apache.commons.net.MalformedServerReplyException;
+
+import org.apache.commons.net.ftp.FTP;
+import org.apache.commons.net.ftp.FTPCommand;
+import org.apache.commons.net.ftp.FTPFile;
+import org.apache.commons.net.ftp.FTPFileEntryParser;
+import org.apache.commons.net.ftp.FTPReply;
+
+import org.apache.commons.net.ftp.FTPConnectionClosedException;
+
+/***********************************************
+ * Client.java encapsulates functionalities necessary for nutch to get dir list
+ * and retrieve file from an FTP server. This class takes care of all low level
+ * details of interacting with an FTP server and provides a convenient higher
+ * level interface.
+ * 
+ * Modified from FtpClient.java in apache commons-net.
+ * 
+ * Notes by John Xing: ftp server implementations are hardly uniform and none
+ * seems to follow RFCs whole-heartedly. We have no choice, but assume common
+ * denominator as following: (1) Use stream mode for data transfer. Block mode
+ * will be better for multiple file downloading and partial file downloading.
+ * However not every ftpd has block mode support. (2) Use passive mode for data
+ * connection. So Nutch will work if we run behind firewall. (3) Data connection
+ * is opened/closed per ftp command for the reasons listed in (1). There are ftp
+ * servers out there, when partial downloading is enforced by closing data
+ * channel socket on our client side, the server side immediately closes control
+ * channel (socket). Our codes deal with such a bad behavior. (4) LIST is used
+ * to obtain remote file attributes if possible. MDTM & SIZE would be nice, but
+ * not as ubiquitously implemented as LIST. (5) Avoid using ABOR in single
+ * thread? Do not use it at all.
+ * 
+ * About exceptions: Some specific exceptions are re-thrown as one of
+ * FtpException*.java In fact, each function throws FtpException*.java or pass
+ * IOException.
+ * 
+ * @author John Xing
+ ***********************************************/
+
+public class Client extends FTP {
+  private int __dataTimeout;
+  private int __passivePort;
+  private String __passiveHost;
+  // private int __fileType, __fileFormat;
+  private boolean __remoteVerificationEnabled;
+  // private FTPFileEntryParser __entryParser;
+  private String __systemName;
+
+  /** Public default constructor */
+  public Client() {
+    __initDefaults();
+    __dataTimeout = -1;
+    __remoteVerificationEnabled = true;
+  }
+
+  // defaults when initialize
+  private void __initDefaults() {
+    __passiveHost = null;
+    __passivePort = -1;
+    __systemName = null;
+    // __fileType = FTP.ASCII_FILE_TYPE;
+    // __fileFormat = FTP.NON_PRINT_TEXT_FORMAT;
+    // __entryParser = null;
+  }
+
+  // parse reply for pass()
+  private void __parsePassiveModeReply(String reply)
+      throws MalformedServerReplyException {
+    int i, index, lastIndex;
+    String octet1, octet2;
+    StringBuffer host;
+
+    reply = reply.substring(reply.indexOf('(') + 1, reply.indexOf(')')).trim();
+
+    host = new StringBuffer(24);
+    lastIndex = 0;
+    index = reply.indexOf(',');
+    host.append(reply.substring(lastIndex, index));
+
+    for (i = 0; i < 3; i++) {
+      host.append('.');
+      lastIndex = index + 1;
+      index = reply.indexOf(',', lastIndex);
+      host.append(reply.substring(lastIndex, index));
+    }
+
+    lastIndex = index + 1;
+    index = reply.indexOf(',', lastIndex);
+
+    octet1 = reply.substring(lastIndex, index);
+    octet2 = reply.substring(index + 1);
+
+    // index and lastIndex now used as temporaries
+    try {
+      index = Integer.parseInt(octet1);
+      lastIndex = Integer.parseInt(octet2);
+    } catch (NumberFormatException e) {
+      throw new MalformedServerReplyException(
+          "Could not parse passive host information.\nServer Reply: " + reply);
+    }
+
+    index <<= 8;
+    index |= lastIndex;
+
+    __passiveHost = host.toString();
+    __passivePort = index;
+  }
+
+  /**
+   * open a passive data connection socket
+   * 
+   * @param command
+   * @param arg
+   * @return
+   * @throws IOException
+   * @throws FtpExceptionCanNotHaveDataConnection
+   */
+  protected Socket __openPassiveDataConnection(int command, String arg)
+      throws IOException, FtpExceptionCanNotHaveDataConnection {
+    Socket socket;
+
+    // // 20040317, xing, accommodate ill-behaved servers, see below
+    // int port_previous = __passivePort;
+
+    if (pasv() != FTPReply.ENTERING_PASSIVE_MODE)
+      throw new FtpExceptionCanNotHaveDataConnection("pasv() failed. "
+          + getReplyString());
+
+    try {
+      __parsePassiveModeReply(getReplyStrings()[0]);
+    } catch (MalformedServerReplyException e) {
+      throw new FtpExceptionCanNotHaveDataConnection(e.getMessage());
+    }
+
+    // // 20040317, xing, accommodate ill-behaved servers, see above
+    // int count = 0;
+    // System.err.println("__passivePort "+__passivePort);
+    // System.err.println("port_previous "+port_previous);
+    // while (__passivePort == port_previous) {
+    // // just quit if too many tries. make it an exception here?
+    // if (count++ > 10)
+    // return null;
+    // // slow down further for each new try
+    // Thread.sleep(500*count);
+    // if (pasv() != FTPReply.ENTERING_PASSIVE_MODE)
+    // throw new FtpExceptionCanNotHaveDataConnection(
+    // "pasv() failed. " + getReplyString());
+    // //return null;
+    // try {
+    // __parsePassiveModeReply(getReplyStrings()[0]);
+    // } catch (MalformedServerReplyException e) {
+    // throw new FtpExceptionCanNotHaveDataConnection(e.getMessage());
+    // }
+    // }
+
+    socket = _socketFactory_.createSocket(__passiveHost, __passivePort);
+
+    if (!FTPReply.isPositivePreliminary(sendCommand(command, arg))) {
+      socket.close();
+      return null;
+    }
+
+    if (__remoteVerificationEnabled && !verifyRemote(socket)) {
+      InetAddress host1, host2;
+
+      host1 = socket.getInetAddress();
+      host2 = getRemoteAddress();
+
+      socket.close();
+
+      // our precaution
+      throw new FtpExceptionCanNotHaveDataConnection(
+          "Host attempting data connection " + host1.getHostAddress()
+              + " is not same as server " + host2.getHostAddress()
+              + " So we intentionally close it for security precaution.");
+    }
+
+    if (__dataTimeout >= 0)
+      socket.setSoTimeout(__dataTimeout);
+
+    return socket;
+  }
+
+  /***
+   * Sets the timeout in milliseconds to use for data connection. set
+   * immediately after opening the data connection.
+   ***/
+  public void setDataTimeout(int timeout) {
+    __dataTimeout = timeout;
+  }
+
+  /***
+   * Closes the connection to the FTP server and restores connection parameters
+   * to the default values.
+   * <p>
+   * 
+   * @exception IOException
+   *              If an error occurs while disconnecting.
+   ***/
+  public void disconnect() throws IOException {
+    __initDefaults();
+    super.disconnect();
+    // no worry for data connection, since we always close it
+    // in every ftp command that invloves data connection
+  }
+
+  /***
+   * Enable or disable verification that the remote host taking part of a data
+   * connection is the same as the host to which the control connection is
+   * attached. The default is for verification to be enabled. You may set this
+   * value at any time, whether the FTPClient is currently connected or not.
+   * <p>
+   * 
+   * @param enable
+   *          True to enable verification, false to disable verification.
+   ***/
+  public void setRemoteVerificationEnabled(boolean enable) {
+    __remoteVerificationEnabled = enable;
+  }
+
+  /***
+   * Return whether or not verification of the remote host participating in data
+   * connections is enabled. The default behavior is for verification to be
+   * enabled.
+   * <p>
+   * 
+   * @return True if verification is enabled, false if not.
+   ***/
+  public boolean isRemoteVerificationEnabled() {
+    return __remoteVerificationEnabled;
+  }
+
+  /***
+   * Login to the FTP server using the provided username and password.
+   * <p>
+   * 
+   * @param username
+   *          The username to login under.
+   * @param password
+   *          The password to use.
+   * @return True if successfully completed, false if not.
+   * @exception FTPConnectionClosedException
+   *              If the FTP server prematurely closes the connection as a
+   *              result of the client being idle or some other reason causing
+   *              the server to send FTP reply code 421. This exception may be
+   *              caught either as an IOException or independently as itself.
+   * @exception IOException
+   *              If an I/O error occurs while either sending a command to the
+   *              server or receiving a reply from the server.
+   ***/
+  public boolean login(String username, String password) throws IOException {
+    user(username);
+
+    if (FTPReply.isPositiveCompletion(getReplyCode()))
+      return true;
+
+    // If we get here, we either have an error code, or an intermmediate
+    // reply requesting password.
+    if (!FTPReply.isPositiveIntermediate(getReplyCode()))
+      return false;
+
+    return FTPReply.isPositiveCompletion(pass(password));
+  }
+
+  /***
+   * Logout of the FTP server by sending the QUIT command.
+   * <p>
+   * 
+   * @return True if successfully completed, false if not.
+   * @exception FTPConnectionClosedException
+   *              If the FTP server prematurely closes the connection as a
+   *              result of the client being idle or some other reason causing
+   *              the server to send FTP reply code 421. This exception may be
+   *              caught either as an IOException or independently as itself.
+   * @exception IOException
+   *              If an I/O error occurs while either sending a command to the
+   *              server or receiving a reply from the server.
+   ***/
+  public boolean logout() throws IOException {
+    return FTPReply.isPositiveCompletion(quit());
+  }
+
+  /**
+   * retrieve list reply for path
+   * 
+   * @param path
+   * @param entries
+   * @param limit
+   * @param parser
+   * @throws IOException
+   * @throws FtpExceptionCanNotHaveDataConnection
+   * @throws FtpExceptionUnknownForcedDataClose
+   * @throws FtpExceptionControlClosedByForcedDataClose
+   */
+  public void retrieveList(String path, List<FTPFile> entries, int limit,
+      FTPFileEntryParser parser) throws IOException,
+      FtpExceptionCanNotHaveDataConnection, FtpExceptionUnknownForcedDataClose,
+      FtpExceptionControlClosedByForcedDataClose {
+    Socket socket = __openPassiveDataConnection(FTPCommand.LIST, path);
+
+    if (socket == null)
+      throw new FtpExceptionCanNotHaveDataConnection("LIST "
+          + ((path == null) ? "" : path));
+
+    BufferedReader reader = new BufferedReader(new InputStreamReader(
+        socket.getInputStream()));
+
+    // force-close data channel socket, when download limit is reached
+    // boolean mandatory_close = false;
+
+    // List entries = new LinkedList();
+    int count = 0;
+    String line = parser.readNextEntry(reader);
+    while (line != null) {
+      FTPFile ftpFile = parser.parseFTPEntry(line);
+      // skip non-formatted lines
+      if (ftpFile == null) {
+        line = parser.readNextEntry(reader);
+        continue;
+      }
+      entries.add(ftpFile);
+      count += line.length();
+      // impose download limit if limit >= 0, otherwise no limit
+      // here, cut off is up to the line when total bytes is just over limit
+      if (limit >= 0 && count > limit) {
+        // mandatory_close = true;
+        break;
+      }
+      line = parser.readNextEntry(reader);
+    }
+
+    // if (mandatory_close)
+    // you always close here, no matter mandatory_close or not.
+    // however different ftp servers respond differently, see below.
+    socket.close();
+
+    // scenarios:
+    // (1) mandatory_close is false, download limit not reached
+    // no special care here
+    // (2) mandatory_close is true, download limit is reached
+    // different servers have different reply codes:
+
+    try {
+      int reply = getReply();
+      if (!_notBadReply(reply))
+        throw new FtpExceptionUnknownForcedDataClose(getReplyString());
+    } catch (FTPConnectionClosedException e) {
+      // some ftp servers will close control channel if data channel socket
+      // is closed by our end before all data has been read out. Check:
+      // tux414.q-tam.hp.com FTP server (hp.com version whp02)
+      // so must catch FTPConnectionClosedException thrown by getReply() above
+      // disconnect();
+      throw new FtpExceptionControlClosedByForcedDataClose(e.getMessage());
+    }
+
+  }
+
+  /**
+   * retrieve file for path
+   * 
+   * @param path
+   * @param os
+   * @param limit
+   * @throws IOException
+   * @throws FtpExceptionCanNotHaveDataConnection
+   * @throws FtpExceptionUnknownForcedDataClose
+   * @throws FtpExceptionControlClosedByForcedDataClose
+   */
+  public void retrieveFile(String path, OutputStream os, int limit)
+      throws IOException, FtpExceptionCanNotHaveDataConnection,
+      FtpExceptionUnknownForcedDataClose,
+      FtpExceptionControlClosedByForcedDataClose {
+
+    Socket socket = __openPassiveDataConnection(FTPCommand.RETR, path);
+
+    if (socket == null)
+      throw new FtpExceptionCanNotHaveDataConnection("RETR "
+          + ((path == null) ? "" : path));
+
+    InputStream input = socket.getInputStream();
+
+    // 20040318, xing, treat everything as BINARY_FILE_TYPE for now
+    // do we ever need ASCII_FILE_TYPE?
+    // if (__fileType == ASCII_FILE_TYPE)
+    // input = new FromNetASCIIInputStream(input);
+
+    // fixme, should we instruct server here for binary file type?
+
+    // force-close data channel socket
+    // boolean mandatory_close = false;
+
+    int len;
+    int count = 0;
+    byte[] buf = new byte[org.apache.commons.net.io.Util.DEFAULT_COPY_BUFFER_SIZE];
+    while ((len = input.read(buf, 0, buf.length)) != -1) {
+      count += len;
+      // impose download limit if limit >= 0, otherwise no limit
+      // here, cut off is exactly of limit bytes
+      if (limit >= 0 && count > limit) {
+        os.write(buf, 0, len - (count - limit));
+        // mandatory_close = true;
+        break;
+      }
+      os.write(buf, 0, len);
+      os.flush();
+    }
+
+    // if (mandatory_close)
+    // you always close here, no matter mandatory_close or not.
+    // however different ftp servers respond differently, see below.
+    socket.close();
+
+    // scenarios:
+    // (1) mandatory_close is false, download limit not reached
+    // no special care here
+    // (2) mandatory_close is true, download limit is reached
+    // different servers have different reply codes:
+
+    // do not need this
+    // sendCommand("ABOR");
+
+    try {
+      int reply = getReply();
+      if (!_notBadReply(reply))
+        throw new FtpExceptionUnknownForcedDataClose(getReplyString());
+    } catch (FTPConnectionClosedException e) {
+      // some ftp servers will close control channel if data channel socket
+      // is closed by our end before all data has been read out. Check:
+      // tux414.q-tam.hp.com FTP server (hp.com version whp02)
+      // so must catch FTPConnectionClosedException thrown by getReply() above
+      // disconnect();
+      throw new FtpExceptionControlClosedByForcedDataClose(e.getMessage());
+    }
+
+  }
+
+  /**
+   * reply check after closing data connection
+   * 
+   * @param reply
+   * @return
+   */
+  private boolean _notBadReply(int reply) {
+
+    if (FTPReply.isPositiveCompletion(reply)) {
+      // do nothing
+    } else if (reply == 426) { // FTPReply.TRANSFER_ABORTED
+      // some ftp servers reply 426, e.g.,
+      // foggy FTP server (Version wu-2.6.2(2)
+      // there is second reply witing? no!
+      // getReply();
+    } else if (reply == 450) { // FTPReply.FILE_ACTION_NOT_TAKEN
+      // some ftp servers reply 450, e.g.,
+      // ProFTPD [ftp.kernel.org]
+      // there is second reply witing? no!
+      // getReply();
+    } else if (reply == 451) { // FTPReply.ACTION_ABORTED
+      // some ftp servers reply 451, e.g.,
+      // ProFTPD [ftp.kernel.org]
+      // there is second reply witing? no!
+      // getReply();
+    } else if (reply == 451) { // FTPReply.ACTION_ABORTED
+    } else {
+      // what other kind of ftp server out there?
+      return false;
+    }
+
+    return true;
+  }
+
+  /***
+   * Sets the file type to be transferred. This should be one of
+   * <code> FTP.ASCII_FILE_TYPE </code>, <code> FTP.IMAGE_FILE_TYPE </code>,
+   * etc. The file type only needs to be set when you want to change the type.
+   * After changing it, the new type stays in effect until you change it again.
+   * The default file type is <code> FTP.ASCII_FILE_TYPE </code> if this method
+   * is never called.
+   * <p>
+   * 
+   * @param fileType
+   *          The <code> _FILE_TYPE </code> constant indcating the type of file.
+   * @return True if successfully completed, false if not.
+   * @exception FTPConnectionClosedException
+   *              If the FTP server prematurely closes the connection as a
+   *              result of the client being idle or some other reason causing
+   *              the server to send FTP reply code 421. This exception may be
+   *              caught either as an IOException or independently as itself.
+   * @exception IOException
+   *              If an I/O error occurs while either sending a command to the
+   *              server or receiving a reply from the server.
+   ***/
+  public boolean setFileType(int fileType) throws IOException {
+    if (FTPReply.isPositiveCompletion(type(fileType))) {
+      /*
+       * __fileType = fileType; __fileFormat = FTP.NON_PRINT_TEXT_FORMAT;
+       */
+      return true;
+    }
+    return false;
+  }
+
+  /***
+   * Fetches the system type name from the server and returns the string. This
+   * value is cached for the duration of the connection after the first call to
+   * this method. In other words, only the first time that you invoke this
+   * method will it issue a SYST command to the FTP server. FTPClient will
+   * remember the value and return the cached value until a call to disconnect.
+   * <p>
+   * 
+   * @return The system type name obtained from the server. null if the
+   *         information could not be obtained.
+   * @exception FTPConnectionClosedException
+   *              If the FTP server prematurely closes the connection as a
+   *              result of the client being idle or some other reason causing
+   *              the server to send FTP reply code 421. This exception may be
+   *              caught either as an IOException or independently as itself.
+   * @exception IOException
+   *              If an I/O error occurs while either sending a command to the
+   *              server or receiving a reply from the server.
+   ***/
+  public String getSystemName() throws IOException, FtpExceptionBadSystResponse {
+    // if (syst() == FTPReply.NAME_SYSTEM_TYPE)
+    // Technically, we should expect a NAME_SYSTEM_TYPE response, but
+    // in practice FTP servers deviate, so we soften the condition to
+    // a positive completion.
+    if (__systemName == null && FTPReply.isPositiveCompletion(syst())) {
+      __systemName = (getReplyStrings()[0]).substring(4);
+    } else {
+      throw new FtpExceptionBadSystResponse("Bad response of SYST: "
+          + getReplyString());
+    }
+
+    return __systemName;
+  }
+
+  /***
+   * Sends a NOOP command to the FTP server. This is useful for preventing
+   * server timeouts.
+   * <p>
+   * 
+   * @return True if successfully completed, false if not.
+   * @exception FTPConnectionClosedException
+   *              If the FTP server prematurely closes the connection as a
+   *              result of the client being idle or some other reason causing
+   *              the server to send FTP reply code 421. This exception may be
+   *              caught either as an IOException or independently as itself.
+   * @exception IOException
+   *              If an I/O error occurs while either sending a command to the
+   *              server or receiving a reply from the server.
+   ***/
+  public boolean sendNoOp() throws IOException {
+    return FTPReply.isPositiveCompletion(noop());
+  }
+
+  // client.stat(path);
+  // client.sendCommand("STAT");
+  // client.sendCommand("STAT",path);
+  // client.sendCommand("MDTM",path);
+  // client.sendCommand("SIZE",path);
+  // client.sendCommand("HELP","SITE");
+  // client.sendCommand("SYST");
+  // client.setRestartOffset(120);
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Ftp.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Ftp.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Ftp.java
new file mode 100644
index 0000000..772f3bb
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/Ftp.java
@@ -0,0 +1,267 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import org.apache.commons.net.ftp.FTPFileEntryParser;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.net.protocols.Response;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.metadata.Nutch;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import crawlercommons.robots.BaseRobotRules;
+
+import java.net.URL;
+
+import java.io.IOException;
+
+/**
+ * This class is a protocol plugin used for ftp: scheme. It creates
+ * {@link FtpResponse} object and gets the content of the url from it.
+ * Configurable parameters are {@code ftp.username}, {@code ftp.password},
+ * {@code ftp.content.limit}, {@code ftp.timeout}, {@code ftp.server.timeout},
+ * {@code ftp.password}, {@code ftp.keep.connection} and {@code ftp.follow.talk}
+ * . For details see "FTP properties" section in {@code nutch-default.xml}.
+ */
+public class Ftp implements Protocol {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Ftp.class);
+
+  private static final int BUFFER_SIZE = 16384; // 16*1024 = 16384
+
+  static final int MAX_REDIRECTS = 5;
+
+  int timeout;
+
+  int maxContentLength;
+
+  String userName;
+  String passWord;
+
+  // typical/default server timeout is 120*1000 millisec.
+  // better be conservative here
+  int serverTimeout;
+
+  // when to have client start anew
+  long renewalTime = -1;
+
+  boolean keepConnection;
+
+  boolean followTalk;
+
+  // ftp client
+  Client client = null;
+  // ftp dir list entry parser
+  FTPFileEntryParser parser = null;
+
+  private Configuration conf;
+
+  private FtpRobotRulesParser robots = null;
+
+  // constructor
+  public Ftp() {
+    robots = new FtpRobotRulesParser();
+  }
+
+  /** Set the timeout. */
+  public void setTimeout(int to) {
+    timeout = to;
+  }
+
+  /** Set the point at which content is truncated. */
+  public void setMaxContentLength(int length) {
+    maxContentLength = length;
+  }
+
+  /** Set followTalk */
+  public void setFollowTalk(boolean followTalk) {
+    this.followTalk = followTalk;
+  }
+
+  /** Set keepConnection */
+  public void setKeepConnection(boolean keepConnection) {
+    this.keepConnection = keepConnection;
+  }
+
+  /**
+   * Creates a {@link FtpResponse} object corresponding to the url and returns a
+   * {@link ProtocolOutput} object as per the content received
+   * 
+   * @param url
+   *          Text containing the ftp url
+   * @param datum
+   *          The CrawlDatum object corresponding to the url
+   * 
+   * @return {@link ProtocolOutput} object for the url
+   */
+  public ProtocolOutput getProtocolOutput(Text url, CrawlDatum datum) {
+    String urlString = url.toString();
+    try {
+      URL u = new URL(urlString);
+
+      int redirects = 0;
+
+      while (true) {
+        FtpResponse response;
+        response = new FtpResponse(u, datum, this, getConf()); // make a request
+
+        int code = response.getCode();
+        datum.getMetaData().put(Nutch.PROTOCOL_STATUS_CODE_KEY,
+          new Text(Integer.toString(code)));
+        
+
+        if (code == 200) { // got a good response
+          return new ProtocolOutput(response.toContent()); // return it
+
+        } else if (code >= 300 && code < 400) { // handle redirect
+          if (redirects == MAX_REDIRECTS)
+            throw new FtpException("Too many redirects: " + url);
+          u = new URL(response.getHeader("Location"));
+          redirects++;
+          if (LOG.isTraceEnabled()) {
+            LOG.trace("redirect to " + u);
+          }
+        } else { // convert to exception
+          throw new FtpError(code);
+        }
+      }
+    } catch (Exception e) {
+      return new ProtocolOutput(null, new ProtocolStatus(e));
+    }
+  }
+
+  protected void finalize() {
+    try {
+      if (this.client != null && this.client.isConnected()) {
+        this.client.logout();
+        this.client.disconnect();
+      }
+    } catch (IOException e) {
+      // do nothing
+    }
+  }
+
+  /** For debugging. */
+  public static void main(String[] args) throws Exception {
+    int timeout = Integer.MIN_VALUE;
+    int maxContentLength = Integer.MIN_VALUE;
+    String logLevel = "info";
+    boolean followTalk = false;
+    boolean keepConnection = false;
+    boolean dumpContent = false;
+    String urlString = null;
+
+    String usage = "Usage: Ftp [-logLevel level] [-followTalk] [-keepConnection] [-timeout N] [-maxContentLength L] [-dumpContent] url";
+
+    if (args.length == 0) {
+      System.err.println(usage);
+      System.exit(-1);
+    }
+
+    for (int i = 0; i < args.length; i++) {
+      if (args[i].equals("-logLevel")) {
+        logLevel = args[++i];
+      } else if (args[i].equals("-followTalk")) {
+        followTalk = true;
+      } else if (args[i].equals("-keepConnection")) {
+        keepConnection = true;
+      } else if (args[i].equals("-timeout")) {
+        timeout = Integer.parseInt(args[++i]) * 1000;
+      } else if (args[i].equals("-maxContentLength")) {
+        maxContentLength = Integer.parseInt(args[++i]);
+      } else if (args[i].equals("-dumpContent")) {
+        dumpContent = true;
+      } else if (i != args.length - 1) {
+        System.err.println(usage);
+        System.exit(-1);
+      } else {
+        urlString = args[i];
+      }
+    }
+
+    Ftp ftp = new Ftp();
+
+    ftp.setFollowTalk(followTalk);
+    ftp.setKeepConnection(keepConnection);
+
+    if (timeout != Integer.MIN_VALUE) // set timeout
+      ftp.setTimeout(timeout);
+
+    if (maxContentLength != Integer.MIN_VALUE) // set maxContentLength
+      ftp.setMaxContentLength(maxContentLength);
+
+    // set log level
+    // LOG.setLevel(Level.parse((new String(logLevel)).toUpperCase()));
+
+    Content content = ftp.getProtocolOutput(new Text(urlString),
+        new CrawlDatum()).getContent();
+
+    System.err.println("Content-Type: " + content.getContentType());
+    System.err.println("Content-Length: "
+        + content.getMetadata().get(Response.CONTENT_LENGTH));
+    System.err.println("Last-Modified: "
+        + content.getMetadata().get(Response.LAST_MODIFIED));
+    if (dumpContent) {
+      System.out.print(new String(content.getContent()));
+    }
+
+    ftp = null;
+  }
+
+  /**
+   * Set the {@link Configuration} object
+   */
+  public void setConf(Configuration conf) {
+    this.conf = conf;
+    this.maxContentLength = conf.getInt("ftp.content.limit", 64 * 1024);
+    this.timeout = conf.getInt("ftp.timeout", 10000);
+    this.userName = conf.get("ftp.username", "anonymous");
+    this.passWord = conf.get("ftp.password", "anonymous@example.com");
+    this.serverTimeout = conf.getInt("ftp.server.timeout", 60 * 1000);
+    this.keepConnection = conf.getBoolean("ftp.keep.connection", false);
+    this.followTalk = conf.getBoolean("ftp.follow.talk", false);
+    this.robots.setConf(conf);
+  }
+
+  /**
+   * Get the {@link Configuration} object
+   */
+  public Configuration getConf() {
+    return this.conf;
+  }
+
+  /**
+   * Get the robots rules for a given url
+   */
+  public BaseRobotRules getRobotRules(Text url, CrawlDatum datum) {
+    return robots.getRobotRulesSet(this, url);
+  }
+
+  public int getBufferSize() {
+    return BUFFER_SIZE;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpError.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpError.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpError.java
new file mode 100644
index 0000000..b63a67e
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpError.java
@@ -0,0 +1,36 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+/**
+ * Thrown for Ftp error codes.
+ */
+public class FtpError extends FtpException {
+
+  private int code;
+
+  public int getCode(int code) {
+    return code;
+  }
+
+  public FtpError(int code) {
+    super("Ftp Error: " + code);
+    this.code = code;
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpException.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpException.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpException.java
new file mode 100644
index 0000000..5a29668
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpException.java
@@ -0,0 +1,46 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import org.apache.nutch.protocol.ProtocolException;
+
+/***
+ * Superclass for important exceptions thrown during FTP talk, that must be
+ * handled with care.
+ * 
+ * @author John Xing
+ */
+public class FtpException extends ProtocolException {
+
+  public FtpException() {
+    super();
+  }
+
+  public FtpException(String message) {
+    super(message);
+  }
+
+  public FtpException(String message, Throwable cause) {
+    super(message, cause);
+  }
+
+  public FtpException(Throwable cause) {
+    super(cause);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java
new file mode 100644
index 0000000..689ac8e
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionBadSystResponse.java
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+/**
+ * Exception indicating bad reply of SYST command.
+ * 
+ * @author John Xing
+ */
+public class FtpExceptionBadSystResponse extends FtpException {
+  FtpExceptionBadSystResponse(String msg) {
+    super(msg);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java
new file mode 100644
index 0000000..9f35b74
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionCanNotHaveDataConnection.java
@@ -0,0 +1,29 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+/**
+ * Exception indicating failure of opening data connection.
+ * 
+ * @author John Xing
+ */
+public class FtpExceptionCanNotHaveDataConnection extends FtpException {
+  FtpExceptionCanNotHaveDataConnection(String msg) {
+    super(msg);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java
new file mode 100644
index 0000000..c058fcb
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionControlClosedByForcedDataClose.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+/**
+ * Exception indicating control channel is closed by server end, due to forced
+ * closure of data channel at client (our) end.
+ * 
+ * @author John Xing
+ */
+public class FtpExceptionControlClosedByForcedDataClose extends FtpException {
+  FtpExceptionControlClosedByForcedDataClose(String msg) {
+    super(msg);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java
new file mode 100644
index 0000000..9083d7c
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpExceptionUnknownForcedDataClose.java
@@ -0,0 +1,30 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+/**
+ * Exception indicating unrecognizable reply from server after forced closure of
+ * data channel by client (our) side.
+ * 
+ * @author John Xing
+ */
+public class FtpExceptionUnknownForcedDataClose extends FtpException {
+  FtpExceptionUnknownForcedDataClose(String msg) {
+    super(msg);
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpResponse.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpResponse.java
new file mode 100644
index 0000000..f7c7c6d
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpResponse.java
@@ -0,0 +1,521 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import org.apache.commons.net.ftp.FTP;
+import org.apache.commons.net.ftp.FTPFile;
+import org.apache.commons.net.ftp.FTPReply;
+import org.apache.commons.net.ftp.parser.DefaultFTPFileEntryParserFactory;
+import org.apache.commons.net.ftp.parser.ParserInitializationException;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.Content;
+import org.apache.nutch.metadata.Metadata;
+import org.apache.nutch.net.protocols.HttpDateFormat;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.hadoop.conf.Configuration;
+
+import java.net.InetAddress;
+import java.net.URL;
+import java.util.List;
+import java.util.LinkedList;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+
+/**
+ * FtpResponse.java mimics ftp replies as http response. It tries its best to
+ * follow http's way for headers, response codes as well as exceptions.
+ * 
+ * Comments: In this class, all FtpException*.java thrown by Client.java and
+ * some important commons-net exceptions passed by Client.java must have been
+ * properly dealt with. They'd better not be leaked to the caller of this class.
+ */
+public class FtpResponse {
+
+  private String orig;
+  private String base;
+  private byte[] content;
+  private static final byte[] EMPTY_CONTENT = new byte[0];
+  private int code;
+  private Metadata headers = new Metadata();
+
+  private final Ftp ftp;
+  private Configuration conf;
+
+  /** Returns the response code. */
+  public int getCode() {
+    return code;
+  }
+
+  /** Returns the value of a named header. */
+  public String getHeader(String name) {
+    return headers.get(name);
+  }
+
+  public byte[] getContent() {
+    return content;
+  }
+
+  public Content toContent() {
+    return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
+        getHeader(Response.CONTENT_TYPE), headers, this.conf);
+  }
+
+  public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf)
+      throws FtpException, IOException {
+
+    this.orig = url.toString();
+    this.base = url.toString();
+    this.ftp = ftp;
+    this.conf = conf;
+
+    if (!"ftp".equals(url.getProtocol()))
+      throw new FtpException("Not a ftp url:" + url);
+
+    if (url.getPath() != url.getFile()) {
+      if (Ftp.LOG.isWarnEnabled()) {
+        Ftp.LOG.warn("url.getPath() != url.getFile(): " + url);
+      }
+    }
+
+    String path = "".equals(url.getPath()) ? "/" : url.getPath();
+
+    try {
+
+      if (ftp.followTalk) {
+        if (Ftp.LOG.isInfoEnabled()) {
+          Ftp.LOG.info("fetching " + url);
+        }
+      } else {
+        if (Ftp.LOG.isTraceEnabled()) {
+          Ftp.LOG.trace("fetching " + url);
+        }
+      }
+
+      InetAddress addr = InetAddress.getByName(url.getHost());
+      if (addr != null && conf.getBoolean("store.ip.address", false) == true) {
+        headers.add("_ip_", addr.getHostAddress());
+      }
+
+      // idled too long, remote server or ourselves may have timed out,
+      // should start anew.
+      if (ftp.client != null && ftp.keepConnection
+          && ftp.renewalTime < System.currentTimeMillis()) {
+        if (Ftp.LOG.isInfoEnabled()) {
+          Ftp.LOG.info("delete client because idled too long");
+        }
+        ftp.client = null;
+      }
+
+      // start anew if needed
+      if (ftp.client == null) {
+        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+          Ftp.LOG.info("start client");
+        }
+        // the real client
+        ftp.client = new Client();
+        // when to renew, take the lesser
+        // ftp.renewalTime = System.currentTimeMillis()
+        // + ((ftp.timeout<ftp.serverTimeout) ? ftp.timeout :
+        // ftp.serverTimeout);
+
+        // timeout for control connection
+        ftp.client.setDefaultTimeout(ftp.timeout);
+        // timeout for data connection
+        ftp.client.setDataTimeout(ftp.timeout);
+
+        // follow ftp talk?
+        if (ftp.followTalk)
+          ftp.client.addProtocolCommandListener(new PrintCommandListener(
+              Ftp.LOG));
+      }
+
+      // quit from previous site if at a different site now
+      if (ftp.client.isConnected()) {
+        InetAddress remoteAddress = ftp.client.getRemoteAddress();
+        if (!addr.equals(remoteAddress)) {
+          if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+            Ftp.LOG.info("disconnect from " + remoteAddress
+                + " before connect to " + addr);
+          }
+          // quit from current site
+          ftp.client.logout();
+          ftp.client.disconnect();
+        }
+      }
+
+      // connect to current site if needed
+      if (!ftp.client.isConnected()) {
+
+        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+          Ftp.LOG.info("connect to " + addr);
+        }
+
+        ftp.client.connect(addr);
+        if (!FTPReply.isPositiveCompletion(ftp.client.getReplyCode())) {
+          ftp.client.disconnect();
+          if (Ftp.LOG.isWarnEnabled()) {
+            Ftp.LOG.warn("ftp.client.connect() failed: " + addr + " "
+                + ftp.client.getReplyString());
+          }
+          this.code = 500; // http Internal Server Error
+          return;
+        }
+
+        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+          Ftp.LOG.info("log into " + addr);
+        }
+
+        if (!ftp.client.login(ftp.userName, ftp.passWord)) {
+          // login failed.
+          // please note that some server may return 421 immediately
+          // after USER anonymous, thus ftp.client.login() won't return false,
+          // but throw exception, which then will be handled by caller
+          // (not dealt with here at all) .
+          ftp.client.disconnect();
+          if (Ftp.LOG.isWarnEnabled()) {
+            Ftp.LOG.warn("ftp.client.login() failed: " + addr);
+          }
+          this.code = 401; // http Unauthorized
+          return;
+        }
+
+        // insist on binary file type
+        if (!ftp.client.setFileType(FTP.BINARY_FILE_TYPE)) {
+          ftp.client.logout();
+          ftp.client.disconnect();
+          if (Ftp.LOG.isWarnEnabled()) {
+            Ftp.LOG.warn("ftp.client.setFileType() failed: " + addr);
+          }
+          this.code = 500; // http Internal Server Error
+          return;
+        }
+
+        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+          Ftp.LOG.info("set parser for " + addr);
+        }
+
+        // SYST is valid only after login
+        try {
+          ftp.parser = null;
+          String parserKey = ftp.client.getSystemName();
+          // some server reports as UNKNOWN Type: L8, but in fact UNIX Type: L8
+          if (parserKey.startsWith("UNKNOWN Type: L8"))
+            parserKey = "UNIX Type: L8";
+          ftp.parser = (new DefaultFTPFileEntryParserFactory())
+              .createFileEntryParser(parserKey);
+        } catch (FtpExceptionBadSystResponse e) {
+          if (Ftp.LOG.isWarnEnabled()) {
+            Ftp.LOG
+                .warn("ftp.client.getSystemName() failed: " + addr + " " + e);
+          }
+          ftp.parser = null;
+        } catch (ParserInitializationException e) {
+          // ParserInitializationException is RuntimeException defined in
+          // org.apache.commons.net.ftp.parser.ParserInitializationException
+          if (Ftp.LOG.isWarnEnabled()) {
+            Ftp.LOG.warn("createFileEntryParser() failed. " + addr + " " + e);
+          }
+          ftp.parser = null;
+        } finally {
+          if (ftp.parser == null) {
+            // do not log as severe, otherwise
+            // FetcherThread/RequestScheduler will abort
+            if (Ftp.LOG.isWarnEnabled()) {
+              Ftp.LOG.warn("ftp.parser is null: " + addr);
+            }
+            ftp.client.logout();
+            ftp.client.disconnect();
+            this.code = 500; // http Internal Server Error
+            return;
+          }
+        }
+
+      } else {
+        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+          Ftp.LOG.info("use existing connection");
+        }
+      }
+
+      this.content = null;
+
+      if (path.endsWith("/")) {
+        getDirAsHttpResponse(path, datum.getModifiedTime());
+      } else {
+        getFileAsHttpResponse(path, datum.getModifiedTime());
+      }
+
+      // reset next renewalTime, take the lesser
+      if (ftp.client != null && ftp.keepConnection) {
+        ftp.renewalTime = System.currentTimeMillis()
+            + ((ftp.timeout < ftp.serverTimeout) ? ftp.timeout
+                : ftp.serverTimeout);
+        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+          Ftp.LOG.info("reset renewalTime to "
+              + HttpDateFormat.toString(ftp.renewalTime));
+        }
+      }
+
+      // getDirAsHttpResponse() or getFileAsHttpResponse() above
+      // may have deleted ftp.client
+      if (ftp.client != null && !ftp.keepConnection) {
+        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+          Ftp.LOG.info("disconnect from " + addr);
+        }
+        ftp.client.logout();
+        ftp.client.disconnect();
+      }
+
+    } catch (Exception e) {
+      if (Ftp.LOG.isWarnEnabled()) {
+        Ftp.LOG.warn("Error: ", e);
+      }
+      // for any un-foreseen exception (run time exception or not),
+      // do ultimate clean and leave ftp.client for garbage collection
+      if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+        Ftp.LOG.info("delete client due to exception");
+      }
+      ftp.client = null;
+      // or do explicit garbage collection?
+      // System.gc();
+      // can we be less dramatic, using the following instead?
+      // probably unnecessary for our practical purpose here
+      // try {
+      // ftp.client.logout();
+      // ftp.client.disconnect();
+      // }
+      throw new FtpException(e);
+      // throw e;
+    }
+
+  }
+
+  // get ftp file as http response
+  private void getFileAsHttpResponse(String path, long lastModified)
+      throws IOException {
+
+    ByteArrayOutputStream os = null;
+    List<FTPFile> list = null;
+
+    try {
+      // first get its possible attributes
+      list = new LinkedList<FTPFile>();
+      ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser);
+
+      FTPFile ftpFile = (FTPFile) list.get(0);
+      this.headers.set(Response.CONTENT_LENGTH,
+          new Long(ftpFile.getSize()).toString());
+      this.headers.set(Response.LAST_MODIFIED,
+          HttpDateFormat.toString(ftpFile.getTimestamp()));
+      // don't retrieve the file if not changed.
+      if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
+        code = 304;
+        return;
+      }
+      os = new ByteArrayOutputStream(ftp.getBufferSize());
+      ftp.client.retrieveFile(path, os, ftp.maxContentLength);
+
+      this.content = os.toByteArray();
+
+      // // approximate bytes sent and read
+      // if (this.httpAccounting != null) {
+      // this.httpAccounting.incrementBytesSent(path.length());
+      // this.httpAccounting.incrementBytesRead(this.content.length);
+      // }
+
+      this.code = 200; // http OK
+
+    } catch (FtpExceptionControlClosedByForcedDataClose e) {
+
+      // control connection is off, clean up
+      // ftp.client.disconnect();
+      if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+        Ftp.LOG.info("delete client because server cut off control channel: "
+            + e);
+      }
+      ftp.client = null;
+
+      // in case this FtpExceptionControlClosedByForcedDataClose is
+      // thrown by retrieveList() (not retrieveFile()) above,
+      if (os == null) { // indicating throwing by retrieveList()
+        // throw new FtpException("fail to get attibutes: "+path);
+        if (Ftp.LOG.isWarnEnabled()) {
+          Ftp.LOG
+              .warn("Please try larger maxContentLength for ftp.client.retrieveList(). "
+                  + e);
+        }
+        // in a way, this is our request fault
+        this.code = 400; // http Bad request
+        return;
+      }
+
+      FTPFile ftpFile = (FTPFile) list.get(0);
+      this.headers.set(Response.CONTENT_LENGTH,
+          new Long(ftpFile.getSize()).toString());
+      // this.headers.put("content-type", "text/html");
+      this.headers.set(Response.LAST_MODIFIED,
+          HttpDateFormat.toString(ftpFile.getTimestamp()));
+      this.content = os.toByteArray();
+      if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
+        code = 304;
+        return;
+      }
+
+      // // approximate bytes sent and read
+      // if (this.httpAccounting != null) {
+      // this.httpAccounting.incrementBytesSent(path.length());
+      // this.httpAccounting.incrementBytesRead(this.content.length);
+      // }
+
+      this.code = 200; // http OK
+
+    } catch (FtpExceptionCanNotHaveDataConnection e) {
+
+      if (FTPReply.isPositiveCompletion(ftp.client.cwd(path))) {
+        // it is not a file, but dir, so redirect as a dir
+        this.headers.set(Response.LOCATION, path + "/");
+        this.code = 300; // http redirect
+        // fixme, should we do ftp.client.cwd("/"), back to top dir?
+      } else {
+        // it is not a dir either
+        this.code = 404; // http Not Found
+      }
+
+    } catch (FtpExceptionUnknownForcedDataClose e) {
+      // Please note control channel is still live.
+      // in a way, this is our request fault
+      if (Ftp.LOG.isWarnEnabled()) {
+        Ftp.LOG.warn("Unrecognized reply after forced close of data channel. "
+            + "If this is acceptable, please modify Client.java accordingly. "
+            + e);
+      }
+      this.code = 400; // http Bad Request
+    }
+
+  }
+
+  // get ftp dir list as http response
+  private void getDirAsHttpResponse(String path, long lastModified)
+      throws IOException {
+    List<FTPFile> list = new LinkedList<FTPFile>();
+
+    try {
+
+      // change to that dir first
+      if (!FTPReply.isPositiveCompletion(ftp.client.cwd(path))) {
+        this.code = 404; // http Not Found
+        return;
+      }
+
+      // fixme, should we do ftp.client.cwd("/"), back to top dir?
+
+      ftp.client.retrieveList(null, list, ftp.maxContentLength, ftp.parser);
+      this.content = list2html(list, path, "/".equals(path) ? false : true);
+      this.headers.set(Response.CONTENT_LENGTH,
+          new Integer(this.content.length).toString());
+      this.headers.set(Response.CONTENT_TYPE, "text/html");
+      // this.headers.put("Last-Modified", null);
+
+      // // approximate bytes sent and read
+      // if (this.httpAccounting != null) {
+      // this.httpAccounting.incrementBytesSent(path.length());
+      // this.httpAccounting.incrementBytesRead(this.content.length);
+      // }
+
+      this.code = 200; // http OK
+
+    } catch (FtpExceptionControlClosedByForcedDataClose e) {
+
+      // control connection is off, clean up
+      // ftp.client.disconnect();
+      if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
+        Ftp.LOG.info("delete client because server cut off control channel: "
+            + e);
+      }
+      ftp.client = null;
+
+      this.content = list2html(list, path, "/".equals(path) ? false : true);
+      this.headers.set(Response.CONTENT_LENGTH,
+          new Integer(this.content.length).toString());
+      this.headers.set(Response.CONTENT_TYPE, "text/html");
+      // this.headers.put("Last-Modified", null);
+
+      // // approximate bytes sent and read
+      // if (this.httpAccounting != null) {
+      // this.httpAccounting.incrementBytesSent(path.length());
+      // this.httpAccounting.incrementBytesRead(this.content.length);
+      // }
+
+      this.code = 200; // http OK
+
+    } catch (FtpExceptionUnknownForcedDataClose e) {
+      // Please note control channel is still live.
+      // in a way, this is our request fault
+      if (Ftp.LOG.isWarnEnabled()) {
+        Ftp.LOG.warn("Unrecognized reply after forced close of data channel. "
+            + "If this is acceptable, please modify Client.java accordingly. "
+            + e);
+      }
+      this.code = 400; // http Bad Request
+    } catch (FtpExceptionCanNotHaveDataConnection e) {
+      if (Ftp.LOG.isWarnEnabled()) {
+        Ftp.LOG.warn("" + e);
+      }
+      this.code = 500; // http Iternal Server Error
+    }
+
+  }
+
+  // generate html page from ftp dir list
+  private byte[] list2html(List<FTPFile> list, String path,
+      boolean includeDotDot) {
+
+    // StringBuffer x = new
+    // StringBuffer("<!doctype html public \"-//ietf//dtd html//en\"><html><head>");
+    StringBuffer x = new StringBuffer("<html><head>");
+    x.append("<title>Index of " + path + "</title></head>\n");
+    x.append("<body><h1>Index of " + path + "</h1><pre>\n");
+
+    if (includeDotDot) {
+      x.append("<a href='../'>../</a>\t-\t-\t-\n");
+    }
+
+    for (int i = 0; i < list.size(); i++) {
+      FTPFile f = (FTPFile) list.get(i);
+      String name = f.getName();
+      String time = HttpDateFormat.toString(f.getTimestamp());
+      if (f.isDirectory()) {
+        // some ftp server LIST "." and "..", we skip them here
+        if (name.equals(".") || name.equals(".."))
+          continue;
+        x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
+        x.append(time + "\t-\n");
+      } else if (f.isFile()) {
+        x.append("<a href='" + name + "'>" + name + "</a>\t");
+        x.append(time + "\t" + f.getSize() + "\n");
+      } else {
+        // ignore isSymbolicLink()
+        // ignore isUnknown()
+      }
+    }
+
+    x.append("</pre></body></html>\n");
+
+    return new String(x).getBytes();
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
new file mode 100644
index 0000000..3764864
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
@@ -0,0 +1,121 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.io.Text;
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.protocol.Protocol;
+import org.apache.nutch.protocol.ProtocolOutput;
+import org.apache.nutch.protocol.ProtocolStatus;
+import org.apache.nutch.protocol.RobotRulesParser;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import crawlercommons.robots.BaseRobotRules;
+import crawlercommons.robots.SimpleRobotRules;
+
+/**
+ * This class is used for parsing robots for urls belonging to FTP protocol. It
+ * extends the generic {@link RobotRulesParser} class and contains Ftp protocol
+ * specific implementation for obtaining the robots file.
+ */
+public class FtpRobotRulesParser extends RobotRulesParser {
+
+  private static final String CONTENT_TYPE = "text/plain";
+  public static final Logger LOG = LoggerFactory
+      .getLogger(FtpRobotRulesParser.class);
+
+  FtpRobotRulesParser() {
+  }
+
+  public FtpRobotRulesParser(Configuration conf) {
+    super(conf);
+  }
+
+  /**
+   * The hosts for which the caching of robots rules is yet to be done, it sends
+   * a Ftp request to the host corresponding to the {@link URL} passed, gets
+   * robots file, parses the rules and caches the rules object to avoid re-work
+   * in future.
+   * 
+   * @param ftp
+   *          The {@link Protocol} object
+   * @param url
+   *          URL
+   * 
+   * @return robotRules A {@link BaseRobotRules} object for the rules
+   */
+  public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) {
+
+    String protocol = url.getProtocol().toLowerCase(); // normalize to lower
+                                                       // case
+    String host = url.getHost().toLowerCase(); // normalize to lower case
+
+    if (LOG.isTraceEnabled() && isWhiteListed(url)) {
+      LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
+    }
+
+    BaseRobotRules robotRules = CACHE.get(protocol + ":" + host);
+
+    if (robotRules != null) {
+      return robotRules; // cached rule
+    } else if (LOG.isTraceEnabled()) {
+      LOG.trace("cache miss " + url);
+    }
+
+    boolean cacheRule = true;
+
+    if (isWhiteListed(url)) {
+      // check in advance whether a host is whitelisted
+      // (we do not need to fetch robots.txt)
+      robotRules = EMPTY_RULES;
+      LOG.info("Whitelisted host found for: {}", url);
+      LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host);
+
+    } else {
+      try {
+        Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
+        ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl,
+            new CrawlDatum());
+        ProtocolStatus status = output.getStatus();
+
+        if (status.getCode() == ProtocolStatus.SUCCESS) {
+          robotRules = parseRules(url.toString(), output.getContent()
+              .getContent(), CONTENT_TYPE, agentNames);
+        } else {
+          robotRules = EMPTY_RULES; // use default rules
+        }
+      } catch (Throwable t) {
+        if (LOG.isInfoEnabled()) {
+          LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
+        }
+        cacheRule = false; // try again later to fetch robots.txt
+        robotRules = EMPTY_RULES;
+      }
+
+    }
+
+    if (cacheRule)
+      CACHE.put(protocol + ":" + host, robotRules); // cache rules for host
+
+    return robotRules;
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
new file mode 100644
index 0000000..c68eac8
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
@@ -0,0 +1,71 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.nutch.protocol.ftp;
+
+import java.io.BufferedReader;
+import java.io.StringReader;
+import java.io.IOException;
+
+import org.slf4j.Logger;
+
+import org.apache.commons.net.ProtocolCommandEvent;
+import org.apache.commons.net.ProtocolCommandListener;
+
+/***
+ * This is a support class for logging all ftp command/reply traffic.
+ * 
+ * @author John Xing
+ ***/
+public class PrintCommandListener implements ProtocolCommandListener {
+  private Logger __logger;
+
+  public PrintCommandListener(Logger logger) {
+    __logger = logger;
+  }
+
+  public void protocolCommandSent(ProtocolCommandEvent event) {
+    try {
+      __logIt(event);
+    } catch (IOException e) {
+      if (__logger.isInfoEnabled()) {
+        __logger.info("PrintCommandListener.protocolCommandSent(): " + e);
+      }
+    }
+  }
+
+  public void protocolReplyReceived(ProtocolCommandEvent event) {
+    try {
+      __logIt(event);
+    } catch (IOException e) {
+      if (__logger.isInfoEnabled()) {
+        __logger.info("PrintCommandListener.protocolReplyReceived(): " + e);
+      }
+    }
+  }
+
+  private void __logIt(ProtocolCommandEvent event) throws IOException {
+    if (!__logger.isInfoEnabled()) {
+      return;
+    }
+    BufferedReader br = new BufferedReader(new StringReader(event.getMessage()));
+    String line;
+    while ((line = br.readLine()) != null) {
+      __logger.info("ftp> " + line);
+    }
+  }
+}

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/package.html
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/package.html b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/package.html
new file mode 100644
index 0000000..d936930
--- /dev/null
+++ b/nutch-plugins/protocol-ftp/src/main/java/org/apache/nutch/protocol/ftp/package.html
@@ -0,0 +1,5 @@
+<html>
+<body>
+<p>Protocol plugin which supports retrieving documents via the ftp protocol.</p><p></p>
+</body>
+</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/build.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/build.xml b/nutch-plugins/protocol-htmlunit/build.xml
new file mode 100644
index 0000000..899214c
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/build.xml
@@ -0,0 +1,37 @@
+<?xml version="1.0"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<project name="protocol-htmlunit" default="jar-core">
+
+  <import file="../build-plugin.xml"/>
+
+  <!-- Build compilation dependencies -->
+  <target name="deps-jar">
+    <ant target="jar" inheritall="false" dir="../lib-http"/>
+    <ant target="jar" inheritall="false" dir="../lib-htmlunit"/>
+  </target>
+
+  <!-- Add compilation dependencies to classpath -->
+  <path id="plugin.deps">
+    <fileset dir="${nutch.root}/build">
+      <include name="**/lib-http/*.jar" />
+      <include name="**/lib-htmlunit/*.jar" />
+    </fileset>
+    <pathelement location="${build.dir}/test/conf"/>
+  </path>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/ivy.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/ivy.xml b/nutch-plugins/protocol-htmlunit/ivy.xml
new file mode 100644
index 0000000..8aa78d2
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/ivy.xml
@@ -0,0 +1,38 @@
+<?xml version="1.0" ?>
+
+<!--
+   Licensed to the Apache Software Foundation (ASF) under one or more
+   contributor license agreements.  See the NOTICE file distributed with
+   this work for additional information regarding copyright ownership.
+   The ASF licenses this file to You under the Apache License, Version 2.0
+   (the "License"); you may not use this file except in compliance with
+   the License.  You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+-->
+
+<ivy-module version="1.0">
+  <info organisation="org.apache.nutch" module="${ant.project.name}">
+    <license name="Apache 2.0"/>
+    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
+    <description>
+        Apache Nutch
+    </description>
+  </info>
+
+  <configurations>
+    <include file="../../..//ivy/ivy-configurations.xml"/>
+  </configurations>
+
+  <publications>
+    <!--get the artifact from our module name-->
+    <artifact conf="master"/>
+  </publications>
+
+</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/plugin.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/plugin.xml b/nutch-plugins/protocol-htmlunit/plugin.xml
new file mode 100644
index 0000000..36bcb80
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/plugin.xml
@@ -0,0 +1,51 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ Licensed to the Apache Software Foundation (ASF) under one or more
+ contributor license agreements.  See the NOTICE file distributed with
+ this work for additional information regarding copyright ownership.
+ The ASF licenses this file to You under the Apache License, Version 2.0
+ (the "License"); you may not use this file except in compliance with
+ the License.  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+-->
+<plugin
+   id="protocol-htmlunit"
+   name="HtmlUnit Protocol Plug-in"
+   version="1.0.0"
+   provider-name="nutch.apache.org">
+
+  <runtime>
+    <library name="protocol-htmlunit.jar">
+      <export name="*"/>
+    </library>
+  </runtime>
+
+  <requires>
+    <import plugin="nutch-extensionpoints"/>
+    <import plugin="lib-http"/>
+    <import plugin="lib-htmlunit"/>
+  </requires>
+
+  <extension id="org.apache.nutch.protocol.http"
+             name="HttpProtocol"
+             point="org.apache.nutch.protocol.Protocol">
+
+    <implementation id="org.apache.nutch.protocol.htmlunit.Http"
+                    class="org.apache.nutch.protocol.htmlunit.Http">
+      <parameter name="protocolName" value="http"/>
+    </implementation>
+      
+    <implementation id="org.apache.nutch.protocol.htmlunit.Http"
+                    class="org.apache.nutch.protocol.htmlunit.Http">
+      <parameter name="protocolName" value="https"/>
+    </implementation>
+
+   </extension>
+</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/pom.xml
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/pom.xml b/nutch-plugins/protocol-htmlunit/pom.xml
new file mode 100644
index 0000000..e5a57d7
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/pom.xml
@@ -0,0 +1,51 @@
+<!--
+  ~ Licensed to the Apache Software Foundation (ASF) under one or more
+  ~ contributor license agreements.  See the NOTICE file distributed with
+  ~ this work for additional information regarding copyright ownership.
+  ~ The ASF licenses this file to You under the Apache License, Version 2.0
+  ~ (the "License"); you may not use this file except in compliance with
+  ~ the License.  You may obtain a copy of the License at
+  ~
+  ~     http://www.apache.org/licenses/LICENSE-2.0
+  ~
+  ~ Unless required by applicable law or agreed to in writing, software
+  ~ distributed under the License is distributed on an "AS IS" BASIS,
+  ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  ~ See the License for the specific language governing permissions and
+  ~ limitations under the License.
+  -->
+
+<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>org.apache.nutch</groupId>
+        <artifactId>nutch-plugins</artifactId>
+        <version>1.13-SNAPSHOT</version>
+        <relativePath>../pom.xml</relativePath>
+    </parent>
+    <artifactId>protocol-htmlunit</artifactId>
+    <packaging>jar</packaging>
+
+    <name>protocol-htmlunit</name>
+    <url>http://nutch.apache.org</url>
+
+    <properties>
+        <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
+    </properties>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-htmlunit</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+        <dependency>
+            <groupId>org.apache.nutch</groupId>
+            <artifactId>lib-http</artifactId>
+            <version>${project.parent.version}</version>
+        </dependency>
+    </dependencies>
+
+</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/0bf453e5/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/Http.java
----------------------------------------------------------------------
diff --git a/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/Http.java b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/Http.java
new file mode 100644
index 0000000..c40ed69
--- /dev/null
+++ b/nutch-plugins/protocol-htmlunit/src/main/java/org/apache/nutch/protocol/htmlunit/Http.java
@@ -0,0 +1,63 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.nutch.protocol.htmlunit;
+
+import java.io.IOException;
+import java.net.URL;
+
+import org.apache.hadoop.conf.Configuration;
+
+import org.apache.nutch.crawl.CrawlDatum;
+import org.apache.nutch.net.protocols.Response;
+import org.apache.nutch.protocol.ProtocolException;
+import org.apache.nutch.protocol.http.api.HttpBase;
+import org.apache.nutch.util.NutchConfiguration;
+
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+public class Http extends HttpBase {
+
+  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
+
+  /**
+   * Default constructor.
+   */
+  public Http() {
+    super(LOG);
+  }
+
+  /**
+   * Set the {@link org.apache.hadoop.conf.Configuration} object.
+   * 
+   * @param conf
+   */
+  public void setConf(Configuration conf) {
+    super.setConf(conf);
+  }
+
+  public static void main(String[] args) throws Exception {
+    Http http = new Http();
+    http.setConf(NutchConfiguration.create());
+    main(http, args);
+  }
+  
+  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
+      throws ProtocolException, IOException {
+    return new HttpResponse(this, url, datum);
+  }
+}