You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@nutch.apache.org by th...@apache.org on 2016/07/05 22:48:49 UTC

[05/69] [abbrv] [partial] nutch git commit: Re arranged the source code as per maven conventions for build

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
deleted file mode 100644
index f7c7c6d..0000000
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpResponse.java
+++ /dev/null
@@ -1,521 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.ftp;
-
-import org.apache.commons.net.ftp.FTP;
-import org.apache.commons.net.ftp.FTPFile;
-import org.apache.commons.net.ftp.FTPReply;
-import org.apache.commons.net.ftp.parser.DefaultFTPFileEntryParserFactory;
-import org.apache.commons.net.ftp.parser.ParserInitializationException;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.protocol.Content;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.net.protocols.HttpDateFormat;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.hadoop.conf.Configuration;
-
-import java.net.InetAddress;
-import java.net.URL;
-import java.util.List;
-import java.util.LinkedList;
-import java.io.ByteArrayOutputStream;
-import java.io.IOException;
-
-/**
- * FtpResponse.java mimics ftp replies as http response. It tries its best to
- * follow http's way for headers, response codes as well as exceptions.
- * 
- * Comments: In this class, all FtpException*.java thrown by Client.java and
- * some important commons-net exceptions passed by Client.java must have been
- * properly dealt with. They'd better not be leaked to the caller of this class.
- */
-public class FtpResponse {
-
-  private String orig;
-  private String base;
-  private byte[] content;
-  private static final byte[] EMPTY_CONTENT = new byte[0];
-  private int code;
-  private Metadata headers = new Metadata();
-
-  private final Ftp ftp;
-  private Configuration conf;
-
-  /** Returns the response code. */
-  public int getCode() {
-    return code;
-  }
-
-  /** Returns the value of a named header. */
-  public String getHeader(String name) {
-    return headers.get(name);
-  }
-
-  public byte[] getContent() {
-    return content;
-  }
-
-  public Content toContent() {
-    return new Content(orig, base, (content != null ? content : EMPTY_CONTENT),
-        getHeader(Response.CONTENT_TYPE), headers, this.conf);
-  }
-
-  public FtpResponse(URL url, CrawlDatum datum, Ftp ftp, Configuration conf)
-      throws FtpException, IOException {
-
-    this.orig = url.toString();
-    this.base = url.toString();
-    this.ftp = ftp;
-    this.conf = conf;
-
-    if (!"ftp".equals(url.getProtocol()))
-      throw new FtpException("Not a ftp url:" + url);
-
-    if (url.getPath() != url.getFile()) {
-      if (Ftp.LOG.isWarnEnabled()) {
-        Ftp.LOG.warn("url.getPath() != url.getFile(): " + url);
-      }
-    }
-
-    String path = "".equals(url.getPath()) ? "/" : url.getPath();
-
-    try {
-
-      if (ftp.followTalk) {
-        if (Ftp.LOG.isInfoEnabled()) {
-          Ftp.LOG.info("fetching " + url);
-        }
-      } else {
-        if (Ftp.LOG.isTraceEnabled()) {
-          Ftp.LOG.trace("fetching " + url);
-        }
-      }
-
-      InetAddress addr = InetAddress.getByName(url.getHost());
-      if (addr != null && conf.getBoolean("store.ip.address", false) == true) {
-        headers.add("_ip_", addr.getHostAddress());
-      }
-
-      // idled too long, remote server or ourselves may have timed out,
-      // should start anew.
-      if (ftp.client != null && ftp.keepConnection
-          && ftp.renewalTime < System.currentTimeMillis()) {
-        if (Ftp.LOG.isInfoEnabled()) {
-          Ftp.LOG.info("delete client because idled too long");
-        }
-        ftp.client = null;
-      }
-
-      // start anew if needed
-      if (ftp.client == null) {
-        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
-          Ftp.LOG.info("start client");
-        }
-        // the real client
-        ftp.client = new Client();
-        // when to renew, take the lesser
-        // ftp.renewalTime = System.currentTimeMillis()
-        // + ((ftp.timeout<ftp.serverTimeout) ? ftp.timeout :
-        // ftp.serverTimeout);
-
-        // timeout for control connection
-        ftp.client.setDefaultTimeout(ftp.timeout);
-        // timeout for data connection
-        ftp.client.setDataTimeout(ftp.timeout);
-
-        // follow ftp talk?
-        if (ftp.followTalk)
-          ftp.client.addProtocolCommandListener(new PrintCommandListener(
-              Ftp.LOG));
-      }
-
-      // quit from previous site if at a different site now
-      if (ftp.client.isConnected()) {
-        InetAddress remoteAddress = ftp.client.getRemoteAddress();
-        if (!addr.equals(remoteAddress)) {
-          if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
-            Ftp.LOG.info("disconnect from " + remoteAddress
-                + " before connect to " + addr);
-          }
-          // quit from current site
-          ftp.client.logout();
-          ftp.client.disconnect();
-        }
-      }
-
-      // connect to current site if needed
-      if (!ftp.client.isConnected()) {
-
-        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
-          Ftp.LOG.info("connect to " + addr);
-        }
-
-        ftp.client.connect(addr);
-        if (!FTPReply.isPositiveCompletion(ftp.client.getReplyCode())) {
-          ftp.client.disconnect();
-          if (Ftp.LOG.isWarnEnabled()) {
-            Ftp.LOG.warn("ftp.client.connect() failed: " + addr + " "
-                + ftp.client.getReplyString());
-          }
-          this.code = 500; // http Internal Server Error
-          return;
-        }
-
-        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
-          Ftp.LOG.info("log into " + addr);
-        }
-
-        if (!ftp.client.login(ftp.userName, ftp.passWord)) {
-          // login failed.
-          // please note that some server may return 421 immediately
-          // after USER anonymous, thus ftp.client.login() won't return false,
-          // but throw exception, which then will be handled by caller
-          // (not dealt with here at all) .
-          ftp.client.disconnect();
-          if (Ftp.LOG.isWarnEnabled()) {
-            Ftp.LOG.warn("ftp.client.login() failed: " + addr);
-          }
-          this.code = 401; // http Unauthorized
-          return;
-        }
-
-        // insist on binary file type
-        if (!ftp.client.setFileType(FTP.BINARY_FILE_TYPE)) {
-          ftp.client.logout();
-          ftp.client.disconnect();
-          if (Ftp.LOG.isWarnEnabled()) {
-            Ftp.LOG.warn("ftp.client.setFileType() failed: " + addr);
-          }
-          this.code = 500; // http Internal Server Error
-          return;
-        }
-
-        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
-          Ftp.LOG.info("set parser for " + addr);
-        }
-
-        // SYST is valid only after login
-        try {
-          ftp.parser = null;
-          String parserKey = ftp.client.getSystemName();
-          // some server reports as UNKNOWN Type: L8, but in fact UNIX Type: L8
-          if (parserKey.startsWith("UNKNOWN Type: L8"))
-            parserKey = "UNIX Type: L8";
-          ftp.parser = (new DefaultFTPFileEntryParserFactory())
-              .createFileEntryParser(parserKey);
-        } catch (FtpExceptionBadSystResponse e) {
-          if (Ftp.LOG.isWarnEnabled()) {
-            Ftp.LOG
-                .warn("ftp.client.getSystemName() failed: " + addr + " " + e);
-          }
-          ftp.parser = null;
-        } catch (ParserInitializationException e) {
-          // ParserInitializationException is RuntimeException defined in
-          // org.apache.commons.net.ftp.parser.ParserInitializationException
-          if (Ftp.LOG.isWarnEnabled()) {
-            Ftp.LOG.warn("createFileEntryParser() failed. " + addr + " " + e);
-          }
-          ftp.parser = null;
-        } finally {
-          if (ftp.parser == null) {
-            // do not log as severe, otherwise
-            // FetcherThread/RequestScheduler will abort
-            if (Ftp.LOG.isWarnEnabled()) {
-              Ftp.LOG.warn("ftp.parser is null: " + addr);
-            }
-            ftp.client.logout();
-            ftp.client.disconnect();
-            this.code = 500; // http Internal Server Error
-            return;
-          }
-        }
-
-      } else {
-        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
-          Ftp.LOG.info("use existing connection");
-        }
-      }
-
-      this.content = null;
-
-      if (path.endsWith("/")) {
-        getDirAsHttpResponse(path, datum.getModifiedTime());
-      } else {
-        getFileAsHttpResponse(path, datum.getModifiedTime());
-      }
-
-      // reset next renewalTime, take the lesser
-      if (ftp.client != null && ftp.keepConnection) {
-        ftp.renewalTime = System.currentTimeMillis()
-            + ((ftp.timeout < ftp.serverTimeout) ? ftp.timeout
-                : ftp.serverTimeout);
-        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
-          Ftp.LOG.info("reset renewalTime to "
-              + HttpDateFormat.toString(ftp.renewalTime));
-        }
-      }
-
-      // getDirAsHttpResponse() or getFileAsHttpResponse() above
-      // may have deleted ftp.client
-      if (ftp.client != null && !ftp.keepConnection) {
-        if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
-          Ftp.LOG.info("disconnect from " + addr);
-        }
-        ftp.client.logout();
-        ftp.client.disconnect();
-      }
-
-    } catch (Exception e) {
-      if (Ftp.LOG.isWarnEnabled()) {
-        Ftp.LOG.warn("Error: ", e);
-      }
-      // for any un-foreseen exception (run time exception or not),
-      // do ultimate clean and leave ftp.client for garbage collection
-      if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
-        Ftp.LOG.info("delete client due to exception");
-      }
-      ftp.client = null;
-      // or do explicit garbage collection?
-      // System.gc();
-      // can we be less dramatic, using the following instead?
-      // probably unnecessary for our practical purpose here
-      // try {
-      // ftp.client.logout();
-      // ftp.client.disconnect();
-      // }
-      throw new FtpException(e);
-      // throw e;
-    }
-
-  }
-
-  // get ftp file as http response
-  private void getFileAsHttpResponse(String path, long lastModified)
-      throws IOException {
-
-    ByteArrayOutputStream os = null;
-    List<FTPFile> list = null;
-
-    try {
-      // first get its possible attributes
-      list = new LinkedList<FTPFile>();
-      ftp.client.retrieveList(path, list, ftp.maxContentLength, ftp.parser);
-
-      FTPFile ftpFile = (FTPFile) list.get(0);
-      this.headers.set(Response.CONTENT_LENGTH,
-          new Long(ftpFile.getSize()).toString());
-      this.headers.set(Response.LAST_MODIFIED,
-          HttpDateFormat.toString(ftpFile.getTimestamp()));
-      // don't retrieve the file if not changed.
-      if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
-        code = 304;
-        return;
-      }
-      os = new ByteArrayOutputStream(ftp.getBufferSize());
-      ftp.client.retrieveFile(path, os, ftp.maxContentLength);
-
-      this.content = os.toByteArray();
-
-      // // approximate bytes sent and read
-      // if (this.httpAccounting != null) {
-      // this.httpAccounting.incrementBytesSent(path.length());
-      // this.httpAccounting.incrementBytesRead(this.content.length);
-      // }
-
-      this.code = 200; // http OK
-
-    } catch (FtpExceptionControlClosedByForcedDataClose e) {
-
-      // control connection is off, clean up
-      // ftp.client.disconnect();
-      if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
-        Ftp.LOG.info("delete client because server cut off control channel: "
-            + e);
-      }
-      ftp.client = null;
-
-      // in case this FtpExceptionControlClosedByForcedDataClose is
-      // thrown by retrieveList() (not retrieveFile()) above,
-      if (os == null) { // indicating throwing by retrieveList()
-        // throw new FtpException("fail to get attibutes: "+path);
-        if (Ftp.LOG.isWarnEnabled()) {
-          Ftp.LOG
-              .warn("Please try larger maxContentLength for ftp.client.retrieveList(). "
-                  + e);
-        }
-        // in a way, this is our request fault
-        this.code = 400; // http Bad request
-        return;
-      }
-
-      FTPFile ftpFile = (FTPFile) list.get(0);
-      this.headers.set(Response.CONTENT_LENGTH,
-          new Long(ftpFile.getSize()).toString());
-      // this.headers.put("content-type", "text/html");
-      this.headers.set(Response.LAST_MODIFIED,
-          HttpDateFormat.toString(ftpFile.getTimestamp()));
-      this.content = os.toByteArray();
-      if (ftpFile.getTimestamp().getTimeInMillis() <= lastModified) {
-        code = 304;
-        return;
-      }
-
-      // // approximate bytes sent and read
-      // if (this.httpAccounting != null) {
-      // this.httpAccounting.incrementBytesSent(path.length());
-      // this.httpAccounting.incrementBytesRead(this.content.length);
-      // }
-
-      this.code = 200; // http OK
-
-    } catch (FtpExceptionCanNotHaveDataConnection e) {
-
-      if (FTPReply.isPositiveCompletion(ftp.client.cwd(path))) {
-        // it is not a file, but dir, so redirect as a dir
-        this.headers.set(Response.LOCATION, path + "/");
-        this.code = 300; // http redirect
-        // fixme, should we do ftp.client.cwd("/"), back to top dir?
-      } else {
-        // it is not a dir either
-        this.code = 404; // http Not Found
-      }
-
-    } catch (FtpExceptionUnknownForcedDataClose e) {
-      // Please note control channel is still live.
-      // in a way, this is our request fault
-      if (Ftp.LOG.isWarnEnabled()) {
-        Ftp.LOG.warn("Unrecognized reply after forced close of data channel. "
-            + "If this is acceptable, please modify Client.java accordingly. "
-            + e);
-      }
-      this.code = 400; // http Bad Request
-    }
-
-  }
-
-  // get ftp dir list as http response
-  private void getDirAsHttpResponse(String path, long lastModified)
-      throws IOException {
-    List<FTPFile> list = new LinkedList<FTPFile>();
-
-    try {
-
-      // change to that dir first
-      if (!FTPReply.isPositiveCompletion(ftp.client.cwd(path))) {
-        this.code = 404; // http Not Found
-        return;
-      }
-
-      // fixme, should we do ftp.client.cwd("/"), back to top dir?
-
-      ftp.client.retrieveList(null, list, ftp.maxContentLength, ftp.parser);
-      this.content = list2html(list, path, "/".equals(path) ? false : true);
-      this.headers.set(Response.CONTENT_LENGTH,
-          new Integer(this.content.length).toString());
-      this.headers.set(Response.CONTENT_TYPE, "text/html");
-      // this.headers.put("Last-Modified", null);
-
-      // // approximate bytes sent and read
-      // if (this.httpAccounting != null) {
-      // this.httpAccounting.incrementBytesSent(path.length());
-      // this.httpAccounting.incrementBytesRead(this.content.length);
-      // }
-
-      this.code = 200; // http OK
-
-    } catch (FtpExceptionControlClosedByForcedDataClose e) {
-
-      // control connection is off, clean up
-      // ftp.client.disconnect();
-      if ((ftp.followTalk) && (Ftp.LOG.isInfoEnabled())) {
-        Ftp.LOG.info("delete client because server cut off control channel: "
-            + e);
-      }
-      ftp.client = null;
-
-      this.content = list2html(list, path, "/".equals(path) ? false : true);
-      this.headers.set(Response.CONTENT_LENGTH,
-          new Integer(this.content.length).toString());
-      this.headers.set(Response.CONTENT_TYPE, "text/html");
-      // this.headers.put("Last-Modified", null);
-
-      // // approximate bytes sent and read
-      // if (this.httpAccounting != null) {
-      // this.httpAccounting.incrementBytesSent(path.length());
-      // this.httpAccounting.incrementBytesRead(this.content.length);
-      // }
-
-      this.code = 200; // http OK
-
-    } catch (FtpExceptionUnknownForcedDataClose e) {
-      // Please note control channel is still live.
-      // in a way, this is our request fault
-      if (Ftp.LOG.isWarnEnabled()) {
-        Ftp.LOG.warn("Unrecognized reply after forced close of data channel. "
-            + "If this is acceptable, please modify Client.java accordingly. "
-            + e);
-      }
-      this.code = 400; // http Bad Request
-    } catch (FtpExceptionCanNotHaveDataConnection e) {
-      if (Ftp.LOG.isWarnEnabled()) {
-        Ftp.LOG.warn("" + e);
-      }
-      this.code = 500; // http Iternal Server Error
-    }
-
-  }
-
-  // generate html page from ftp dir list
-  private byte[] list2html(List<FTPFile> list, String path,
-      boolean includeDotDot) {
-
-    // StringBuffer x = new
-    // StringBuffer("<!doctype html public \"-//ietf//dtd html//en\"><html><head>");
-    StringBuffer x = new StringBuffer("<html><head>");
-    x.append("<title>Index of " + path + "</title></head>\n");
-    x.append("<body><h1>Index of " + path + "</h1><pre>\n");
-
-    if (includeDotDot) {
-      x.append("<a href='../'>../</a>\t-\t-\t-\n");
-    }
-
-    for (int i = 0; i < list.size(); i++) {
-      FTPFile f = (FTPFile) list.get(i);
-      String name = f.getName();
-      String time = HttpDateFormat.toString(f.getTimestamp());
-      if (f.isDirectory()) {
-        // some ftp server LIST "." and "..", we skip them here
-        if (name.equals(".") || name.equals(".."))
-          continue;
-        x.append("<a href='" + name + "/" + "'>" + name + "/</a>\t");
-        x.append(time + "\t-\n");
-      } else if (f.isFile()) {
-        x.append("<a href='" + name + "'>" + name + "</a>\t");
-        x.append(time + "\t" + f.getSize() + "\n");
-      } else {
-        // ignore isSymbolicLink()
-        // ignore isUnknown()
-      }
-    }
-
-    x.append("</pre></body></html>\n");
-
-    return new String(x).getBytes();
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
deleted file mode 100644
index 3764864..0000000
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/FtpRobotRulesParser.java
+++ /dev/null
@@ -1,121 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.ftp;
-
-import java.net.URL;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.hadoop.io.Text;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.protocol.Protocol;
-import org.apache.nutch.protocol.ProtocolOutput;
-import org.apache.nutch.protocol.ProtocolStatus;
-import org.apache.nutch.protocol.RobotRulesParser;
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-import crawlercommons.robots.BaseRobotRules;
-import crawlercommons.robots.SimpleRobotRules;
-
-/**
- * This class is used for parsing robots for urls belonging to FTP protocol. It
- * extends the generic {@link RobotRulesParser} class and contains Ftp protocol
- * specific implementation for obtaining the robots file.
- */
-public class FtpRobotRulesParser extends RobotRulesParser {
-
-  private static final String CONTENT_TYPE = "text/plain";
-  public static final Logger LOG = LoggerFactory
-      .getLogger(FtpRobotRulesParser.class);
-
-  FtpRobotRulesParser() {
-  }
-
-  public FtpRobotRulesParser(Configuration conf) {
-    super(conf);
-  }
-
-  /**
-   * The hosts for which the caching of robots rules is yet to be done, it sends
-   * a Ftp request to the host corresponding to the {@link URL} passed, gets
-   * robots file, parses the rules and caches the rules object to avoid re-work
-   * in future.
-   * 
-   * @param ftp
-   *          The {@link Protocol} object
-   * @param url
-   *          URL
-   * 
-   * @return robotRules A {@link BaseRobotRules} object for the rules
-   */
-  public BaseRobotRules getRobotRulesSet(Protocol ftp, URL url) {
-
-    String protocol = url.getProtocol().toLowerCase(); // normalize to lower
-                                                       // case
-    String host = url.getHost().toLowerCase(); // normalize to lower case
-
-    if (LOG.isTraceEnabled() && isWhiteListed(url)) {
-      LOG.trace("Ignoring robots.txt (host is whitelisted) for URL: {}", url);
-    }
-
-    BaseRobotRules robotRules = CACHE.get(protocol + ":" + host);
-
-    if (robotRules != null) {
-      return robotRules; // cached rule
-    } else if (LOG.isTraceEnabled()) {
-      LOG.trace("cache miss " + url);
-    }
-
-    boolean cacheRule = true;
-
-    if (isWhiteListed(url)) {
-      // check in advance whether a host is whitelisted
-      // (we do not need to fetch robots.txt)
-      robotRules = EMPTY_RULES;
-      LOG.info("Whitelisted host found for: {}", url);
-      LOG.info("Ignoring robots.txt for all URLs from whitelisted host: {}", host);
-
-    } else {
-      try {
-        Text robotsUrl = new Text(new URL(url, "/robots.txt").toString());
-        ProtocolOutput output = ((Ftp) ftp).getProtocolOutput(robotsUrl,
-            new CrawlDatum());
-        ProtocolStatus status = output.getStatus();
-
-        if (status.getCode() == ProtocolStatus.SUCCESS) {
-          robotRules = parseRules(url.toString(), output.getContent()
-              .getContent(), CONTENT_TYPE, agentNames);
-        } else {
-          robotRules = EMPTY_RULES; // use default rules
-        }
-      } catch (Throwable t) {
-        if (LOG.isInfoEnabled()) {
-          LOG.info("Couldn't get robots.txt for " + url + ": " + t.toString());
-        }
-        cacheRule = false; // try again later to fetch robots.txt
-        robotRules = EMPTY_RULES;
-      }
-
-    }
-
-    if (cacheRule)
-      CACHE.put(protocol + ":" + host, robotRules); // cache rules for host
-
-    return robotRules;
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
deleted file mode 100644
index c68eac8..0000000
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/PrintCommandListener.java
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.nutch.protocol.ftp;
-
-import java.io.BufferedReader;
-import java.io.StringReader;
-import java.io.IOException;
-
-import org.slf4j.Logger;
-
-import org.apache.commons.net.ProtocolCommandEvent;
-import org.apache.commons.net.ProtocolCommandListener;
-
-/***
- * This is a support class for logging all ftp command/reply traffic.
- * 
- * @author John Xing
- ***/
-public class PrintCommandListener implements ProtocolCommandListener {
-  private Logger __logger;
-
-  public PrintCommandListener(Logger logger) {
-    __logger = logger;
-  }
-
-  public void protocolCommandSent(ProtocolCommandEvent event) {
-    try {
-      __logIt(event);
-    } catch (IOException e) {
-      if (__logger.isInfoEnabled()) {
-        __logger.info("PrintCommandListener.protocolCommandSent(): " + e);
-      }
-    }
-  }
-
-  public void protocolReplyReceived(ProtocolCommandEvent event) {
-    try {
-      __logIt(event);
-    } catch (IOException e) {
-      if (__logger.isInfoEnabled()) {
-        __logger.info("PrintCommandListener.protocolReplyReceived(): " + e);
-      }
-    }
-  }
-
-  private void __logIt(ProtocolCommandEvent event) throws IOException {
-    if (!__logger.isInfoEnabled()) {
-      return;
-    }
-    BufferedReader br = new BufferedReader(new StringReader(event.getMessage()));
-    String line;
-    while ((line = br.readLine()) != null) {
-      __logger.info("ftp> " + line);
-    }
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package.html b/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package.html
deleted file mode 100644
index d936930..0000000
--- a/src/plugin/protocol-ftp/src/java/org/apache/nutch/protocol/ftp/package.html
+++ /dev/null
@@ -1,5 +0,0 @@
-<html>
-<body>
-<p>Protocol plugin which supports retrieving documents via the ftp protocol.</p><p></p>
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-htmlunit/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/build.xml b/src/plugin/protocol-htmlunit/build.xml
deleted file mode 100644
index 899214c..0000000
--- a/src/plugin/protocol-htmlunit/build.xml
+++ /dev/null
@@ -1,37 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="protocol-htmlunit" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-  <!-- Build compilation dependencies -->
-  <target name="deps-jar">
-    <ant target="jar" inheritall="false" dir="../lib-http"/>
-    <ant target="jar" inheritall="false" dir="../lib-htmlunit"/>
-  </target>
-
-  <!-- Add compilation dependencies to classpath -->
-  <path id="plugin.deps">
-    <fileset dir="${nutch.root}/build">
-      <include name="**/lib-http/*.jar" />
-      <include name="**/lib-htmlunit/*.jar" />
-    </fileset>
-    <pathelement location="${build.dir}/test/conf"/>
-  </path>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-htmlunit/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/ivy.xml b/src/plugin/protocol-htmlunit/ivy.xml
deleted file mode 100644
index 8aa78d2..0000000
--- a/src/plugin/protocol-htmlunit/ivy.xml
+++ /dev/null
@@ -1,38 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-htmlunit/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/plugin.xml b/src/plugin/protocol-htmlunit/plugin.xml
deleted file mode 100644
index 36bcb80..0000000
--- a/src/plugin/protocol-htmlunit/plugin.xml
+++ /dev/null
@@ -1,51 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="protocol-htmlunit"
-   name="HtmlUnit Protocol Plug-in"
-   version="1.0.0"
-   provider-name="nutch.apache.org">
-
-  <runtime>
-    <library name="protocol-htmlunit.jar">
-      <export name="*"/>
-    </library>
-  </runtime>
-
-  <requires>
-    <import plugin="nutch-extensionpoints"/>
-    <import plugin="lib-http"/>
-    <import plugin="lib-htmlunit"/>
-  </requires>
-
-  <extension id="org.apache.nutch.protocol.http"
-             name="HttpProtocol"
-             point="org.apache.nutch.protocol.Protocol">
-
-    <implementation id="org.apache.nutch.protocol.htmlunit.Http"
-                    class="org.apache.nutch.protocol.htmlunit.Http">
-      <parameter name="protocolName" value="http"/>
-    </implementation>
-      
-    <implementation id="org.apache.nutch.protocol.htmlunit.Http"
-                    class="org.apache.nutch.protocol.htmlunit.Http">
-      <parameter name="protocolName" value="https"/>
-    </implementation>
-
-   </extension>
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
deleted file mode 100644
index c40ed69..0000000
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/Http.java
+++ /dev/null
@@ -1,63 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.htmlunit;
-
-import java.io.IOException;
-import java.net.URL;
-
-import org.apache.hadoop.conf.Configuration;
-
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.http.api.HttpBase;
-import org.apache.nutch.util.NutchConfiguration;
-
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-public class Http extends HttpBase {
-
-  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
-
-  /**
-   * Default constructor.
-   */
-  public Http() {
-    super(LOG);
-  }
-
-  /**
-   * Set the {@link org.apache.hadoop.conf.Configuration} object.
-   * 
-   * @param conf
-   */
-  public void setConf(Configuration conf) {
-    super.setConf(conf);
-  }
-
-  public static void main(String[] args) throws Exception {
-    Http http = new Http();
-    http.setConf(NutchConfiguration.create());
-    main(http, args);
-  }
-  
-  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
-      throws ProtocolException, IOException {
-    return new HttpResponse(this, url, datum);
-  }
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
deleted file mode 100644
index 8b1a031..0000000
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/HttpResponse.java
+++ /dev/null
@@ -1,573 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- * <p/>
- * http://www.apache.org/licenses/LICENSE-2.0
- * <p/>
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.htmlunit;
-
-import java.io.BufferedInputStream;
-import java.io.ByteArrayOutputStream;
-import java.io.EOFException;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.io.PushbackInputStream;
-import java.net.InetSocketAddress;
-import java.net.Socket;
-import java.net.URL;
-import java.util.Arrays;
-import java.util.HashSet;
-import java.util.Set;
-
-import javax.net.ssl.SSLSocket;
-import javax.net.ssl.SSLSocketFactory;
-
-import org.apache.hadoop.conf.Configuration;
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.metadata.Metadata;
-import org.apache.nutch.metadata.SpellCheckedMetadata;
-import org.apache.nutch.net.protocols.HttpDateFormat;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.http.api.HttpBase;
-import org.apache.nutch.protocol.http.api.HttpException;
-
-/**
- * An HTTP response.
- */
-public class HttpResponse implements Response {
-
-  private Configuration conf;
-  private HttpBase http;
-  private URL url;
-  private String orig;
-  private String base;
-  private byte[] content;
-  private int code;
-  private Metadata headers = new SpellCheckedMetadata();
-  // used for storing the http headers verbatim
-  private StringBuffer httpHeaders;
-
-  protected enum Scheme {
-    HTTP, HTTPS,
-  }
-
-  /**
-   * Default public constructor.
-   *
-   * @param http
-   * @param url
-   * @param datum
-   * @throws ProtocolException
-   * @throws IOException
-   */
-  public HttpResponse(HttpBase http, URL url, CrawlDatum datum)
-      throws ProtocolException, IOException {
-
-    this.http = http;
-    this.url = url;
-    this.orig = url.toString();
-    this.base = url.toString();
-
-    Scheme scheme = null;
-
-    if ("http".equals(url.getProtocol())) {
-      scheme = Scheme.HTTP;
-    } else if ("https".equals(url.getProtocol())) {
-      scheme = Scheme.HTTPS;
-    } else {
-      throw new HttpException("Unknown scheme (not http/https) for url:" + url);
-    }
-
-    if (Http.LOG.isTraceEnabled()) {
-      Http.LOG.trace("fetching " + url);
-    }
-
-    String path = "".equals(url.getFile()) ? "/" : url.getFile();
-
-    // some servers will redirect a request with a host line like
-    // "Host: <hostname>:80" to "http://<hpstname>/<orig_path>"- they
-    // don't want the :80...
-
-    String host = url.getHost();
-    int port;
-    String portString;
-    if (url.getPort() == -1) {
-      if (scheme == Scheme.HTTP) {
-        port = 80;
-      } else {
-        port = 443;
-      }
-      portString = "";
-    } else {
-      port = url.getPort();
-      portString = ":" + port;
-    }
-    Socket socket = null;
-
-    try {
-      socket = new Socket(); // create the socket
-      socket.setSoTimeout(http.getTimeout());
-
-      // connect
-      String sockHost = http.useProxy(url) ? http.getProxyHost() : host;
-      int sockPort = http.useProxy(url) ? http.getProxyPort() : port;
-      InetSocketAddress sockAddr = new InetSocketAddress(sockHost, sockPort);
-      socket.connect(sockAddr, http.getTimeout());
-
-      if (scheme == Scheme.HTTPS) {
-        SSLSocketFactory factory = (SSLSocketFactory) SSLSocketFactory
-            .getDefault();
-        SSLSocket sslsocket = (SSLSocket) factory
-            .createSocket(socket, sockHost, sockPort, true);
-        sslsocket.setUseClientMode(true);
-
-        // Get the protocols and ciphers supported by this JVM
-        Set<String> protocols = new HashSet<String>(
-            Arrays.asList(sslsocket.getSupportedProtocols()));
-        Set<String> ciphers = new HashSet<String>(
-            Arrays.asList(sslsocket.getSupportedCipherSuites()));
-
-        // Intersect with preferred protocols and ciphers
-        protocols.retainAll(http.getTlsPreferredProtocols());
-        ciphers.retainAll(http.getTlsPreferredCipherSuites());
-
-        sslsocket.setEnabledProtocols(
-            protocols.toArray(new String[protocols.size()]));
-        sslsocket.setEnabledCipherSuites(
-            ciphers.toArray(new String[ciphers.size()]));
-
-        sslsocket.startHandshake();
-        socket = sslsocket;
-      }
-
-      this.conf = http.getConf();
-      if (sockAddr != null
-          && conf.getBoolean("store.ip.address", false) == true) {
-        headers.add("_ip_", sockAddr.getAddress().getHostAddress());
-      }
-
-      // make request
-      OutputStream req = socket.getOutputStream();
-
-      StringBuffer reqStr = new StringBuffer("GET ");
-      if (http.useProxy(url)) {
-        reqStr.append(url.getProtocol() + "://" + host + portString + path);
-      } else {
-        reqStr.append(path);
-      }
-
-      reqStr.append(" HTTP/1.0\r\n");
-
-      reqStr.append("Host: ");
-      reqStr.append(host);
-      reqStr.append(portString);
-      reqStr.append("\r\n");
-
-      reqStr.append("Accept-Encoding: x-gzip, gzip, deflate\r\n");
-
-      String userAgent = http.getUserAgent();
-      if ((userAgent == null) || (userAgent.length() == 0)) {
-        if (Http.LOG.isErrorEnabled()) {
-          Http.LOG.error("User-agent is not set!");
-        }
-      } else {
-        reqStr.append("User-Agent: ");
-        reqStr.append(userAgent);
-        reqStr.append("\r\n");
-      }
-
-      reqStr.append("Accept-Language: ");
-      reqStr.append(this.http.getAcceptLanguage());
-      reqStr.append("\r\n");
-
-      reqStr.append("Accept: ");
-      reqStr.append(this.http.getAccept());
-      reqStr.append("\r\n");
-
-      if (http.isIfModifiedSinceEnabled() && datum.getModifiedTime() > 0) {
-        reqStr.append("If-Modified-Since: " + HttpDateFormat
-            .toString(datum.getModifiedTime()));
-        reqStr.append("\r\n");
-      }
-      reqStr.append("\r\n");
-
-      // store the request in the metadata?
-      if (conf.getBoolean("store.http.request", false) == true) {
-        headers.add("_request_", reqStr.toString());
-      }
-
-      byte[] reqBytes = reqStr.toString().getBytes();
-
-      req.write(reqBytes);
-      req.flush();
-
-      PushbackInputStream in = // process response
-          new PushbackInputStream(
-              new BufferedInputStream(socket.getInputStream(),
-                  Http.BUFFER_SIZE), Http.BUFFER_SIZE);
-
-      StringBuffer line = new StringBuffer();
-
-      // store the http headers verbatim
-      if (conf.getBoolean("store.http.headers", false) == true) {
-        httpHeaders = new StringBuffer();
-      }
-
-      headers.add("nutch.fetch.time", Long.toString(System.currentTimeMillis()));
-
-      boolean haveSeenNonContinueStatus = false;
-      while (!haveSeenNonContinueStatus) {
-        // parse status code line
-        this.code = parseStatusLine(in, line);
-        if (httpHeaders != null)
-          httpHeaders.append(line).append("\n");
-        // parse headers
-        parseHeaders(in, line, httpHeaders);
-        haveSeenNonContinueStatus = code != 100; // 100 is "Continue"
-      }
-
-      // Get Content type header
-      String contentType = getHeader(Response.CONTENT_TYPE);
-
-      // handle with HtmlUnit only if content type in HTML or XHTML 
-      if (contentType != null) {
-        if (contentType.contains("text/html") || contentType.contains("application/xhtml")) {
-          readContentFromHtmlUnit(url);
-        } else {
-          String transferEncoding = getHeader(Response.TRANSFER_ENCODING);
-          if (transferEncoding != null && "chunked"
-              .equalsIgnoreCase(transferEncoding.trim())) {
-            readChunkedContent(in, line);
-          } else {
-            readPlainContent(in);
-          }
-
-          String contentEncoding = getHeader(Response.CONTENT_ENCODING);
-          if ("gzip".equals(contentEncoding) || "x-gzip".equals(contentEncoding)) {
-            content = http.processGzipEncoded(content, url);
-          } else if ("deflate".equals(contentEncoding)) {
-            content = http.processDeflateEncoded(content, url);
-          } else {
-            // store the headers verbatim only if the response was not compressed
-            // as the content length reported with not match otherwise
-            if (httpHeaders != null) {
-              headers.add("_response.headers_", httpHeaders.toString());
-            }
-            if (Http.LOG.isTraceEnabled()) {
-              Http.LOG.trace("fetched " + content.length + " bytes from " + url);
-            }
-          }
-        }
-      }
-
-    } finally {
-      if (socket != null)
-        socket.close();
-    }
-
-  }
-
-  /*
-   * ------------------------- * <implementation:Response> *
-   * -------------------------
-   */
-
-  public URL getUrl() {
-    return url;
-  }
-
-  public int getCode() {
-    return code;
-  }
-
-  public String getHeader(String name) {
-    return headers.get(name);
-  }
-
-  public Metadata getHeaders() {
-    return headers;
-  }
-
-  public byte[] getContent() {
-    return content;
-  }
-
-  /*
-   * ------------------------- * <implementation:Response> *
-   * -------------------------
-   */
-
-  private void readContentFromHtmlUnit(URL url) throws IOException {
-    String page = HtmlUnitWebDriver.getHtmlPage(url.toString(), conf);
-    content = page.getBytes("UTF-8");
-  }
-  
-  private void readPlainContent(InputStream in)
-      throws HttpException, IOException {
-
-    int contentLength = Integer.MAX_VALUE; // get content length
-    String contentLengthString = headers.get(Response.CONTENT_LENGTH);
-    if (contentLengthString != null) {
-      contentLengthString = contentLengthString.trim();
-      try {
-        if (!contentLengthString.isEmpty())
-          contentLength = Integer.parseInt(contentLengthString);
-      } catch (NumberFormatException e) {
-        throw new HttpException("bad content length: " + contentLengthString);
-      }
-    }
-    if (http.getMaxContent() >= 0 && contentLength > http
-        .getMaxContent()) // limit
-      // download
-      // size
-      contentLength = http.getMaxContent();
-
-    ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
-    byte[] bytes = new byte[Http.BUFFER_SIZE];
-    int length = 0;
-
-    // do not try to read if the contentLength is 0
-    if (contentLength == 0) {
-      content = new byte[0];
-      return;
-    }
-
-    // read content
-    int i = in.read(bytes);
-    while (i != -1) {
-      out.write(bytes, 0, i);
-      length += i;
-      if (length >= contentLength) {
-        break;
-      }
-      if ((length + Http.BUFFER_SIZE) > contentLength) {
-        // reading next chunk may hit contentLength,
-        // must limit number of bytes read
-        i = in.read(bytes, 0, (contentLength - length));
-      } else {
-        i = in.read(bytes);
-      }
-    }
-    content = out.toByteArray();
-  }
-
-  /**
-   * @param in
-   * @param line
-   * @throws HttpException
-   * @throws IOException
-   */
-  private void readChunkedContent(PushbackInputStream in, StringBuffer line)
-      throws HttpException, IOException {
-    boolean doneChunks = false;
-    int contentBytesRead = 0;
-    byte[] bytes = new byte[Http.BUFFER_SIZE];
-    ByteArrayOutputStream out = new ByteArrayOutputStream(Http.BUFFER_SIZE);
-
-    while (!doneChunks) {
-      if (Http.LOG.isTraceEnabled()) {
-        Http.LOG.trace("Http: starting chunk");
-      }
-
-      readLine(in, line, false);
-
-      String chunkLenStr;
-      // if (LOG.isTraceEnabled()) { LOG.trace("chunk-header: '" + line + "'");
-      // }
-
-      int pos = line.indexOf(";");
-      if (pos < 0) {
-        chunkLenStr = line.toString();
-      } else {
-        chunkLenStr = line.substring(0, pos);
-        // if (LOG.isTraceEnabled()) { LOG.trace("got chunk-ext: " +
-        // line.substring(pos+1)); }
-      }
-      chunkLenStr = chunkLenStr.trim();
-      int chunkLen;
-      try {
-        chunkLen = Integer.parseInt(chunkLenStr, 16);
-      } catch (NumberFormatException e) {
-        throw new HttpException("bad chunk length: " + line.toString());
-      }
-
-      if (chunkLen == 0) {
-        doneChunks = true;
-        break;
-      }
-
-      if (http.getMaxContent() >= 0 && (contentBytesRead + chunkLen) > http
-          .getMaxContent())
-        chunkLen = http.getMaxContent() - contentBytesRead;
-
-      // read one chunk
-      int chunkBytesRead = 0;
-      while (chunkBytesRead < chunkLen) {
-
-        int toRead = (chunkLen - chunkBytesRead) < Http.BUFFER_SIZE ?
-            (chunkLen - chunkBytesRead) :
-            Http.BUFFER_SIZE;
-        int len = in.read(bytes, 0, toRead);
-
-        if (len == -1)
-          throw new HttpException("chunk eof after " + contentBytesRead
-              + " bytes in successful chunks" + " and " + chunkBytesRead
-              + " in current chunk");
-
-        // DANGER!!! Will printed GZIPed stuff right to your
-        // terminal!
-        // if (LOG.isTraceEnabled()) { LOG.trace("read: " + new String(bytes, 0,
-        // len)); }
-
-        out.write(bytes, 0, len);
-        chunkBytesRead += len;
-      }
-
-      readLine(in, line, false);
-
-    }
-
-    if (!doneChunks) {
-      if (contentBytesRead != http.getMaxContent())
-        throw new HttpException("chunk eof: !doneChunk && didn't max out");
-      return;
-    }
-
-    content = out.toByteArray();
-    parseHeaders(in, line, null);
-
-  }
-
-  private int parseStatusLine(PushbackInputStream in, StringBuffer line)
-      throws IOException, HttpException {
-    readLine(in, line, false);
-
-    int codeStart = line.indexOf(" ");
-    int codeEnd = line.indexOf(" ", codeStart + 1);
-
-    // handle lines with no plaintext result code, ie:
-    // "HTTP/1.1 200" vs "HTTP/1.1 200 OK"
-    if (codeEnd == -1)
-      codeEnd = line.length();
-
-    int code;
-    try {
-      code = Integer.parseInt(line.substring(codeStart + 1, codeEnd));
-    } catch (NumberFormatException e) {
-      throw new HttpException(
-          "bad status line '" + line + "': " + e.getMessage(), e);
-    }
-
-    return code;
-  }
-
-  private void processHeaderLine(StringBuffer line)
-      throws IOException, HttpException {
-
-    int colonIndex = line.indexOf(":"); // key is up to colon
-    if (colonIndex == -1) {
-      int i;
-      for (i = 0; i < line.length(); i++)
-        if (!Character.isWhitespace(line.charAt(i)))
-          break;
-      if (i == line.length())
-        return;
-      throw new HttpException("No colon in header:" + line);
-    }
-    String key = line.substring(0, colonIndex);
-
-    int valueStart = colonIndex + 1; // skip whitespace
-    while (valueStart < line.length()) {
-      int c = line.charAt(valueStart);
-      if (c != ' ' && c != '\t')
-        break;
-      valueStart++;
-    }
-    String value = line.substring(valueStart);
-    headers.set(key, value);
-  }
-
-  // Adds headers to our headers Metadata
-  private void parseHeaders(PushbackInputStream in, StringBuffer line,
-      StringBuffer httpHeaders) throws IOException, HttpException {
-
-    while (readLine(in, line, true) != 0) {
-
-      if (httpHeaders != null)
-        httpHeaders.append(line).append("\n");
-
-      // handle HTTP responses with missing blank line after headers
-      int pos;
-      if (((pos = line.indexOf("<!DOCTYPE")) != -1) || (
-          (pos = line.indexOf("<HTML")) != -1) || ((pos = line.indexOf("<html"))
-          != -1)) {
-
-        in.unread(line.substring(pos).getBytes("UTF-8"));
-        line.setLength(pos);
-
-        try {
-          // TODO: (CM) We don't know the header names here
-          // since we're just handling them generically. It would
-          // be nice to provide some sort of mapping function here
-          // for the returned header names to the standard metadata
-          // names in the ParseData class
-          processHeaderLine(line);
-        } catch (Exception e) {
-          // fixme:
-          Http.LOG.warn("Error: ", e);
-        }
-        return;
-      }
-
-      processHeaderLine(line);
-    }
-  }
-
-  private static int readLine(PushbackInputStream in, StringBuffer line,
-      boolean allowContinuedLine) throws IOException {
-    line.setLength(0);
-    for (int c = in.read(); c != -1; c = in.read()) {
-      switch (c) {
-      case '\r':
-        if (peek(in) == '\n') {
-          in.read();
-        }
-      case '\n':
-        if (line.length() > 0) {
-          // at EOL -- check for continued line if the current
-          // (possibly continued) line wasn't blank
-          if (allowContinuedLine)
-            switch (peek(in)) {
-            case ' ':
-            case '\t': // line is continued
-              in.read();
-              continue;
-            }
-        }
-        return line.length(); // else complete
-      default:
-        line.append((char) c);
-      }
-    }
-    throw new EOFException();
-  }
-
-  private static int peek(PushbackInputStream in) throws IOException {
-    int value = in.read();
-    in.unread(value);
-    return value;
-  }
-
-}

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html b/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
deleted file mode 100644
index 4181951..0000000
--- a/src/plugin/protocol-htmlunit/src/java/org/apache/nutch/protocol/htmlunit/package.html
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-<html>
-<body>
-<p>Protocol plugin which supports retrieving documents via the http protocol.</p><p></p>
-</body>
-</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/build.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/build.xml b/src/plugin/protocol-http/build.xml
deleted file mode 100755
index 30720f1..0000000
--- a/src/plugin/protocol-http/build.xml
+++ /dev/null
@@ -1,50 +0,0 @@
-<?xml version="1.0"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<project name="protocol-http" default="jar-core">
-
-  <import file="../build-plugin.xml"/>
-
-  <!-- Build compilation dependencies -->
-  <target name="deps-jar">
-    <ant target="jar" inheritall="false" dir="../lib-http"/>
-  </target>
-
-  <!-- Add compilation dependencies to classpath -->
-  <path id="plugin.deps">
-    <fileset dir="${nutch.root}/build">
-      <include name="**/lib-http/*.jar" />
-    </fileset>
-    <pathelement location="${build.dir}/test/conf"/>
-  </path>
-
-  <!-- Deploy Unit test dependencies -->
-  <target name="deps-test">
-    <ant target="deploy" inheritall="false" dir="../lib-http"/>
-    <ant target="deploy" inheritall="false" dir="../nutch-extensionpoints"/>
-    <copy toDir="${build.test}">
-      <fileset dir="${src.test}" excludes="**/*.java"/>
-    </copy>
-  </target>
-
-  <!-- for junit test -->
-  <mkdir dir="${build.test}/data" />
-  <copy todir="${build.test}/data">
-      <fileset dir="jsp"/>
-   </copy>
-
-</project>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/ivy.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/ivy.xml b/src/plugin/protocol-http/ivy.xml
deleted file mode 100644
index 1a86d68..0000000
--- a/src/plugin/protocol-http/ivy.xml
+++ /dev/null
@@ -1,41 +0,0 @@
-<?xml version="1.0" ?>
-
-<!--
-   Licensed to the Apache Software Foundation (ASF) under one or more
-   contributor license agreements.  See the NOTICE file distributed with
-   this work for additional information regarding copyright ownership.
-   The ASF licenses this file to You under the Apache License, Version 2.0
-   (the "License"); you may not use this file except in compliance with
-   the License.  You may obtain a copy of the License at
-
-       http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License.
--->
-
-<ivy-module version="1.0">
-  <info organisation="org.apache.nutch" module="${ant.project.name}">
-    <license name="Apache 2.0"/>
-    <ivyauthor name="Apache Nutch Team" url="http://nutch.apache.org"/>
-    <description>
-        Apache Nutch
-    </description>
-  </info>
-
-  <configurations>
-    <include file="../../..//ivy/ivy-configurations.xml"/>
-  </configurations>
-
-  <publications>
-    <!--get the artifact from our module name-->
-    <artifact conf="master"/>
-  </publications>
-
-  <dependencies>
-  </dependencies>
-  
-</ivy-module>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/jsp/basic-http.jsp
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/jsp/basic-http.jsp b/src/plugin/protocol-http/jsp/basic-http.jsp
deleted file mode 100644
index bf1f8bd..0000000
--- a/src/plugin/protocol-http/jsp/basic-http.jsp
+++ /dev/null
@@ -1,44 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%><%--
-  Example JSP Page to Test Protocol-Http Plugin  
---%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
-String path = request.getContextPath();
-String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
-%>
-
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
-  <head>
-    <base href="<%=basePath%>">
-    
-    <title>HelloWorld</title>
-    <meta http-equiv="content-type" content="text/html;charset=utf-8" />
-    <meta name="Language" content="en" />
-	<meta http-equiv="pragma" content="no-cache">
-	<meta http-equiv="cache-control" content="no-cache">
-	<meta http-equiv="expires" content="0">    
-	<meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
-	<meta http-equiv="description" content="This is my page">
-	<!--
-	<link rel="stylesheet" type="text/css" href="styles.css">
-	-->
-  </head>
-  
-  <body>
-    Hello World!!! <br>
-  </body>
-</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/jsp/brokenpage.jsp
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/jsp/brokenpage.jsp b/src/plugin/protocol-http/jsp/brokenpage.jsp
deleted file mode 100644
index f3f7c4a..0000000
--- a/src/plugin/protocol-http/jsp/brokenpage.jsp
+++ /dev/null
@@ -1,47 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%><%--
-  Example JSP Page to Test Protocol-Http Plugin
---%>
-
-@ page language="java" import="java.util.*" pageEncoding="UTF-8"
-
-String path = request.getContextPath();
-String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
-
-
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
-  <head>
-    <base href="<%=basePath%>">
-    
-    <title>HelloWorld</title>
-    <meta http-equiv="content-type" content="text/html;charset=utf-8" />
-    <meta name="Language" content="en" />
-	<meta http-equiv="pragma" content="no-cache">
-	<meta http-equiv="cache-control" content="no-cache">
-	<meta http-equiv="expires" content="0">    
-	<meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
-	<meta http-equiv="description" content="This is my page">
-	<!--
-	<link rel="stylesheet" type="text/css" href="styles.css">
-	-->
-  </head>
-  
-  <body>
-    Hello World!!! <br>
-  </body>
-</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/jsp/redirect301.jsp
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/jsp/redirect301.jsp b/src/plugin/protocol-http/jsp/redirect301.jsp
deleted file mode 100644
index 1100b89..0000000
--- a/src/plugin/protocol-http/jsp/redirect301.jsp
+++ /dev/null
@@ -1,49 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%><%--
-  Example JSP Page to Test Protocol-Http Plugin
---%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
-String path = request.getContextPath();
-String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
-%>
-
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
-  <head>
-    <base href="<%=basePath%>">
-    
-    <title>My JSP page</title>
-    
-	<meta http-equiv="pragma" content="no-cache">
-	<meta http-equiv="cache-control" content="no-cache">
-	<meta http-equiv="expires" content="0">    
-	<meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
-	<meta http-equiv="description" content="This is my page">
-	<!--
-	<link rel="stylesheet" type="text/css" href="styles.css">
-	-->
-
-  </head>
-  
-  <body>
-       <%
-	response.setStatus(301);
-	response.setHeader( "Location", "http://nutch.apache.org");
-	response.setHeader( "Connection", "close" );
-		%> 
-    You are redirected by JSP<br>
-  </body>
-</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/jsp/redirect302.jsp
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/jsp/redirect302.jsp b/src/plugin/protocol-http/jsp/redirect302.jsp
deleted file mode 100644
index 8a250d9..0000000
--- a/src/plugin/protocol-http/jsp/redirect302.jsp
+++ /dev/null
@@ -1,49 +0,0 @@
-<%--
-  Licensed to the Apache Software Foundation (ASF) under one or more
-  contributor license agreements.  See the NOTICE file distributed with
-  this work for additional information regarding copyright ownership.
-  The ASF licenses this file to You under the Apache License, Version 2.0
-  (the "License"); you may not use this file except in compliance with
-  the License.  You may obtain a copy of the License at
-  
-  http://www.apache.org/licenses/LICENSE-2.0
-  
-  Unless required by applicable law or agreed to in writing, software
-  distributed under the License is distributed on an "AS IS" BASIS,
-  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-  See the License for the specific language governing permissions and
-  limitations under the License.
---%><%--
-  Example JSP Page to Test Protocol-Http Plugin 
---%><%@ page language="java" import="java.util.*" pageEncoding="UTF-8"%><%
-String path = request.getContextPath();
-String basePath = request.getScheme()+"://"+request.getServerName()+":"+request.getServerPort()+path+"/";
-%>
-
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN">
-<html>
-  <head>
-    <base href="<%=basePath%>">
-    
-    <title>My JSP page</title>
-    
-	<meta http-equiv="pragma" content="no-cache">
-	<meta http-equiv="cache-control" content="no-cache">
-	<meta http-equiv="expires" content="0">    
-	<meta http-equiv="keywords" content="keyword1,keyword2,keyword3">
-	<meta http-equiv="description" content="This is my page">
-	<!--
-	<link rel="stylesheet" type="text/css" href="styles.css">
-	-->
-
-  </head>
-  
-  <body>
-       <%
-	response.setStatus(302);
-	response.setHeader( "Location", "http://nutch.apache.org");
-	response.setHeader( "Connection", "close" );
-		%> 
-    You are sucessfully redirected by JSP<br>
-  </body>
-</html>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/plugin.xml
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/plugin.xml b/src/plugin/protocol-http/plugin.xml
deleted file mode 100755
index 8770b10..0000000
--- a/src/plugin/protocol-http/plugin.xml
+++ /dev/null
@@ -1,51 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<!--
- Licensed to the Apache Software Foundation (ASF) under one or more
- contributor license agreements.  See the NOTICE file distributed with
- this work for additional information regarding copyright ownership.
- The ASF licenses this file to You under the Apache License, Version 2.0
- (the "License"); you may not use this file except in compliance with
- the License.  You may obtain a copy of the License at
-
-     http://www.apache.org/licenses/LICENSE-2.0
-
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License.
--->
-<plugin
-   id="protocol-http"
-   name="Http Protocol Plug-in"
-   version="1.0.0"
-   provider-name="nutch.org">
-
-   <runtime>
-      <library name="protocol-http.jar">
-         <export name="*"/>
-      </library>
-   </runtime>
-
-   <requires>
-      <import plugin="nutch-extensionpoints"/>
-      <import plugin="lib-http"/>
-   </requires>
-
-   <extension id="org.apache.nutch.protocol.http"
-              name="HttpProtocol"
-              point="org.apache.nutch.protocol.Protocol">
-
-      <implementation id="org.apache.nutch.protocol.http.Http"
-                      class="org.apache.nutch.protocol.http.Http">
-        <parameter name="protocolName" value="http"/>
-      </implementation>
-      
-      <implementation id="org.apache.nutch.protocol.http.Http"
-                       class="org.apache.nutch.protocol.http.Http">
-           <parameter name="protocolName" value="https"/>
-      </implementation>
-
-   </extension>
-
-</plugin>

http://git-wip-us.apache.org/repos/asf/nutch/blob/ffa16784/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
----------------------------------------------------------------------
diff --git a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java b/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
deleted file mode 100755
index 56f9f4f..0000000
--- a/src/plugin/protocol-http/src/java/org/apache/nutch/protocol/http/Http.java
+++ /dev/null
@@ -1,73 +0,0 @@
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.nutch.protocol.http;
-
-// JDK imports
-import java.io.IOException;
-import java.net.URL;
-
-// Commons Logging imports
-import org.slf4j.Logger;
-import org.slf4j.LoggerFactory;
-
-// Hadoop imports
-import org.apache.hadoop.conf.Configuration;
-
-// Nutch imports
-import org.apache.nutch.crawl.CrawlDatum;
-import org.apache.nutch.net.protocols.Response;
-import org.apache.nutch.protocol.ProtocolException;
-import org.apache.nutch.protocol.http.api.HttpBase;
-import org.apache.nutch.util.NutchConfiguration;
-
-public class Http extends HttpBase {
-
-  public static final Logger LOG = LoggerFactory.getLogger(Http.class);
-
-  /**
-   * Public default constructor.
-   */
-  public Http() {
-    super(LOG);
-  }
-
-  /**
-   * Set the {@link org.apache.hadoop.conf.Configuration} object.
-   * 
-   * @param conf
-   */
-  public void setConf(Configuration conf) {
-    super.setConf(conf);
-    // Level logLevel = Level.WARNING;
-    // if (conf.getBoolean("http.verbose", false)) {
-    // logLevel = Level.FINE;
-    // }
-    // LOG.setLevel(logLevel);
-  }
-
-  public static void main(String[] args) throws Exception {
-    Http http = new Http();
-    http.setConf(NutchConfiguration.create());
-    main(http, args);
-  }
-
-  protected Response getResponse(URL url, CrawlDatum datum, boolean redirect)
-      throws ProtocolException, IOException {
-    return new HttpResponse(this, url, datum);
-  }
-
-}